server/src/app/Console/Commands/ImportCocoonRDF.php
changeset 508 2cb514f10a72
parent 506 8a5bb4b48b85
child 513 dad9471f0d63
equal deleted inserted replaced
507:a56a807f5d8e 508:2cb514f10a72
    31     /**
    31     /**
    32     * The name and signature of the console command.
    32     * The name and signature of the console command.
    33     *
    33     *
    34     * @var string
    34     * @var string
    35     */
    35     */
    36     protected $signature = 'corpus-parole:importRDF {--skip=0 : Number of record to skip} {--raw : Register raw}';
    36     protected $signature = "corpus-parole:importRDF
       
    37         {--skip=0 : Number of record to skip}
       
    38         {--no-raw : Do not record raw queries}
       
    39         {--no-raw-clear : Do not clear raw repository}
       
    40         {--clear : Clear repository}
       
    41         {--force-import : Overwrite document from import event if the repo version is more recent}
       
    42         {--keep-repo-doc : Keep the existing doc in repo (default is replace document)}
       
    43     ";
       
    44     //protected $signature = 'corpus-parole:importRDF {--skip=0 : Number of record to skip}';
    37 
    45 
    38     /**
    46     /**
    39      * Create a new command instance.
    47      * Create a new command instance.
    40      */
    48      */
    41     public function __construct() {
    49     public function __construct() {
    64             }
    72             }
    65         }
    73         }
    66         return $docTypes;
    74         return $docTypes;
    67     }
    75     }
    68 
    76 
       
    77     /**
       
    78      * Map a documents into graphes.
       
    79      */
       
    80     public function mapDoc($doc, $docUri) {
       
    81         $inputDocTypes = $this->getDocTypes($doc, $docUri);
       
    82 
       
    83         $docType = count($inputDocTypes)>0? $inputDocTypes[0]:null;
       
    84 
       
    85         if(is_null($docType) || !array_key_exists($docType,ImportCocoonRDF::MAPPER_CLASS_MAP)) {
       
    86             $this->error("\nError processing $identifier ($docRdfUrl) : $docType unknown mapper");
       
    87             Log::error("Error processing $identifier ($docRdfUrl) : $docType unknown mapper");
       
    88             $this->documentCount['unknown'] += 1;
       
    89             continue;
       
    90         }
       
    91 
       
    92         $mapperClass = ImportCocoonRDF::MAPPER_CLASS_MAP[$docType];
       
    93         $mapper = new $mapperClass($doc, $docUri);
       
    94 
       
    95         try {
       
    96             $mapper->mapGraph();
       
    97         } catch (\Exception $e) {
       
    98             Log::error("Error processing $identifier ($docRdfUrl) : error mapping graph : $e");
       
    99             $this->documentCount['error'] += 1;
       
   100         }
       
   101         $this->documentCount['all'] += 1;
       
   102         $this->documentCount[$docType] = isset($this->documentCount[$docType])?$this->documentCount[$docType]+1:1;
       
   103 
       
   104         return [$docType, $mapper->getOutputGraphes()];
       
   105 
       
   106     }
       
   107 
       
   108     public function mergeDocs($docType, $outputGraphes) {
       
   109 
       
   110         foreach ($outputGraphes as $mappedGraphKey => $mappedGraph) {
       
   111 
       
   112             $mappedGraphUri = $mappedGraph->getUri();
       
   113             try {
       
   114                 $resDocs = $this->gs->query("CONSTRUCT { ?s ?p ?o } WHERE { GRAPH <$mappedGraphUri> { ?s ?p ?o }}");
       
   115             } catch (\Exception $e) {
       
   116                 $this->error("\nError on graph query $mappedGraphUri : $e \n" . $e->getMessage() . "\n");
       
   117                 Log::error("\nError on graph query $mappedGraphUri : $e \n" . $e->getMessage());
       
   118                 exit;
       
   119             }
       
   120 
       
   121             $mergedGraph = null;
       
   122             $doDelete = true;
       
   123 
       
   124             if($resDocs->isEmpty()) {
       
   125                 $mergedGraph = $mappedGraph;
       
   126                 $doDelete = false;
       
   127             } else {
       
   128                 $doDelete = true;
       
   129                 $mappedTypes = $this->getDocTypes($mappedGraph, $mappedGraphUri);
       
   130                 $presentTypes = $this->getDocTypes($resDocs, $mappedGraphUri);
       
   131 
       
   132                 if($docType == "http://purl.org/dc/dcmitype/Collection" || in_array("http://purl.org/dc/dcmitype/Collection", $mappedTypes)) {
       
   133                     $merger = new \CorpusParole\Libraries\Mergers\CocoonCollectionRdfMerger();
       
   134                     $baseGraph = $resDocs;
       
   135                     $sourceGraph = $mappedGraph;
       
   136                 }
       
   137                 elseif ($docType == "http://purl.org/dc/dcmitype/Text") {
       
   138                     $merger = new \CorpusParole\Libraries\Mergers\CocoonTextRdfMerger();
       
   139                     $baseGraph = $resDocs;
       
   140                     $sourceGraph = $mappedGraph;
       
   141                 }
       
   142                 else {
       
   143                     $merger = new \CorpusParole\Libraries\Mergers\CocoonSoundRdfMerger();
       
   144                     $baseGraph = $mappedGraph;
       
   145                     $sourceGraph = $resDocs;
       
   146                 }
       
   147                 $mergedGraph = $merger->mergeGraph($baseGraph, $sourceGraph, $mappedGraphUri);
       
   148                 if(\EasyRdf\Isomorphic::isomorphic($resDocs, $mergedGraph)) {
       
   149                     //graph are isomorphic no need to go farther for this graph
       
   150                     Log::info("Graph are isomorphic for $mappedGraphUri, skipping");
       
   151                     continue;
       
   152                 }
       
   153             }
       
   154 
       
   155             try {
       
   156                 if($doDelete) {
       
   157                     $this->gs->clear($mappedGraphUri);
       
   158                 }
       
   159                 $this->gs->insert($mergedGraph, $mappedGraphUri);
       
   160             }
       
   161             catch(\Exception $e) {
       
   162                 // just log not much we can do here...
       
   163                 $this->error("\nError on insert $mappedGraphUri : $e");
       
   164                 Log::error("Error on insert $mappedGraphUri : $e");
       
   165                 $code = $e->getCode();
       
   166                 $message = $e->getMessage();
       
   167                 if($e instanceof EasyRdf\Exception && stripos($message, 'timed out')>=0 && $insertTimeout<= ImportCocoonRDF::INSERT_TIMEOUT_RETRY) {
       
   168                     $this->info("\nThis is a timeout, we continue.");
       
   169                     Log::info("This is a timeout, we continue.");
       
   170                     $insertTimeouts++;
       
   171                     continue;
       
   172                 }
       
   173                 throw $e;
       
   174             }
       
   175         }
       
   176     }
       
   177 
       
   178     function getModified($graph) {
       
   179         // get first element of array
       
   180         $providedCHORes = $graph->allOfType('http://www.europeana.eu/schemas/edm/ProvidedCHO');
       
   181         $providedCHO = reset($providedCHORes);
       
   182         if($providedCHO === false) {
       
   183             $date = new \DateTime();
       
   184             $date->setTimestamp(0);
       
   185             return $date;
       
   186         }
       
   187         $modified = $providedCHO->getLiteral("<http://purl.org/dc/terms/modified>");
       
   188         if(is_null($modified)) {
       
   189             $date = new \DateTime();
       
   190             $date->setTimestamp(0);
       
   191             return $date;
       
   192         }
       
   193         return \DateTime::createFromFormat(\DateTime::W3C, $modified->getValue());
       
   194     }
       
   195 
    69 
   196 
    70     /**
   197     /**
    71      * Execute the console command.
   198      * Execute the console command.
    72      *
   199      *
    73      * @return mixed
   200      * @return mixed
    75     public function fire() {
   202     public function fire() {
    76 
   203 
    77         libxml_use_internal_errors(true);
   204         libxml_use_internal_errors(true);
    78 
   205 
    79         $skip = (int)$this->option('skip');
   206         $skip = (int)$this->option('skip');
    80         $raw = $this->option('raw');
   207         $raw = !$this->option('no-raw');
       
   208         $rawClear = !$this->option('no-raw-clear');
       
   209         $clear = $this->option('clear');
       
   210         $forceImport = $this->option('force-import');
       
   211         $keepRepoDoc = $this->option('keep-repo-doc');
    81 
   212 
    82         $this->comment("Skipping $skip records");
   213         $this->comment("Skipping $skip records");
    83         $this->comment("Recording raw queries: ".($raw?'TRUE':'FALSE'));
   214         $this->comment("Querying Cocoon: ".($raw?'TRUE':'FALSE'));
    84 
   215         $this->comment("Clear raw repository: ".($rawClear?'TRUE':'FALSE'));
    85         $gs = new \EasyRdf\Sparql\Client(Config::get('corpusparole.rdf4j_query_url'), Config::get('corpusparole.rdf4j_update_url'));
   216         $this->comment("Clear repository: ".($clear?'TRUE':'FALSE'));
    86         $gs_raw = new \EasyRdf\Sparql\Client(Config::get('corpusparole.rdf4j_query_url_raw'), Config::get('corpusparole.rdf4j_update_url_raw'));
   217         $this->comment("Keep existing document into repository: ".($keepRepoDoc?'TRUE':'FALSE'));
    87 
   218         $this->comment("Overwrite more recent document:".($forceImport?'TRUE':'FALSE'));
    88 
   219 
    89         $client = new Client(Config::get('corpusparole.cocoon_oaipmh_url'));
   220         $this->gs = new \EasyRdf\Sparql\Client(Config::get('corpusparole.rdf4j_query_url'), Config::get('corpusparole.rdf4j_update_url'));
    90         $endpoint = new Endpoint($client);
   221         $this->gs_raw = new \EasyRdf\Sparql\Client(Config::get('corpusparole.rdf4j_query_url_raw'), Config::get('corpusparole.rdf4j_update_url_raw'));
    91 
   222 
    92         $recs = $endpoint->listRecords('olac', null, null, 'LanguesDeFrance');
   223         $this->documentCount = [
    93 
   224             'all' => 0,
    94         //TODO : treat timeout exceptions
   225             'unknown' => 0,
    95         $progressBar = $this->output->createProgressBar($recs->getTotalRecordsInCollection());
   226             'error' => 0,
       
   227             'raw_duplicates' => 0,
       
   228             'modified' => 0,
       
   229             'replaced' => 0
       
   230         ];
       
   231 
       
   232         if($raw) {
       
   233             $client = new Client(Config::get('corpusparole.cocoon_oaipmh_url'));
       
   234             $endpoint = new Endpoint($client);
       
   235 
       
   236             $recs = $endpoint->listRecords('olac', null, null, 'LanguesDeFrance');
       
   237 
       
   238             $progressBar = $this->output->createProgressBar($recs->getTotalRecordsInCollection());
       
   239             $progressBar->setFormat(' %current%/%max% [%bar%] %percent:3s%% - %message%');
       
   240 
       
   241             $insertTimeouts = 0;
       
   242 
       
   243             //Clear raw repository if asked
       
   244             if($rawClear) {
       
   245                 $this->gs_raw->clear("all");
       
   246             }
       
   247 
       
   248             foreach ($recs as $item) {
       
   249                 $item->registerXPathNamespace('oai', "http://www.openarchives.org/OAI/2.0/");
       
   250                 $identifier = (string) $item->xpath('oai:header/oai:identifier')[0];
       
   251 
       
   252                 $docRdfUrl = Config::get('corpusparole.cocoon_rdf_base_uri').substr($identifier, strlen(Config::get('corpusparole.cocoon_doc_id_base')));
       
   253                 $message = "$identifier : $docRdfUrl";
       
   254                 if($recs->getNumRetrieved() <= $skip) {
       
   255                     $progressBar->setMessage("$message - Skipping");
       
   256                     $progressBar->advance();
       
   257                     continue;
       
   258                 }
       
   259                 $progressBar->setMessage($message);
       
   260                 $progressBar->advance();
       
   261 
       
   262                 $docUri = config('corpusparole.cocoon_doc_id_base_uri').substr($identifier, strlen(Config::get('corpusparole.cocoon_doc_id_base')));
       
   263 
       
   264                 $docLoaded = false;
       
   265                 $loadRetry = 0;
       
   266                 $doc = null;
       
   267                 while(!$docLoaded && $loadRetry < config('corpusparole.max_load_retry', 3)) {
       
   268                     $loadRetry++;
       
   269                     try {
       
   270                         $doc = new \EasyRdf\Graph($docRdfUrl);
       
   271                         $doc->load();
       
   272                         $docLoaded = true;
       
   273                     }
       
   274                     //TODO: catch network exception - add error to database
       
   275                     catch(\Exception $e) {
       
   276                         $code = $e->getCode();
       
   277                         $message = $e->getMessage();
       
   278                         $this->info("\nError processing $identifier. code : $code, message: $message");
       
   279                         Log::debug("Error processing $identifier. code : $code, message: $message");
       
   280                         if($code == 400 || ($code == 0 && stripos($message, 'timed out')>=0) ) {
       
   281                             $this->info("\nTimeout error processing $identifier ($docRdfUrl) : $e, retrying");
       
   282                             Log::warning("Timeout error processing $identifier ($docRdfUrl) : $e, retrying");
       
   283                             continue;
       
   284                         }
       
   285                         else {
       
   286                             $this->error("\nError processing $identifier ($docRdfUrl) : $e");
       
   287                             Log::error("Error processing $identifier ($docRdfUrl) : $e");
       
   288                             break;
       
   289                         }
       
   290                         //$this->error(print_r($e->getTraceAsString(),true));
       
   291                     }
       
   292                 }
       
   293                 if(!$docLoaded) {
       
   294                     $this->documentCount['error'] += 1;
       
   295                     continue;
       
   296                 }
       
   297 
       
   298                 $resDocsRaw = $this->gs_raw->query("ASK WHERE { GRAPH <$docUri> { ?s ?p ?o }}");
       
   299                 if($resDocsRaw->getBoolean()) {
       
   300                     $this->gs_raw->clear($docUri);
       
   301                     $this->documentCount['raw_duplicates'] += 1;
       
   302                 }
       
   303                 $this->gs_raw->insert($doc, $docUri);
       
   304             }
       
   305             $progressBar->setMessage("finished raw import");
       
   306             $progressBar->finish();
       
   307         }
       
   308 
       
   309         // $collectionDocsUris = $this->gs_raw->query("SELECT distinct ?uri WHERE {
       
   310         //     GRAPH ?uri {
       
   311         //         ?s ?p ?o.
       
   312         //         ?s <http://purl.org/dc/elements/1.1/type> <http://purl.org/dc/dcmitype/Collection>.
       
   313         //         FILTER(?o IN (<http://purl.org/dc/dcmitype/Sound>, <http://purl.org/dc/dcmitype/MovingImage>))
       
   314         //     }
       
   315         // }");
       
   316 
       
   317         if($clear) {
       
   318             $this->gs->clear("all");
       
   319         }
       
   320 
       
   321         $collectionDocsUris = $this->gs_raw->query("SELECT distinct ?uri WHERE {
       
   322             GRAPH ?uri {
       
   323                 ?s <http://purl.org/dc/elements/1.1/type> <http://purl.org/dc/dcmitype/Collection>.
       
   324             }
       
   325         }");
       
   326 
       
   327         $collectionCount = count($collectionDocsUris);
       
   328         $this->info("\nImporting $collectionCount Collections from raw repository");
       
   329         $progressBar = $this->output->createProgressBar($collectionCount);
    96         $progressBar->setFormat(' %current%/%max% [%bar%] %percent:3s%% - %message%');
   330         $progressBar->setFormat(' %current%/%max% [%bar%] %percent:3s%% - %message%');
    97 
   331 
    98         $insertTimeouts = 0;
   332 
    99 
   333         foreach($collectionDocsUris as $docUriRes) {
   100         $documentCounts = ['all' => 0, 'unknown' => 0, 'error' => 0, 'raw_duplicates' => 0];
   334             $docUri = $docUriRes->uri->getUri();
   101 
   335 
   102         foreach ($recs as $item) {
   336             $progressBar->setMessage("Importing collection $docUri.");
   103             $item->registerXPathNamespace('oai', "http://www.openarchives.org/OAI/2.0/");
       
   104             $identifier = (string) $item->xpath('oai:header/oai:identifier')[0];
       
   105 
       
   106             $docRdfUrl = Config::get('corpusparole.cocoon_rdf_base_uri').substr($identifier, strlen(Config::get('corpusparole.cocoon_doc_id_base')));
       
   107             $message = "$identifier : $docRdfUrl";
       
   108             if($recs->getNumRetrieved() <= $skip) {
       
   109                 $progressBar->setMessage("$message - Skipping");
       
   110                 $progressBar->advance();
       
   111                 continue;
       
   112             }
       
   113             $progressBar->setMessage($message);
       
   114             $progressBar->advance();
   337             $progressBar->advance();
   115 
   338 
   116             $docUri = config('corpusparole.cocoon_doc_id_base_uri').substr($identifier, strlen(Config::get('corpusparole.cocoon_doc_id_base')));
   339             $doc = $this->gs_raw->query("CONSTRUCT { ?s ?p ?o } WHERE { GRAPH <$docUri> { ?s ?p ?o. }}");
   117 
   340 
   118             $docLoaded = false;
   341             //map the doc
   119             $loadRetry = 0;
   342             list($docType, $mappedGraphes) = $this->mapDoc($doc, $docUri);
   120             $doc = null;
   343 
   121             while(!$docLoaded && $loadRetry < config('corpusparole.max_load_retry', 3)) {
   344             //merge the result docs
   122                 $loadRetry++;
   345             $this->mergeDocs($docType, $mappedGraphes);
   123                 try {
   346 
   124                     $doc = new \EasyRdf\Graph($docRdfUrl);
   347         }
   125                     $doc->load();
   348 
   126                     $docLoaded = true;
   349         $progressBar->setMessage("finished raw import for collections.");
   127                 }
   350         $progressBar->finish();
   128                 //TODO: catch network exception - add error to database
   351 
   129                 catch(\Exception $e) {
   352         // list the existing documents
   130                     $code = $e->getCode();
   353         $providedCHODocsUris = [];
   131                     $message = $e->getMessage();
   354         $providedCHODocsUrisRes = $this->gs->query("SELECT distinct ?uri WHERE {
   132                     $this->info("\nError processing $identifier. code : $code, message: $message");
   355             GRAPH ?uri {
   133                     Log::debug("Error processing $identifier. code : $code, message: $message");
   356                 ?s <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://www.europeana.eu/schemas/edm/ProvidedCHO>.
   134                     if($code == 400 || ($code == 0 && stripos($message, 'timed out')>=0) ) {
   357             }
   135                         $this->info("\nTimeout error processing $identifier ($docRdfUrl) : $e, retrying");
   358         }");
   136                         Log::warning("Timeout error processing $identifier ($docRdfUrl) : $e, retrying");
   359 
   137                         continue;
   360         foreach($providedCHODocsUrisRes as $docUriRes) {
   138                     }
   361             array_push($providedCHODocsUris, $docUriRes->uri->getUri());
   139                     else {
   362         }
   140                         $this->error("\nError processing $identifier ($docRdfUrl) : $e");
   363 
   141                         Log::error("Error processing $identifier ($docRdfUrl) : $e");
   364         $this->info("\n\nWe have ".count($providedCHODocsUris)." providedCHO in database.\n");
   142                         break;
   365 
   143                     }
   366         $soundDocsUris = $this->gs_raw->query("SELECT distinct ?uri WHERE {
   144                     //$this->error(print_r($e->getTraceAsString(),true));
   367             GRAPH ?uri {
   145                 }
   368                 ?s <http://purl.org/dc/elements/1.1/type> ?o.
   146             }
   369                 FILTER(?o IN (<http://purl.org/dc/dcmitype/Sound>, <http://purl.org/dc/dcmitype/MovingImage>))
   147             if(!$docLoaded) {
   370             }
   148                 $documentCounts['error'] += 1;
   371         }");
   149                 continue;
   372 
   150             }
   373         $soundCount = count($soundDocsUris);
   151 
   374         $this->info("\nImporting $soundCount Sound (or Moving Image) from raw repository\n");
   152             //insert raw
   375         $progressBar = $this->output->createProgressBar($soundCount);
   153             if($raw) {
   376         $progressBar->setFormat(' %current%/%max% [%bar%] %percent:3s%% - %message%');
   154                 $resDocsRaw = $gs_raw->query("ASK WHERE { GRAPH <$docUri> { ?s ?p ?o }}");
   377 
   155                 if($resDocsRaw->getBoolean()) {
   378 
   156                     $gs_raw->clear($docUri);
   379         foreach($soundDocsUris as $docUriRes) {
   157 
   380             $docUri = $docUriRes->uri->getUri();
   158                 }
   381 
   159                 $gs_raw->insert($doc, $docUri);
   382             $progressBar->setMessage("Importing Sound (or Moving Image) $docUri.");
   160             }
   383             $progressBar->advance();
   161 
   384 
   162             //map doc
   385             $doc = $this->gs_raw->query("CONSTRUCT { ?s ?p ?o } WHERE { GRAPH <$docUri> { ?s ?p ?o. }}");
   163             $inputDocTypes = $this->getDocTypes($doc, $docUri);
   386 
   164 
   387             //map the doc
   165             $docType = count($inputDocTypes)>0? $inputDocTypes[0]:null;
   388             list($docType, $mappedGraphes) = $this->mapDoc($doc, $docUri);
   166 
   389             $firstGraph = reset($mappedGraphes); // first graph is main graph
   167             if(is_null($docType) || !array_key_exists($docType,ImportCocoonRDF::MAPPER_CLASS_MAP)) {
   390             // remove it from list of existing graphes in repository
   168                 $this->error("\nError processing $identifier ($docRdfUrl) : $docType unknown mapper");
   391             $firstGraphUri = $firstGraph->getUri();
   169                 Log::error("Error processing $identifier ($docRdfUrl) : $docType unknown mapper");
   392             if(($key = array_search($firstGraphUri, $providedCHODocsUris)) !== false) {
   170                 $documentCounts['unknown'] += 1;
   393                unset($providedCHODocsUris[$key]);
   171                 continue;
   394             }
   172             }
   395             //if asked, delete it from repository. check modified date
   173 
   396             //merge the result docs
   174             $mapperClass = ImportCocoonRDF::MAPPER_CLASS_MAP[$docType];
       
   175             $mapper = new $mapperClass($doc, $docUri);
       
   176 
       
   177             try {
   397             try {
   178                 $mapper->mapGraph();
   398                 $resDocs = $this->gs->query("CONSTRUCT { ?s ?p ?o } WHERE { GRAPH <$firstGraphUri> { ?s ?p ?o }}");
   179             } catch (\Exception $e) {
   399             } catch (\Exception $e) {
   180                 Log::error("Error processing $identifier ($docRdfUrl) : error mapping graph : $e");
   400                 $this->error("\nError on graph query $firstGraphUri : $e \n" . $e->getMessage() . "\n");
   181                 $documentCounts['error'] += 1;
   401                 Log::error("\nError on graph query $firstGraphUri : $e \n" . $e->getMessage());
   182             }
   402                 exit;
   183             $documentCounts['all'] += 1;
   403             }
   184             $documentCounts[$docType] = isset($documentCounts[$docType])?$documentCounts[$docType]+1:1;
   404             $doDelete = true;
   185 
   405             if($resDocs->isEmpty()) {
   186             $mappedGraphes = $mapper->getOutputGraphes();
   406                 $doDelete = false;
   187 
   407             } else {
   188             foreach ($mapper->getOutputGraphes() as $mappedGraphKey => $mappedGraph) {
   408                 // get modified from repo
   189 
   409                 $dateRepo = $this->getModified($resDocs);
   190                 $mappedGraphUri = $mappedGraph->getUri();
   410                 // get modified from import
   191                 try {
   411                 $dateImport = $this->getModified($firstGraph);
   192                     $resDocs = $gs->query("CONSTRUCT { ?s ?p ?o } WHERE { GRAPH <$mappedGraphUri> { ?s ?p ?o }}");
   412 
   193                 } catch (\Exception $e) {
   413                 if($dateRepo > $dateImport) {
   194                     $this->error("\nError on graph query $identifier ($mappedGraphUri) : $e \n" . $e->getBody() . "\n");
   414                     $this->documentCount['modified'] += 1;
   195                     Log::error("\nError on graph query $identifier ($mappedGraphUri) : $e \n" . $e->getBody());
   415                     $doDelete = $forceImport;
   196                     exit;
   416                 } else {
   197                 }
   417                     $doDelete = !$keepRepoDoc;
   198 
   418                 }
   199                 $mergedGraph = null;
   419 
   200                 $doDelete = true;
   420             }
   201 
   421 
   202                 if($resDocs->isEmpty()) {
   422             if($doDelete) {
   203                     $mergedGraph = $mappedGraph;
   423                 $this->documentCount['replaced'] += 1;
   204                     $doDelete = false;
   424                 $this->gs->clear($firstGraphUri);
   205                 }
   425             }
   206                 else {
   426 
   207                     $doDelete = true;
   427             $this->mergeDocs($docType, $mappedGraphes);
   208                     $mappedTypes = $this->getDocTypes($mappedGraph, $mappedGraphUri);
   428         }
   209                     $presentTypes = $this->getDocTypes($resDocs, $mappedGraphUri);
   429 
   210 
   430         $progressBar->setMessage("finished raw import for sounds.");
   211                     if($docType == "http://purl.org/dc/dcmitype/Collection" || in_array("http://purl.org/dc/dcmitype/Collection", $mappedTypes)) {
       
   212                         $merger = new \CorpusParole\Libraries\Mergers\CocoonCollectionRdfMerger();
       
   213                         $baseGraph = $resDocs;
       
   214                         $sourceGraph = $mappedGraph;
       
   215                     }
       
   216                     elseif ($docType == "http://purl.org/dc/dcmitype/Text") {
       
   217                         $merger = new \CorpusParole\Libraries\Mergers\CocoonTextRdfMerger();
       
   218                         $baseGraph = $resDocs;
       
   219                         $sourceGraph = $mappedGraph;
       
   220                     }
       
   221                     else {
       
   222                         $merger = new \CorpusParole\Libraries\Mergers\CocoonSoundRdfMerger();
       
   223                         $baseGraph = $mappedGraph;
       
   224                         $sourceGraph = $resDocs;
       
   225                     }
       
   226                     $mergedGraph = $merger->mergeGraph($baseGraph, $sourceGraph, $mappedGraphUri);
       
   227                     if(\EasyRdf\Isomorphic::isomorphic($resDocs, $mergedGraph)) {
       
   228                         //graph are isomorphic no need to go farther for this graph
       
   229                         Log::info("Graph are isomorphic for $mappedGraphUri (from $identifier : $docRdfUrl), skipping");
       
   230                         continue;
       
   231                     }
       
   232                 }
       
   233 
       
   234                 try {
       
   235                     if($doDelete) {
       
   236                         $gs->clear($mappedGraphUri);
       
   237                     }
       
   238                     $gs->insert($mergedGraph, $mappedGraphUri);
       
   239                 }
       
   240                 catch(\Exception $e) {
       
   241                     // just log not much we can do here...
       
   242                     $this->error("\nError on insert $identifier ($docRdfUrl) : $e");
       
   243                     Log::error("Error on insert $identifier ($docRdfUrl) : $e");
       
   244                     $code = $e->getCode();
       
   245                     $message = $e->getMessage();
       
   246                     if($e instanceof EasyRdf\Exception && stripos($message, 'timed out')>=0 && $insertTimeout<= ImportCocoonRDF::INSERT_TIMEOUT_RETRY) {
       
   247                         $this->info("\nThis is a timeout, we continue.");
       
   248                         Log::info("This is a timeout, we continue.");
       
   249                         $insertTimeouts++;
       
   250                         continue;
       
   251                     }
       
   252                     throw $e;
       
   253                 }
       
   254             }
       
   255         }
       
   256         $progressBar->setMessage("finished");
       
   257         $progressBar->finish();
   431         $progressBar->finish();
   258 
   432 
   259         $this->info("\nDocument count info: ");
   433 
   260         foreach ($documentCounts as $docType => $docCount) {
   434         $textDocsUris = $this->gs_raw->query("SELECT distinct ?uri WHERE {
       
   435             GRAPH ?uri {
       
   436                 ?s <http://purl.org/dc/elements/1.1/type> <http://purl.org/dc/dcmitype/Text>.
       
   437             }
       
   438         }");
       
   439 
       
   440         $textCount = count($textDocsUris);
       
   441         $this->info("\n\nImporting $textCount text from raw repository\n");
       
   442         $progressBar = $this->output->createProgressBar($textCount);
       
   443         $progressBar->setFormat(' %current%/%max% [%bar%] %percent:3s%% - %message%');
       
   444 
       
   445 
       
   446         foreach($textDocsUris as $docUriRes) {
       
   447             $docUri = $docUriRes->uri->getUri();
       
   448 
       
   449             $progressBar->setMessage("Importing Text $docUri.");
       
   450             $progressBar->advance();
       
   451 
       
   452             $doc = $this->gs_raw->query("CONSTRUCT { ?s ?p ?o } WHERE { GRAPH <$docUri> { ?s ?p ?o. }}");
       
   453 
       
   454             //map the doc
       
   455             list($docType, $mappedGraphes) = $this->mapDoc($doc, $docUri);
       
   456 
       
   457             //merge the result docs
       
   458             $this->mergeDocs($docType, $mappedGraphes);
       
   459 
       
   460         }
       
   461 
       
   462         $progressBar->setMessage("finished raw import for text.");
       
   463         $progressBar->finish();
       
   464 
       
   465 
       
   466         // delete left overs from previous repository
       
   467         $this->info("\n\nThere is ".count($providedCHODocsUris)." documents left-over.\n");
       
   468         if(count($providedCHODocsUris) > 0 && $delete_old) {
       
   469             foreach($providedCHODocsUris as $graphUri) {
       
   470                 $this->gs->clear($graphUri);
       
   471             }
       
   472         }
       
   473 
       
   474         $this->info("\n\nDocument count info: ");
       
   475         foreach ($this->documentCount as $docType => $docCount) {
   261             if($docType == 'error' && $docCount > 0) {
   476             if($docType == 'error' && $docCount > 0) {
   262                 $this->error("$docType => $docCount");
   477                 $this->error("$docType => $docCount");
   263             } else {
   478             } else {
   264                 $this->info("$docType => $docCount");
   479                 $this->info("$docType => $docCount");
   265             }
   480             }