64 } |
72 } |
65 } |
73 } |
66 return $docTypes; |
74 return $docTypes; |
67 } |
75 } |
68 |
76 |
|
77 /** |
|
78 * Map a documents into graphes. |
|
79 */ |
|
80 public function mapDoc($doc, $docUri) { |
|
81 $inputDocTypes = $this->getDocTypes($doc, $docUri); |
|
82 |
|
83 $docType = count($inputDocTypes)>0? $inputDocTypes[0]:null; |
|
84 |
|
85 if(is_null($docType) || !array_key_exists($docType,ImportCocoonRDF::MAPPER_CLASS_MAP)) { |
|
86 $this->error("\nError processing $identifier ($docRdfUrl) : $docType unknown mapper"); |
|
87 Log::error("Error processing $identifier ($docRdfUrl) : $docType unknown mapper"); |
|
88 $this->documentCount['unknown'] += 1; |
|
89 continue; |
|
90 } |
|
91 |
|
92 $mapperClass = ImportCocoonRDF::MAPPER_CLASS_MAP[$docType]; |
|
93 $mapper = new $mapperClass($doc, $docUri); |
|
94 |
|
95 try { |
|
96 $mapper->mapGraph(); |
|
97 } catch (\Exception $e) { |
|
98 Log::error("Error processing $identifier ($docRdfUrl) : error mapping graph : $e"); |
|
99 $this->documentCount['error'] += 1; |
|
100 } |
|
101 $this->documentCount['all'] += 1; |
|
102 $this->documentCount[$docType] = isset($this->documentCount[$docType])?$this->documentCount[$docType]+1:1; |
|
103 |
|
104 return [$docType, $mapper->getOutputGraphes()]; |
|
105 |
|
106 } |
|
107 |
|
108 public function mergeDocs($docType, $outputGraphes) { |
|
109 |
|
110 foreach ($outputGraphes as $mappedGraphKey => $mappedGraph) { |
|
111 |
|
112 $mappedGraphUri = $mappedGraph->getUri(); |
|
113 try { |
|
114 $resDocs = $this->gs->query("CONSTRUCT { ?s ?p ?o } WHERE { GRAPH <$mappedGraphUri> { ?s ?p ?o }}"); |
|
115 } catch (\Exception $e) { |
|
116 $this->error("\nError on graph query $mappedGraphUri : $e \n" . $e->getMessage() . "\n"); |
|
117 Log::error("\nError on graph query $mappedGraphUri : $e \n" . $e->getMessage()); |
|
118 exit; |
|
119 } |
|
120 |
|
121 $mergedGraph = null; |
|
122 $doDelete = true; |
|
123 |
|
124 if($resDocs->isEmpty()) { |
|
125 $mergedGraph = $mappedGraph; |
|
126 $doDelete = false; |
|
127 } else { |
|
128 $doDelete = true; |
|
129 $mappedTypes = $this->getDocTypes($mappedGraph, $mappedGraphUri); |
|
130 $presentTypes = $this->getDocTypes($resDocs, $mappedGraphUri); |
|
131 |
|
132 if($docType == "http://purl.org/dc/dcmitype/Collection" || in_array("http://purl.org/dc/dcmitype/Collection", $mappedTypes)) { |
|
133 $merger = new \CorpusParole\Libraries\Mergers\CocoonCollectionRdfMerger(); |
|
134 $baseGraph = $resDocs; |
|
135 $sourceGraph = $mappedGraph; |
|
136 } |
|
137 elseif ($docType == "http://purl.org/dc/dcmitype/Text") { |
|
138 $merger = new \CorpusParole\Libraries\Mergers\CocoonTextRdfMerger(); |
|
139 $baseGraph = $resDocs; |
|
140 $sourceGraph = $mappedGraph; |
|
141 } |
|
142 else { |
|
143 $merger = new \CorpusParole\Libraries\Mergers\CocoonSoundRdfMerger(); |
|
144 $baseGraph = $mappedGraph; |
|
145 $sourceGraph = $resDocs; |
|
146 } |
|
147 $mergedGraph = $merger->mergeGraph($baseGraph, $sourceGraph, $mappedGraphUri); |
|
148 if(\EasyRdf\Isomorphic::isomorphic($resDocs, $mergedGraph)) { |
|
149 //graph are isomorphic no need to go farther for this graph |
|
150 Log::info("Graph are isomorphic for $mappedGraphUri, skipping"); |
|
151 continue; |
|
152 } |
|
153 } |
|
154 |
|
155 try { |
|
156 if($doDelete) { |
|
157 $this->gs->clear($mappedGraphUri); |
|
158 } |
|
159 $this->gs->insert($mergedGraph, $mappedGraphUri); |
|
160 } |
|
161 catch(\Exception $e) { |
|
162 // just log not much we can do here... |
|
163 $this->error("\nError on insert $mappedGraphUri : $e"); |
|
164 Log::error("Error on insert $mappedGraphUri : $e"); |
|
165 $code = $e->getCode(); |
|
166 $message = $e->getMessage(); |
|
167 if($e instanceof EasyRdf\Exception && stripos($message, 'timed out')>=0 && $insertTimeout<= ImportCocoonRDF::INSERT_TIMEOUT_RETRY) { |
|
168 $this->info("\nThis is a timeout, we continue."); |
|
169 Log::info("This is a timeout, we continue."); |
|
170 $insertTimeouts++; |
|
171 continue; |
|
172 } |
|
173 throw $e; |
|
174 } |
|
175 } |
|
176 } |
|
177 |
|
178 function getModified($graph) { |
|
179 // get first element of array |
|
180 $providedCHORes = $graph->allOfType('http://www.europeana.eu/schemas/edm/ProvidedCHO'); |
|
181 $providedCHO = reset($providedCHORes); |
|
182 if($providedCHO === false) { |
|
183 $date = new \DateTime(); |
|
184 $date->setTimestamp(0); |
|
185 return $date; |
|
186 } |
|
187 $modified = $providedCHO->getLiteral("<http://purl.org/dc/terms/modified>"); |
|
188 if(is_null($modified)) { |
|
189 $date = new \DateTime(); |
|
190 $date->setTimestamp(0); |
|
191 return $date; |
|
192 } |
|
193 return \DateTime::createFromFormat(\DateTime::W3C, $modified->getValue()); |
|
194 } |
|
195 |
69 |
196 |
70 /** |
197 /** |
71 * Execute the console command. |
198 * Execute the console command. |
72 * |
199 * |
73 * @return mixed |
200 * @return mixed |
75 public function fire() { |
202 public function fire() { |
76 |
203 |
77 libxml_use_internal_errors(true); |
204 libxml_use_internal_errors(true); |
78 |
205 |
79 $skip = (int)$this->option('skip'); |
206 $skip = (int)$this->option('skip'); |
80 $raw = $this->option('raw'); |
207 $raw = !$this->option('no-raw'); |
|
208 $rawClear = !$this->option('no-raw-clear'); |
|
209 $clear = $this->option('clear'); |
|
210 $forceImport = $this->option('force-import'); |
|
211 $keepRepoDoc = $this->option('keep-repo-doc'); |
81 |
212 |
82 $this->comment("Skipping $skip records"); |
213 $this->comment("Skipping $skip records"); |
83 $this->comment("Recording raw queries: ".($raw?'TRUE':'FALSE')); |
214 $this->comment("Querying Cocoon: ".($raw?'TRUE':'FALSE')); |
84 |
215 $this->comment("Clear raw repository: ".($rawClear?'TRUE':'FALSE')); |
85 $gs = new \EasyRdf\Sparql\Client(Config::get('corpusparole.rdf4j_query_url'), Config::get('corpusparole.rdf4j_update_url')); |
216 $this->comment("Clear repository: ".($clear?'TRUE':'FALSE')); |
86 $gs_raw = new \EasyRdf\Sparql\Client(Config::get('corpusparole.rdf4j_query_url_raw'), Config::get('corpusparole.rdf4j_update_url_raw')); |
217 $this->comment("Keep existing document into repository: ".($keepRepoDoc?'TRUE':'FALSE')); |
87 |
218 $this->comment("Overwrite more recent document:".($forceImport?'TRUE':'FALSE')); |
88 |
219 |
89 $client = new Client(Config::get('corpusparole.cocoon_oaipmh_url')); |
220 $this->gs = new \EasyRdf\Sparql\Client(Config::get('corpusparole.rdf4j_query_url'), Config::get('corpusparole.rdf4j_update_url')); |
90 $endpoint = new Endpoint($client); |
221 $this->gs_raw = new \EasyRdf\Sparql\Client(Config::get('corpusparole.rdf4j_query_url_raw'), Config::get('corpusparole.rdf4j_update_url_raw')); |
91 |
222 |
92 $recs = $endpoint->listRecords('olac', null, null, 'LanguesDeFrance'); |
223 $this->documentCount = [ |
93 |
224 'all' => 0, |
94 //TODO : treat timeout exceptions |
225 'unknown' => 0, |
95 $progressBar = $this->output->createProgressBar($recs->getTotalRecordsInCollection()); |
226 'error' => 0, |
|
227 'raw_duplicates' => 0, |
|
228 'modified' => 0, |
|
229 'replaced' => 0 |
|
230 ]; |
|
231 |
|
232 if($raw) { |
|
233 $client = new Client(Config::get('corpusparole.cocoon_oaipmh_url')); |
|
234 $endpoint = new Endpoint($client); |
|
235 |
|
236 $recs = $endpoint->listRecords('olac', null, null, 'LanguesDeFrance'); |
|
237 |
|
238 $progressBar = $this->output->createProgressBar($recs->getTotalRecordsInCollection()); |
|
239 $progressBar->setFormat(' %current%/%max% [%bar%] %percent:3s%% - %message%'); |
|
240 |
|
241 $insertTimeouts = 0; |
|
242 |
|
243 //Clear raw repository if asked |
|
244 if($rawClear) { |
|
245 $this->gs_raw->clear("all"); |
|
246 } |
|
247 |
|
248 foreach ($recs as $item) { |
|
249 $item->registerXPathNamespace('oai', "http://www.openarchives.org/OAI/2.0/"); |
|
250 $identifier = (string) $item->xpath('oai:header/oai:identifier')[0]; |
|
251 |
|
252 $docRdfUrl = Config::get('corpusparole.cocoon_rdf_base_uri').substr($identifier, strlen(Config::get('corpusparole.cocoon_doc_id_base'))); |
|
253 $message = "$identifier : $docRdfUrl"; |
|
254 if($recs->getNumRetrieved() <= $skip) { |
|
255 $progressBar->setMessage("$message - Skipping"); |
|
256 $progressBar->advance(); |
|
257 continue; |
|
258 } |
|
259 $progressBar->setMessage($message); |
|
260 $progressBar->advance(); |
|
261 |
|
262 $docUri = config('corpusparole.cocoon_doc_id_base_uri').substr($identifier, strlen(Config::get('corpusparole.cocoon_doc_id_base'))); |
|
263 |
|
264 $docLoaded = false; |
|
265 $loadRetry = 0; |
|
266 $doc = null; |
|
267 while(!$docLoaded && $loadRetry < config('corpusparole.max_load_retry', 3)) { |
|
268 $loadRetry++; |
|
269 try { |
|
270 $doc = new \EasyRdf\Graph($docRdfUrl); |
|
271 $doc->load(); |
|
272 $docLoaded = true; |
|
273 } |
|
274 //TODO: catch network exception - add error to database |
|
275 catch(\Exception $e) { |
|
276 $code = $e->getCode(); |
|
277 $message = $e->getMessage(); |
|
278 $this->info("\nError processing $identifier. code : $code, message: $message"); |
|
279 Log::debug("Error processing $identifier. code : $code, message: $message"); |
|
280 if($code == 400 || ($code == 0 && stripos($message, 'timed out')>=0) ) { |
|
281 $this->info("\nTimeout error processing $identifier ($docRdfUrl) : $e, retrying"); |
|
282 Log::warning("Timeout error processing $identifier ($docRdfUrl) : $e, retrying"); |
|
283 continue; |
|
284 } |
|
285 else { |
|
286 $this->error("\nError processing $identifier ($docRdfUrl) : $e"); |
|
287 Log::error("Error processing $identifier ($docRdfUrl) : $e"); |
|
288 break; |
|
289 } |
|
290 //$this->error(print_r($e->getTraceAsString(),true)); |
|
291 } |
|
292 } |
|
293 if(!$docLoaded) { |
|
294 $this->documentCount['error'] += 1; |
|
295 continue; |
|
296 } |
|
297 |
|
298 $resDocsRaw = $this->gs_raw->query("ASK WHERE { GRAPH <$docUri> { ?s ?p ?o }}"); |
|
299 if($resDocsRaw->getBoolean()) { |
|
300 $this->gs_raw->clear($docUri); |
|
301 $this->documentCount['raw_duplicates'] += 1; |
|
302 } |
|
303 $this->gs_raw->insert($doc, $docUri); |
|
304 } |
|
305 $progressBar->setMessage("finished raw import"); |
|
306 $progressBar->finish(); |
|
307 } |
|
308 |
|
309 // $collectionDocsUris = $this->gs_raw->query("SELECT distinct ?uri WHERE { |
|
310 // GRAPH ?uri { |
|
311 // ?s ?p ?o. |
|
312 // ?s <http://purl.org/dc/elements/1.1/type> <http://purl.org/dc/dcmitype/Collection>. |
|
313 // FILTER(?o IN (<http://purl.org/dc/dcmitype/Sound>, <http://purl.org/dc/dcmitype/MovingImage>)) |
|
314 // } |
|
315 // }"); |
|
316 |
|
317 if($clear) { |
|
318 $this->gs->clear("all"); |
|
319 } |
|
320 |
|
321 $collectionDocsUris = $this->gs_raw->query("SELECT distinct ?uri WHERE { |
|
322 GRAPH ?uri { |
|
323 ?s <http://purl.org/dc/elements/1.1/type> <http://purl.org/dc/dcmitype/Collection>. |
|
324 } |
|
325 }"); |
|
326 |
|
327 $collectionCount = count($collectionDocsUris); |
|
328 $this->info("\nImporting $collectionCount Collections from raw repository"); |
|
329 $progressBar = $this->output->createProgressBar($collectionCount); |
96 $progressBar->setFormat(' %current%/%max% [%bar%] %percent:3s%% - %message%'); |
330 $progressBar->setFormat(' %current%/%max% [%bar%] %percent:3s%% - %message%'); |
97 |
331 |
98 $insertTimeouts = 0; |
332 |
99 |
333 foreach($collectionDocsUris as $docUriRes) { |
100 $documentCounts = ['all' => 0, 'unknown' => 0, 'error' => 0, 'raw_duplicates' => 0]; |
334 $docUri = $docUriRes->uri->getUri(); |
101 |
335 |
102 foreach ($recs as $item) { |
336 $progressBar->setMessage("Importing collection $docUri."); |
103 $item->registerXPathNamespace('oai', "http://www.openarchives.org/OAI/2.0/"); |
|
104 $identifier = (string) $item->xpath('oai:header/oai:identifier')[0]; |
|
105 |
|
106 $docRdfUrl = Config::get('corpusparole.cocoon_rdf_base_uri').substr($identifier, strlen(Config::get('corpusparole.cocoon_doc_id_base'))); |
|
107 $message = "$identifier : $docRdfUrl"; |
|
108 if($recs->getNumRetrieved() <= $skip) { |
|
109 $progressBar->setMessage("$message - Skipping"); |
|
110 $progressBar->advance(); |
|
111 continue; |
|
112 } |
|
113 $progressBar->setMessage($message); |
|
114 $progressBar->advance(); |
337 $progressBar->advance(); |
115 |
338 |
116 $docUri = config('corpusparole.cocoon_doc_id_base_uri').substr($identifier, strlen(Config::get('corpusparole.cocoon_doc_id_base'))); |
339 $doc = $this->gs_raw->query("CONSTRUCT { ?s ?p ?o } WHERE { GRAPH <$docUri> { ?s ?p ?o. }}"); |
117 |
340 |
118 $docLoaded = false; |
341 //map the doc |
119 $loadRetry = 0; |
342 list($docType, $mappedGraphes) = $this->mapDoc($doc, $docUri); |
120 $doc = null; |
343 |
121 while(!$docLoaded && $loadRetry < config('corpusparole.max_load_retry', 3)) { |
344 //merge the result docs |
122 $loadRetry++; |
345 $this->mergeDocs($docType, $mappedGraphes); |
123 try { |
346 |
124 $doc = new \EasyRdf\Graph($docRdfUrl); |
347 } |
125 $doc->load(); |
348 |
126 $docLoaded = true; |
349 $progressBar->setMessage("finished raw import for collections."); |
127 } |
350 $progressBar->finish(); |
128 //TODO: catch network exception - add error to database |
351 |
129 catch(\Exception $e) { |
352 // list the existing documents |
130 $code = $e->getCode(); |
353 $providedCHODocsUris = []; |
131 $message = $e->getMessage(); |
354 $providedCHODocsUrisRes = $this->gs->query("SELECT distinct ?uri WHERE { |
132 $this->info("\nError processing $identifier. code : $code, message: $message"); |
355 GRAPH ?uri { |
133 Log::debug("Error processing $identifier. code : $code, message: $message"); |
356 ?s <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://www.europeana.eu/schemas/edm/ProvidedCHO>. |
134 if($code == 400 || ($code == 0 && stripos($message, 'timed out')>=0) ) { |
357 } |
135 $this->info("\nTimeout error processing $identifier ($docRdfUrl) : $e, retrying"); |
358 }"); |
136 Log::warning("Timeout error processing $identifier ($docRdfUrl) : $e, retrying"); |
359 |
137 continue; |
360 foreach($providedCHODocsUrisRes as $docUriRes) { |
138 } |
361 array_push($providedCHODocsUris, $docUriRes->uri->getUri()); |
139 else { |
362 } |
140 $this->error("\nError processing $identifier ($docRdfUrl) : $e"); |
363 |
141 Log::error("Error processing $identifier ($docRdfUrl) : $e"); |
364 $this->info("\n\nWe have ".count($providedCHODocsUris)." providedCHO in database.\n"); |
142 break; |
365 |
143 } |
366 $soundDocsUris = $this->gs_raw->query("SELECT distinct ?uri WHERE { |
144 //$this->error(print_r($e->getTraceAsString(),true)); |
367 GRAPH ?uri { |
145 } |
368 ?s <http://purl.org/dc/elements/1.1/type> ?o. |
146 } |
369 FILTER(?o IN (<http://purl.org/dc/dcmitype/Sound>, <http://purl.org/dc/dcmitype/MovingImage>)) |
147 if(!$docLoaded) { |
370 } |
148 $documentCounts['error'] += 1; |
371 }"); |
149 continue; |
372 |
150 } |
373 $soundCount = count($soundDocsUris); |
151 |
374 $this->info("\nImporting $soundCount Sound (or Moving Image) from raw repository\n"); |
152 //insert raw |
375 $progressBar = $this->output->createProgressBar($soundCount); |
153 if($raw) { |
376 $progressBar->setFormat(' %current%/%max% [%bar%] %percent:3s%% - %message%'); |
154 $resDocsRaw = $gs_raw->query("ASK WHERE { GRAPH <$docUri> { ?s ?p ?o }}"); |
377 |
155 if($resDocsRaw->getBoolean()) { |
378 |
156 $gs_raw->clear($docUri); |
379 foreach($soundDocsUris as $docUriRes) { |
157 |
380 $docUri = $docUriRes->uri->getUri(); |
158 } |
381 |
159 $gs_raw->insert($doc, $docUri); |
382 $progressBar->setMessage("Importing Sound (or Moving Image) $docUri."); |
160 } |
383 $progressBar->advance(); |
161 |
384 |
162 //map doc |
385 $doc = $this->gs_raw->query("CONSTRUCT { ?s ?p ?o } WHERE { GRAPH <$docUri> { ?s ?p ?o. }}"); |
163 $inputDocTypes = $this->getDocTypes($doc, $docUri); |
386 |
164 |
387 //map the doc |
165 $docType = count($inputDocTypes)>0? $inputDocTypes[0]:null; |
388 list($docType, $mappedGraphes) = $this->mapDoc($doc, $docUri); |
166 |
389 $firstGraph = reset($mappedGraphes); // first graph is main graph |
167 if(is_null($docType) || !array_key_exists($docType,ImportCocoonRDF::MAPPER_CLASS_MAP)) { |
390 // remove it from list of existing graphes in repository |
168 $this->error("\nError processing $identifier ($docRdfUrl) : $docType unknown mapper"); |
391 $firstGraphUri = $firstGraph->getUri(); |
169 Log::error("Error processing $identifier ($docRdfUrl) : $docType unknown mapper"); |
392 if(($key = array_search($firstGraphUri, $providedCHODocsUris)) !== false) { |
170 $documentCounts['unknown'] += 1; |
393 unset($providedCHODocsUris[$key]); |
171 continue; |
394 } |
172 } |
395 //if asked, delete it from repository. check modified date |
173 |
396 //merge the result docs |
174 $mapperClass = ImportCocoonRDF::MAPPER_CLASS_MAP[$docType]; |
|
175 $mapper = new $mapperClass($doc, $docUri); |
|
176 |
|
177 try { |
397 try { |
178 $mapper->mapGraph(); |
398 $resDocs = $this->gs->query("CONSTRUCT { ?s ?p ?o } WHERE { GRAPH <$firstGraphUri> { ?s ?p ?o }}"); |
179 } catch (\Exception $e) { |
399 } catch (\Exception $e) { |
180 Log::error("Error processing $identifier ($docRdfUrl) : error mapping graph : $e"); |
400 $this->error("\nError on graph query $firstGraphUri : $e \n" . $e->getMessage() . "\n"); |
181 $documentCounts['error'] += 1; |
401 Log::error("\nError on graph query $firstGraphUri : $e \n" . $e->getMessage()); |
182 } |
402 exit; |
183 $documentCounts['all'] += 1; |
403 } |
184 $documentCounts[$docType] = isset($documentCounts[$docType])?$documentCounts[$docType]+1:1; |
404 $doDelete = true; |
185 |
405 if($resDocs->isEmpty()) { |
186 $mappedGraphes = $mapper->getOutputGraphes(); |
406 $doDelete = false; |
187 |
407 } else { |
188 foreach ($mapper->getOutputGraphes() as $mappedGraphKey => $mappedGraph) { |
408 // get modified from repo |
189 |
409 $dateRepo = $this->getModified($resDocs); |
190 $mappedGraphUri = $mappedGraph->getUri(); |
410 // get modified from import |
191 try { |
411 $dateImport = $this->getModified($firstGraph); |
192 $resDocs = $gs->query("CONSTRUCT { ?s ?p ?o } WHERE { GRAPH <$mappedGraphUri> { ?s ?p ?o }}"); |
412 |
193 } catch (\Exception $e) { |
413 if($dateRepo > $dateImport) { |
194 $this->error("\nError on graph query $identifier ($mappedGraphUri) : $e \n" . $e->getBody() . "\n"); |
414 $this->documentCount['modified'] += 1; |
195 Log::error("\nError on graph query $identifier ($mappedGraphUri) : $e \n" . $e->getBody()); |
415 $doDelete = $forceImport; |
196 exit; |
416 } else { |
197 } |
417 $doDelete = !$keepRepoDoc; |
198 |
418 } |
199 $mergedGraph = null; |
419 |
200 $doDelete = true; |
420 } |
201 |
421 |
202 if($resDocs->isEmpty()) { |
422 if($doDelete) { |
203 $mergedGraph = $mappedGraph; |
423 $this->documentCount['replaced'] += 1; |
204 $doDelete = false; |
424 $this->gs->clear($firstGraphUri); |
205 } |
425 } |
206 else { |
426 |
207 $doDelete = true; |
427 $this->mergeDocs($docType, $mappedGraphes); |
208 $mappedTypes = $this->getDocTypes($mappedGraph, $mappedGraphUri); |
428 } |
209 $presentTypes = $this->getDocTypes($resDocs, $mappedGraphUri); |
429 |
210 |
430 $progressBar->setMessage("finished raw import for sounds."); |
211 if($docType == "http://purl.org/dc/dcmitype/Collection" || in_array("http://purl.org/dc/dcmitype/Collection", $mappedTypes)) { |
|
212 $merger = new \CorpusParole\Libraries\Mergers\CocoonCollectionRdfMerger(); |
|
213 $baseGraph = $resDocs; |
|
214 $sourceGraph = $mappedGraph; |
|
215 } |
|
216 elseif ($docType == "http://purl.org/dc/dcmitype/Text") { |
|
217 $merger = new \CorpusParole\Libraries\Mergers\CocoonTextRdfMerger(); |
|
218 $baseGraph = $resDocs; |
|
219 $sourceGraph = $mappedGraph; |
|
220 } |
|
221 else { |
|
222 $merger = new \CorpusParole\Libraries\Mergers\CocoonSoundRdfMerger(); |
|
223 $baseGraph = $mappedGraph; |
|
224 $sourceGraph = $resDocs; |
|
225 } |
|
226 $mergedGraph = $merger->mergeGraph($baseGraph, $sourceGraph, $mappedGraphUri); |
|
227 if(\EasyRdf\Isomorphic::isomorphic($resDocs, $mergedGraph)) { |
|
228 //graph are isomorphic no need to go farther for this graph |
|
229 Log::info("Graph are isomorphic for $mappedGraphUri (from $identifier : $docRdfUrl), skipping"); |
|
230 continue; |
|
231 } |
|
232 } |
|
233 |
|
234 try { |
|
235 if($doDelete) { |
|
236 $gs->clear($mappedGraphUri); |
|
237 } |
|
238 $gs->insert($mergedGraph, $mappedGraphUri); |
|
239 } |
|
240 catch(\Exception $e) { |
|
241 // just log not much we can do here... |
|
242 $this->error("\nError on insert $identifier ($docRdfUrl) : $e"); |
|
243 Log::error("Error on insert $identifier ($docRdfUrl) : $e"); |
|
244 $code = $e->getCode(); |
|
245 $message = $e->getMessage(); |
|
246 if($e instanceof EasyRdf\Exception && stripos($message, 'timed out')>=0 && $insertTimeout<= ImportCocoonRDF::INSERT_TIMEOUT_RETRY) { |
|
247 $this->info("\nThis is a timeout, we continue."); |
|
248 Log::info("This is a timeout, we continue."); |
|
249 $insertTimeouts++; |
|
250 continue; |
|
251 } |
|
252 throw $e; |
|
253 } |
|
254 } |
|
255 } |
|
256 $progressBar->setMessage("finished"); |
|
257 $progressBar->finish(); |
431 $progressBar->finish(); |
258 |
432 |
259 $this->info("\nDocument count info: "); |
433 |
260 foreach ($documentCounts as $docType => $docCount) { |
434 $textDocsUris = $this->gs_raw->query("SELECT distinct ?uri WHERE { |
|
435 GRAPH ?uri { |
|
436 ?s <http://purl.org/dc/elements/1.1/type> <http://purl.org/dc/dcmitype/Text>. |
|
437 } |
|
438 }"); |
|
439 |
|
440 $textCount = count($textDocsUris); |
|
441 $this->info("\n\nImporting $textCount text from raw repository\n"); |
|
442 $progressBar = $this->output->createProgressBar($textCount); |
|
443 $progressBar->setFormat(' %current%/%max% [%bar%] %percent:3s%% - %message%'); |
|
444 |
|
445 |
|
446 foreach($textDocsUris as $docUriRes) { |
|
447 $docUri = $docUriRes->uri->getUri(); |
|
448 |
|
449 $progressBar->setMessage("Importing Text $docUri."); |
|
450 $progressBar->advance(); |
|
451 |
|
452 $doc = $this->gs_raw->query("CONSTRUCT { ?s ?p ?o } WHERE { GRAPH <$docUri> { ?s ?p ?o. }}"); |
|
453 |
|
454 //map the doc |
|
455 list($docType, $mappedGraphes) = $this->mapDoc($doc, $docUri); |
|
456 |
|
457 //merge the result docs |
|
458 $this->mergeDocs($docType, $mappedGraphes); |
|
459 |
|
460 } |
|
461 |
|
462 $progressBar->setMessage("finished raw import for text."); |
|
463 $progressBar->finish(); |
|
464 |
|
465 |
|
466 // delete left overs from previous repository |
|
467 $this->info("\n\nThere is ".count($providedCHODocsUris)." documents left-over.\n"); |
|
468 if(count($providedCHODocsUris) > 0 && $delete_old) { |
|
469 foreach($providedCHODocsUris as $graphUri) { |
|
470 $this->gs->clear($graphUri); |
|
471 } |
|
472 } |
|
473 |
|
474 $this->info("\n\nDocument count info: "); |
|
475 foreach ($this->documentCount as $docType => $docCount) { |
261 if($docType == 'error' && $docCount > 0) { |
476 if($docType == 'error' && $docCount > 0) { |
262 $this->error("$docType => $docCount"); |
477 $this->error("$docType => $docCount"); |
263 } else { |
478 } else { |
264 $this->info("$docType => $docCount"); |
479 $this->info("$docType => $docCount"); |
265 } |
480 } |