diff -r f2a40bbc27f6 -r eadaf0b8f02e server/src/app/Console/Commands/ImportCocoonRDF.php --- a/server/src/app/Console/Commands/ImportCocoonRDF.php Tue Nov 17 13:11:55 2015 +0100 +++ b/server/src/app/Console/Commands/ImportCocoonRDF.php Fri Nov 27 17:59:36 2015 +0100 @@ -14,6 +14,13 @@ const INSERT_TIMEOUT_RETRY = 5; + const MAPPER_CLASS_MAP = [ + "http://purl.org/dc/dcmitype/Sound" => '\CorpusParole\Libraries\Mappers\CocoonSoundRdfMapper', + "http://purl.org/dc/dcmitype/MovingImage" => '\CorpusParole\Libraries\Mappers\CocoonSoundRdfMapper', + "http://purl.org/dc/dcmitype/Text" => '\CorpusParole\Libraries\Mappers\CocoonTextRdfMapper', + "http://purl.org/dc/dcmitype/Collection" => '\CorpusParole\Libraries\Mappers\CocoonCollectionRdfMapper' + ]; + /** * The console command description. * @@ -26,7 +33,7 @@ * * @var string */ - protected $signature = 'corpus-parole:importRDF {--skip=0 : Number of record to skip}'; + protected $signature = 'corpus-parole:importRDF {--skip=0 : Number of record to skip} {--raw : Register raw}'; /** * Create a new command instance. @@ -36,6 +43,31 @@ } /** + * Get the list of dcmi types for the graph + */ + private function getDocTypes($doc, $docUri) { + + $res = $doc->resource($docUri); + $docTypes = []; + //foreach ($res->all("http://purl.org/dc/elements/1.1/type") as $resType) { + foreach ($res->all("dc11:type","resource") as $resType) { + $type = $resType->getUri(); + if(0 === strpos($type, 'http://purl.org/dc/dcmitype/')) { + $docTypes[] = $type; + } + } + + // if the doc type list is empty, check that we have a collection + if(empty($docTypes)) { + if(!empty($doc->allOfType('edm:Collection'))) { + $docTypes[] = "http://purl.org/dc/dcmitype/Collection"; + } + } + return $docTypes; + } + + + /** * Execute the console command. * * @return mixed @@ -45,10 +77,13 @@ libxml_use_internal_errors(true); $skip = (int)$this->option('skip'); + $raw = $this->option('raw'); $this->comment("Skipping $skip records"); + $this->comment("Recording raw queries: $raw"); $gs = new \EasyRdf\Sparql\Client(Config::get('corpusparole.sesame_query_url'), Config::get('corpusparole.sesame_update_url')); + $gs_raw = new \EasyRdf\Sparql\Client(Config::get('corpusparole.sesame_query_url_raw'), Config::get('corpusparole.sesame_update_url_raw')); $client = new Client(Config::get('corpusparole.cocoon_oaipmh_url')); @@ -62,6 +97,8 @@ $insertTimeouts = 0; + $documentCounts = ['all' => 0, 'unknown' => 0]; + foreach ($recs as $item) { $identifier = (string) $item->xpath('/record/header/identifier')[0]; @@ -77,44 +114,119 @@ $docUri = config('corpusparole.cocoon_doc_id_base_uri').substr($identifier, strlen(Config::get('corpusparole.cocoon_doc_id_base'))); - $resDocs = $gs->query("ASK WHERE { GRAPH <$docUri> { ?s ?p ?o }}"); - if(!$resDocs->getBoolean()) { - $docLoaded = false; - $loadRetry = 0; - while(!$docLoaded && $loadRetry < config('corpusparole.max_load_retry', 3)) { - $loadRetry++; - try { - $doc = new \EasyRdf\Graph($docRdfUrl); - $doc->load(); - $docLoaded = true; + $docLoaded = false; + $loadRetry = 0; + $doc = null; + while(!$docLoaded && $loadRetry < config('corpusparole.max_load_retry', 3)) { + $loadRetry++; + try { + $doc = new \EasyRdf\Graph($docRdfUrl); + $doc->load(); + $docLoaded = true; + } + //TODO: catch network exception - add error to database + catch(\Exception $e) { + $code = $e->getCode(); + $message = $e->getMessage(); + $this->info("\nError processing $identifier. code : $code, message: $message"); + Log::debug("Error processing $identifier. code : $code, message: $message"); + if($code == 0 && stripos($message, 'timed out')>=0 ) { + $this->info("\nTimeout error processing $identifier ($docRdfUrl) : $e, retrying"); + Log::warning("Timeout error processing $identifier ($docRdfUrl) : $e, retrying"); + continue; + } + else { + $this->error("\nError processing $identifier ($docRdfUrl) : $e"); + Log::error("Error processing $identifier ($docRdfUrl) : $e"); + break; } - //TODO: catch network exception - add error to database - catch(\Exception $e) { - $code = $e->getCode(); - $message = $e->getMessage(); - $this->debug("\nError processing $identifier. code : $code, message: $message"); - Log::debug("Error processing $identifier. code : $code, message: $message"); - if($code == 1 && stripos($message, 'timed out')>=0 ) { - $this->warning("\nTimeout error processing $identifier ($docRdfUrl) : $e, retrying"); - Log::warning("Timeout error processing $identifier ($docRdfUrl) : $e, retrying"); - continue; - } - else { - $this->error("\nError processing $identifier ($docRdfUrl) : $e"); - Log::error("Error processing $identifier ($docRdfUrl) : $e"); - break; - } - //$this->error(print_r($e->getTraceAsString(),true)); + //$this->error(print_r($e->getTraceAsString(),true)); + } + } + if(!$docLoaded) { + continue; + } + + //insert raw + if($raw) { + $resDocsRaw = $gs_raw->query("ASK WHERE { GRAPH <$docUri> { ?s ?p ?o }}"); + if($resDocsRaw->getBoolean()) { + $gs_raw->clear($docUri); + } + $gs_raw->insert($doc, $docUri); + } + + //map doc + $inputDocTypes = $this->getDocTypes($doc, $docUri); + + $docType = count($inputDocTypes)>0? $inputDocTypes[0]:null; + + if(is_null($docType) || !array_key_exists($docType,ImportCocoonRDF::MAPPER_CLASS_MAP)) { + $this->error("\nError processing $identifier ($docRdfUrl) : $docType unknown mapper"); + Log::error("Error processing $identifier ($docRdfUrl) : $docType unknown mapper"); + $documentCounts['unknown'] += 1; + continue; + } + $documentCounts['all'] += 1; + $documentCounts[$docType] = isset($documentCounts[$docType])?$documentCounts[$docType]+1:1; + + $mapperClass = ImportCocoonRDF::MAPPER_CLASS_MAP[$docType]; + $mapper = new $mapperClass($doc, $docUri); + + $mapper->mapGraph(); + $mappedGraphes = $mapper->getOutputGraphes(); + + foreach ($mapper->getOutputGraphes() as $mappedGraphKey => $mappedGraph) { + + $mappedGraphUri = $mappedGraph->getUri(); + try { + $resDocs = $gs->query("CONSTRUCT { ?s ?p ?o } WHERE { GRAPH <$mappedGraphUri> { ?s ?p ?o }}"); + } catch (\Exception $e) { + $this->error("\nError on graph query $identifier ($mappedGraphUri) : $e \n" . $e->getBody() . "\n"); + Log::error("\nError on graph query $identifier ($mappedGraphUri) : $e \n" . $e->getBody()); + exit; + } + + $mergedGraph = null; + $doDelete = true; + + if($resDocs->isEmpty()) { + $mergedGraph = $mappedGraph; + $doDelete = false; + } + else { + $doDelete = true; + $mappedTypes = $this->getDocTypes($mappedGraph, $mappedGraphUri); + $presentTypes = $this->getDocTypes($resDocs, $mappedGraphUri); + + if($docType == "http://purl.org/dc/dcmitype/Collection" || in_array("http://purl.org/dc/dcmitype/Collection", $mappedTypes)) { + $merger = new \CorpusParole\Libraries\Mergers\CocoonCollectionRdfMerger(); + $baseGraph = $resDocs; + $sourceGraph = $mappedGraph; + } + elseif ($docType == "http://purl.org/dc/dcmitype/Text") { + $merger = new \CorpusParole\Libraries\Mergers\CocoonSoundRdfMerger(); + $baseGraph = $resDocs; + $sourceGraph = $mappedGraph; + } + else { + $merger = new \CorpusParole\Libraries\Mergers\CocoonSoundRdfMerger(); + $baseGraph = $mappedGraph; + $sourceGraph = $resDocs; + } + $mergedGraph = $merger->mergeGraph($baseGraph, $sourceGraph, $mappedGraphUri); + if(\EasyRdf\Isomorphic::isomorphic($resDocs, $mergedGraph)) { + //graph are isomorphic no need to go farther for this graph + Log::info("Graph are isomorphic for $mappedGraphUri (from $identifier : $docRdfUrl), skipping"); + continue; } } - if(!$docLoaded) { - continue; - } - //TODO: treat errors - $subjects = $doc->resources(); - $subject = reset($subjects)->getUri(); + try { - $gs->insert($doc, $subject); + if($doDelete) { + $gs->clear($mappedGraphUri); + } + $gs->insert($mergedGraph, $mappedGraphUri); } catch(\Exception $e) { // just log not much we can do here... @@ -134,5 +246,10 @@ } $progressBar->setMessage("finished"); $progressBar->finish(); + + $this->info("\nDocument count info: "); + foreach ($documentCounts as $docType => $docCount) { + $this->info("$docType => $docCount"); + } } }