--- a/server/src/app/Console/Commands/ImportCocoonRDF.php Tue Nov 17 13:11:55 2015 +0100
+++ b/server/src/app/Console/Commands/ImportCocoonRDF.php Fri Nov 27 17:59:36 2015 +0100
@@ -14,6 +14,13 @@
const INSERT_TIMEOUT_RETRY = 5;
+ const MAPPER_CLASS_MAP = [
+ "http://purl.org/dc/dcmitype/Sound" => '\CorpusParole\Libraries\Mappers\CocoonSoundRdfMapper',
+ "http://purl.org/dc/dcmitype/MovingImage" => '\CorpusParole\Libraries\Mappers\CocoonSoundRdfMapper',
+ "http://purl.org/dc/dcmitype/Text" => '\CorpusParole\Libraries\Mappers\CocoonTextRdfMapper',
+ "http://purl.org/dc/dcmitype/Collection" => '\CorpusParole\Libraries\Mappers\CocoonCollectionRdfMapper'
+ ];
+
/**
* The console command description.
*
@@ -26,7 +33,7 @@
*
* @var string
*/
- protected $signature = 'corpus-parole:importRDF {--skip=0 : Number of record to skip}';
+ protected $signature = 'corpus-parole:importRDF {--skip=0 : Number of record to skip} {--raw : Register raw}';
/**
* Create a new command instance.
@@ -36,6 +43,31 @@
}
/**
+ * Get the list of dcmi types for the graph
+ */
+ private function getDocTypes($doc, $docUri) {
+
+ $res = $doc->resource($docUri);
+ $docTypes = [];
+ //foreach ($res->all("http://purl.org/dc/elements/1.1/type") as $resType) {
+ foreach ($res->all("dc11:type","resource") as $resType) {
+ $type = $resType->getUri();
+ if(0 === strpos($type, 'http://purl.org/dc/dcmitype/')) {
+ $docTypes[] = $type;
+ }
+ }
+
+ // if the doc type list is empty, check that we have a collection
+ if(empty($docTypes)) {
+ if(!empty($doc->allOfType('edm:Collection'))) {
+ $docTypes[] = "http://purl.org/dc/dcmitype/Collection";
+ }
+ }
+ return $docTypes;
+ }
+
+
+ /**
* Execute the console command.
*
* @return mixed
@@ -45,10 +77,13 @@
libxml_use_internal_errors(true);
$skip = (int)$this->option('skip');
+ $raw = $this->option('raw');
$this->comment("Skipping $skip records");
+ $this->comment("Recording raw queries: $raw");
$gs = new \EasyRdf\Sparql\Client(Config::get('corpusparole.sesame_query_url'), Config::get('corpusparole.sesame_update_url'));
+ $gs_raw = new \EasyRdf\Sparql\Client(Config::get('corpusparole.sesame_query_url_raw'), Config::get('corpusparole.sesame_update_url_raw'));
$client = new Client(Config::get('corpusparole.cocoon_oaipmh_url'));
@@ -62,6 +97,8 @@
$insertTimeouts = 0;
+ $documentCounts = ['all' => 0, 'unknown' => 0];
+
foreach ($recs as $item) {
$identifier = (string) $item->xpath('/record/header/identifier')[0];
@@ -77,44 +114,119 @@
$docUri = config('corpusparole.cocoon_doc_id_base_uri').substr($identifier, strlen(Config::get('corpusparole.cocoon_doc_id_base')));
- $resDocs = $gs->query("ASK WHERE { GRAPH <$docUri> { ?s ?p ?o }}");
- if(!$resDocs->getBoolean()) {
- $docLoaded = false;
- $loadRetry = 0;
- while(!$docLoaded && $loadRetry < config('corpusparole.max_load_retry', 3)) {
- $loadRetry++;
- try {
- $doc = new \EasyRdf\Graph($docRdfUrl);
- $doc->load();
- $docLoaded = true;
+ $docLoaded = false;
+ $loadRetry = 0;
+ $doc = null;
+ while(!$docLoaded && $loadRetry < config('corpusparole.max_load_retry', 3)) {
+ $loadRetry++;
+ try {
+ $doc = new \EasyRdf\Graph($docRdfUrl);
+ $doc->load();
+ $docLoaded = true;
+ }
+ //TODO: catch network exception - add error to database
+ catch(\Exception $e) {
+ $code = $e->getCode();
+ $message = $e->getMessage();
+ $this->info("\nError processing $identifier. code : $code, message: $message");
+ Log::debug("Error processing $identifier. code : $code, message: $message");
+ if($code == 0 && stripos($message, 'timed out')>=0 ) {
+ $this->info("\nTimeout error processing $identifier ($docRdfUrl) : $e, retrying");
+ Log::warning("Timeout error processing $identifier ($docRdfUrl) : $e, retrying");
+ continue;
+ }
+ else {
+ $this->error("\nError processing $identifier ($docRdfUrl) : $e");
+ Log::error("Error processing $identifier ($docRdfUrl) : $e");
+ break;
}
- //TODO: catch network exception - add error to database
- catch(\Exception $e) {
- $code = $e->getCode();
- $message = $e->getMessage();
- $this->debug("\nError processing $identifier. code : $code, message: $message");
- Log::debug("Error processing $identifier. code : $code, message: $message");
- if($code == 1 && stripos($message, 'timed out')>=0 ) {
- $this->warning("\nTimeout error processing $identifier ($docRdfUrl) : $e, retrying");
- Log::warning("Timeout error processing $identifier ($docRdfUrl) : $e, retrying");
- continue;
- }
- else {
- $this->error("\nError processing $identifier ($docRdfUrl) : $e");
- Log::error("Error processing $identifier ($docRdfUrl) : $e");
- break;
- }
- //$this->error(print_r($e->getTraceAsString(),true));
+ //$this->error(print_r($e->getTraceAsString(),true));
+ }
+ }
+ if(!$docLoaded) {
+ continue;
+ }
+
+ //insert raw
+ if($raw) {
+ $resDocsRaw = $gs_raw->query("ASK WHERE { GRAPH <$docUri> { ?s ?p ?o }}");
+ if($resDocsRaw->getBoolean()) {
+ $gs_raw->clear($docUri);
+ }
+ $gs_raw->insert($doc, $docUri);
+ }
+
+ //map doc
+ $inputDocTypes = $this->getDocTypes($doc, $docUri);
+
+ $docType = count($inputDocTypes)>0? $inputDocTypes[0]:null;
+
+ if(is_null($docType) || !array_key_exists($docType,ImportCocoonRDF::MAPPER_CLASS_MAP)) {
+ $this->error("\nError processing $identifier ($docRdfUrl) : $docType unknown mapper");
+ Log::error("Error processing $identifier ($docRdfUrl) : $docType unknown mapper");
+ $documentCounts['unknown'] += 1;
+ continue;
+ }
+ $documentCounts['all'] += 1;
+ $documentCounts[$docType] = isset($documentCounts[$docType])?$documentCounts[$docType]+1:1;
+
+ $mapperClass = ImportCocoonRDF::MAPPER_CLASS_MAP[$docType];
+ $mapper = new $mapperClass($doc, $docUri);
+
+ $mapper->mapGraph();
+ $mappedGraphes = $mapper->getOutputGraphes();
+
+ foreach ($mapper->getOutputGraphes() as $mappedGraphKey => $mappedGraph) {
+
+ $mappedGraphUri = $mappedGraph->getUri();
+ try {
+ $resDocs = $gs->query("CONSTRUCT { ?s ?p ?o } WHERE { GRAPH <$mappedGraphUri> { ?s ?p ?o }}");
+ } catch (\Exception $e) {
+ $this->error("\nError on graph query $identifier ($mappedGraphUri) : $e \n" . $e->getBody() . "\n");
+ Log::error("\nError on graph query $identifier ($mappedGraphUri) : $e \n" . $e->getBody());
+ exit;
+ }
+
+ $mergedGraph = null;
+ $doDelete = true;
+
+ if($resDocs->isEmpty()) {
+ $mergedGraph = $mappedGraph;
+ $doDelete = false;
+ }
+ else {
+ $doDelete = true;
+ $mappedTypes = $this->getDocTypes($mappedGraph, $mappedGraphUri);
+ $presentTypes = $this->getDocTypes($resDocs, $mappedGraphUri);
+
+ if($docType == "http://purl.org/dc/dcmitype/Collection" || in_array("http://purl.org/dc/dcmitype/Collection", $mappedTypes)) {
+ $merger = new \CorpusParole\Libraries\Mergers\CocoonCollectionRdfMerger();
+ $baseGraph = $resDocs;
+ $sourceGraph = $mappedGraph;
+ }
+ elseif ($docType == "http://purl.org/dc/dcmitype/Text") {
+ $merger = new \CorpusParole\Libraries\Mergers\CocoonSoundRdfMerger();
+ $baseGraph = $resDocs;
+ $sourceGraph = $mappedGraph;
+ }
+ else {
+ $merger = new \CorpusParole\Libraries\Mergers\CocoonSoundRdfMerger();
+ $baseGraph = $mappedGraph;
+ $sourceGraph = $resDocs;
+ }
+ $mergedGraph = $merger->mergeGraph($baseGraph, $sourceGraph, $mappedGraphUri);
+ if(\EasyRdf\Isomorphic::isomorphic($resDocs, $mergedGraph)) {
+ //graph are isomorphic no need to go farther for this graph
+ Log::info("Graph are isomorphic for $mappedGraphUri (from $identifier : $docRdfUrl), skipping");
+ continue;
}
}
- if(!$docLoaded) {
- continue;
- }
- //TODO: treat errors
- $subjects = $doc->resources();
- $subject = reset($subjects)->getUri();
+
try {
- $gs->insert($doc, $subject);
+ if($doDelete) {
+ $gs->clear($mappedGraphUri);
+ }
+ $gs->insert($mergedGraph, $mappedGraphUri);
}
catch(\Exception $e) {
// just log not much we can do here...
@@ -134,5 +246,10 @@
}
$progressBar->setMessage("finished");
$progressBar->finish();
+
+ $this->info("\nDocument count info: ");
+ foreach ($documentCounts as $docType => $docCount) {
+ $this->info("$docType => $docCount");
+ }
}
}