server/src/app/Console/Commands/ImportCocoonRDF.php
changeset 19 eadaf0b8f02e
parent 18 f2a40bbc27f6
child 114 8af5ed0521a2
--- a/server/src/app/Console/Commands/ImportCocoonRDF.php	Tue Nov 17 13:11:55 2015 +0100
+++ b/server/src/app/Console/Commands/ImportCocoonRDF.php	Fri Nov 27 17:59:36 2015 +0100
@@ -14,6 +14,13 @@
 
     const INSERT_TIMEOUT_RETRY = 5;
 
+    const MAPPER_CLASS_MAP = [
+        "http://purl.org/dc/dcmitype/Sound" => '\CorpusParole\Libraries\Mappers\CocoonSoundRdfMapper',
+        "http://purl.org/dc/dcmitype/MovingImage" => '\CorpusParole\Libraries\Mappers\CocoonSoundRdfMapper',
+        "http://purl.org/dc/dcmitype/Text" => '\CorpusParole\Libraries\Mappers\CocoonTextRdfMapper',
+        "http://purl.org/dc/dcmitype/Collection" => '\CorpusParole\Libraries\Mappers\CocoonCollectionRdfMapper'
+    ];
+
     /**
      * The console command description.
      *
@@ -26,7 +33,7 @@
     *
     * @var string
     */
-    protected $signature = 'corpus-parole:importRDF {--skip=0 : Number of record to skip}';
+    protected $signature = 'corpus-parole:importRDF {--skip=0 : Number of record to skip} {--raw : Register raw}';
 
     /**
      * Create a new command instance.
@@ -36,6 +43,31 @@
     }
 
     /**
+     * Get the list of dcmi types for the graph
+     */
+    private function getDocTypes($doc, $docUri) {
+
+        $res = $doc->resource($docUri);
+        $docTypes = [];
+        //foreach ($res->all("http://purl.org/dc/elements/1.1/type") as $resType) {
+        foreach ($res->all("dc11:type","resource") as $resType) {
+            $type = $resType->getUri();
+            if(0 === strpos($type, 'http://purl.org/dc/dcmitype/')) {
+                $docTypes[] = $type;
+            }
+        }
+
+        // if the doc type list is empty, check that we have a collection
+        if(empty($docTypes)) {
+            if(!empty($doc->allOfType('edm:Collection'))) {
+                $docTypes[] = "http://purl.org/dc/dcmitype/Collection";
+            }
+        }
+        return $docTypes;
+    }
+
+
+    /**
      * Execute the console command.
      *
      * @return mixed
@@ -45,10 +77,13 @@
         libxml_use_internal_errors(true);
 
         $skip = (int)$this->option('skip');
+        $raw = $this->option('raw');
 
         $this->comment("Skipping $skip records");
+        $this->comment("Recording raw queries: $raw");
 
         $gs = new \EasyRdf\Sparql\Client(Config::get('corpusparole.sesame_query_url'), Config::get('corpusparole.sesame_update_url'));
+        $gs_raw = new \EasyRdf\Sparql\Client(Config::get('corpusparole.sesame_query_url_raw'), Config::get('corpusparole.sesame_update_url_raw'));
 
 
         $client = new Client(Config::get('corpusparole.cocoon_oaipmh_url'));
@@ -62,6 +97,8 @@
 
         $insertTimeouts = 0;
 
+        $documentCounts = ['all' => 0, 'unknown' => 0];
+
         foreach ($recs as $item) {
 
             $identifier = (string) $item->xpath('/record/header/identifier')[0];
@@ -77,44 +114,119 @@
 
             $docUri = config('corpusparole.cocoon_doc_id_base_uri').substr($identifier, strlen(Config::get('corpusparole.cocoon_doc_id_base')));
 
-            $resDocs = $gs->query("ASK WHERE { GRAPH <$docUri> { ?s ?p ?o }}");
-            if(!$resDocs->getBoolean()) {
-                $docLoaded = false;
-                $loadRetry = 0;
-                while(!$docLoaded && $loadRetry < config('corpusparole.max_load_retry', 3)) {
-                    $loadRetry++;
-                    try {
-                        $doc = new \EasyRdf\Graph($docRdfUrl);
-                        $doc->load();
-                        $docLoaded = true;
+            $docLoaded = false;
+            $loadRetry = 0;
+            $doc = null;
+            while(!$docLoaded && $loadRetry < config('corpusparole.max_load_retry', 3)) {
+                $loadRetry++;
+                try {
+                    $doc = new \EasyRdf\Graph($docRdfUrl);
+                    $doc->load();
+                    $docLoaded = true;
+                }
+                //TODO: catch network exception - add error to database
+                catch(\Exception $e) {
+                    $code = $e->getCode();
+                    $message = $e->getMessage();
+                    $this->info("\nError processing $identifier. code : $code, message: $message");
+                    Log::debug("Error processing $identifier. code : $code, message: $message");
+                    if($code == 0 && stripos($message, 'timed out')>=0 ) {
+                        $this->info("\nTimeout error processing $identifier ($docRdfUrl) : $e, retrying");
+                        Log::warning("Timeout error processing $identifier ($docRdfUrl) : $e, retrying");
+                        continue;
+                    }
+                    else {
+                        $this->error("\nError processing $identifier ($docRdfUrl) : $e");
+                        Log::error("Error processing $identifier ($docRdfUrl) : $e");
+                        break;
                     }
-                    //TODO: catch network exception - add error to database
-                    catch(\Exception $e) {
-                        $code = $e->getCode();
-                        $message = $e->getMessage();
-                        $this->debug("\nError processing $identifier. code : $code, message: $message");
-                        Log::debug("Error processing $identifier. code : $code, message: $message");
-                        if($code == 1 && stripos($message, 'timed out')>=0 ) {
-                            $this->warning("\nTimeout error processing $identifier ($docRdfUrl) : $e, retrying");
-                            Log::warning("Timeout error processing $identifier ($docRdfUrl) : $e, retrying");
-                            continue;
-                        }
-                        else {
-                            $this->error("\nError processing $identifier ($docRdfUrl) : $e");
-                            Log::error("Error processing $identifier ($docRdfUrl) : $e");
-                            break;
-                        }
-                        //$this->error(print_r($e->getTraceAsString(),true));
+                    //$this->error(print_r($e->getTraceAsString(),true));
+                }
+            }
+            if(!$docLoaded) {
+                continue;
+            }
+
+            //insert raw
+            if($raw) {
+                $resDocsRaw = $gs_raw->query("ASK WHERE { GRAPH <$docUri> { ?s ?p ?o }}");
+                if($resDocsRaw->getBoolean()) {
+                    $gs_raw->clear($docUri);
+                }
+                $gs_raw->insert($doc, $docUri);
+            }
+
+            //map doc
+            $inputDocTypes = $this->getDocTypes($doc, $docUri);
+
+            $docType = count($inputDocTypes)>0? $inputDocTypes[0]:null;
+
+            if(is_null($docType) || !array_key_exists($docType,ImportCocoonRDF::MAPPER_CLASS_MAP)) {
+                $this->error("\nError processing $identifier ($docRdfUrl) : $docType unknown mapper");
+                Log::error("Error processing $identifier ($docRdfUrl) : $docType unknown mapper");
+                $documentCounts['unknown'] += 1;
+                continue;
+            }
+            $documentCounts['all'] += 1;
+            $documentCounts[$docType] = isset($documentCounts[$docType])?$documentCounts[$docType]+1:1;
+
+            $mapperClass = ImportCocoonRDF::MAPPER_CLASS_MAP[$docType];
+            $mapper = new $mapperClass($doc, $docUri);
+
+            $mapper->mapGraph();
+            $mappedGraphes = $mapper->getOutputGraphes();
+
+            foreach ($mapper->getOutputGraphes() as $mappedGraphKey => $mappedGraph) {
+
+                $mappedGraphUri = $mappedGraph->getUri();
+                try {
+                    $resDocs = $gs->query("CONSTRUCT { ?s ?p ?o } WHERE { GRAPH <$mappedGraphUri> { ?s ?p ?o }}");
+                } catch (\Exception $e) {
+                    $this->error("\nError on graph query $identifier ($mappedGraphUri) : $e \n" . $e->getBody() . "\n");
+                    Log::error("\nError on graph query $identifier ($mappedGraphUri) : $e \n" . $e->getBody());
+                    exit;
+                }
+
+                $mergedGraph = null;
+                $doDelete = true;
+
+                if($resDocs->isEmpty()) {
+                    $mergedGraph = $mappedGraph;
+                    $doDelete = false;
+                }
+                else {
+                    $doDelete = true;
+                    $mappedTypes = $this->getDocTypes($mappedGraph, $mappedGraphUri);
+                    $presentTypes = $this->getDocTypes($resDocs, $mappedGraphUri);
+
+                    if($docType == "http://purl.org/dc/dcmitype/Collection" || in_array("http://purl.org/dc/dcmitype/Collection", $mappedTypes)) {
+                        $merger = new \CorpusParole\Libraries\Mergers\CocoonCollectionRdfMerger();
+                        $baseGraph = $resDocs;
+                        $sourceGraph = $mappedGraph;
+                    }
+                    elseif ($docType == "http://purl.org/dc/dcmitype/Text") {
+                        $merger = new \CorpusParole\Libraries\Mergers\CocoonSoundRdfMerger();
+                        $baseGraph = $resDocs;
+                        $sourceGraph = $mappedGraph;
+                    }
+                    else {
+                        $merger = new \CorpusParole\Libraries\Mergers\CocoonSoundRdfMerger();
+                        $baseGraph = $mappedGraph;
+                        $sourceGraph = $resDocs;
+                    }
+                    $mergedGraph = $merger->mergeGraph($baseGraph, $sourceGraph, $mappedGraphUri);
+                    if(\EasyRdf\Isomorphic::isomorphic($resDocs, $mergedGraph)) {
+                        //graph are isomorphic no need to go farther for this graph
+                        Log::info("Graph are isomorphic for $mappedGraphUri (from $identifier : $docRdfUrl), skipping");
+                        continue;
                     }
                 }
-                if(!$docLoaded) {
-                    continue;
-                }
-                //TODO: treat errors
-                $subjects = $doc->resources();
-                $subject = reset($subjects)->getUri();
+
                 try {
-                    $gs->insert($doc, $subject);
+                    if($doDelete) {
+                        $gs->clear($mappedGraphUri);
+                    }
+                    $gs->insert($mergedGraph, $mappedGraphUri);
                 }
                 catch(\Exception $e) {
                     // just log not much we can do here...
@@ -134,5 +246,10 @@
         }
         $progressBar->setMessage("finished");
         $progressBar->finish();
+
+        $this->info("\nDocument count info: ");
+        foreach ($documentCounts as $docType => $docCount) {
+            $this->info("$docType => $docCount");
+        }
     }
 }