try to solve #0025932 + try to improve indexing process reliability by retrying bnf label resolve queries
authorymh <ymh.work@gmail.com>
Fri, 10 Feb 2017 12:03:12 +0100
changeset 506 8a5bb4b48b85
parent 505 9175ea22f1b1
child 507 a56a807f5d8e
try to solve #0025932 + try to improve indexing process reliability by retrying bnf label resolve queries
server/src/app/Console/Commands/ImportCocoonRDF.php
server/src/app/Console/Commands/IndexDocuments.php
server/src/app/Libraries/Mergers/CocoonTextRdfMerger.php
server/src/app/Repositories/RdfDocumentRepository.php
server/src/app/Services/BnfResolver.php
server/src/app/Services/BnfResolverTimeoutException.php
server/src/config/corpusparole.php
--- a/server/src/app/Console/Commands/ImportCocoonRDF.php	Thu Feb 09 17:22:58 2017 +0100
+++ b/server/src/app/Console/Commands/ImportCocoonRDF.php	Fri Feb 10 12:03:12 2017 +0100
@@ -97,7 +97,7 @@
 
         $insertTimeouts = 0;
 
-        $documentCounts = ['all' => 0, 'unknown' => 0, 'error' => 0];
+        $documentCounts = ['all' => 0, 'unknown' => 0, 'error' => 0, 'raw_duplicates' => 0];
 
         foreach ($recs as $item) {
             $item->registerXPathNamespace('oai', "http://www.openarchives.org/OAI/2.0/");
@@ -154,6 +154,7 @@
                 $resDocsRaw = $gs_raw->query("ASK WHERE { GRAPH <$docUri> { ?s ?p ?o }}");
                 if($resDocsRaw->getBoolean()) {
                     $gs_raw->clear($docUri);
+
                 }
                 $gs_raw->insert($doc, $docUri);
             }
--- a/server/src/app/Console/Commands/IndexDocuments.php	Thu Feb 09 17:22:58 2017 +0100
+++ b/server/src/app/Console/Commands/IndexDocuments.php	Fri Feb 10 12:03:12 2017 +0100
@@ -250,17 +250,33 @@
             return $res;
         }, []);
 
-        $labelsBnf = $this->bnfResolver->getLabels(
-            array_unique(array_reduce(
-                $sres,
-                function($r, $so) {
-                    if($so['type'] === Utils::SUBJECT_TYPE_BNF) {
-                        array_push($r, $so['uri']);
-                    }
-                    return $r;
-                },[]
-            ))
-        );
+        $labelsResolved = false;
+        $timeoutRetries = 0;
+        while(!$labelsResolved && $timeoutRetries < config('corpusparole.bnf_max_retries', 3)) {
+            try {
+                $labelsBnf = $this->bnfResolver->getLabels(
+                    array_unique(array_reduce(
+                        $sres,
+                        function($r, $so) {
+                            if($so['type'] === Utils::SUBJECT_TYPE_BNF) {
+                                array_push($r, $so['uri']);
+                            }
+                            return $r;
+                        },[]
+                    ))
+                );
+                $labelsResolved = true;
+            } catch(BnfResolverTimeoutException $e) {
+                Log::warning('IndexDocument: Resolve label timeout, will retry');
+                $timeoutRetries++;
+                continue;
+            }
+        }
+        if(!$labelsResolved) {
+            Log::error("IndexDocument: Some bnf labels not resolved (retry timeout: $timeoutRetries).");
+            $this->error("\nError resolving bnf labels (retry timeout: $timeoutRetries). Stopping");
+            throw new \Exception("Error resolving bnf labels (retry timeout: $timeoutRetries). Stopping");
+        }
         $labelsLexvo = $this->lexvoResolver->getNames(
             array_unique(array_reduce(
                 $sres,
@@ -586,9 +602,11 @@
      *
      * @return int (1 if sucess, 0 if error)
      */
-     private function indexBulk($docBodies)
-     {
-          $query_data = ['body' => []];
+     private function indexBulk($docBodies) {
+        if(empty($docBodies)) {
+            return;
+        }
+        $query_data = ['body' => []];
           foreach($docBodies as $docId => $docBody){
               $query_data['body'][] = [
                   'index' => [
@@ -665,6 +683,9 @@
                 $progressBar->setMessage($docId);
                 $progressBar->advance();
                 $doc = $this->documentRepository->get($docId);
+                if(is_null($doc)) {
+                    continue;
+                }
                 $docBody = $this->getDocBody($doc);
                 if($noBulk) {
                     $this->indexOne($docId, $docBody);
--- a/server/src/app/Libraries/Mergers/CocoonTextRdfMerger.php	Thu Feb 09 17:22:58 2017 +0100
+++ b/server/src/app/Libraries/Mergers/CocoonTextRdfMerger.php	Fri Feb 10 12:03:12 2017 +0100
@@ -109,7 +109,7 @@
         $isBaseSound = false;
         foreach ($baseRes->all("dc11:type","resource") as $resType) {
             $type = $resType->getUri();
-            if($type === 'http://purl.org/dc/dcmitype/Sound') {
+            if($type === 'http://purl.org/dc/dcmitype/Sound' || $type === 'http://purl.org/dc/dcmitype/MovingImage') {
                 $isBaseSound = true;
                 break;
             }
--- a/server/src/app/Repositories/RdfDocumentRepository.php	Thu Feb 09 17:22:58 2017 +0100
+++ b/server/src/app/Repositories/RdfDocumentRepository.php	Fri Feb 10 12:03:12 2017 +0100
@@ -96,7 +96,7 @@
 
     private function getResGraph($doc) {
 
-        if(empty((array)$doc)) {
+        if(empty((array)$doc) || !array_key_exists('uri', (array)$doc)) {
             return null;
         }
 
--- a/server/src/app/Services/BnfResolver.php	Thu Feb 09 17:22:58 2017 +0100
+++ b/server/src/app/Services/BnfResolver.php	Fri Feb 10 12:03:12 2017 +0100
@@ -3,6 +3,8 @@
 
 use Cache;
 use CorpusParole\Services\BnfResolverInterface;
+use CorpusParole\Services\BnfResolverTimeoutException;
+use EasyRdf;
 
 class BnfResolver implements BnfResolverInterface {
 
@@ -84,7 +86,20 @@
         }
         $query .= "}";
 
-        $docs = $this->sparqlClient->query($query);
+        try {
+            $docs = $this->sparqlClient->query($query);
+        } catch (EasyRdf\Exception $e) {
+            $code = 0;
+            if(method_exists($e, 'getCode')) {
+                $code = $e->getCode();
+            }
+            $message = $e->getMessage();
+            if($code == 400 || ($code == 0 && stripos($message, 'timed out')>=0) ) {
+                throw new BnfResolverTimeoutException("Query to bnf server timed out.");
+            }
+            // reraise the original exception
+            throw $e;
+        }
 
         $resultsRaw = [];
 
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/server/src/app/Services/BnfResolverTimeoutException.php	Fri Feb 10 12:03:12 2017 +0100
@@ -0,0 +1,6 @@
+<?php
+namespace CorpusParole\Services;
+
+class BnfResolverTimeoutException extends \Exception {
+    // just extend...
+}
--- a/server/src/config/corpusparole.php	Thu Feb 09 17:22:58 2017 +0100
+++ b/server/src/config/corpusparole.php	Fri Feb 10 12:03:12 2017 +0100
@@ -137,6 +137,7 @@
     'bnf_max_ids' => 5,
     'bnf_query_url' => 'http://data.bnf.fr/sparql',
     'bnf_completion_url' => 'http://data.bnf.fr/search-letter/',
+    'bnf_max_retries' => 3,
 
     'dbpedia_url_regexp' => '/^https?\:\/\/(([[:alpha:]]+\.)?dbpedia\.org)\/(resource|page)\/([^\/]+)\/?$/',