try to solve #0025932 + try to improve indexing process reliability by retrying bnf label resolve queries
--- a/server/src/app/Console/Commands/ImportCocoonRDF.php Thu Feb 09 17:22:58 2017 +0100
+++ b/server/src/app/Console/Commands/ImportCocoonRDF.php Fri Feb 10 12:03:12 2017 +0100
@@ -97,7 +97,7 @@
$insertTimeouts = 0;
- $documentCounts = ['all' => 0, 'unknown' => 0, 'error' => 0];
+ $documentCounts = ['all' => 0, 'unknown' => 0, 'error' => 0, 'raw_duplicates' => 0];
foreach ($recs as $item) {
$item->registerXPathNamespace('oai', "http://www.openarchives.org/OAI/2.0/");
@@ -154,6 +154,7 @@
$resDocsRaw = $gs_raw->query("ASK WHERE { GRAPH <$docUri> { ?s ?p ?o }}");
if($resDocsRaw->getBoolean()) {
$gs_raw->clear($docUri);
+
}
$gs_raw->insert($doc, $docUri);
}
--- a/server/src/app/Console/Commands/IndexDocuments.php Thu Feb 09 17:22:58 2017 +0100
+++ b/server/src/app/Console/Commands/IndexDocuments.php Fri Feb 10 12:03:12 2017 +0100
@@ -250,17 +250,33 @@
return $res;
}, []);
- $labelsBnf = $this->bnfResolver->getLabels(
- array_unique(array_reduce(
- $sres,
- function($r, $so) {
- if($so['type'] === Utils::SUBJECT_TYPE_BNF) {
- array_push($r, $so['uri']);
- }
- return $r;
- },[]
- ))
- );
+ $labelsResolved = false;
+ $timeoutRetries = 0;
+ while(!$labelsResolved && $timeoutRetries < config('corpusparole.bnf_max_retries', 3)) {
+ try {
+ $labelsBnf = $this->bnfResolver->getLabels(
+ array_unique(array_reduce(
+ $sres,
+ function($r, $so) {
+ if($so['type'] === Utils::SUBJECT_TYPE_BNF) {
+ array_push($r, $so['uri']);
+ }
+ return $r;
+ },[]
+ ))
+ );
+ $labelsResolved = true;
+ } catch(BnfResolverTimeoutException $e) {
+ Log::warning('IndexDocument: Resolve label timeout, will retry');
+ $timeoutRetries++;
+ continue;
+ }
+ }
+ if(!$labelsResolved) {
+ Log::error("IndexDocument: Some bnf labels not resolved (retry timeout: $timeoutRetries).");
+ $this->error("\nError resolving bnf labels (retry timeout: $timeoutRetries). Stopping");
+ throw new \Exception("Error resolving bnf labels (retry timeout: $timeoutRetries). Stopping");
+ }
$labelsLexvo = $this->lexvoResolver->getNames(
array_unique(array_reduce(
$sres,
@@ -586,9 +602,11 @@
*
* @return int (1 if sucess, 0 if error)
*/
- private function indexBulk($docBodies)
- {
- $query_data = ['body' => []];
+ private function indexBulk($docBodies) {
+ if(empty($docBodies)) {
+ return;
+ }
+ $query_data = ['body' => []];
foreach($docBodies as $docId => $docBody){
$query_data['body'][] = [
'index' => [
@@ -665,6 +683,9 @@
$progressBar->setMessage($docId);
$progressBar->advance();
$doc = $this->documentRepository->get($docId);
+ if(is_null($doc)) {
+ continue;
+ }
$docBody = $this->getDocBody($doc);
if($noBulk) {
$this->indexOne($docId, $docBody);
--- a/server/src/app/Libraries/Mergers/CocoonTextRdfMerger.php Thu Feb 09 17:22:58 2017 +0100
+++ b/server/src/app/Libraries/Mergers/CocoonTextRdfMerger.php Fri Feb 10 12:03:12 2017 +0100
@@ -109,7 +109,7 @@
$isBaseSound = false;
foreach ($baseRes->all("dc11:type","resource") as $resType) {
$type = $resType->getUri();
- if($type === 'http://purl.org/dc/dcmitype/Sound') {
+ if($type === 'http://purl.org/dc/dcmitype/Sound' || $type === 'http://purl.org/dc/dcmitype/MovingImage') {
$isBaseSound = true;
break;
}
--- a/server/src/app/Repositories/RdfDocumentRepository.php Thu Feb 09 17:22:58 2017 +0100
+++ b/server/src/app/Repositories/RdfDocumentRepository.php Fri Feb 10 12:03:12 2017 +0100
@@ -96,7 +96,7 @@
private function getResGraph($doc) {
- if(empty((array)$doc)) {
+ if(empty((array)$doc) || !array_key_exists('uri', (array)$doc)) {
return null;
}
--- a/server/src/app/Services/BnfResolver.php Thu Feb 09 17:22:58 2017 +0100
+++ b/server/src/app/Services/BnfResolver.php Fri Feb 10 12:03:12 2017 +0100
@@ -3,6 +3,8 @@
use Cache;
use CorpusParole\Services\BnfResolverInterface;
+use CorpusParole\Services\BnfResolverTimeoutException;
+use EasyRdf;
class BnfResolver implements BnfResolverInterface {
@@ -84,7 +86,20 @@
}
$query .= "}";
- $docs = $this->sparqlClient->query($query);
+ try {
+ $docs = $this->sparqlClient->query($query);
+ } catch (EasyRdf\Exception $e) {
+ $code = 0;
+ if(method_exists($e, 'getCode')) {
+ $code = $e->getCode();
+ }
+ $message = $e->getMessage();
+ if($code == 400 || ($code == 0 && stripos($message, 'timed out')>=0) ) {
+ throw new BnfResolverTimeoutException("Query to bnf server timed out.");
+ }
+ // reraise the original exception
+ throw $e;
+ }
$resultsRaw = [];
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/server/src/app/Services/BnfResolverTimeoutException.php Fri Feb 10 12:03:12 2017 +0100
@@ -0,0 +1,6 @@
+<?php
+namespace CorpusParole\Services;
+
+class BnfResolverTimeoutException extends \Exception {
+ // just extend...
+}
--- a/server/src/config/corpusparole.php Thu Feb 09 17:22:58 2017 +0100
+++ b/server/src/config/corpusparole.php Fri Feb 10 12:03:12 2017 +0100
@@ -137,6 +137,7 @@
'bnf_max_ids' => 5,
'bnf_query_url' => 'http://data.bnf.fr/sparql',
'bnf_completion_url' => 'http://data.bnf.fr/search-letter/',
+ 'bnf_max_retries' => 3,
'dbpedia_url_regexp' => '/^https?\:\/\/(([[:alpha:]]+\.)?dbpedia\.org)\/(resource|page)\/([^\/]+)\/?$/',