# HG changeset patch # User ymh # Date 1486724592 -3600 # Node ID 8a5bb4b48b8592857a4a96a4b30221f8196b8b97 # Parent 9175ea22f1b163e22301a35a3dc330dd15bf1b31 try to solve #0025932 + try to improve indexing process reliability by retrying bnf label resolve queries diff -r 9175ea22f1b1 -r 8a5bb4b48b85 server/src/app/Console/Commands/ImportCocoonRDF.php --- a/server/src/app/Console/Commands/ImportCocoonRDF.php Thu Feb 09 17:22:58 2017 +0100 +++ b/server/src/app/Console/Commands/ImportCocoonRDF.php Fri Feb 10 12:03:12 2017 +0100 @@ -97,7 +97,7 @@ $insertTimeouts = 0; - $documentCounts = ['all' => 0, 'unknown' => 0, 'error' => 0]; + $documentCounts = ['all' => 0, 'unknown' => 0, 'error' => 0, 'raw_duplicates' => 0]; foreach ($recs as $item) { $item->registerXPathNamespace('oai', "http://www.openarchives.org/OAI/2.0/"); @@ -154,6 +154,7 @@ $resDocsRaw = $gs_raw->query("ASK WHERE { GRAPH <$docUri> { ?s ?p ?o }}"); if($resDocsRaw->getBoolean()) { $gs_raw->clear($docUri); + } $gs_raw->insert($doc, $docUri); } diff -r 9175ea22f1b1 -r 8a5bb4b48b85 server/src/app/Console/Commands/IndexDocuments.php --- a/server/src/app/Console/Commands/IndexDocuments.php Thu Feb 09 17:22:58 2017 +0100 +++ b/server/src/app/Console/Commands/IndexDocuments.php Fri Feb 10 12:03:12 2017 +0100 @@ -250,17 +250,33 @@ return $res; }, []); - $labelsBnf = $this->bnfResolver->getLabels( - array_unique(array_reduce( - $sres, - function($r, $so) { - if($so['type'] === Utils::SUBJECT_TYPE_BNF) { - array_push($r, $so['uri']); - } - return $r; - },[] - )) - ); + $labelsResolved = false; + $timeoutRetries = 0; + while(!$labelsResolved && $timeoutRetries < config('corpusparole.bnf_max_retries', 3)) { + try { + $labelsBnf = $this->bnfResolver->getLabels( + array_unique(array_reduce( + $sres, + function($r, $so) { + if($so['type'] === Utils::SUBJECT_TYPE_BNF) { + array_push($r, $so['uri']); + } + return $r; + },[] + )) + ); + $labelsResolved = true; + } catch(BnfResolverTimeoutException $e) { + Log::warning('IndexDocument: Resolve label timeout, will retry'); + $timeoutRetries++; + continue; + } + } + if(!$labelsResolved) { + Log::error("IndexDocument: Some bnf labels not resolved (retry timeout: $timeoutRetries)."); + $this->error("\nError resolving bnf labels (retry timeout: $timeoutRetries). Stopping"); + throw new \Exception("Error resolving bnf labels (retry timeout: $timeoutRetries). Stopping"); + } $labelsLexvo = $this->lexvoResolver->getNames( array_unique(array_reduce( $sres, @@ -586,9 +602,11 @@ * * @return int (1 if sucess, 0 if error) */ - private function indexBulk($docBodies) - { - $query_data = ['body' => []]; + private function indexBulk($docBodies) { + if(empty($docBodies)) { + return; + } + $query_data = ['body' => []]; foreach($docBodies as $docId => $docBody){ $query_data['body'][] = [ 'index' => [ @@ -665,6 +683,9 @@ $progressBar->setMessage($docId); $progressBar->advance(); $doc = $this->documentRepository->get($docId); + if(is_null($doc)) { + continue; + } $docBody = $this->getDocBody($doc); if($noBulk) { $this->indexOne($docId, $docBody); diff -r 9175ea22f1b1 -r 8a5bb4b48b85 server/src/app/Libraries/Mergers/CocoonTextRdfMerger.php --- a/server/src/app/Libraries/Mergers/CocoonTextRdfMerger.php Thu Feb 09 17:22:58 2017 +0100 +++ b/server/src/app/Libraries/Mergers/CocoonTextRdfMerger.php Fri Feb 10 12:03:12 2017 +0100 @@ -109,7 +109,7 @@ $isBaseSound = false; foreach ($baseRes->all("dc11:type","resource") as $resType) { $type = $resType->getUri(); - if($type === 'http://purl.org/dc/dcmitype/Sound') { + if($type === 'http://purl.org/dc/dcmitype/Sound' || $type === 'http://purl.org/dc/dcmitype/MovingImage') { $isBaseSound = true; break; } diff -r 9175ea22f1b1 -r 8a5bb4b48b85 server/src/app/Repositories/RdfDocumentRepository.php --- a/server/src/app/Repositories/RdfDocumentRepository.php Thu Feb 09 17:22:58 2017 +0100 +++ b/server/src/app/Repositories/RdfDocumentRepository.php Fri Feb 10 12:03:12 2017 +0100 @@ -96,7 +96,7 @@ private function getResGraph($doc) { - if(empty((array)$doc)) { + if(empty((array)$doc) || !array_key_exists('uri', (array)$doc)) { return null; } diff -r 9175ea22f1b1 -r 8a5bb4b48b85 server/src/app/Services/BnfResolver.php --- a/server/src/app/Services/BnfResolver.php Thu Feb 09 17:22:58 2017 +0100 +++ b/server/src/app/Services/BnfResolver.php Fri Feb 10 12:03:12 2017 +0100 @@ -3,6 +3,8 @@ use Cache; use CorpusParole\Services\BnfResolverInterface; +use CorpusParole\Services\BnfResolverTimeoutException; +use EasyRdf; class BnfResolver implements BnfResolverInterface { @@ -84,7 +86,20 @@ } $query .= "}"; - $docs = $this->sparqlClient->query($query); + try { + $docs = $this->sparqlClient->query($query); + } catch (EasyRdf\Exception $e) { + $code = 0; + if(method_exists($e, 'getCode')) { + $code = $e->getCode(); + } + $message = $e->getMessage(); + if($code == 400 || ($code == 0 && stripos($message, 'timed out')>=0) ) { + throw new BnfResolverTimeoutException("Query to bnf server timed out."); + } + // reraise the original exception + throw $e; + } $resultsRaw = []; diff -r 9175ea22f1b1 -r 8a5bb4b48b85 server/src/app/Services/BnfResolverTimeoutException.php --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/server/src/app/Services/BnfResolverTimeoutException.php Fri Feb 10 12:03:12 2017 +0100 @@ -0,0 +1,6 @@ + 5, 'bnf_query_url' => 'http://data.bnf.fr/sparql', 'bnf_completion_url' => 'http://data.bnf.fr/search-letter/', + 'bnf_max_retries' => 3, 'dbpedia_url_regexp' => '/^https?\:\/\/(([[:alpha:]]+\.)?dbpedia\.org)\/(resource|page)\/([^\/]+)\/?$/',