author | ymh <ymh.work@gmail.com> |
Fri, 09 Jun 2017 15:22:02 +0200 | |
changeset 531 | 48f5380c26d0 |
parent 526 | cdaf9dfb5dfd |
child 544 | ad58d7627f70 |
permissions | -rw-r--r-- |
2
00e2916104fe
Migrate to php 5.6 + Laravel 5.1 + add phpunit test
ymh <ymh.work@gmail.com>
parents:
1
diff
changeset
|
1 |
<?php |
1
01a844d292ac
dev environment + first skeleton for bo
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
2 |
|
2
00e2916104fe
Migrate to php 5.6 + Laravel 5.1 + add phpunit test
ymh <ymh.work@gmail.com>
parents:
1
diff
changeset
|
3 |
namespace CorpusParole\Console\Commands; |
00e2916104fe
Migrate to php 5.6 + Laravel 5.1 + add phpunit test
ymh <ymh.work@gmail.com>
parents:
1
diff
changeset
|
4 |
|
00e2916104fe
Migrate to php 5.6 + Laravel 5.1 + add phpunit test
ymh <ymh.work@gmail.com>
parents:
1
diff
changeset
|
5 |
use Config; |
4
f55970e41793
first skeleton of bo client in ember
ymh <ymh.work@gmail.com>
parents:
3
diff
changeset
|
6 |
use Log; |
1
01a844d292ac
dev environment + first skeleton for bo
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
7 |
use Illuminate\Console\Command; |
01a844d292ac
dev environment + first skeleton for bo
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
8 |
use Symfony\Component\Console\Input\InputOption; |
01a844d292ac
dev environment + first skeleton for bo
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
9 |
use Symfony\Component\Console\Input\InputArgument; |
2
00e2916104fe
Migrate to php 5.6 + Laravel 5.1 + add phpunit test
ymh <ymh.work@gmail.com>
parents:
1
diff
changeset
|
10 |
use Phpoaipmh\Client; |
00e2916104fe
Migrate to php 5.6 + Laravel 5.1 + add phpunit test
ymh <ymh.work@gmail.com>
parents:
1
diff
changeset
|
11 |
use Phpoaipmh\Endpoint; |
531
48f5380c26d0
Replace EasyRdf http loading with guzzle to solve proxy problems
ymh <ymh.work@gmail.com>
parents:
526
diff
changeset
|
12 |
use CorpusParole\Libraries\Sparql\GuzzleSparqlClient; |
1
01a844d292ac
dev environment + first skeleton for bo
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
13 |
|
4
f55970e41793
first skeleton of bo client in ember
ymh <ymh.work@gmail.com>
parents:
3
diff
changeset
|
14 |
class ImportCocoonRDF extends Command { |
2
00e2916104fe
Migrate to php 5.6 + Laravel 5.1 + add phpunit test
ymh <ymh.work@gmail.com>
parents:
1
diff
changeset
|
15 |
|
18
f2a40bbc27f6
add rdf mapper + merger + basic database model
ymh <ymh.work@gmail.com>
parents:
4
diff
changeset
|
16 |
const INSERT_TIMEOUT_RETRY = 5; |
f2a40bbc27f6
add rdf mapper + merger + basic database model
ymh <ymh.work@gmail.com>
parents:
4
diff
changeset
|
17 |
|
19
eadaf0b8f02e
Bo conception step. back to ember page
ymh <ymh.work@gmail.com>
parents:
18
diff
changeset
|
18 |
const MAPPER_CLASS_MAP = [ |
eadaf0b8f02e
Bo conception step. back to ember page
ymh <ymh.work@gmail.com>
parents:
18
diff
changeset
|
19 |
"http://purl.org/dc/dcmitype/Sound" => '\CorpusParole\Libraries\Mappers\CocoonSoundRdfMapper', |
eadaf0b8f02e
Bo conception step. back to ember page
ymh <ymh.work@gmail.com>
parents:
18
diff
changeset
|
20 |
"http://purl.org/dc/dcmitype/MovingImage" => '\CorpusParole\Libraries\Mappers\CocoonSoundRdfMapper', |
eadaf0b8f02e
Bo conception step. back to ember page
ymh <ymh.work@gmail.com>
parents:
18
diff
changeset
|
21 |
"http://purl.org/dc/dcmitype/Text" => '\CorpusParole\Libraries\Mappers\CocoonTextRdfMapper', |
eadaf0b8f02e
Bo conception step. back to ember page
ymh <ymh.work@gmail.com>
parents:
18
diff
changeset
|
22 |
"http://purl.org/dc/dcmitype/Collection" => '\CorpusParole\Libraries\Mappers\CocoonCollectionRdfMapper' |
eadaf0b8f02e
Bo conception step. back to ember page
ymh <ymh.work@gmail.com>
parents:
18
diff
changeset
|
23 |
]; |
eadaf0b8f02e
Bo conception step. back to ember page
ymh <ymh.work@gmail.com>
parents:
18
diff
changeset
|
24 |
|
2
00e2916104fe
Migrate to php 5.6 + Laravel 5.1 + add phpunit test
ymh <ymh.work@gmail.com>
parents:
1
diff
changeset
|
25 |
/** |
00e2916104fe
Migrate to php 5.6 + Laravel 5.1 + add phpunit test
ymh <ymh.work@gmail.com>
parents:
1
diff
changeset
|
26 |
* The console command description. |
00e2916104fe
Migrate to php 5.6 + Laravel 5.1 + add phpunit test
ymh <ymh.work@gmail.com>
parents:
1
diff
changeset
|
27 |
* |
00e2916104fe
Migrate to php 5.6 + Laravel 5.1 + add phpunit test
ymh <ymh.work@gmail.com>
parents:
1
diff
changeset
|
28 |
* @var string |
00e2916104fe
Migrate to php 5.6 + Laravel 5.1 + add phpunit test
ymh <ymh.work@gmail.com>
parents:
1
diff
changeset
|
29 |
*/ |
4
f55970e41793
first skeleton of bo client in ember
ymh <ymh.work@gmail.com>
parents:
3
diff
changeset
|
30 |
protected $description = 'Import Rdf from Cocoon.'; |
f55970e41793
first skeleton of bo client in ember
ymh <ymh.work@gmail.com>
parents:
3
diff
changeset
|
31 |
|
f55970e41793
first skeleton of bo client in ember
ymh <ymh.work@gmail.com>
parents:
3
diff
changeset
|
32 |
/** |
f55970e41793
first skeleton of bo client in ember
ymh <ymh.work@gmail.com>
parents:
3
diff
changeset
|
33 |
* The name and signature of the console command. |
f55970e41793
first skeleton of bo client in ember
ymh <ymh.work@gmail.com>
parents:
3
diff
changeset
|
34 |
* |
f55970e41793
first skeleton of bo client in ember
ymh <ymh.work@gmail.com>
parents:
3
diff
changeset
|
35 |
* @var string |
f55970e41793
first skeleton of bo client in ember
ymh <ymh.work@gmail.com>
parents:
3
diff
changeset
|
36 |
*/ |
508 | 37 |
protected $signature = "corpus-parole:importRDF |
38 |
{--skip=0 : Number of record to skip} |
|
39 |
{--no-raw : Do not record raw queries} |
|
40 |
{--no-raw-clear : Do not clear raw repository} |
|
41 |
{--clear : Clear repository} |
|
42 |
{--force-import : Overwrite document from import event if the repo version is more recent} |
|
43 |
{--keep-repo-doc : Keep the existing doc in repo (default is replace document)} |
|
44 |
"; |
|
45 |
//protected $signature = 'corpus-parole:importRDF {--skip=0 : Number of record to skip}'; |
|
1
01a844d292ac
dev environment + first skeleton for bo
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
46 |
|
2
00e2916104fe
Migrate to php 5.6 + Laravel 5.1 + add phpunit test
ymh <ymh.work@gmail.com>
parents:
1
diff
changeset
|
47 |
/** |
00e2916104fe
Migrate to php 5.6 + Laravel 5.1 + add phpunit test
ymh <ymh.work@gmail.com>
parents:
1
diff
changeset
|
48 |
* Create a new command instance. |
00e2916104fe
Migrate to php 5.6 + Laravel 5.1 + add phpunit test
ymh <ymh.work@gmail.com>
parents:
1
diff
changeset
|
49 |
*/ |
4
f55970e41793
first skeleton of bo client in ember
ymh <ymh.work@gmail.com>
parents:
3
diff
changeset
|
50 |
public function __construct() { |
2
00e2916104fe
Migrate to php 5.6 + Laravel 5.1 + add phpunit test
ymh <ymh.work@gmail.com>
parents:
1
diff
changeset
|
51 |
parent::__construct(); |
00e2916104fe
Migrate to php 5.6 + Laravel 5.1 + add phpunit test
ymh <ymh.work@gmail.com>
parents:
1
diff
changeset
|
52 |
} |
1
01a844d292ac
dev environment + first skeleton for bo
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
53 |
|
2
00e2916104fe
Migrate to php 5.6 + Laravel 5.1 + add phpunit test
ymh <ymh.work@gmail.com>
parents:
1
diff
changeset
|
54 |
/** |
19
eadaf0b8f02e
Bo conception step. back to ember page
ymh <ymh.work@gmail.com>
parents:
18
diff
changeset
|
55 |
* Get the list of dcmi types for the graph |
eadaf0b8f02e
Bo conception step. back to ember page
ymh <ymh.work@gmail.com>
parents:
18
diff
changeset
|
56 |
*/ |
eadaf0b8f02e
Bo conception step. back to ember page
ymh <ymh.work@gmail.com>
parents:
18
diff
changeset
|
57 |
private function getDocTypes($doc, $docUri) { |
eadaf0b8f02e
Bo conception step. back to ember page
ymh <ymh.work@gmail.com>
parents:
18
diff
changeset
|
58 |
|
eadaf0b8f02e
Bo conception step. back to ember page
ymh <ymh.work@gmail.com>
parents:
18
diff
changeset
|
59 |
$res = $doc->resource($docUri); |
eadaf0b8f02e
Bo conception step. back to ember page
ymh <ymh.work@gmail.com>
parents:
18
diff
changeset
|
60 |
$docTypes = []; |
eadaf0b8f02e
Bo conception step. back to ember page
ymh <ymh.work@gmail.com>
parents:
18
diff
changeset
|
61 |
//foreach ($res->all("http://purl.org/dc/elements/1.1/type") as $resType) { |
eadaf0b8f02e
Bo conception step. back to ember page
ymh <ymh.work@gmail.com>
parents:
18
diff
changeset
|
62 |
foreach ($res->all("dc11:type","resource") as $resType) { |
eadaf0b8f02e
Bo conception step. back to ember page
ymh <ymh.work@gmail.com>
parents:
18
diff
changeset
|
63 |
$type = $resType->getUri(); |
eadaf0b8f02e
Bo conception step. back to ember page
ymh <ymh.work@gmail.com>
parents:
18
diff
changeset
|
64 |
if(0 === strpos($type, 'http://purl.org/dc/dcmitype/')) { |
eadaf0b8f02e
Bo conception step. back to ember page
ymh <ymh.work@gmail.com>
parents:
18
diff
changeset
|
65 |
$docTypes[] = $type; |
eadaf0b8f02e
Bo conception step. back to ember page
ymh <ymh.work@gmail.com>
parents:
18
diff
changeset
|
66 |
} |
eadaf0b8f02e
Bo conception step. back to ember page
ymh <ymh.work@gmail.com>
parents:
18
diff
changeset
|
67 |
} |
eadaf0b8f02e
Bo conception step. back to ember page
ymh <ymh.work@gmail.com>
parents:
18
diff
changeset
|
68 |
|
eadaf0b8f02e
Bo conception step. back to ember page
ymh <ymh.work@gmail.com>
parents:
18
diff
changeset
|
69 |
// if the doc type list is empty, check that we have a collection |
eadaf0b8f02e
Bo conception step. back to ember page
ymh <ymh.work@gmail.com>
parents:
18
diff
changeset
|
70 |
if(empty($docTypes)) { |
eadaf0b8f02e
Bo conception step. back to ember page
ymh <ymh.work@gmail.com>
parents:
18
diff
changeset
|
71 |
if(!empty($doc->allOfType('edm:Collection'))) { |
eadaf0b8f02e
Bo conception step. back to ember page
ymh <ymh.work@gmail.com>
parents:
18
diff
changeset
|
72 |
$docTypes[] = "http://purl.org/dc/dcmitype/Collection"; |
eadaf0b8f02e
Bo conception step. back to ember page
ymh <ymh.work@gmail.com>
parents:
18
diff
changeset
|
73 |
} |
eadaf0b8f02e
Bo conception step. back to ember page
ymh <ymh.work@gmail.com>
parents:
18
diff
changeset
|
74 |
} |
eadaf0b8f02e
Bo conception step. back to ember page
ymh <ymh.work@gmail.com>
parents:
18
diff
changeset
|
75 |
return $docTypes; |
eadaf0b8f02e
Bo conception step. back to ember page
ymh <ymh.work@gmail.com>
parents:
18
diff
changeset
|
76 |
} |
eadaf0b8f02e
Bo conception step. back to ember page
ymh <ymh.work@gmail.com>
parents:
18
diff
changeset
|
77 |
|
508 | 78 |
/** |
79 |
* Map a documents into graphes. |
|
80 |
*/ |
|
81 |
public function mapDoc($doc, $docUri) { |
|
82 |
$inputDocTypes = $this->getDocTypes($doc, $docUri); |
|
83 |
||
84 |
$docType = count($inputDocTypes)>0? $inputDocTypes[0]:null; |
|
85 |
||
86 |
if(is_null($docType) || !array_key_exists($docType,ImportCocoonRDF::MAPPER_CLASS_MAP)) { |
|
87 |
$this->error("\nError processing $identifier ($docRdfUrl) : $docType unknown mapper"); |
|
88 |
Log::error("Error processing $identifier ($docRdfUrl) : $docType unknown mapper"); |
|
89 |
$this->documentCount['unknown'] += 1; |
|
518
4864076bf0e3
Correct error cases after code reorganization
ymh <ymh.work@gmail.com>
parents:
513
diff
changeset
|
90 |
return ['unknown', null]; |
508 | 91 |
} |
92 |
||
93 |
$mapperClass = ImportCocoonRDF::MAPPER_CLASS_MAP[$docType]; |
|
94 |
$mapper = new $mapperClass($doc, $docUri); |
|
95 |
||
96 |
try { |
|
97 |
$mapper->mapGraph(); |
|
98 |
} catch (\Exception $e) { |
|
99 |
Log::error("Error processing $identifier ($docRdfUrl) : error mapping graph : $e"); |
|
100 |
$this->documentCount['error'] += 1; |
|
518
4864076bf0e3
Correct error cases after code reorganization
ymh <ymh.work@gmail.com>
parents:
513
diff
changeset
|
101 |
return ['error', null]; |
508 | 102 |
} |
103 |
$this->documentCount['all'] += 1; |
|
104 |
$this->documentCount[$docType] = isset($this->documentCount[$docType])?$this->documentCount[$docType]+1:1; |
|
105 |
||
106 |
return [$docType, $mapper->getOutputGraphes()]; |
|
107 |
||
108 |
} |
|
109 |
||
110 |
public function mergeDocs($docType, $outputGraphes) { |
|
111 |
||
112 |
foreach ($outputGraphes as $mappedGraphKey => $mappedGraph) { |
|
113 |
||
114 |
$mappedGraphUri = $mappedGraph->getUri(); |
|
115 |
try { |
|
116 |
$resDocs = $this->gs->query("CONSTRUCT { ?s ?p ?o } WHERE { GRAPH <$mappedGraphUri> { ?s ?p ?o }}"); |
|
117 |
} catch (\Exception $e) { |
|
118 |
$this->error("\nError on graph query $mappedGraphUri : $e \n" . $e->getMessage() . "\n"); |
|
119 |
Log::error("\nError on graph query $mappedGraphUri : $e \n" . $e->getMessage()); |
|
120 |
exit; |
|
121 |
} |
|
122 |
||
123 |
$mergedGraph = null; |
|
124 |
$doDelete = true; |
|
125 |
||
126 |
if($resDocs->isEmpty()) { |
|
127 |
$mergedGraph = $mappedGraph; |
|
128 |
$doDelete = false; |
|
129 |
} else { |
|
130 |
$doDelete = true; |
|
131 |
$mappedTypes = $this->getDocTypes($mappedGraph, $mappedGraphUri); |
|
132 |
$presentTypes = $this->getDocTypes($resDocs, $mappedGraphUri); |
|
133 |
||
134 |
if($docType == "http://purl.org/dc/dcmitype/Collection" || in_array("http://purl.org/dc/dcmitype/Collection", $mappedTypes)) { |
|
135 |
$merger = new \CorpusParole\Libraries\Mergers\CocoonCollectionRdfMerger(); |
|
136 |
$baseGraph = $resDocs; |
|
137 |
$sourceGraph = $mappedGraph; |
|
138 |
} |
|
139 |
elseif ($docType == "http://purl.org/dc/dcmitype/Text") { |
|
140 |
$merger = new \CorpusParole\Libraries\Mergers\CocoonTextRdfMerger(); |
|
141 |
$baseGraph = $resDocs; |
|
142 |
$sourceGraph = $mappedGraph; |
|
143 |
} |
|
144 |
else { |
|
145 |
$merger = new \CorpusParole\Libraries\Mergers\CocoonSoundRdfMerger(); |
|
146 |
$baseGraph = $mappedGraph; |
|
147 |
$sourceGraph = $resDocs; |
|
148 |
} |
|
149 |
$mergedGraph = $merger->mergeGraph($baseGraph, $sourceGraph, $mappedGraphUri); |
|
150 |
if(\EasyRdf\Isomorphic::isomorphic($resDocs, $mergedGraph)) { |
|
151 |
//graph are isomorphic no need to go farther for this graph |
|
152 |
Log::info("Graph are isomorphic for $mappedGraphUri, skipping"); |
|
153 |
continue; |
|
154 |
} |
|
155 |
} |
|
156 |
||
157 |
try { |
|
158 |
if($doDelete) { |
|
159 |
$this->gs->clear($mappedGraphUri); |
|
160 |
} |
|
161 |
$this->gs->insert($mergedGraph, $mappedGraphUri); |
|
162 |
} |
|
163 |
catch(\Exception $e) { |
|
164 |
// just log not much we can do here... |
|
165 |
$this->error("\nError on insert $mappedGraphUri : $e"); |
|
166 |
Log::error("Error on insert $mappedGraphUri : $e"); |
|
167 |
$code = $e->getCode(); |
|
168 |
$message = $e->getMessage(); |
|
169 |
if($e instanceof EasyRdf\Exception && stripos($message, 'timed out')>=0 && $insertTimeout<= ImportCocoonRDF::INSERT_TIMEOUT_RETRY) { |
|
170 |
$this->info("\nThis is a timeout, we continue."); |
|
171 |
Log::info("This is a timeout, we continue."); |
|
172 |
$insertTimeouts++; |
|
173 |
continue; |
|
174 |
} |
|
175 |
throw $e; |
|
176 |
} |
|
177 |
} |
|
178 |
} |
|
179 |
||
180 |
function getModified($graph) { |
|
181 |
// get first element of array |
|
182 |
$providedCHORes = $graph->allOfType('http://www.europeana.eu/schemas/edm/ProvidedCHO'); |
|
183 |
$providedCHO = reset($providedCHORes); |
|
184 |
if($providedCHO === false) { |
|
185 |
$date = new \DateTime(); |
|
186 |
$date->setTimestamp(0); |
|
187 |
return $date; |
|
188 |
} |
|
189 |
$modified = $providedCHO->getLiteral("<http://purl.org/dc/terms/modified>"); |
|
190 |
if(is_null($modified)) { |
|
191 |
$date = new \DateTime(); |
|
192 |
$date->setTimestamp(0); |
|
193 |
return $date; |
|
194 |
} |
|
195 |
return \DateTime::createFromFormat(\DateTime::W3C, $modified->getValue()); |
|
196 |
} |
|
197 |
||
19
eadaf0b8f02e
Bo conception step. back to ember page
ymh <ymh.work@gmail.com>
parents:
18
diff
changeset
|
198 |
|
eadaf0b8f02e
Bo conception step. back to ember page
ymh <ymh.work@gmail.com>
parents:
18
diff
changeset
|
199 |
/** |
2
00e2916104fe
Migrate to php 5.6 + Laravel 5.1 + add phpunit test
ymh <ymh.work@gmail.com>
parents:
1
diff
changeset
|
200 |
* Execute the console command. |
00e2916104fe
Migrate to php 5.6 + Laravel 5.1 + add phpunit test
ymh <ymh.work@gmail.com>
parents:
1
diff
changeset
|
201 |
* |
00e2916104fe
Migrate to php 5.6 + Laravel 5.1 + add phpunit test
ymh <ymh.work@gmail.com>
parents:
1
diff
changeset
|
202 |
* @return mixed |
00e2916104fe
Migrate to php 5.6 + Laravel 5.1 + add phpunit test
ymh <ymh.work@gmail.com>
parents:
1
diff
changeset
|
203 |
*/ |
526
cdaf9dfb5dfd
correct licence problem in bug #0026523
ymh <ymh.work@gmail.com>
parents:
518
diff
changeset
|
204 |
public function handle() { |
4
f55970e41793
first skeleton of bo client in ember
ymh <ymh.work@gmail.com>
parents:
3
diff
changeset
|
205 |
|
2
00e2916104fe
Migrate to php 5.6 + Laravel 5.1 + add phpunit test
ymh <ymh.work@gmail.com>
parents:
1
diff
changeset
|
206 |
libxml_use_internal_errors(true); |
00e2916104fe
Migrate to php 5.6 + Laravel 5.1 + add phpunit test
ymh <ymh.work@gmail.com>
parents:
1
diff
changeset
|
207 |
|
4
f55970e41793
first skeleton of bo client in ember
ymh <ymh.work@gmail.com>
parents:
3
diff
changeset
|
208 |
$skip = (int)$this->option('skip'); |
508 | 209 |
$raw = !$this->option('no-raw'); |
210 |
$rawClear = !$this->option('no-raw-clear'); |
|
211 |
$clear = $this->option('clear'); |
|
212 |
$forceImport = $this->option('force-import'); |
|
213 |
$keepRepoDoc = $this->option('keep-repo-doc'); |
|
2
00e2916104fe
Migrate to php 5.6 + Laravel 5.1 + add phpunit test
ymh <ymh.work@gmail.com>
parents:
1
diff
changeset
|
214 |
|
4
f55970e41793
first skeleton of bo client in ember
ymh <ymh.work@gmail.com>
parents:
3
diff
changeset
|
215 |
$this->comment("Skipping $skip records"); |
508 | 216 |
$this->comment("Querying Cocoon: ".($raw?'TRUE':'FALSE')); |
217 |
$this->comment("Clear raw repository: ".($rawClear?'TRUE':'FALSE')); |
|
218 |
$this->comment("Clear repository: ".($clear?'TRUE':'FALSE')); |
|
219 |
$this->comment("Keep existing document into repository: ".($keepRepoDoc?'TRUE':'FALSE')); |
|
220 |
$this->comment("Overwrite more recent document:".($forceImport?'TRUE':'FALSE')); |
|
18
f2a40bbc27f6
add rdf mapper + merger + basic database model
ymh <ymh.work@gmail.com>
parents:
4
diff
changeset
|
221 |
|
531
48f5380c26d0
Replace EasyRdf http loading with guzzle to solve proxy problems
ymh <ymh.work@gmail.com>
parents:
526
diff
changeset
|
222 |
$this->httpClient = app()->make('Guzzle'); |
48f5380c26d0
Replace EasyRdf http loading with guzzle to solve proxy problems
ymh <ymh.work@gmail.com>
parents:
526
diff
changeset
|
223 |
$this->gs = new GuzzleSparqlClient($this->httpClient, Config::get('corpusparole.rdf4j_query_url'), Config::get('corpusparole.rdf4j_update_url')); |
48f5380c26d0
Replace EasyRdf http loading with guzzle to solve proxy problems
ymh <ymh.work@gmail.com>
parents:
526
diff
changeset
|
224 |
$this->gs_raw = new GuzzleSparqlClient($this->httpClient, Config::get('corpusparole.rdf4j_query_url_raw'), Config::get('corpusparole.rdf4j_update_url_raw')); |
18
f2a40bbc27f6
add rdf mapper + merger + basic database model
ymh <ymh.work@gmail.com>
parents:
4
diff
changeset
|
225 |
|
508 | 226 |
$this->documentCount = [ |
227 |
'all' => 0, |
|
228 |
'unknown' => 0, |
|
229 |
'error' => 0, |
|
230 |
'raw_duplicates' => 0, |
|
231 |
'modified' => 0, |
|
232 |
'replaced' => 0 |
|
233 |
]; |
|
19
eadaf0b8f02e
Bo conception step. back to ember page
ymh <ymh.work@gmail.com>
parents:
18
diff
changeset
|
234 |
|
508 | 235 |
if($raw) { |
236 |
$client = new Client(Config::get('corpusparole.cocoon_oaipmh_url')); |
|
237 |
$endpoint = new Endpoint($client); |
|
238 |
||
239 |
$recs = $endpoint->listRecords('olac', null, null, 'LanguesDeFrance'); |
|
506
8a5bb4b48b85
try to solve #0025932 + try to improve indexing process reliability by retrying bnf label resolve queries
ymh <ymh.work@gmail.com>
parents:
412
diff
changeset
|
240 |
|
508 | 241 |
$progressBar = $this->output->createProgressBar($recs->getTotalRecordsInCollection()); |
242 |
$progressBar->setFormat(' %current%/%max% [%bar%] %percent:3s%% - %message%'); |
|
243 |
||
244 |
$insertTimeouts = 0; |
|
245 |
||
246 |
//Clear raw repository if asked |
|
247 |
if($rawClear) { |
|
248 |
$this->gs_raw->clear("all"); |
|
19
eadaf0b8f02e
Bo conception step. back to ember page
ymh <ymh.work@gmail.com>
parents:
18
diff
changeset
|
249 |
} |
eadaf0b8f02e
Bo conception step. back to ember page
ymh <ymh.work@gmail.com>
parents:
18
diff
changeset
|
250 |
|
508 | 251 |
foreach ($recs as $item) { |
252 |
$item->registerXPathNamespace('oai', "http://www.openarchives.org/OAI/2.0/"); |
|
253 |
$identifier = (string) $item->xpath('oai:header/oai:identifier')[0]; |
|
19
eadaf0b8f02e
Bo conception step. back to ember page
ymh <ymh.work@gmail.com>
parents:
18
diff
changeset
|
254 |
|
508 | 255 |
$docRdfUrl = Config::get('corpusparole.cocoon_rdf_base_uri').substr($identifier, strlen(Config::get('corpusparole.cocoon_doc_id_base'))); |
256 |
$message = "$identifier : $docRdfUrl"; |
|
257 |
if($recs->getNumRetrieved() <= $skip) { |
|
258 |
$progressBar->setMessage("$message - Skipping"); |
|
259 |
$progressBar->advance(); |
|
260 |
continue; |
|
261 |
} |
|
262 |
$progressBar->setMessage($message); |
|
263 |
$progressBar->advance(); |
|
19
eadaf0b8f02e
Bo conception step. back to ember page
ymh <ymh.work@gmail.com>
parents:
18
diff
changeset
|
264 |
|
508 | 265 |
$docUri = config('corpusparole.cocoon_doc_id_base_uri').substr($identifier, strlen(Config::get('corpusparole.cocoon_doc_id_base'))); |
19
eadaf0b8f02e
Bo conception step. back to ember page
ymh <ymh.work@gmail.com>
parents:
18
diff
changeset
|
266 |
|
508 | 267 |
$docLoaded = false; |
268 |
$loadRetry = 0; |
|
269 |
$doc = null; |
|
270 |
while(!$docLoaded && $loadRetry < config('corpusparole.max_load_retry', 3)) { |
|
271 |
$loadRetry++; |
|
272 |
try { |
|
531
48f5380c26d0
Replace EasyRdf http loading with guzzle to solve proxy problems
ymh <ymh.work@gmail.com>
parents:
526
diff
changeset
|
273 |
$resp = $this->httpClient->get($docRdfUrl); |
48f5380c26d0
Replace EasyRdf http loading with guzzle to solve proxy problems
ymh <ymh.work@gmail.com>
parents:
526
diff
changeset
|
274 |
$content_type = $resp->getHeader('Content-Type'); |
48f5380c26d0
Replace EasyRdf http loading with guzzle to solve proxy problems
ymh <ymh.work@gmail.com>
parents:
526
diff
changeset
|
275 |
$format = null; |
48f5380c26d0
Replace EasyRdf http loading with guzzle to solve proxy problems
ymh <ymh.work@gmail.com>
parents:
526
diff
changeset
|
276 |
if(is_array($content_type) && count($content_type)>0) { |
48f5380c26d0
Replace EasyRdf http loading with guzzle to solve proxy problems
ymh <ymh.work@gmail.com>
parents:
526
diff
changeset
|
277 |
list($format, ) = \EasyRdf\Utils::parseMimeType($content_type[0]); |
48f5380c26d0
Replace EasyRdf http loading with guzzle to solve proxy problems
ymh <ymh.work@gmail.com>
parents:
526
diff
changeset
|
278 |
} |
48f5380c26d0
Replace EasyRdf http loading with guzzle to solve proxy problems
ymh <ymh.work@gmail.com>
parents:
526
diff
changeset
|
279 |
$doc = new \EasyRdf\Graph($docRdfUrl, $resp->getBody(), $format); |
508 | 280 |
$docLoaded = true; |
281 |
} |
|
282 |
//TODO: catch network exception - add error to database |
|
531
48f5380c26d0
Replace EasyRdf http loading with guzzle to solve proxy problems
ymh <ymh.work@gmail.com>
parents:
526
diff
changeset
|
283 |
catch(\GuzzleHttp\Exception\ConnectException $e) { |
48f5380c26d0
Replace EasyRdf http loading with guzzle to solve proxy problems
ymh <ymh.work@gmail.com>
parents:
526
diff
changeset
|
284 |
$this->info("\nTimeout error processing $identifier ($docRdfUrl) : $e, retrying"); |
48f5380c26d0
Replace EasyRdf http loading with guzzle to solve proxy problems
ymh <ymh.work@gmail.com>
parents:
526
diff
changeset
|
285 |
Log::warning("Timeout error processing $identifier ($docRdfUrl) : $e, retrying"); |
48f5380c26d0
Replace EasyRdf http loading with guzzle to solve proxy problems
ymh <ymh.work@gmail.com>
parents:
526
diff
changeset
|
286 |
continue; |
48f5380c26d0
Replace EasyRdf http loading with guzzle to solve proxy problems
ymh <ymh.work@gmail.com>
parents:
526
diff
changeset
|
287 |
} |
48f5380c26d0
Replace EasyRdf http loading with guzzle to solve proxy problems
ymh <ymh.work@gmail.com>
parents:
526
diff
changeset
|
288 |
catch(\GuzzleHttp\Exception\ClientException $e) { |
48f5380c26d0
Replace EasyRdf http loading with guzzle to solve proxy problems
ymh <ymh.work@gmail.com>
parents:
526
diff
changeset
|
289 |
if($e->getResponse()->getStatusCode() == 400) { |
508 | 290 |
$this->info("\nTimeout error processing $identifier ($docRdfUrl) : $e, retrying"); |
291 |
Log::warning("Timeout error processing $identifier ($docRdfUrl) : $e, retrying"); |
|
292 |
continue; |
|
293 |
} |
|
294 |
else { |
|
295 |
$this->error("\nError processing $identifier ($docRdfUrl) : $e"); |
|
296 |
Log::error("Error processing $identifier ($docRdfUrl) : $e"); |
|
297 |
break; |
|
298 |
} |
|
531
48f5380c26d0
Replace EasyRdf http loading with guzzle to solve proxy problems
ymh <ymh.work@gmail.com>
parents:
526
diff
changeset
|
299 |
} |
48f5380c26d0
Replace EasyRdf http loading with guzzle to solve proxy problems
ymh <ymh.work@gmail.com>
parents:
526
diff
changeset
|
300 |
catch(\Exception $e) { |
48f5380c26d0
Replace EasyRdf http loading with guzzle to solve proxy problems
ymh <ymh.work@gmail.com>
parents:
526
diff
changeset
|
301 |
$this->error("\nError processing $identifier ($docRdfUrl) : $e"); |
48f5380c26d0
Replace EasyRdf http loading with guzzle to solve proxy problems
ymh <ymh.work@gmail.com>
parents:
526
diff
changeset
|
302 |
Log::error("Error processing $identifier ($docRdfUrl) : $e"); |
48f5380c26d0
Replace EasyRdf http loading with guzzle to solve proxy problems
ymh <ymh.work@gmail.com>
parents:
526
diff
changeset
|
303 |
break; |
508 | 304 |
} |
305 |
} |
|
306 |
if(!$docLoaded) { |
|
307 |
$this->documentCount['error'] += 1; |
|
308 |
continue; |
|
19
eadaf0b8f02e
Bo conception step. back to ember page
ymh <ymh.work@gmail.com>
parents:
18
diff
changeset
|
309 |
} |
eadaf0b8f02e
Bo conception step. back to ember page
ymh <ymh.work@gmail.com>
parents:
18
diff
changeset
|
310 |
|
508 | 311 |
$resDocsRaw = $this->gs_raw->query("ASK WHERE { GRAPH <$docUri> { ?s ?p ?o }}"); |
312 |
if($resDocsRaw->getBoolean()) { |
|
313 |
$this->gs_raw->clear($docUri); |
|
314 |
$this->documentCount['raw_duplicates'] += 1; |
|
315 |
} |
|
316 |
$this->gs_raw->insert($doc, $docUri); |
|
317 |
} |
|
318 |
$progressBar->setMessage("finished raw import"); |
|
319 |
$progressBar->finish(); |
|
320 |
} |
|
321 |
||
322 |
if($clear) { |
|
323 |
$this->gs->clear("all"); |
|
324 |
} |
|
19
eadaf0b8f02e
Bo conception step. back to ember page
ymh <ymh.work@gmail.com>
parents:
18
diff
changeset
|
325 |
|
508 | 326 |
$collectionDocsUris = $this->gs_raw->query("SELECT distinct ?uri WHERE { |
327 |
GRAPH ?uri { |
|
328 |
?s <http://purl.org/dc/elements/1.1/type> <http://purl.org/dc/dcmitype/Collection>. |
|
329 |
} |
|
330 |
}"); |
|
331 |
||
332 |
$collectionCount = count($collectionDocsUris); |
|
333 |
$this->info("\nImporting $collectionCount Collections from raw repository"); |
|
334 |
$progressBar = $this->output->createProgressBar($collectionCount); |
|
335 |
$progressBar->setFormat(' %current%/%max% [%bar%] %percent:3s%% - %message%'); |
|
336 |
||
337 |
||
338 |
foreach($collectionDocsUris as $docUriRes) { |
|
339 |
$docUri = $docUriRes->uri->getUri(); |
|
340 |
||
341 |
$progressBar->setMessage("Importing collection $docUri."); |
|
342 |
$progressBar->advance(); |
|
343 |
||
344 |
$doc = $this->gs_raw->query("CONSTRUCT { ?s ?p ?o } WHERE { GRAPH <$docUri> { ?s ?p ?o. }}"); |
|
345 |
||
346 |
//map the doc |
|
347 |
list($docType, $mappedGraphes) = $this->mapDoc($doc, $docUri); |
|
348 |
||
518
4864076bf0e3
Correct error cases after code reorganization
ymh <ymh.work@gmail.com>
parents:
513
diff
changeset
|
349 |
if($docType === 'unknown' || $docType === 'error') { |
4864076bf0e3
Correct error cases after code reorganization
ymh <ymh.work@gmail.com>
parents:
513
diff
changeset
|
350 |
// The error has been traced in mapDoc |
4864076bf0e3
Correct error cases after code reorganization
ymh <ymh.work@gmail.com>
parents:
513
diff
changeset
|
351 |
continue; |
4864076bf0e3
Correct error cases after code reorganization
ymh <ymh.work@gmail.com>
parents:
513
diff
changeset
|
352 |
} |
4864076bf0e3
Correct error cases after code reorganization
ymh <ymh.work@gmail.com>
parents:
513
diff
changeset
|
353 |
|
508 | 354 |
//merge the result docs |
355 |
$this->mergeDocs($docType, $mappedGraphes); |
|
356 |
||
357 |
} |
|
358 |
||
359 |
$progressBar->setMessage("finished raw import for collections."); |
|
360 |
$progressBar->finish(); |
|
19
eadaf0b8f02e
Bo conception step. back to ember page
ymh <ymh.work@gmail.com>
parents:
18
diff
changeset
|
361 |
|
508 | 362 |
// list the existing documents |
363 |
$providedCHODocsUris = []; |
|
364 |
$providedCHODocsUrisRes = $this->gs->query("SELECT distinct ?uri WHERE { |
|
365 |
GRAPH ?uri { |
|
366 |
?s <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://www.europeana.eu/schemas/edm/ProvidedCHO>. |
|
367 |
} |
|
368 |
}"); |
|
369 |
||
370 |
foreach($providedCHODocsUrisRes as $docUriRes) { |
|
371 |
array_push($providedCHODocsUris, $docUriRes->uri->getUri()); |
|
372 |
} |
|
373 |
||
374 |
$this->info("\n\nWe have ".count($providedCHODocsUris)." providedCHO in database.\n"); |
|
375 |
||
376 |
$soundDocsUris = $this->gs_raw->query("SELECT distinct ?uri WHERE { |
|
377 |
GRAPH ?uri { |
|
378 |
?s <http://purl.org/dc/elements/1.1/type> ?o. |
|
379 |
FILTER(?o IN (<http://purl.org/dc/dcmitype/Sound>, <http://purl.org/dc/dcmitype/MovingImage>)) |
|
380 |
} |
|
381 |
}"); |
|
382 |
||
383 |
$soundCount = count($soundDocsUris); |
|
384 |
$this->info("\nImporting $soundCount Sound (or Moving Image) from raw repository\n"); |
|
385 |
$progressBar = $this->output->createProgressBar($soundCount); |
|
386 |
$progressBar->setFormat(' %current%/%max% [%bar%] %percent:3s%% - %message%'); |
|
387 |
||
388 |
||
389 |
foreach($soundDocsUris as $docUriRes) { |
|
390 |
$docUri = $docUriRes->uri->getUri(); |
|
391 |
||
392 |
$progressBar->setMessage("Importing Sound (or Moving Image) $docUri."); |
|
393 |
$progressBar->advance(); |
|
394 |
||
395 |
$doc = $this->gs_raw->query("CONSTRUCT { ?s ?p ?o } WHERE { GRAPH <$docUri> { ?s ?p ?o. }}"); |
|
396 |
||
397 |
//map the doc |
|
398 |
list($docType, $mappedGraphes) = $this->mapDoc($doc, $docUri); |
|
399 |
$firstGraph = reset($mappedGraphes); // first graph is main graph |
|
400 |
// remove it from list of existing graphes in repository |
|
401 |
$firstGraphUri = $firstGraph->getUri(); |
|
402 |
if(($key = array_search($firstGraphUri, $providedCHODocsUris)) !== false) { |
|
403 |
unset($providedCHODocsUris[$key]); |
|
404 |
} |
|
405 |
//if asked, delete it from repository. check modified date |
|
406 |
//merge the result docs |
|
407 |
try { |
|
408 |
$resDocs = $this->gs->query("CONSTRUCT { ?s ?p ?o } WHERE { GRAPH <$firstGraphUri> { ?s ?p ?o }}"); |
|
409 |
} catch (\Exception $e) { |
|
410 |
$this->error("\nError on graph query $firstGraphUri : $e \n" . $e->getMessage() . "\n"); |
|
411 |
Log::error("\nError on graph query $firstGraphUri : $e \n" . $e->getMessage()); |
|
412 |
exit; |
|
413 |
} |
|
414 |
$doDelete = true; |
|
415 |
if($resDocs->isEmpty()) { |
|
416 |
$doDelete = false; |
|
417 |
} else { |
|
418 |
// get modified from repo |
|
419 |
$dateRepo = $this->getModified($resDocs); |
|
420 |
// get modified from import |
|
421 |
$dateImport = $this->getModified($firstGraph); |
|
422 |
||
423 |
if($dateRepo > $dateImport) { |
|
424 |
$this->documentCount['modified'] += 1; |
|
425 |
$doDelete = $forceImport; |
|
426 |
} else { |
|
427 |
$doDelete = !$keepRepoDoc; |
|
4
f55970e41793
first skeleton of bo client in ember
ymh <ymh.work@gmail.com>
parents:
3
diff
changeset
|
428 |
} |
19
eadaf0b8f02e
Bo conception step. back to ember page
ymh <ymh.work@gmail.com>
parents:
18
diff
changeset
|
429 |
|
508 | 430 |
} |
431 |
||
432 |
if($doDelete) { |
|
433 |
$this->documentCount['replaced'] += 1; |
|
434 |
$this->gs->clear($firstGraphUri); |
|
435 |
} |
|
436 |
||
437 |
$this->mergeDocs($docType, $mappedGraphes); |
|
438 |
} |
|
439 |
||
440 |
$progressBar->setMessage("finished raw import for sounds."); |
|
441 |
$progressBar->finish(); |
|
442 |
||
443 |
||
444 |
$textDocsUris = $this->gs_raw->query("SELECT distinct ?uri WHERE { |
|
445 |
GRAPH ?uri { |
|
446 |
?s <http://purl.org/dc/elements/1.1/type> <http://purl.org/dc/dcmitype/Text>. |
|
447 |
} |
|
448 |
}"); |
|
449 |
||
450 |
$textCount = count($textDocsUris); |
|
451 |
$this->info("\n\nImporting $textCount text from raw repository\n"); |
|
452 |
$progressBar = $this->output->createProgressBar($textCount); |
|
453 |
$progressBar->setFormat(' %current%/%max% [%bar%] %percent:3s%% - %message%'); |
|
454 |
||
455 |
||
456 |
foreach($textDocsUris as $docUriRes) { |
|
457 |
$docUri = $docUriRes->uri->getUri(); |
|
458 |
||
459 |
$progressBar->setMessage("Importing Text $docUri."); |
|
460 |
$progressBar->advance(); |
|
461 |
||
462 |
$doc = $this->gs_raw->query("CONSTRUCT { ?s ?p ?o } WHERE { GRAPH <$docUri> { ?s ?p ?o. }}"); |
|
463 |
||
464 |
//map the doc |
|
465 |
list($docType, $mappedGraphes) = $this->mapDoc($doc, $docUri); |
|
466 |
||
467 |
//merge the result docs |
|
468 |
$this->mergeDocs($docType, $mappedGraphes); |
|
469 |
||
470 |
} |
|
471 |
||
472 |
$progressBar->setMessage("finished raw import for text."); |
|
473 |
$progressBar->finish(); |
|
474 |
||
475 |
||
476 |
// delete left overs from previous repository |
|
477 |
$this->info("\n\nThere is ".count($providedCHODocsUris)." documents left-over.\n"); |
|
478 |
if(count($providedCHODocsUris) > 0 && $delete_old) { |
|
479 |
foreach($providedCHODocsUris as $graphUri) { |
|
480 |
$this->gs->clear($graphUri); |
|
4
f55970e41793
first skeleton of bo client in ember
ymh <ymh.work@gmail.com>
parents:
3
diff
changeset
|
481 |
} |
f55970e41793
first skeleton of bo client in ember
ymh <ymh.work@gmail.com>
parents:
3
diff
changeset
|
482 |
} |
19
eadaf0b8f02e
Bo conception step. back to ember page
ymh <ymh.work@gmail.com>
parents:
18
diff
changeset
|
483 |
|
508 | 484 |
$this->info("\n\nDocument count info: "); |
485 |
foreach ($this->documentCount as $docType => $docCount) { |
|
172
660570f13537
Add error counter to importRDF command + retry on code 400 for sparql download
ymh <ymh.work@gmail.com>
parents:
118
diff
changeset
|
486 |
if($docType == 'error' && $docCount > 0) { |
660570f13537
Add error counter to importRDF command + retry on code 400 for sparql download
ymh <ymh.work@gmail.com>
parents:
118
diff
changeset
|
487 |
$this->error("$docType => $docCount"); |
660570f13537
Add error counter to importRDF command + retry on code 400 for sparql download
ymh <ymh.work@gmail.com>
parents:
118
diff
changeset
|
488 |
} else { |
660570f13537
Add error counter to importRDF command + retry on code 400 for sparql download
ymh <ymh.work@gmail.com>
parents:
118
diff
changeset
|
489 |
$this->info("$docType => $docCount"); |
660570f13537
Add error counter to importRDF command + retry on code 400 for sparql download
ymh <ymh.work@gmail.com>
parents:
118
diff
changeset
|
490 |
} |
19
eadaf0b8f02e
Bo conception step. back to ember page
ymh <ymh.work@gmail.com>
parents:
18
diff
changeset
|
491 |
} |
2
00e2916104fe
Migrate to php 5.6 + Laravel 5.1 + add phpunit test
ymh <ymh.work@gmail.com>
parents:
1
diff
changeset
|
492 |
} |
1
01a844d292ac
dev environment + first skeleton for bo
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
493 |
} |