author | ymh <ymh.work@gmail.com> |
Thu, 16 Feb 2017 16:10:07 +0100 | |
changeset 513 | dad9471f0d63 |
parent 508 | 2cb514f10a72 |
child 518 | 4864076bf0e3 |
permissions | -rw-r--r-- |
2
00e2916104fe
Migrate to php 5.6 + Laravel 5.1 + add phpunit test
ymh <ymh.work@gmail.com>
parents:
1
diff
changeset
|
1 |
<?php |
1
01a844d292ac
dev environment + first skeleton for bo
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
2 |
|
2
00e2916104fe
Migrate to php 5.6 + Laravel 5.1 + add phpunit test
ymh <ymh.work@gmail.com>
parents:
1
diff
changeset
|
3 |
namespace CorpusParole\Console\Commands; |
00e2916104fe
Migrate to php 5.6 + Laravel 5.1 + add phpunit test
ymh <ymh.work@gmail.com>
parents:
1
diff
changeset
|
4 |
|
00e2916104fe
Migrate to php 5.6 + Laravel 5.1 + add phpunit test
ymh <ymh.work@gmail.com>
parents:
1
diff
changeset
|
5 |
use Config; |
4
f55970e41793
first skeleton of bo client in ember
ymh <ymh.work@gmail.com>
parents:
3
diff
changeset
|
6 |
use Log; |
1
01a844d292ac
dev environment + first skeleton for bo
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
7 |
use Illuminate\Console\Command; |
01a844d292ac
dev environment + first skeleton for bo
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
8 |
use Symfony\Component\Console\Input\InputOption; |
01a844d292ac
dev environment + first skeleton for bo
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
9 |
use Symfony\Component\Console\Input\InputArgument; |
2
00e2916104fe
Migrate to php 5.6 + Laravel 5.1 + add phpunit test
ymh <ymh.work@gmail.com>
parents:
1
diff
changeset
|
10 |
use Phpoaipmh\Client; |
00e2916104fe
Migrate to php 5.6 + Laravel 5.1 + add phpunit test
ymh <ymh.work@gmail.com>
parents:
1
diff
changeset
|
11 |
use Phpoaipmh\Endpoint; |
1
01a844d292ac
dev environment + first skeleton for bo
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
12 |
|
4
f55970e41793
first skeleton of bo client in ember
ymh <ymh.work@gmail.com>
parents:
3
diff
changeset
|
13 |
class ImportCocoonRDF extends Command { |
2
00e2916104fe
Migrate to php 5.6 + Laravel 5.1 + add phpunit test
ymh <ymh.work@gmail.com>
parents:
1
diff
changeset
|
14 |
|
18
f2a40bbc27f6
add rdf mapper + merger + basic database model
ymh <ymh.work@gmail.com>
parents:
4
diff
changeset
|
15 |
const INSERT_TIMEOUT_RETRY = 5; |
f2a40bbc27f6
add rdf mapper + merger + basic database model
ymh <ymh.work@gmail.com>
parents:
4
diff
changeset
|
16 |
|
19
eadaf0b8f02e
Bo conception step. back to ember page
ymh <ymh.work@gmail.com>
parents:
18
diff
changeset
|
17 |
const MAPPER_CLASS_MAP = [ |
eadaf0b8f02e
Bo conception step. back to ember page
ymh <ymh.work@gmail.com>
parents:
18
diff
changeset
|
18 |
"http://purl.org/dc/dcmitype/Sound" => '\CorpusParole\Libraries\Mappers\CocoonSoundRdfMapper', |
eadaf0b8f02e
Bo conception step. back to ember page
ymh <ymh.work@gmail.com>
parents:
18
diff
changeset
|
19 |
"http://purl.org/dc/dcmitype/MovingImage" => '\CorpusParole\Libraries\Mappers\CocoonSoundRdfMapper', |
eadaf0b8f02e
Bo conception step. back to ember page
ymh <ymh.work@gmail.com>
parents:
18
diff
changeset
|
20 |
"http://purl.org/dc/dcmitype/Text" => '\CorpusParole\Libraries\Mappers\CocoonTextRdfMapper', |
eadaf0b8f02e
Bo conception step. back to ember page
ymh <ymh.work@gmail.com>
parents:
18
diff
changeset
|
21 |
"http://purl.org/dc/dcmitype/Collection" => '\CorpusParole\Libraries\Mappers\CocoonCollectionRdfMapper' |
eadaf0b8f02e
Bo conception step. back to ember page
ymh <ymh.work@gmail.com>
parents:
18
diff
changeset
|
22 |
]; |
eadaf0b8f02e
Bo conception step. back to ember page
ymh <ymh.work@gmail.com>
parents:
18
diff
changeset
|
23 |
|
2
00e2916104fe
Migrate to php 5.6 + Laravel 5.1 + add phpunit test
ymh <ymh.work@gmail.com>
parents:
1
diff
changeset
|
24 |
/** |
00e2916104fe
Migrate to php 5.6 + Laravel 5.1 + add phpunit test
ymh <ymh.work@gmail.com>
parents:
1
diff
changeset
|
25 |
* The console command description. |
00e2916104fe
Migrate to php 5.6 + Laravel 5.1 + add phpunit test
ymh <ymh.work@gmail.com>
parents:
1
diff
changeset
|
26 |
* |
00e2916104fe
Migrate to php 5.6 + Laravel 5.1 + add phpunit test
ymh <ymh.work@gmail.com>
parents:
1
diff
changeset
|
27 |
* @var string |
00e2916104fe
Migrate to php 5.6 + Laravel 5.1 + add phpunit test
ymh <ymh.work@gmail.com>
parents:
1
diff
changeset
|
28 |
*/ |
4
f55970e41793
first skeleton of bo client in ember
ymh <ymh.work@gmail.com>
parents:
3
diff
changeset
|
29 |
protected $description = 'Import Rdf from Cocoon.'; |
f55970e41793
first skeleton of bo client in ember
ymh <ymh.work@gmail.com>
parents:
3
diff
changeset
|
30 |
|
f55970e41793
first skeleton of bo client in ember
ymh <ymh.work@gmail.com>
parents:
3
diff
changeset
|
31 |
/** |
f55970e41793
first skeleton of bo client in ember
ymh <ymh.work@gmail.com>
parents:
3
diff
changeset
|
32 |
* The name and signature of the console command. |
f55970e41793
first skeleton of bo client in ember
ymh <ymh.work@gmail.com>
parents:
3
diff
changeset
|
33 |
* |
f55970e41793
first skeleton of bo client in ember
ymh <ymh.work@gmail.com>
parents:
3
diff
changeset
|
34 |
* @var string |
f55970e41793
first skeleton of bo client in ember
ymh <ymh.work@gmail.com>
parents:
3
diff
changeset
|
35 |
*/ |
508 | 36 |
protected $signature = "corpus-parole:importRDF |
37 |
{--skip=0 : Number of record to skip} |
|
38 |
{--no-raw : Do not record raw queries} |
|
39 |
{--no-raw-clear : Do not clear raw repository} |
|
40 |
{--clear : Clear repository} |
|
41 |
{--force-import : Overwrite document from import event if the repo version is more recent} |
|
42 |
{--keep-repo-doc : Keep the existing doc in repo (default is replace document)} |
|
43 |
"; |
|
44 |
//protected $signature = 'corpus-parole:importRDF {--skip=0 : Number of record to skip}'; |
|
1
01a844d292ac
dev environment + first skeleton for bo
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
45 |
|
2
00e2916104fe
Migrate to php 5.6 + Laravel 5.1 + add phpunit test
ymh <ymh.work@gmail.com>
parents:
1
diff
changeset
|
46 |
/** |
00e2916104fe
Migrate to php 5.6 + Laravel 5.1 + add phpunit test
ymh <ymh.work@gmail.com>
parents:
1
diff
changeset
|
47 |
* Create a new command instance. |
00e2916104fe
Migrate to php 5.6 + Laravel 5.1 + add phpunit test
ymh <ymh.work@gmail.com>
parents:
1
diff
changeset
|
48 |
*/ |
4
f55970e41793
first skeleton of bo client in ember
ymh <ymh.work@gmail.com>
parents:
3
diff
changeset
|
49 |
public function __construct() { |
2
00e2916104fe
Migrate to php 5.6 + Laravel 5.1 + add phpunit test
ymh <ymh.work@gmail.com>
parents:
1
diff
changeset
|
50 |
parent::__construct(); |
00e2916104fe
Migrate to php 5.6 + Laravel 5.1 + add phpunit test
ymh <ymh.work@gmail.com>
parents:
1
diff
changeset
|
51 |
} |
1
01a844d292ac
dev environment + first skeleton for bo
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
52 |
|
2
00e2916104fe
Migrate to php 5.6 + Laravel 5.1 + add phpunit test
ymh <ymh.work@gmail.com>
parents:
1
diff
changeset
|
53 |
/** |
19
eadaf0b8f02e
Bo conception step. back to ember page
ymh <ymh.work@gmail.com>
parents:
18
diff
changeset
|
54 |
* Get the list of dcmi types for the graph |
eadaf0b8f02e
Bo conception step. back to ember page
ymh <ymh.work@gmail.com>
parents:
18
diff
changeset
|
55 |
*/ |
eadaf0b8f02e
Bo conception step. back to ember page
ymh <ymh.work@gmail.com>
parents:
18
diff
changeset
|
56 |
private function getDocTypes($doc, $docUri) { |
eadaf0b8f02e
Bo conception step. back to ember page
ymh <ymh.work@gmail.com>
parents:
18
diff
changeset
|
57 |
|
eadaf0b8f02e
Bo conception step. back to ember page
ymh <ymh.work@gmail.com>
parents:
18
diff
changeset
|
58 |
$res = $doc->resource($docUri); |
eadaf0b8f02e
Bo conception step. back to ember page
ymh <ymh.work@gmail.com>
parents:
18
diff
changeset
|
59 |
$docTypes = []; |
eadaf0b8f02e
Bo conception step. back to ember page
ymh <ymh.work@gmail.com>
parents:
18
diff
changeset
|
60 |
//foreach ($res->all("http://purl.org/dc/elements/1.1/type") as $resType) { |
eadaf0b8f02e
Bo conception step. back to ember page
ymh <ymh.work@gmail.com>
parents:
18
diff
changeset
|
61 |
foreach ($res->all("dc11:type","resource") as $resType) { |
eadaf0b8f02e
Bo conception step. back to ember page
ymh <ymh.work@gmail.com>
parents:
18
diff
changeset
|
62 |
$type = $resType->getUri(); |
eadaf0b8f02e
Bo conception step. back to ember page
ymh <ymh.work@gmail.com>
parents:
18
diff
changeset
|
63 |
if(0 === strpos($type, 'http://purl.org/dc/dcmitype/')) { |
eadaf0b8f02e
Bo conception step. back to ember page
ymh <ymh.work@gmail.com>
parents:
18
diff
changeset
|
64 |
$docTypes[] = $type; |
eadaf0b8f02e
Bo conception step. back to ember page
ymh <ymh.work@gmail.com>
parents:
18
diff
changeset
|
65 |
} |
eadaf0b8f02e
Bo conception step. back to ember page
ymh <ymh.work@gmail.com>
parents:
18
diff
changeset
|
66 |
} |
eadaf0b8f02e
Bo conception step. back to ember page
ymh <ymh.work@gmail.com>
parents:
18
diff
changeset
|
67 |
|
eadaf0b8f02e
Bo conception step. back to ember page
ymh <ymh.work@gmail.com>
parents:
18
diff
changeset
|
68 |
// if the doc type list is empty, check that we have a collection |
eadaf0b8f02e
Bo conception step. back to ember page
ymh <ymh.work@gmail.com>
parents:
18
diff
changeset
|
69 |
if(empty($docTypes)) { |
eadaf0b8f02e
Bo conception step. back to ember page
ymh <ymh.work@gmail.com>
parents:
18
diff
changeset
|
70 |
if(!empty($doc->allOfType('edm:Collection'))) { |
eadaf0b8f02e
Bo conception step. back to ember page
ymh <ymh.work@gmail.com>
parents:
18
diff
changeset
|
71 |
$docTypes[] = "http://purl.org/dc/dcmitype/Collection"; |
eadaf0b8f02e
Bo conception step. back to ember page
ymh <ymh.work@gmail.com>
parents:
18
diff
changeset
|
72 |
} |
eadaf0b8f02e
Bo conception step. back to ember page
ymh <ymh.work@gmail.com>
parents:
18
diff
changeset
|
73 |
} |
eadaf0b8f02e
Bo conception step. back to ember page
ymh <ymh.work@gmail.com>
parents:
18
diff
changeset
|
74 |
return $docTypes; |
eadaf0b8f02e
Bo conception step. back to ember page
ymh <ymh.work@gmail.com>
parents:
18
diff
changeset
|
75 |
} |
eadaf0b8f02e
Bo conception step. back to ember page
ymh <ymh.work@gmail.com>
parents:
18
diff
changeset
|
76 |
|
508 | 77 |
/** |
78 |
* Map a documents into graphes. |
|
79 |
*/ |
|
80 |
public function mapDoc($doc, $docUri) { |
|
81 |
$inputDocTypes = $this->getDocTypes($doc, $docUri); |
|
82 |
||
83 |
$docType = count($inputDocTypes)>0? $inputDocTypes[0]:null; |
|
84 |
||
85 |
if(is_null($docType) || !array_key_exists($docType,ImportCocoonRDF::MAPPER_CLASS_MAP)) { |
|
86 |
$this->error("\nError processing $identifier ($docRdfUrl) : $docType unknown mapper"); |
|
87 |
Log::error("Error processing $identifier ($docRdfUrl) : $docType unknown mapper"); |
|
88 |
$this->documentCount['unknown'] += 1; |
|
89 |
continue; |
|
90 |
} |
|
91 |
||
92 |
$mapperClass = ImportCocoonRDF::MAPPER_CLASS_MAP[$docType]; |
|
93 |
$mapper = new $mapperClass($doc, $docUri); |
|
94 |
||
95 |
try { |
|
96 |
$mapper->mapGraph(); |
|
97 |
} catch (\Exception $e) { |
|
98 |
Log::error("Error processing $identifier ($docRdfUrl) : error mapping graph : $e"); |
|
99 |
$this->documentCount['error'] += 1; |
|
100 |
} |
|
101 |
$this->documentCount['all'] += 1; |
|
102 |
$this->documentCount[$docType] = isset($this->documentCount[$docType])?$this->documentCount[$docType]+1:1; |
|
103 |
||
104 |
return [$docType, $mapper->getOutputGraphes()]; |
|
105 |
||
106 |
} |
|
107 |
||
108 |
public function mergeDocs($docType, $outputGraphes) { |
|
109 |
||
110 |
foreach ($outputGraphes as $mappedGraphKey => $mappedGraph) { |
|
111 |
||
112 |
$mappedGraphUri = $mappedGraph->getUri(); |
|
113 |
try { |
|
114 |
$resDocs = $this->gs->query("CONSTRUCT { ?s ?p ?o } WHERE { GRAPH <$mappedGraphUri> { ?s ?p ?o }}"); |
|
115 |
} catch (\Exception $e) { |
|
116 |
$this->error("\nError on graph query $mappedGraphUri : $e \n" . $e->getMessage() . "\n"); |
|
117 |
Log::error("\nError on graph query $mappedGraphUri : $e \n" . $e->getMessage()); |
|
118 |
exit; |
|
119 |
} |
|
120 |
||
121 |
$mergedGraph = null; |
|
122 |
$doDelete = true; |
|
123 |
||
124 |
if($resDocs->isEmpty()) { |
|
125 |
$mergedGraph = $mappedGraph; |
|
126 |
$doDelete = false; |
|
127 |
} else { |
|
128 |
$doDelete = true; |
|
129 |
$mappedTypes = $this->getDocTypes($mappedGraph, $mappedGraphUri); |
|
130 |
$presentTypes = $this->getDocTypes($resDocs, $mappedGraphUri); |
|
131 |
||
132 |
if($docType == "http://purl.org/dc/dcmitype/Collection" || in_array("http://purl.org/dc/dcmitype/Collection", $mappedTypes)) { |
|
133 |
$merger = new \CorpusParole\Libraries\Mergers\CocoonCollectionRdfMerger(); |
|
134 |
$baseGraph = $resDocs; |
|
135 |
$sourceGraph = $mappedGraph; |
|
136 |
} |
|
137 |
elseif ($docType == "http://purl.org/dc/dcmitype/Text") { |
|
138 |
$merger = new \CorpusParole\Libraries\Mergers\CocoonTextRdfMerger(); |
|
139 |
$baseGraph = $resDocs; |
|
140 |
$sourceGraph = $mappedGraph; |
|
141 |
} |
|
142 |
else { |
|
143 |
$merger = new \CorpusParole\Libraries\Mergers\CocoonSoundRdfMerger(); |
|
144 |
$baseGraph = $mappedGraph; |
|
145 |
$sourceGraph = $resDocs; |
|
146 |
} |
|
147 |
$mergedGraph = $merger->mergeGraph($baseGraph, $sourceGraph, $mappedGraphUri); |
|
148 |
if(\EasyRdf\Isomorphic::isomorphic($resDocs, $mergedGraph)) { |
|
149 |
//graph are isomorphic no need to go farther for this graph |
|
150 |
Log::info("Graph are isomorphic for $mappedGraphUri, skipping"); |
|
151 |
continue; |
|
152 |
} |
|
153 |
} |
|
154 |
||
155 |
try { |
|
156 |
if($doDelete) { |
|
157 |
$this->gs->clear($mappedGraphUri); |
|
158 |
} |
|
159 |
$this->gs->insert($mergedGraph, $mappedGraphUri); |
|
160 |
} |
|
161 |
catch(\Exception $e) { |
|
162 |
// just log not much we can do here... |
|
163 |
$this->error("\nError on insert $mappedGraphUri : $e"); |
|
164 |
Log::error("Error on insert $mappedGraphUri : $e"); |
|
165 |
$code = $e->getCode(); |
|
166 |
$message = $e->getMessage(); |
|
167 |
if($e instanceof EasyRdf\Exception && stripos($message, 'timed out')>=0 && $insertTimeout<= ImportCocoonRDF::INSERT_TIMEOUT_RETRY) { |
|
168 |
$this->info("\nThis is a timeout, we continue."); |
|
169 |
Log::info("This is a timeout, we continue."); |
|
170 |
$insertTimeouts++; |
|
171 |
continue; |
|
172 |
} |
|
173 |
throw $e; |
|
174 |
} |
|
175 |
} |
|
176 |
} |
|
177 |
||
178 |
function getModified($graph) { |
|
179 |
// get first element of array |
|
180 |
$providedCHORes = $graph->allOfType('http://www.europeana.eu/schemas/edm/ProvidedCHO'); |
|
181 |
$providedCHO = reset($providedCHORes); |
|
182 |
if($providedCHO === false) { |
|
183 |
$date = new \DateTime(); |
|
184 |
$date->setTimestamp(0); |
|
185 |
return $date; |
|
186 |
} |
|
187 |
$modified = $providedCHO->getLiteral("<http://purl.org/dc/terms/modified>"); |
|
188 |
if(is_null($modified)) { |
|
189 |
$date = new \DateTime(); |
|
190 |
$date->setTimestamp(0); |
|
191 |
return $date; |
|
192 |
} |
|
193 |
return \DateTime::createFromFormat(\DateTime::W3C, $modified->getValue()); |
|
194 |
} |
|
195 |
||
19
eadaf0b8f02e
Bo conception step. back to ember page
ymh <ymh.work@gmail.com>
parents:
18
diff
changeset
|
196 |
|
eadaf0b8f02e
Bo conception step. back to ember page
ymh <ymh.work@gmail.com>
parents:
18
diff
changeset
|
197 |
/** |
2
00e2916104fe
Migrate to php 5.6 + Laravel 5.1 + add phpunit test
ymh <ymh.work@gmail.com>
parents:
1
diff
changeset
|
198 |
* Execute the console command. |
00e2916104fe
Migrate to php 5.6 + Laravel 5.1 + add phpunit test
ymh <ymh.work@gmail.com>
parents:
1
diff
changeset
|
199 |
* |
00e2916104fe
Migrate to php 5.6 + Laravel 5.1 + add phpunit test
ymh <ymh.work@gmail.com>
parents:
1
diff
changeset
|
200 |
* @return mixed |
00e2916104fe
Migrate to php 5.6 + Laravel 5.1 + add phpunit test
ymh <ymh.work@gmail.com>
parents:
1
diff
changeset
|
201 |
*/ |
4
f55970e41793
first skeleton of bo client in ember
ymh <ymh.work@gmail.com>
parents:
3
diff
changeset
|
202 |
public function fire() { |
f55970e41793
first skeleton of bo client in ember
ymh <ymh.work@gmail.com>
parents:
3
diff
changeset
|
203 |
|
2
00e2916104fe
Migrate to php 5.6 + Laravel 5.1 + add phpunit test
ymh <ymh.work@gmail.com>
parents:
1
diff
changeset
|
204 |
libxml_use_internal_errors(true); |
00e2916104fe
Migrate to php 5.6 + Laravel 5.1 + add phpunit test
ymh <ymh.work@gmail.com>
parents:
1
diff
changeset
|
205 |
|
4
f55970e41793
first skeleton of bo client in ember
ymh <ymh.work@gmail.com>
parents:
3
diff
changeset
|
206 |
$skip = (int)$this->option('skip'); |
508 | 207 |
$raw = !$this->option('no-raw'); |
208 |
$rawClear = !$this->option('no-raw-clear'); |
|
209 |
$clear = $this->option('clear'); |
|
210 |
$forceImport = $this->option('force-import'); |
|
211 |
$keepRepoDoc = $this->option('keep-repo-doc'); |
|
2
00e2916104fe
Migrate to php 5.6 + Laravel 5.1 + add phpunit test
ymh <ymh.work@gmail.com>
parents:
1
diff
changeset
|
212 |
|
4
f55970e41793
first skeleton of bo client in ember
ymh <ymh.work@gmail.com>
parents:
3
diff
changeset
|
213 |
$this->comment("Skipping $skip records"); |
508 | 214 |
$this->comment("Querying Cocoon: ".($raw?'TRUE':'FALSE')); |
215 |
$this->comment("Clear raw repository: ".($rawClear?'TRUE':'FALSE')); |
|
216 |
$this->comment("Clear repository: ".($clear?'TRUE':'FALSE')); |
|
217 |
$this->comment("Keep existing document into repository: ".($keepRepoDoc?'TRUE':'FALSE')); |
|
218 |
$this->comment("Overwrite more recent document:".($forceImport?'TRUE':'FALSE')); |
|
18
f2a40bbc27f6
add rdf mapper + merger + basic database model
ymh <ymh.work@gmail.com>
parents:
4
diff
changeset
|
219 |
|
508 | 220 |
$this->gs = new \EasyRdf\Sparql\Client(Config::get('corpusparole.rdf4j_query_url'), Config::get('corpusparole.rdf4j_update_url')); |
221 |
$this->gs_raw = new \EasyRdf\Sparql\Client(Config::get('corpusparole.rdf4j_query_url_raw'), Config::get('corpusparole.rdf4j_update_url_raw')); |
|
18
f2a40bbc27f6
add rdf mapper + merger + basic database model
ymh <ymh.work@gmail.com>
parents:
4
diff
changeset
|
222 |
|
508 | 223 |
$this->documentCount = [ |
224 |
'all' => 0, |
|
225 |
'unknown' => 0, |
|
226 |
'error' => 0, |
|
227 |
'raw_duplicates' => 0, |
|
228 |
'modified' => 0, |
|
229 |
'replaced' => 0 |
|
230 |
]; |
|
19
eadaf0b8f02e
Bo conception step. back to ember page
ymh <ymh.work@gmail.com>
parents:
18
diff
changeset
|
231 |
|
508 | 232 |
if($raw) { |
233 |
$client = new Client(Config::get('corpusparole.cocoon_oaipmh_url')); |
|
234 |
$endpoint = new Endpoint($client); |
|
235 |
||
236 |
$recs = $endpoint->listRecords('olac', null, null, 'LanguesDeFrance'); |
|
506
8a5bb4b48b85
try to solve #0025932 + try to improve indexing process reliability by retrying bnf label resolve queries
ymh <ymh.work@gmail.com>
parents:
412
diff
changeset
|
237 |
|
508 | 238 |
$progressBar = $this->output->createProgressBar($recs->getTotalRecordsInCollection()); |
239 |
$progressBar->setFormat(' %current%/%max% [%bar%] %percent:3s%% - %message%'); |
|
240 |
||
241 |
$insertTimeouts = 0; |
|
242 |
||
243 |
//Clear raw repository if asked |
|
244 |
if($rawClear) { |
|
245 |
$this->gs_raw->clear("all"); |
|
19
eadaf0b8f02e
Bo conception step. back to ember page
ymh <ymh.work@gmail.com>
parents:
18
diff
changeset
|
246 |
} |
eadaf0b8f02e
Bo conception step. back to ember page
ymh <ymh.work@gmail.com>
parents:
18
diff
changeset
|
247 |
|
508 | 248 |
foreach ($recs as $item) { |
249 |
$item->registerXPathNamespace('oai', "http://www.openarchives.org/OAI/2.0/"); |
|
250 |
$identifier = (string) $item->xpath('oai:header/oai:identifier')[0]; |
|
19
eadaf0b8f02e
Bo conception step. back to ember page
ymh <ymh.work@gmail.com>
parents:
18
diff
changeset
|
251 |
|
508 | 252 |
$docRdfUrl = Config::get('corpusparole.cocoon_rdf_base_uri').substr($identifier, strlen(Config::get('corpusparole.cocoon_doc_id_base'))); |
253 |
$message = "$identifier : $docRdfUrl"; |
|
254 |
if($recs->getNumRetrieved() <= $skip) { |
|
255 |
$progressBar->setMessage("$message - Skipping"); |
|
256 |
$progressBar->advance(); |
|
257 |
continue; |
|
258 |
} |
|
259 |
$progressBar->setMessage($message); |
|
260 |
$progressBar->advance(); |
|
19
eadaf0b8f02e
Bo conception step. back to ember page
ymh <ymh.work@gmail.com>
parents:
18
diff
changeset
|
261 |
|
508 | 262 |
$docUri = config('corpusparole.cocoon_doc_id_base_uri').substr($identifier, strlen(Config::get('corpusparole.cocoon_doc_id_base'))); |
19
eadaf0b8f02e
Bo conception step. back to ember page
ymh <ymh.work@gmail.com>
parents:
18
diff
changeset
|
263 |
|
508 | 264 |
$docLoaded = false; |
265 |
$loadRetry = 0; |
|
266 |
$doc = null; |
|
267 |
while(!$docLoaded && $loadRetry < config('corpusparole.max_load_retry', 3)) { |
|
268 |
$loadRetry++; |
|
269 |
try { |
|
270 |
$doc = new \EasyRdf\Graph($docRdfUrl); |
|
271 |
$doc->load(); |
|
272 |
$docLoaded = true; |
|
273 |
} |
|
274 |
//TODO: catch network exception - add error to database |
|
275 |
catch(\Exception $e) { |
|
276 |
$code = $e->getCode(); |
|
277 |
$message = $e->getMessage(); |
|
278 |
$this->info("\nError processing $identifier. code : $code, message: $message"); |
|
279 |
Log::debug("Error processing $identifier. code : $code, message: $message"); |
|
280 |
if($code == 400 || ($code == 0 && stripos($message, 'timed out')>=0) ) { |
|
281 |
$this->info("\nTimeout error processing $identifier ($docRdfUrl) : $e, retrying"); |
|
282 |
Log::warning("Timeout error processing $identifier ($docRdfUrl) : $e, retrying"); |
|
283 |
continue; |
|
284 |
} |
|
285 |
else { |
|
286 |
$this->error("\nError processing $identifier ($docRdfUrl) : $e"); |
|
287 |
Log::error("Error processing $identifier ($docRdfUrl) : $e"); |
|
288 |
break; |
|
289 |
} |
|
290 |
//$this->error(print_r($e->getTraceAsString(),true)); |
|
291 |
} |
|
292 |
} |
|
293 |
if(!$docLoaded) { |
|
294 |
$this->documentCount['error'] += 1; |
|
295 |
continue; |
|
19
eadaf0b8f02e
Bo conception step. back to ember page
ymh <ymh.work@gmail.com>
parents:
18
diff
changeset
|
296 |
} |
eadaf0b8f02e
Bo conception step. back to ember page
ymh <ymh.work@gmail.com>
parents:
18
diff
changeset
|
297 |
|
508 | 298 |
$resDocsRaw = $this->gs_raw->query("ASK WHERE { GRAPH <$docUri> { ?s ?p ?o }}"); |
299 |
if($resDocsRaw->getBoolean()) { |
|
300 |
$this->gs_raw->clear($docUri); |
|
301 |
$this->documentCount['raw_duplicates'] += 1; |
|
302 |
} |
|
303 |
$this->gs_raw->insert($doc, $docUri); |
|
304 |
} |
|
305 |
$progressBar->setMessage("finished raw import"); |
|
306 |
$progressBar->finish(); |
|
307 |
} |
|
308 |
||
309 |
if($clear) { |
|
310 |
$this->gs->clear("all"); |
|
311 |
} |
|
19
eadaf0b8f02e
Bo conception step. back to ember page
ymh <ymh.work@gmail.com>
parents:
18
diff
changeset
|
312 |
|
508 | 313 |
$collectionDocsUris = $this->gs_raw->query("SELECT distinct ?uri WHERE { |
314 |
GRAPH ?uri { |
|
315 |
?s <http://purl.org/dc/elements/1.1/type> <http://purl.org/dc/dcmitype/Collection>. |
|
316 |
} |
|
317 |
}"); |
|
318 |
||
319 |
$collectionCount = count($collectionDocsUris); |
|
320 |
$this->info("\nImporting $collectionCount Collections from raw repository"); |
|
321 |
$progressBar = $this->output->createProgressBar($collectionCount); |
|
322 |
$progressBar->setFormat(' %current%/%max% [%bar%] %percent:3s%% - %message%'); |
|
323 |
||
324 |
||
325 |
foreach($collectionDocsUris as $docUriRes) { |
|
326 |
$docUri = $docUriRes->uri->getUri(); |
|
327 |
||
328 |
$progressBar->setMessage("Importing collection $docUri."); |
|
329 |
$progressBar->advance(); |
|
330 |
||
331 |
$doc = $this->gs_raw->query("CONSTRUCT { ?s ?p ?o } WHERE { GRAPH <$docUri> { ?s ?p ?o. }}"); |
|
332 |
||
333 |
//map the doc |
|
334 |
list($docType, $mappedGraphes) = $this->mapDoc($doc, $docUri); |
|
335 |
||
336 |
//merge the result docs |
|
337 |
$this->mergeDocs($docType, $mappedGraphes); |
|
338 |
||
339 |
} |
|
340 |
||
341 |
$progressBar->setMessage("finished raw import for collections."); |
|
342 |
$progressBar->finish(); |
|
19
eadaf0b8f02e
Bo conception step. back to ember page
ymh <ymh.work@gmail.com>
parents:
18
diff
changeset
|
343 |
|
508 | 344 |
// list the existing documents |
345 |
$providedCHODocsUris = []; |
|
346 |
$providedCHODocsUrisRes = $this->gs->query("SELECT distinct ?uri WHERE { |
|
347 |
GRAPH ?uri { |
|
348 |
?s <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://www.europeana.eu/schemas/edm/ProvidedCHO>. |
|
349 |
} |
|
350 |
}"); |
|
351 |
||
352 |
foreach($providedCHODocsUrisRes as $docUriRes) { |
|
353 |
array_push($providedCHODocsUris, $docUriRes->uri->getUri()); |
|
354 |
} |
|
355 |
||
356 |
$this->info("\n\nWe have ".count($providedCHODocsUris)." providedCHO in database.\n"); |
|
357 |
||
358 |
$soundDocsUris = $this->gs_raw->query("SELECT distinct ?uri WHERE { |
|
359 |
GRAPH ?uri { |
|
360 |
?s <http://purl.org/dc/elements/1.1/type> ?o. |
|
361 |
FILTER(?o IN (<http://purl.org/dc/dcmitype/Sound>, <http://purl.org/dc/dcmitype/MovingImage>)) |
|
362 |
} |
|
363 |
}"); |
|
364 |
||
365 |
$soundCount = count($soundDocsUris); |
|
366 |
$this->info("\nImporting $soundCount Sound (or Moving Image) from raw repository\n"); |
|
367 |
$progressBar = $this->output->createProgressBar($soundCount); |
|
368 |
$progressBar->setFormat(' %current%/%max% [%bar%] %percent:3s%% - %message%'); |
|
369 |
||
370 |
||
371 |
foreach($soundDocsUris as $docUriRes) { |
|
372 |
$docUri = $docUriRes->uri->getUri(); |
|
373 |
||
374 |
$progressBar->setMessage("Importing Sound (or Moving Image) $docUri."); |
|
375 |
$progressBar->advance(); |
|
376 |
||
377 |
$doc = $this->gs_raw->query("CONSTRUCT { ?s ?p ?o } WHERE { GRAPH <$docUri> { ?s ?p ?o. }}"); |
|
378 |
||
379 |
//map the doc |
|
380 |
list($docType, $mappedGraphes) = $this->mapDoc($doc, $docUri); |
|
381 |
$firstGraph = reset($mappedGraphes); // first graph is main graph |
|
382 |
// remove it from list of existing graphes in repository |
|
383 |
$firstGraphUri = $firstGraph->getUri(); |
|
384 |
if(($key = array_search($firstGraphUri, $providedCHODocsUris)) !== false) { |
|
385 |
unset($providedCHODocsUris[$key]); |
|
386 |
} |
|
387 |
//if asked, delete it from repository. check modified date |
|
388 |
//merge the result docs |
|
389 |
try { |
|
390 |
$resDocs = $this->gs->query("CONSTRUCT { ?s ?p ?o } WHERE { GRAPH <$firstGraphUri> { ?s ?p ?o }}"); |
|
391 |
} catch (\Exception $e) { |
|
392 |
$this->error("\nError on graph query $firstGraphUri : $e \n" . $e->getMessage() . "\n"); |
|
393 |
Log::error("\nError on graph query $firstGraphUri : $e \n" . $e->getMessage()); |
|
394 |
exit; |
|
395 |
} |
|
396 |
$doDelete = true; |
|
397 |
if($resDocs->isEmpty()) { |
|
398 |
$doDelete = false; |
|
399 |
} else { |
|
400 |
// get modified from repo |
|
401 |
$dateRepo = $this->getModified($resDocs); |
|
402 |
// get modified from import |
|
403 |
$dateImport = $this->getModified($firstGraph); |
|
404 |
||
405 |
if($dateRepo > $dateImport) { |
|
406 |
$this->documentCount['modified'] += 1; |
|
407 |
$doDelete = $forceImport; |
|
408 |
} else { |
|
409 |
$doDelete = !$keepRepoDoc; |
|
4
f55970e41793
first skeleton of bo client in ember
ymh <ymh.work@gmail.com>
parents:
3
diff
changeset
|
410 |
} |
19
eadaf0b8f02e
Bo conception step. back to ember page
ymh <ymh.work@gmail.com>
parents:
18
diff
changeset
|
411 |
|
508 | 412 |
} |
413 |
||
414 |
if($doDelete) { |
|
415 |
$this->documentCount['replaced'] += 1; |
|
416 |
$this->gs->clear($firstGraphUri); |
|
417 |
} |
|
418 |
||
419 |
$this->mergeDocs($docType, $mappedGraphes); |
|
420 |
} |
|
421 |
||
422 |
$progressBar->setMessage("finished raw import for sounds."); |
|
423 |
$progressBar->finish(); |
|
424 |
||
425 |
||
426 |
$textDocsUris = $this->gs_raw->query("SELECT distinct ?uri WHERE { |
|
427 |
GRAPH ?uri { |
|
428 |
?s <http://purl.org/dc/elements/1.1/type> <http://purl.org/dc/dcmitype/Text>. |
|
429 |
} |
|
430 |
}"); |
|
431 |
||
432 |
$textCount = count($textDocsUris); |
|
433 |
$this->info("\n\nImporting $textCount text from raw repository\n"); |
|
434 |
$progressBar = $this->output->createProgressBar($textCount); |
|
435 |
$progressBar->setFormat(' %current%/%max% [%bar%] %percent:3s%% - %message%'); |
|
436 |
||
437 |
||
438 |
foreach($textDocsUris as $docUriRes) { |
|
439 |
$docUri = $docUriRes->uri->getUri(); |
|
440 |
||
441 |
$progressBar->setMessage("Importing Text $docUri."); |
|
442 |
$progressBar->advance(); |
|
443 |
||
444 |
$doc = $this->gs_raw->query("CONSTRUCT { ?s ?p ?o } WHERE { GRAPH <$docUri> { ?s ?p ?o. }}"); |
|
445 |
||
446 |
//map the doc |
|
447 |
list($docType, $mappedGraphes) = $this->mapDoc($doc, $docUri); |
|
448 |
||
449 |
//merge the result docs |
|
450 |
$this->mergeDocs($docType, $mappedGraphes); |
|
451 |
||
452 |
} |
|
453 |
||
454 |
$progressBar->setMessage("finished raw import for text."); |
|
455 |
$progressBar->finish(); |
|
456 |
||
457 |
||
458 |
// delete left overs from previous repository |
|
459 |
$this->info("\n\nThere is ".count($providedCHODocsUris)." documents left-over.\n"); |
|
460 |
if(count($providedCHODocsUris) > 0 && $delete_old) { |
|
461 |
foreach($providedCHODocsUris as $graphUri) { |
|
462 |
$this->gs->clear($graphUri); |
|
4
f55970e41793
first skeleton of bo client in ember
ymh <ymh.work@gmail.com>
parents:
3
diff
changeset
|
463 |
} |
f55970e41793
first skeleton of bo client in ember
ymh <ymh.work@gmail.com>
parents:
3
diff
changeset
|
464 |
} |
19
eadaf0b8f02e
Bo conception step. back to ember page
ymh <ymh.work@gmail.com>
parents:
18
diff
changeset
|
465 |
|
508 | 466 |
$this->info("\n\nDocument count info: "); |
467 |
foreach ($this->documentCount as $docType => $docCount) { |
|
172
660570f13537
Add error counter to importRDF command + retry on code 400 for sparql download
ymh <ymh.work@gmail.com>
parents:
118
diff
changeset
|
468 |
if($docType == 'error' && $docCount > 0) { |
660570f13537
Add error counter to importRDF command + retry on code 400 for sparql download
ymh <ymh.work@gmail.com>
parents:
118
diff
changeset
|
469 |
$this->error("$docType => $docCount"); |
660570f13537
Add error counter to importRDF command + retry on code 400 for sparql download
ymh <ymh.work@gmail.com>
parents:
118
diff
changeset
|
470 |
} else { |
660570f13537
Add error counter to importRDF command + retry on code 400 for sparql download
ymh <ymh.work@gmail.com>
parents:
118
diff
changeset
|
471 |
$this->info("$docType => $docCount"); |
660570f13537
Add error counter to importRDF command + retry on code 400 for sparql download
ymh <ymh.work@gmail.com>
parents:
118
diff
changeset
|
472 |
} |
19
eadaf0b8f02e
Bo conception step. back to ember page
ymh <ymh.work@gmail.com>
parents:
18
diff
changeset
|
473 |
} |
2
00e2916104fe
Migrate to php 5.6 + Laravel 5.1 + add phpunit test
ymh <ymh.work@gmail.com>
parents:
1
diff
changeset
|
474 |
} |
1
01a844d292ac
dev environment + first skeleton for bo
ymh <ymh.work@gmail.com>
parents:
diff
changeset
|
475 |
} |