151 return $res; |
184 return $res; |
152 |
185 |
153 } |
186 } |
154 |
187 |
155 /** |
188 /** |
|
189 * get subjects as { 'label': label, 'code': code } objects |
|
190 * Takes only into account the bnf subjects |
|
191 */ |
|
192 private function getSubjects($doc) { |
|
193 |
|
194 $sres = array_reduce($doc->getSubjects(), function($res, $s) { |
|
195 $mBnf = []; |
|
196 $mLexvo = []; |
|
197 |
|
198 if($s instanceof Resource && preg_match(config('corpusparole.bnf_ark_url_regexp'), $s->getUri(), $mBnf) === 1) { |
|
199 |
|
200 array_push($res, [ |
|
201 'uri' => $mBnf[0], |
|
202 'code' => $mBnf[1], |
|
203 'type' => 'bnf' |
|
204 ]); |
|
205 } elseif($s instanceof Resource && preg_match(config('corpusparole.lexvo_url_regexp'), $s->getUri(), $mLexvo) === 1) { |
|
206 array_push($res, [ |
|
207 'uri' => $mLexvo[0], |
|
208 'code' => $mLexvo[1], |
|
209 'type' => 'lxv' |
|
210 ]); |
|
211 } elseif($s instanceof Literal && strpos($s->getDatatypeUri(), config('corpusparole.olac_base_url')) === 0 ) { |
|
212 array_push($res, [ |
|
213 'uri' => $s->getValue(), |
|
214 'code' => $s->getValue(), |
|
215 'type' => 'olac' |
|
216 ]); |
|
217 } elseif($s instanceof Literal) { |
|
218 array_push($res, [ |
|
219 'uri' => $s->getValue(), |
|
220 'code' => $s->getValue(), |
|
221 'type' => 'txt' |
|
222 ]); |
|
223 } |
|
224 return $res; |
|
225 }, []); |
|
226 |
|
227 $labelsBnf = $this->bnfResolver->getLabels( |
|
228 array_unique(array_reduce( |
|
229 $sres, |
|
230 function($r, $so) { |
|
231 if($so['type'] === 'bnf') { |
|
232 array_push($r, $so['uri']); |
|
233 } |
|
234 return $r; |
|
235 },[] |
|
236 )) |
|
237 ); |
|
238 $labelsLexvo = $this->lexvoResolver->getNames( |
|
239 array_unique(array_reduce( |
|
240 $sres, |
|
241 function($r, $so) { |
|
242 if($so['type'] === 'lxv') { |
|
243 array_push($r, $so['uri']); |
|
244 } |
|
245 return $r; |
|
246 },[] |
|
247 )) |
|
248 ); |
|
249 |
|
250 return array_map(function($so) use ($labelsBnf, $labelsLexvo) { |
|
251 $label = $so['uri']; |
|
252 if($so['type'] === 'bnf') { |
|
253 $label = $labelsBnf[$label]; |
|
254 } elseif ($so['type'] === 'lxv') { |
|
255 $label = $labelsLexvo[$label]; |
|
256 } |
|
257 return [ 'label' => $label, 'code' => $so['code'], 'label_code' => $label."|".$so['type']."|".$so['code'] ]; }, $sres |
|
258 ); |
|
259 } |
|
260 |
|
261 private function graphResolvCoordinate($loc, $graph) { |
|
262 $latLit = $graph->getLiteral($loc, "<http://www.w3.org/2003/01/geo/wgs84_pos#lat>"); |
|
263 if(is_null($latLit) || empty($latLit->getValue())) { |
|
264 return null; |
|
265 } |
|
266 $lat = $latLit->getValue(); |
|
267 |
|
268 $longLit = $graph->getLiteral($loc, "<http://www.w3.org/2003/01/geo/wgs84_pos#long>"); |
|
269 if(is_null($longLit) || empty($longLit->getValue())) { |
|
270 return null; |
|
271 } |
|
272 $long = $longLit->getValue(); |
|
273 |
|
274 return [ $lat, $long ]; |
|
275 } |
|
276 |
|
277 private function loadGraph($url, $type) { |
|
278 try { |
|
279 $r = $this->httpClient->get($url); |
|
280 } catch (TransferException $e) { |
|
281 $this->error("loadGraph : Error Loading $url"); |
|
282 Log::error("loadGraph : Error Loading $url"); |
|
283 Log::error("loadGraph : Error request " . Psr7\str($e->getRequest())); |
|
284 if ($e->hasResponse()) { |
|
285 $this->error("loadGraph : Error response " . Psr7\str($e->getResponse())); |
|
286 Log::error("loadGraph : Error response " . Psr7\str($e->getResponse())); |
|
287 } |
|
288 return null; |
|
289 } |
|
290 try { |
|
291 $message = (string)$r->getBody(); |
|
292 $graph = new Graph($url, $message, $type); |
|
293 return $graph; |
|
294 } catch (EasyRdf\Exception $e) { |
|
295 $this->error("loadGraph : Error parsing $url"); |
|
296 Log::error("loadGraph : Error parsing $url"); |
|
297 if($e instanceof EasyRdf\Parser\Exception) { |
|
298 Log::error("loadGraph : Error exception line ".$e->getLine().", column: ".$e->getColumn()); |
|
299 } |
|
300 $this->error("loadGraph : Error exception message ".$e->getMessage()); |
|
301 Log::error("loadGraph : Error exception message ".$e->getMessage()); |
|
302 Log::error("loadGraph : Error content $message"); |
|
303 return null; |
|
304 } |
|
305 |
|
306 } |
|
307 |
|
308 private function geonamesResolveCoordinates($loc) { |
|
309 $coords = cache("corpus.geonames.coord.$loc"); |
|
310 if(is_null($coords)) { |
|
311 $graph = $this->loadGraph("{$loc}about.rdf", 'rdfxml'); |
|
312 $coords = is_null($graph)?null:$this->graphResolvCoordinate($loc, $graph); |
|
313 cache(["corpus.geonames.coord.$loc" => is_null($coords)?false:$coords], Carbon::now()->addMinutes(20)); |
|
314 } |
|
315 return ($coords===false)?null:$coords; |
|
316 } |
|
317 |
|
318 private function dbpediaResolveCoordinates($loc) { |
|
319 $coords = cache("corpus.dbpedia.coord.$loc"); |
|
320 if(is_null($coords)) { |
|
321 $graph = $this->loadGraph("$loc.rdf", 'rdfxml'); |
|
322 $coords = is_null($graph)?null:$this->graphResolvCoordinate($loc, $graph); |
|
323 cache(["corpus.dbpedia.coord.$loc"=> is_null($coords)?false:$coords], Carbon::now()->addMinutes(20)); |
|
324 } |
|
325 return ($coords===false)?null:$coords; |
|
326 } |
|
327 |
|
328 private function getLocation($doc) { |
|
329 |
|
330 $geoRes = $doc->getGeoInfo(); |
|
331 |
|
332 if(is_null($geoRes)) { |
|
333 return null; |
|
334 } |
|
335 |
|
336 $locUrls = []; |
|
337 foreach($geoRes->getRefLocs() as $loc) { |
|
338 if(preg_match(config('corpusparole.geonames_url_regexp'), $loc, $m) === 1) { |
|
339 |
|
340 if(!array_key_exists('geonames', $locUrls)) { |
|
341 $locUrls['geonames'] = []; |
|
342 } |
|
343 array_push($locUrls['geonames'], "http://sws.geonames.org/$m[1]/"); |
|
344 |
|
345 } elseif(preg_match(config('corpusparole.dbpedia_url_regexp'), $loc, $md) === 1) { |
|
346 if(!array_key_exists('dbpedia', $locUrls)) { |
|
347 $locUrls['dbpedia'] = []; |
|
348 } |
|
349 //$this->line("DBPEDIA MATCH $loc ".print_r($md,true)); |
|
350 array_push($locUrls['dbpedia'], "http://$md[1]/data/$md[4]"); |
|
351 } |
|
352 } |
|
353 |
|
354 $coordinates = null; |
|
355 foreach($locUrls as $locType => $locList) { |
|
356 foreach($locList as $locationUrl) { |
|
357 $coordinates = call_user_func([$this, "${locType}ResolveCoordinates"], $locationUrl); |
|
358 if(!is_null($coordinates)) { |
|
359 break; |
|
360 } |
|
361 } |
|
362 } |
|
363 |
|
364 if(is_null($coordinates)) { |
|
365 $coordinates = [$geoRes->getLatitudeValue(), $geoRes->getLongitudeValue()]; |
|
366 } |
|
367 |
|
368 if(empty($coordinates[0]) || empty($coordinates[1])) { |
|
369 return null; |
|
370 } else { |
|
371 return [floatval($coordinates[0]), floatval($coordinates[1])]; |
|
372 } |
|
373 |
|
374 } |
|
375 |
|
376 private function getCreationDate($doc) { |
|
377 |
|
378 $created = $doc->getCreated(); |
|
379 if(is_null($created)) { |
|
380 return null; |
|
381 } |
|
382 $dateType = $created->getDatatypeUri(); |
|
383 $res = null; |
|
384 |
|
385 if($dateType === "http://purl.org/dc/terms/Period") { |
|
386 $res = $this->processPeriod($created->getValue()); |
|
387 } |
|
388 elseif($dateType === "http://purl.org/dc/terms/W3CDTF") { |
|
389 $res = $this->processDate($created->getValue()); |
|
390 } |
|
391 |
|
392 return $res; |
|
393 |
|
394 } |
|
395 |
|
396 private function extractDate($dateStr) { |
|
397 if(preg_match("/^\\d{4}$/", $dateStr) === 1) { |
|
398 $dateStr = "$dateStr-1-1"; |
|
399 } |
|
400 $date = date_create($dateStr); |
|
401 if($date === false ) { |
|
402 Log::warning("DateStatsController:extractYear bad format for date $dateStr"); |
|
403 return null; |
|
404 } |
|
405 return $date; |
|
406 } |
|
407 |
|
408 private function processPeriod($periodStr) { |
|
409 $start = null; |
|
410 $end = null; |
|
411 foreach(explode(";", $periodStr) as $elem) { |
|
412 $elem = trim($elem); |
|
413 if(strpos($elem, 'start=') === 0) { |
|
414 $startDate = $this->extractDate(trim(substr($elem, 6))); |
|
415 if(is_null($startDate)) { |
|
416 return null; |
|
417 } |
|
418 $start = intval($startDate->format("Y")); |
|
419 if($start === false) { |
|
420 return null; |
|
421 } |
|
422 } elseif(strpos($elem, 'end=') === 0) { |
|
423 $endDate = $this->extractDate(trim(substr($elem, 4))); |
|
424 if(is_null($endDate)) { |
|
425 return null; |
|
426 } |
|
427 $end = intval($endDate->format("Y")); |
|
428 if($end === false) { |
|
429 return null; |
|
430 } |
|
431 } |
|
432 } |
|
433 |
|
434 if(is_null($start) || is_null($end) || $start>$end ) { |
|
435 Log::warning("Bad format for $periodStr"); |
|
436 return null; |
|
437 } |
|
438 |
|
439 return array_map(function($y) { |
|
440 return \DateTime::createFromFormat("Y", "$y")->format(\DateTime::W3C); |
|
441 }, range($start, $end)); |
|
442 } |
|
443 |
|
444 private function processDate($dateStr) { |
|
445 $date = $this->extractDate($dateStr); |
|
446 if(is_null($date)) { |
|
447 return null; |
|
448 } else { |
|
449 return $date->format(\DateTime::W3C); |
|
450 } |
|
451 } |
|
452 |
|
453 private function getDiscourseTypes($doc) { |
|
454 return array_reduce($doc->getDiscourseTypes(), function($res, $d) { |
|
455 $val = null; |
|
456 if($d instanceof Resource) { |
|
457 $val = $d->getUri(); |
|
458 } elseif($d instanceof Literal) { |
|
459 $datatype = $d->getDatatypeURI(); |
|
460 $val = (!empty($datatype)?"$datatype#":"").$d->getValue(); |
|
461 } |
|
462 if(!empty($val)) { |
|
463 array_push($res,$val); |
|
464 } |
|
465 return $res; |
|
466 }, []); |
|
467 } |
|
468 |
|
469 private function getDocBody($doc) { |
|
470 return [ |
|
471 'title' => (string)$doc->getTitle(), |
|
472 'date' => (string)$doc->getModified(), |
|
473 'location' => $this->getLocation($doc), |
|
474 'creation_date' => $this->getCreationDate($doc), |
|
475 'language' => $doc->getLanguagesValue(), |
|
476 'discourse_types' => $this->getDiscourseTypes($doc), |
|
477 'geonames_hierarchy' => $this->getGeonamesHierarchy($doc), |
|
478 'subject' => $this->getSubjects($doc), |
|
479 ]; |
|
480 } |
|
481 |
|
482 /** |
156 * Index one document into Elasticsearch |
483 * Index one document into Elasticsearch |
157 * |
484 * |
158 * @return int (1 if sucess, 0 if error) |
485 * @return int (1 if sucess, 0 if error) |
159 */ |
486 */ |
160 private function indexOne($resultDoc) |
487 private function indexOne($docId, $docBody) |
161 { |
488 { |
162 $doc = $this->documentRepository->get($resultDoc->getId()); |
|
163 $query_data = [ |
489 $query_data = [ |
164 'index' => config('elasticsearch.index'), |
490 'index' => config('elasticsearch.index'), |
165 'type' => 'document', |
491 'type' => 'document', |
166 'id' => (string)$doc->getId(), |
492 'id' => $docId, |
167 'body' => [ |
493 'body' => $docBody |
168 'title' => (string)$doc->getTitle(), |
|
169 'date' => (string)$doc->getModified(), |
|
170 'geonames_hierarchy' => $this->getGeonamesHierarchy($doc) |
|
171 ] |
|
172 ]; |
494 ]; |
173 Es::index($query_data); |
495 Es::index($query_data); |
174 } |
496 } |
175 |
497 |
176 /** |
498 /** |
177 * Index multiple document into Elasticsearch |
499 * Index multiple document into Elasticsearch |
178 * |
500 * |
179 * @return int (1 if sucess, 0 if error) |
501 * @return int (1 if sucess, 0 if error) |
180 */ |
502 */ |
181 private function indexBulk($docs) |
503 private function indexBulk($docBodies) |
182 { |
504 { |
183 $query_data = ['body' => []]; |
505 $query_data = ['body' => []]; |
184 foreach($docs as $doc){ |
506 foreach($docBodies as $docId => $docBody){ |
185 $query_data['body'][] = [ |
507 $query_data['body'][] = [ |
186 'index' => [ |
508 'index' => [ |
187 '_index' => config('elasticsearch.index'), |
509 '_index' => config('elasticsearch.index'), |
188 '_type' => 'document', |
510 '_type' => 'document', |
189 '_id' => (string)$doc->getId() |
511 '_id' => $docId |
190 ] |
512 ] |
191 ]; |
513 ]; |
192 $query_data['body'][] = [ |
514 $query_data['body'][] = $docBody; |
193 'title' => (string)$doc->getTitle(), |
|
194 'date' => (string)$doc->getModified() |
|
195 ]; |
|
196 } |
515 } |
197 Es::bulk($query_data); |
516 Es::bulk($query_data); |
198 } |
517 } |
199 /** |
518 /** |
200 * Execute the console command. |
519 * Execute the console command. |
229 $this->error('Error resetting index ' . env('ELASTICSEARCH_INDEX')); |
554 $this->error('Error resetting index ' . env('ELASTICSEARCH_INDEX')); |
230 } |
555 } |
231 |
556 |
232 $this->info('Indexing documents...'); |
557 $this->info('Indexing documents...'); |
233 |
558 |
234 if ($limit<=0) { |
559 $limit = (int)$limit; |
235 $lastPage = $this->documentRepository->paginateAll($stepSize, 'page')->lastPage(); |
560 $total = $this->documentRepository->getCount(); |
236 $total = $this->documentRepository->getCount(); |
561 |
237 $lastPageEntryCount = $stepSize+1; |
562 if($limit>0) { |
238 } |
563 $total = min($limit, $total); |
239 else { |
564 } |
240 $lastPage = min((int)($limit/$stepSize)+1, $this->documentRepository->paginateAll($stepSize, 'page')->lastPage()); |
565 |
241 $total = $limit; |
566 $progressBar = $this->output->createProgressBar($total); |
242 $lastPageEntryCount = $limit % $stepSize; |
|
243 } |
|
244 |
|
245 if ($noBulk) |
|
246 { |
|
247 $progressBar = $this->output->createProgressBar($total); |
|
248 } |
|
249 else |
|
250 { |
|
251 $progressBar = $this->output->createProgressBar($lastPage); |
|
252 } |
|
253 $progressBar->setFormat(' %current%/%max% [%bar%] %percent:3s%% - %message%'); |
567 $progressBar->setFormat(' %current%/%max% [%bar%] %percent:3s%% - %message%'); |
254 |
568 |
255 for ($page=1;$page<=$lastPage;$page++) |
569 $page = 0; |
256 { |
570 $lastPage = PHP_INT_MAX; |
257 $docs = $this->documentRepository->paginateAll($stepSize, 'page', $page); |
571 $docIds = []; |
258 if ($noBulk) |
572 |
259 { |
573 while($page++<$lastPage) { |
260 foreach ($docs as $i=>$doc){ |
574 $docsPaginator = $this->documentRepository->paginate(null, $stepSize, config('corpusparole.pagination_page_param'), $page, "_graph"); |
261 if ($page==$lastPage && $i>=$lastPageEntryCount){ |
575 $lastPage = $docsPaginator->lastPage(); |
262 break; |
576 $docsBodies = []; |
263 } |
577 foreach($docsPaginator as $docResult) { |
264 $this->indexOne($doc); |
578 $docId = (string)$docResult->getId(); |
265 $progressBar->advance(); |
579 $progressBar->setMessage($docId); |
266 $progressBar->setMessage($doc->getId()); |
|
267 } |
|
268 } |
|
269 else |
|
270 { |
|
271 $this->indexBulk($docs); |
|
272 $progressBar->advance(); |
580 $progressBar->advance(); |
273 $progressBar->setMessage('Page '.$page); |
581 $doc = $this->documentRepository->get($docId); |
|
582 $docBody = $this->getDocBody($doc); |
|
583 if($noBulk) { |
|
584 $this->indexOne($docId, $docBody); |
|
585 } else { |
|
586 $docsBodies[$docId] = $docBody; |
|
587 } |
|
588 $docIds[] = $docId; |
|
589 } |
|
590 if(!$noBulk) { |
|
591 $this->indexBulk($docsBodies); |
274 } |
592 } |
275 } |
593 } |
276 $progressBar->finish(); |
594 $progressBar->finish(); |
277 $this->info('Indexing completed'); |
595 $this->info("\nIndexing completed for " . count(array_unique($docIds))." documents (of ".count($docIds).")."); |
|
596 |
278 } |
597 } |
279 } |
598 } |