server/src/app/Console/Commands/IndexDocuments.php
changeset 325 31a4987f6017
parent 323 47f0611cc57d
child 326 226d5b17a119
equal deleted inserted replaced
324:92fc9d077f95 325:31a4987f6017
     1 <?php
     1 <?php
     2 
     2 
     3 namespace CorpusParole\Console\Commands;
     3 namespace CorpusParole\Console\Commands;
       
     4 
       
     5 
     4 
     6 
     5 use Illuminate\Console\Command;
     7 use Illuminate\Console\Command;
     6 use EasyRdf\Resource;
     8 use EasyRdf\Resource;
     7 use EasyRdf\Literal;
     9 use EasyRdf\Literal;
       
    10 use EasyRdf\Graph;
       
    11 
       
    12 use Carbon\Carbon;
     8 
    13 
     9 use GuzzleHttp\Client;
    14 use GuzzleHttp\Client;
       
    15 use GuzzleHttp\Exception\TransferException;
       
    16 use GuzzleHttp\Psr7;
       
    17 
    10 use CorpusParole\Libraries\Utils;
    18 use CorpusParole\Libraries\Utils;
    11 use CorpusParole\Repositories\DocumentRepository;
    19 use CorpusParole\Repositories\DocumentRepository;
    12 use CorpusParole\Libraries\CocoonUtils;
    20 use CorpusParole\Libraries\CocoonUtils;
    13 use CorpusParole\Models\GeonamesHierarchy;
    21 use CorpusParole\Models\GeonamesHierarchy;
    14 use CorpusParole\Services\BnfResolverInterface;
    22 use CorpusParole\Services\BnfResolverInterface;
    15 use CorpusParole\Services\LexvoResolverInterface;
    23 use CorpusParole\Services\LexvoResolverInterface;
    16 use Es;
    24 use Es;
       
    25 use Log;
       
    26 use Cache;
    17 
    27 
    18 class IndexDocuments extends Command
    28 class IndexDocuments extends Command
    19 {
    29 {
    20 
    30 
    21     /**
    31     /**
    94                             ]
   104                             ]
    95                         ],
   105                         ],
    96                         'date' => [ 'type' => 'date' ],
   106                         'date' => [ 'type' => 'date' ],
    97                         'geonames_hyerarchy' => [ 'type' => 'string' ],
   107                         'geonames_hyerarchy' => [ 'type' => 'string' ],
    98                         'location' => [ 'type' => 'geo_point' ],
   108                         'location' => [ 'type' => 'geo_point' ],
       
   109                         'creation_date' => ['type' => 'date'],
       
   110                         'language' => ['type' => 'string'],
       
   111                         'discourse_types' => ['type' => 'string'],
    99                         'subject' => [
   112                         'subject' => [
   100                             'type' => 'nested',
   113                             'type' => 'nested',
   101                             'properties' => [
   114                             'properties' => [
   102                                 'label' => [ 'type' => 'string', 'index' => 'not_analyzed'],
   115                                 'label' => [ 'type' => 'string', 'index' => 'not_analyzed'],
   103                                 'code' => [ 'type' => 'string', 'index' => 'not_analyzed'],
   116                                 'code' => [ 'type' => 'string', 'index' => 'not_analyzed'],
   116         return 1;
   129         return 1;
   117     }
   130     }
   118 
   131 
   119 
   132 
   120     private function getGeonamesHierarchyArray($geonamesid) {
   133     private function getGeonamesHierarchyArray($geonamesid) {
   121         // TODO: Manage this cache !!!
   134 
   122         $hcache = GeonamesHierarchy::where('geonamesid', $geonamesid)->first();
   135         $hcache = GeonamesHierarchy::where('geonamesid', $geonamesid)->first();
   123         if(is_null($hcache)) {
   136         if(is_null($hcache)) {
   124 
   137 
   125             // TODO: add delay to respect geonames 2k request/hour
   138             // TODO: add delay to respect geonames 2k request/hour
   126             // TODO: manage errors
   139             // TODO: manage errors
   144         foreach($hcache->hierarchy['geonames'] as $hierarchyElem) {
   157         foreach($hcache->hierarchy['geonames'] as $hierarchyElem) {
   145             if(in_array($hierarchyElem['fcode'], ['CONT','PCLI', 'PCL','PCLD', 'PCLF', 'PCLH', 'PCLIX', 'PCLIS', 'ADM1'])) {
   158             if(in_array($hierarchyElem['fcode'], ['CONT','PCLI', 'PCL','PCLD', 'PCLF', 'PCLH', 'PCLIX', 'PCLIS', 'ADM1'])) {
   146                 array_push($res, $hierarchyElem['geonameId']);
   159                 array_push($res, $hierarchyElem['geonameId']);
   147             }
   160             }
   148         }
   161         }
   149 
       
   150         return $res;
   162         return $res;
   151 
   163 
   152     }
   164     }
   153 
   165 
   154     /**
   166     /**
   208                     'uri' => $s->getValue(),
   220                     'uri' => $s->getValue(),
   209                     'code' => $s->getValue(),
   221                     'code' => $s->getValue(),
   210                     'type' => 'txt'
   222                     'type' => 'txt'
   211                 ]);
   223                 ]);
   212             }
   224             }
   213 
       
   214             return $res;
   225             return $res;
   215         }, []);
   226         }, []);
   216 
   227 
   217         $labelsBnf = $this->bnfResolver->getLabels(
   228         $labelsBnf = $this->bnfResolver->getLabels(
   218             array_unique(array_reduce(
   229             array_unique(array_reduce(
   246             }
   257             }
   247             return [ 'label' => $label, 'code' => $so['code'], 'label_code' =>  $label."|".$so['type']."|".$so['code'] ]; }, $sres
   258             return [ 'label' => $label, 'code' => $so['code'], 'label_code' =>  $label."|".$so['type']."|".$so['code'] ]; }, $sres
   248         );
   259         );
   249     }
   260     }
   250 
   261 
       
   262     private function graphResolvCoordinate($loc, $graph) {
       
   263         $latLit = $graph->getLiteral($loc, "<http://www.w3.org/2003/01/geo/wgs84_pos#lat>");
       
   264         if(is_null($latLit) || empty($latLit->getValue())) {
       
   265             return null;
       
   266         }
       
   267         $lat = $latLit->getValue();
       
   268 
       
   269         $longLit = $graph->getLiteral($loc, "<http://www.w3.org/2003/01/geo/wgs84_pos#long>");
       
   270         if(is_null($longLit) || empty($longLit->getValue())) {
       
   271             return null;
       
   272         }
       
   273         $long = $longLit->getValue();
       
   274 
       
   275         return [ $lat, $long ];
       
   276     }
       
   277 
       
   278     private function loadGraph($url, $type) {
       
   279         try {
       
   280             $r = $this->httpClient->get($url);
       
   281         } catch (TransferException $e) {
       
   282             $this->error("loadGraph : Error Loading $url");
       
   283             Log::error("loadGraph : Error Loading $url");
       
   284             Log::error("loadGraph : Error request " . Psr7\str($e->getRequest()));
       
   285             if ($e->hasResponse()) {
       
   286                 $this->error("loadGraph : Error response " . Psr7\str($e->getResponse()));
       
   287                 Log::error("loadGraph : Error response " . Psr7\str($e->getResponse()));
       
   288             }
       
   289             return null;
       
   290         }
       
   291         try {
       
   292             $message = (string)$r->getBody();
       
   293             $graph = new Graph($url, $message, $type);
       
   294             return $graph;
       
   295         } catch (EasyRdf\Exception $e) {
       
   296             $this->error("loadGraph : Error parsing $url");
       
   297             Log::error("loadGraph : Error parsing $url");
       
   298             if($e instanceof EasyRdf\Parser\Exception) {
       
   299                 Log::error("loadGraph : Error exception line ".$e->getLine().", column: ".$e->getColumn());
       
   300             }
       
   301             $this->error("loadGraph : Error exception message ".$e->getMessage());
       
   302             Log::error("loadGraph : Error exception message ".$e->getMessage());
       
   303             Log::error("loadGraph : Error content $message");
       
   304             return null;
       
   305         }
       
   306 
       
   307     }
       
   308 
       
   309     private function geonamesResolveCoordinates($loc) {
       
   310         $coords = cache("corpus.geonames.coord.$loc");
       
   311         if(is_null($coords)) {
       
   312             $graph = $this->loadGraph("{$loc}about.rdf", 'rdfxml');
       
   313             $coords = is_null($graph)?null:$this->graphResolvCoordinate($loc, $graph);
       
   314             cache(["corpus.geonames.coord.$loc" => is_null($coords)?false:$coords], Carbon::now()->addMinutes(20));
       
   315         }
       
   316         return ($coords===false)?null:$coords;
       
   317     }
       
   318 
       
   319     private function dbpediaResolveCoordinates($loc) {
       
   320         $coords = cache("corpus.dbpedia.coord.$loc");
       
   321         if(is_null($coords)) {
       
   322             $graph = $this->loadGraph("$loc.rdf", 'rdfxml');
       
   323             $coords = is_null($graph)?null:$this->graphResolvCoordinate($loc, $graph);
       
   324             cache(["corpus.dbpedia.coord.$loc"=> is_null($coords)?false:$coords], Carbon::now()->addMinutes(20));
       
   325         }
       
   326         return ($coords===false)?null:$coords;
       
   327     }
       
   328 
       
   329     private function getLocation($doc) {
       
   330 
       
   331         $geoRes = $doc->getGeoInfo();
       
   332 
       
   333         if(is_null($geoRes)) {
       
   334             return null;
       
   335         }
       
   336 
       
   337         $locUrls = [];
       
   338         foreach($geoRes->getRefLocs() as $loc) {
       
   339             if(preg_match(config('corpusparole.geonames_url_regexp'), $loc, $m) === 1) {
       
   340 
       
   341                 if(!array_key_exists('geonames', $locUrls)) {
       
   342                     $locUrls['geonames'] = [];
       
   343                 }
       
   344                 array_push($locUrls['geonames'], "http://sws.geonames.org/$m[1]/");
       
   345 
       
   346             } elseif(preg_match(config('corpusparole.dbpedia_url_regexp'), $loc, $md) === 1) {
       
   347                 if(!array_key_exists('dbpedia', $locUrls)) {
       
   348                     $locUrls['dbpedia'] = [];
       
   349                 }
       
   350                 //$this->line("DBPEDIA MATCH $loc ".print_r($md,true));
       
   351                 array_push($locUrls['dbpedia'], "http://$md[1]/data/$md[4]");
       
   352             }
       
   353         }
       
   354 
       
   355         $coordinates = null;
       
   356         foreach($locUrls as $locType => $locList) {
       
   357             foreach($locList as $locationUrl) {
       
   358                 $coordinates = call_user_func([$this, "${locType}ResolveCoordinates"], $locationUrl);
       
   359                 if(!is_null($coordinates)) {
       
   360                     break;
       
   361                 }
       
   362             }
       
   363         }
       
   364 
       
   365         if(is_null($coordinates)) {
       
   366             $coordinates = [$geoRes->getLatitudeValue(), $geoRes->getLongitudeValue()];
       
   367         }
       
   368 
       
   369         if(empty($coordinates[0]) || empty($coordinates[1])) {
       
   370             return null;
       
   371         } else {
       
   372             return [floatval($coordinates[0]), floatval($coordinates[1])];
       
   373         }
       
   374 
       
   375     }
       
   376 
       
   377     private function getCreationDate($doc) {
       
   378 
       
   379         $created = $doc->getCreated();
       
   380         if(is_null($created)) {
       
   381             return null;
       
   382         }
       
   383         $dateType = $created->getDatatypeUri();
       
   384         $res = null;
       
   385 
       
   386         if($dateType === "http://purl.org/dc/terms/Period") {
       
   387             $res = $this->processPeriod($created->getValue());
       
   388         }
       
   389         elseif($dateType === "http://purl.org/dc/terms/W3CDTF") {
       
   390             $res = $this->processDate($created->getValue());
       
   391         }
       
   392 
       
   393         return $res;
       
   394 
       
   395     }
       
   396 
       
   397     private function extractDate($dateStr) {
       
   398         if(preg_match("/^\\d{4}$/", $dateStr) === 1) {
       
   399             $dateStr = "$dateStr-1-1";
       
   400         }
       
   401         $date = date_create($dateStr);
       
   402         if($date === false ) {
       
   403             Log::warning("DateStatsController:extractYear bad format for date $dateStr");
       
   404             return null;
       
   405         }
       
   406         return $date;
       
   407     }
       
   408 
       
   409     private function processPeriod($periodStr) {
       
   410         $start = null;
       
   411         $end = null;
       
   412         foreach(explode(";", $periodStr) as $elem) {
       
   413             $elem = trim($elem);
       
   414             if(strpos($elem, 'start=') === 0) {
       
   415                 $startDate = $this->extractDate(trim(substr($elem, 6)));
       
   416                 if(is_null($startDate)) {
       
   417                     return null;
       
   418                 }
       
   419                 $start = intval($startDate->format("Y"));
       
   420                 if($start === false) {
       
   421                     return null;
       
   422                 }
       
   423             } elseif(strpos($elem, 'end=') === 0) {
       
   424                 $endDate = $this->extractDate(trim(substr($elem, 4)));
       
   425                 if(is_null($endDate)) {
       
   426                     return null;
       
   427                 }
       
   428                 $end = intval($endDate->format("Y"));
       
   429                 if($end === false) {
       
   430                     return null;
       
   431                 }
       
   432             }
       
   433         }
       
   434 
       
   435         if(is_null($start) || is_null($end) || $start>$end ) {
       
   436             Log::warning("Bad format for $periodStr");
       
   437             return null;
       
   438         }
       
   439 
       
   440         return array_map(function($y) {
       
   441             return \DateTime::createFromFormat("Y", "$y")->format(\DateTime::W3C);
       
   442         }, range($start, $end));
       
   443     }
       
   444 
       
   445     private function processDate($dateStr) {
       
   446         $date = $this->extractDate($dateStr);
       
   447         if(is_null($date))  {
       
   448             return null;
       
   449         } else {
       
   450             return $date->format(\DateTime::W3C);
       
   451         }
       
   452     }
       
   453 
       
   454     private function getDiscourseTypes($doc) {
       
   455         return array_reduce($doc->getDiscourseTypes(), function($res, $d) {
       
   456             $val = null;
       
   457             if($d instanceof Resource) {
       
   458                 $val = $d->getUri();
       
   459             } elseif($d instanceof Literal) {
       
   460                 $datatype = $d->getDatatypeURI();
       
   461                 $val = (!empty($datatype)?"$datatype#":"").$d->getValue();
       
   462             }
       
   463             if(!empty($val)) {
       
   464                 array_push($res,$val);
       
   465             }
       
   466             return $res;
       
   467         }, []);
       
   468     }
       
   469 
       
   470     private function getDocBody($doc) {
       
   471         return [
       
   472             'title' => (string)$doc->getTitle(),
       
   473             'date' => (string)$doc->getModified(),
       
   474             'location' => $this->getLocation($doc),
       
   475             'creation_date' => $this->getCreationDate($doc),
       
   476             'language' => $doc->getLanguageValue(),
       
   477             'discourse_types' => $this->getDiscourseTypes($doc),
       
   478             'geonames_hierarchy' => $this->getGeonamesHierarchy($doc),
       
   479             'subject' => $this->getSubjects($doc),
       
   480         ];
       
   481     }
       
   482 
   251     /**
   483     /**
   252      * Index one document into Elasticsearch
   484      * Index one document into Elasticsearch
   253      *
   485      *
   254      * @return int (1 if sucess, 0 if error)
   486      * @return int (1 if sucess, 0 if error)
   255      */
   487      */
   258         $doc = $this->documentRepository->get($resultDoc->getId());
   490         $doc = $this->documentRepository->get($resultDoc->getId());
   259         $query_data = [
   491         $query_data = [
   260             'index' => config('elasticsearch.index'),
   492             'index' => config('elasticsearch.index'),
   261             'type' => 'document',
   493             'type' => 'document',
   262             'id' => (string)$doc->getId(),
   494             'id' => (string)$doc->getId(),
   263             'body' => [
   495             'body' => $this->getDocBody($doc)
   264                 'title' => (string)$doc->getTitle(),
       
   265                 'date' => (string)$doc->getModified(),
       
   266                 'geonames_hierarchy' => $this->getGeonamesHierarchy($doc),
       
   267                 'subject' => $this->getSubjects($doc)
       
   268             ]
       
   269         ];
   496         ];
   270         Es::index($query_data);
   497         Es::index($query_data);
   271     }
   498     }
   272 
   499 
   273     /**
   500     /**
   285                       '_index' => config('elasticsearch.index'),
   512                       '_index' => config('elasticsearch.index'),
   286                       '_type' => 'document',
   513                       '_type' => 'document',
   287                       '_id' => (string)$doc->getId()
   514                       '_id' => (string)$doc->getId()
   288                   ]
   515                   ]
   289               ];
   516               ];
   290               $query_data['body'][] = [
   517               $query_data['body'][] = $this->getDocBody($doc);
   291                   'title' => (string)$doc->getTitle(),
       
   292                   'date' => (string)$doc->getModified(),
       
   293                   'geonames_hierarchy' => $this->getGeonamesHierarchy($doc),
       
   294                   'subject' => $this->getSubjects($doc)
       
   295               ];
       
   296           }
   518           }
   297           Es::bulk($query_data);
   519           Es::bulk($query_data);
   298      }
   520      }
   299     /**
   521     /**
   300      * Execute the console command.
   522      * Execute the console command.
   365             {
   587             {
   366                 foreach ($docs as $i=>$doc){
   588                 foreach ($docs as $i=>$doc){
   367                     if ($page==$lastPage && $i>=$lastPageEntryCount){
   589                     if ($page==$lastPage && $i>=$lastPageEntryCount){
   368                         break;
   590                         break;
   369                     }
   591                     }
   370                     $this->indexOne($doc);
       
   371                     $progressBar->setMessage($doc->getId());
   592                     $progressBar->setMessage($doc->getId());
   372                     $progressBar->advance();
   593                     $progressBar->advance();
       
   594                     $this->indexOne($doc);
   373                 }
   595                 }
   374             }
   596             }
   375             else
   597             else
   376             {
   598             {
   377                 $this->indexBulk($docs);
       
   378                 $progressBar->setMessage('Page '.$page);
   599                 $progressBar->setMessage('Page '.$page);
   379                 $progressBar->advance();
   600                 $progressBar->advance();
       
   601                 $this->indexBulk($docs);
   380             }
   602             }
   381         }
   603         }
   382         $progressBar->finish();
   604         $progressBar->finish();
   383         $this->info("\nIndexing completed");
   605         $this->info("\nIndexing completed");
   384     }
   606     }