server/src/app/Console/Commands/IndexDocuments.php
changeset 322 084aae09edf4
parent 321 aefaad270b9b
child 323 47f0611cc57d
equal deleted inserted replaced
321:aefaad270b9b 322:084aae09edf4
     2 
     2 
     3 namespace CorpusParole\Console\Commands;
     3 namespace CorpusParole\Console\Commands;
     4 
     4 
     5 use Illuminate\Console\Command;
     5 use Illuminate\Console\Command;
     6 use EasyRdf\Resource;
     6 use EasyRdf\Resource;
       
     7 use EasyRdf\Literal;
     7 
     8 
     8 use GuzzleHttp\Client;
     9 use GuzzleHttp\Client;
     9 use CorpusParole\Libraries\Utils;
    10 use CorpusParole\Libraries\Utils;
    10 use CorpusParole\Repositories\DocumentRepository;
    11 use CorpusParole\Repositories\DocumentRepository;
    11 use CorpusParole\Libraries\CocoonUtils;
    12 use CorpusParole\Libraries\CocoonUtils;
    12 use CorpusParole\Models\GeonamesHierarchy;
    13 use CorpusParole\Models\GeonamesHierarchy;
    13 use CorpusParole\Services\BnfResolverInterface;
    14 use CorpusParole\Services\BnfResolverInterface;
       
    15 use CorpusParole\Services\LexvoResolverInterface;
    14 use Es;
    16 use Es;
    15 
    17 
    16 class IndexDocuments extends Command
    18 class IndexDocuments extends Command
    17 {
    19 {
    18 
    20 
    22      * @var string
    24      * @var string
    23      */
    25      */
    24     protected $signature = 'corpus-parole:indexDocuments
    26     protected $signature = 'corpus-parole:indexDocuments
    25                           {--limit=0 : index only the first n documents, 0 (default) means index everything }
    27                           {--limit=0 : index only the first n documents, 0 (default) means index everything }
    26                           {--no-bulk : index documents one by one instead of using ElasticSearch bulk indexing }
    28                           {--no-bulk : index documents one by one instead of using ElasticSearch bulk indexing }
    27                           {--step-size=100 : number of documents to retrieve from repository at a time before indexing}';
    29                           {--step-size=100 : number of documents to retrieve from repository at a time before indexing}
       
    30                           {--reset-geo-cache: reset geo cache befr indexing}';
    28 
    31 
    29     /**
    32     /**
    30      * The console command description.
    33      * The console command description.
    31      *
    34      *
    32      * @var string
    35      * @var string
    36     /**
    39     /**
    37      * Create a new command instance.
    40      * Create a new command instance.
    38      *
    41      *
    39      * @return void
    42      * @return void
    40      */
    43      */
    41     public function __construct(DocumentRepository $documentRepository, Client $httpClient, BnfResolverInterface $bnfResolver)
    44     public function __construct(
       
    45         DocumentRepository $documentRepository,
       
    46         Client $httpClient,
       
    47         BnfResolverInterface $bnfResolver,
       
    48         LexvoResolverInterface $lexvoResolver)
    42     {
    49     {
    43         $this->documentRepository = $documentRepository;
    50         $this->documentRepository = $documentRepository;
    44         $this->bnfResolver = $bnfResolver;
    51         $this->bnfResolver = $bnfResolver;
       
    52         $this->lexvoResolver = $lexvoResolver;
    45         $this->httpClient = $httpClient;
    53         $this->httpClient = $httpClient;
    46         parent::__construct();
    54         parent::__construct();
    47     }
    55     }
    48 
    56 
    49 
    57 
    51     /**
    59     /**
    52      * Reset Elasticsearch index
    60      * Reset Elasticsearch index
    53      *
    61      *
    54      * @return int (1 if sucess, 0 if error)
    62      * @return int (1 if sucess, 0 if error)
    55      */
    63      */
    56     private function resetIndex()
    64     private function resetIndex($resetGeoCache)
    57     {
    65     {
       
    66         if($resetGeoCache) {
       
    67             // delete all rows in GeonamesHierarchy
       
    68             GeonamesHierarchy::getQuery()->delete();
       
    69         }
    58         $indexParams = [
    70         $indexParams = [
    59             'index' => env('ELASTICSEARCH_INDEX')
    71             'index' => env('ELASTICSEARCH_INDEX')
    60         ];
    72         ];
    61         if(Es::indices()->exists($indexParams)){
    73         if(Es::indices()->exists($indexParams)){
    62             $response = Es::indices()->delete($indexParams);
    74             $response = Es::indices()->delete($indexParams);
   124                       'username' => config('corpusparole.geonames_username') ],
   136                       'username' => config('corpusparole.geonames_username') ],
   125                   'accept' => 'application/json' // TODO: check this
   137                   'accept' => 'application/json' // TODO: check this
   126                 ]
   138                 ]
   127             )->getBody();
   139             )->getBody();
   128             $hjson = json_decode($apiBody);
   140             $hjson = json_decode($apiBody);
   129             $hcache = new GeonamesHierarchy;
   141             $hcache = new GeonamesHierarchy();
   130             $hcache->geonamesid = $geonamesid;
   142             $hcache->geonamesid = $geonamesid;
   131             $hcache->hierarchy = $hjson;
   143             $hcache->hierarchy = $hjson;
   132             $hcache->save();
   144             $hcache->save();
   133         }
   145         }
   134 
   146 
   171      * Takes only into account the bnf subjects
   183      * Takes only into account the bnf subjects
   172      */
   184      */
   173     private function getSubjects($doc) {
   185     private function getSubjects($doc) {
   174 
   186 
   175         $sres = array_reduce($doc->getSubjects(), function($res, $s) {
   187         $sres = array_reduce($doc->getSubjects(), function($res, $s) {
   176             $m = [];
   188             $mBnf = [];
   177             if($s instanceof Resource && preg_match(config('corpusparole.bnf_ark_url_regexp'), $s->getUri(), $m) === 1) {
   189             $mLexvo = [];
       
   190 
       
   191             if($s instanceof Resource && preg_match(config('corpusparole.bnf_ark_url_regexp'), $s->getUri(), $mBnf) === 1) {
   178                 array_push($res, [
   192                 array_push($res, [
   179                     'uri' => $m[0],
   193                     'uri' => $mBnf[0],
   180                     'code' => $m[1]
   194                     'code' => $mBnf[1],
       
   195                     'type' => 'bnf'
   181                 ]);
   196                 ]);
   182             }
   197             } elseif($s instanceof Resource && preg_match(config('corpusparole.lexvo_url_regexp'), $s->getUri(). $mLexvo) === 1) {
       
   198                 array_push($res, [
       
   199                     'uri' => $mLexvo[0],
       
   200                     'code' => $mLexvo[1],
       
   201                     'type' => 'lxv'
       
   202                 ]);
       
   203             }
       
   204 
   183             return $res;
   205             return $res;
   184         }, []);
   206         }, []);
   185 
   207 
   186         $labels = $this->bnfResolver->getLabels(array_unique(array_map(function($so) { return $so['uri'];}, $sres)));
   208         $labelsBnf = $this->bnfResolver->getLabels(
   187 
   209             array_unique(array_reduce(
   188         return array_map(function($so) use ($labels) { return [ 'label' => $labels[$so['uri']], 'code' => $so['code'], 'label_code' =>  $labels[$so['uri']]."|".$so['code'] ]; }, $sres);
   210                 $sres,
   189 
   211                 function($r, $so) {
       
   212                     if($so['type'] === 'bnf') {
       
   213                         array_push($r, $so['uri']);
       
   214                     }
       
   215                     return $r;
       
   216                 },[]
       
   217             ))
       
   218         );
       
   219         $labelsLexvo = $this->lexvoResolver->getLabels(
       
   220             array_unique(array_reduce(
       
   221                 $sres,
       
   222                 function($r, $so) {
       
   223                     if($so['type'] === 'lxv') {
       
   224                         array_push($r, $so['uri']);
       
   225                     }
       
   226                     return $r;
       
   227                 },[]
       
   228             ))
       
   229         );
       
   230 
       
   231         return array_map(function($so) use ($labelsBnf, $labelsLexvo) {
       
   232             $label = $so['uri'];
       
   233             if($so['type'] === 'bnf') {
       
   234                 $label = $labelsBnf[$label];
       
   235             } elseif ($so['type'] === 'lxv') {
       
   236                 $label = $labelsLexvo[$label];
       
   237             }
       
   238             return [ 'label' => $label, 'code' => $so['code'], 'label_code' =>  $label."|".$so['type']."|".$so['code'] ]; }, $sres
       
   239         );
   190     }
   240     }
   191 
   241 
   192     /**
   242     /**
   193      * Index one document into Elasticsearch
   243      * Index one document into Elasticsearch
   194      *
   244      *
   259             $this->comment(' - Indexing only the first '.$limit.' documents');
   309             $this->comment(' - Indexing only the first '.$limit.' documents');
   260         }
   310         }
   261         $stepSize = $this->option('step-size');
   311         $stepSize = $this->option('step-size');
   262         $this->comment(' - Indexing with step size of '.$stepSize);
   312         $this->comment(' - Indexing with step size of '.$stepSize);
   263 
   313 
       
   314         $resetGeoCache = $this->option('reset-geo-cache', false);
   264         $this->info('Resetting index...');
   315         $this->info('Resetting index...');
   265         $success = $this->resetIndex();
   316         $success = $this->resetIndex($resetGeoCache);
   266         if($success==1){
   317         if($success==1){
   267             $this->comment('Index reset!');
   318             $this->comment('Index reset!');
   268         }
   319         }
   269         else{
   320         else{
   270             $this->error('Error resetting index ' . env('ELASTICSEARCH_INDEX'));
   321             $this->error('Error resetting index ' . env('ELASTICSEARCH_INDEX'));