server/src/app/Console/Commands/IndexDocuments.php
changeset 321 aefaad270b9b
parent 320 0fce13da58af
child 322 084aae09edf4
equal deleted inserted replaced
320:0fce13da58af 321:aefaad270b9b
     1 <?php
     1 <?php
     2 
     2 
     3 namespace CorpusParole\Console\Commands;
     3 namespace CorpusParole\Console\Commands;
     4 
     4 
     5 use Illuminate\Console\Command;
     5 use Illuminate\Console\Command;
       
     6 use EasyRdf\Resource;
       
     7 
     6 use GuzzleHttp\Client;
     8 use GuzzleHttp\Client;
       
     9 use CorpusParole\Libraries\Utils;
     7 use CorpusParole\Repositories\DocumentRepository;
    10 use CorpusParole\Repositories\DocumentRepository;
     8 use CorpusParole\Libraries\CocoonUtils;
    11 use CorpusParole\Libraries\CocoonUtils;
     9 use CorpusParole\Models\GeonamesHierarchy;
    12 use CorpusParole\Models\GeonamesHierarchy;
       
    13 use CorpusParole\Services\BnfResolverInterface;
    10 use Es;
    14 use Es;
    11 
    15 
    12 class IndexDocuments extends Command
    16 class IndexDocuments extends Command
    13 {
    17 {
    14 
    18 
    32     /**
    36     /**
    33      * Create a new command instance.
    37      * Create a new command instance.
    34      *
    38      *
    35      * @return void
    39      * @return void
    36      */
    40      */
    37     public function __construct(DocumentRepository $documentRepository, Client $httpClient)
    41     public function __construct(DocumentRepository $documentRepository, Client $httpClient, BnfResolverInterface $bnfResolver)
    38     {
    42     {
    39         $this->documentRepository = $documentRepository;
    43         $this->documentRepository = $documentRepository;
       
    44         $this->bnfResolver = $bnfResolver;
    40         $this->httpClient = $httpClient;
    45         $this->httpClient = $httpClient;
    41         parent::__construct();
    46         parent::__construct();
    42     }
    47     }
    43 
    48 
    44 
    49 
    80                                 ]
    85                                 ]
    81                             ]
    86                             ]
    82                         ],
    87                         ],
    83                         'date' => [ 'type' => 'date' ],
    88                         'date' => [ 'type' => 'date' ],
    84                         'geonames_hyerarchy' => [ 'type' => 'string' ],
    89                         'geonames_hyerarchy' => [ 'type' => 'string' ],
    85                         'location' => [ 'type' => 'geo_point' ]
    90                         'location' => [ 'type' => 'geo_point' ],
       
    91                         'subject' => [
       
    92                             'type' => 'nested',
       
    93                             'properties' => [
       
    94                                 'label' => [ 'type' => 'string', 'index' => 'not_analyzed'],
       
    95                                 'code' => [ 'type' => 'string', 'index' => 'not_analyzed'],
       
    96                                 'label_code' => [ 'type' => 'string', 'index' => 'not_analyzed']
       
    97                             ]
       
    98                         ]
    86                         // TODO: add location information
    99                         // TODO: add location information
    87                     ]
   100                     ]
    88                 ]
   101                 ]
    89             ]
   102             ]
    90         ];
   103         ];
   152         return $res;
   165         return $res;
   153 
   166 
   154     }
   167     }
   155 
   168 
   156     /**
   169     /**
       
   170      * get subjects as { 'label': label, 'code': code } objects
       
   171      * Takes only into account the bnf subjects
       
   172      */
       
   173     private function getSubjects($doc) {
       
   174 
       
   175         $sres = array_reduce($doc->getSubjects(), function($res, $s) {
       
   176             $m = [];
       
   177             if($s instanceof Resource && preg_match(config('corpusparole.bnf_ark_url_regexp'), $s->getUri(), $m) === 1) {
       
   178                 array_push($res, [
       
   179                     'uri' => $m[0],
       
   180                     'code' => $m[1]
       
   181                 ]);
       
   182             }
       
   183             return $res;
       
   184         }, []);
       
   185 
       
   186         $labels = $this->bnfResolver->getLabels(array_unique(array_map(function($so) { return $so['uri'];}, $sres)));
       
   187 
       
   188         return array_map(function($so) use ($labels) { return [ 'label' => $labels[$so['uri']], 'code' => $so['code'], 'label_code' =>  $labels[$so['uri']]."|".$so['code'] ]; }, $sres);
       
   189 
       
   190     }
       
   191 
       
   192     /**
   157      * Index one document into Elasticsearch
   193      * Index one document into Elasticsearch
   158      *
   194      *
   159      * @return int (1 if sucess, 0 if error)
   195      * @return int (1 if sucess, 0 if error)
   160      */
   196      */
   161     private function indexOne($resultDoc)
   197     private function indexOne($resultDoc)
   166             'type' => 'document',
   202             'type' => 'document',
   167             'id' => (string)$doc->getId(),
   203             'id' => (string)$doc->getId(),
   168             'body' => [
   204             'body' => [
   169                 'title' => (string)$doc->getTitle(),
   205                 'title' => (string)$doc->getTitle(),
   170                 'date' => (string)$doc->getModified(),
   206                 'date' => (string)$doc->getModified(),
   171                 'geonames_hierarchy' => $this->getGeonamesHierarchy($doc)
   207                 'geonames_hierarchy' => $this->getGeonamesHierarchy($doc),
       
   208                 'subject' => $this->getSubjects($doc)
   172             ]
   209             ]
   173         ];
   210         ];
   174         Es::index($query_data);
   211         Es::index($query_data);
   175     }
   212     }
   176 
   213 
   180      * @return int (1 if sucess, 0 if error)
   217      * @return int (1 if sucess, 0 if error)
   181      */
   218      */
   182      private function indexBulk($docs)
   219      private function indexBulk($docs)
   183      {
   220      {
   184           $query_data = ['body' => []];
   221           $query_data = ['body' => []];
   185           foreach($docs as $doc){
   222           foreach($docs as $resultDoc){
       
   223               $doc = $this->documentRepository->get($resultDoc->getId());
   186               $query_data['body'][] = [
   224               $query_data['body'][] = [
   187                   'index' => [
   225                   'index' => [
   188                       '_index' => config('elasticsearch.index'),
   226                       '_index' => config('elasticsearch.index'),
   189                       '_type' => 'document',
   227                       '_type' => 'document',
   190                       '_id' => (string)$doc->getId()
   228                       '_id' => (string)$doc->getId()
   191                   ]
   229                   ]
   192               ];
   230               ];
   193               $query_data['body'][] = [
   231               $query_data['body'][] = [
   194                   'title' => (string)$doc->getTitle(),
   232                   'title' => (string)$doc->getTitle(),
   195                   'date' => (string)$doc->getModified()
   233                   'date' => (string)$doc->getModified(),
       
   234                   'geonames_hierarchy' => $this->getGeonamesHierarchy($doc),
       
   235                   'subject' => $this->getSubjects($doc)
   196               ];
   236               ];
   197           }
   237           }
   198           Es::bulk($query_data);
   238           Es::bulk($query_data);
   199      }
   239      }
   200     /**
   240     /**
   261                 foreach ($docs as $i=>$doc){
   301                 foreach ($docs as $i=>$doc){
   262                     if ($page==$lastPage && $i>=$lastPageEntryCount){
   302                     if ($page==$lastPage && $i>=$lastPageEntryCount){
   263                         break;
   303                         break;
   264                     }
   304                     }
   265                     $this->indexOne($doc);
   305                     $this->indexOne($doc);
       
   306                     $progressBar->setMessage($doc->getId());
   266                     $progressBar->advance();
   307                     $progressBar->advance();
   267                     $progressBar->setMessage($doc->getId());
       
   268                 }
   308                 }
   269             }
   309             }
   270             else
   310             else
   271             {
   311             {
   272                 $this->indexBulk($docs);
   312                 $this->indexBulk($docs);
       
   313                 $progressBar->setMessage('Page '.$page);
   273                 $progressBar->advance();
   314                 $progressBar->advance();
   274                 $progressBar->setMessage('Page '.$page);
       
   275             }
   315             }
   276         }
   316         }
   277         $progressBar->finish();
   317         $progressBar->finish();
   278         $this->info('Indexing completed');
   318         $this->info("\nIndexing completed");
   279     }
   319     }
   280 }
   320 }