server/src/app/Console/Commands/IndexDocuments.php
changeset 326 226d5b17a119
parent 325 31a4987f6017
child 369 796725d33b67
equal deleted inserted replaced
325:31a4987f6017 326:226d5b17a119
   101                                     'type' => 'string',
   101                                     'type' => 'string',
   102                                     'index' => 'not_analyzed'
   102                                     'index' => 'not_analyzed'
   103                                 ]
   103                                 ]
   104                             ]
   104                             ]
   105                         ],
   105                         ],
   106                         'date' => [ 'type' => 'date' ],
   106                         'date' => [ 'type' => 'date', 'index' => 'not_analyzed'],
   107                         'geonames_hyerarchy' => [ 'type' => 'string' ],
   107                         'geonames_hyerarchy' => [ 'type' => 'string', 'index' => 'not_analyzed'],
   108                         'location' => [ 'type' => 'geo_point' ],
   108                         'location' => [ 'type' => 'geo_point'],
   109                         'creation_date' => ['type' => 'date'],
   109                         'creation_date' => ['type' => 'date', 'index' => 'not_analyzed'],
   110                         'language' => ['type' => 'string'],
   110                         'language' => ['type' => 'string', 'index' => 'not_analyzed'],
   111                         'discourse_types' => ['type' => 'string'],
   111                         'discourse_types' => ['type' => 'string', 'index' => 'not_analyzed'],
   112                         'subject' => [
   112                         'subject' => [
   113                             'type' => 'nested',
   113                             'type' => 'nested',
   114                             'properties' => [
   114                             'properties' => [
   115                                 'label' => [ 'type' => 'string', 'index' => 'not_analyzed'],
   115                                 'label' => [ 'type' => 'string', 'index' => 'not_analyzed'],
   116                                 'code' => [ 'type' => 'string', 'index' => 'not_analyzed'],
   116                                 'code' => [ 'type' => 'string', 'index' => 'not_analyzed'],
   117                                 'label_code' => [ 'type' => 'string', 'index' => 'not_analyzed']
   117                                 'label_code' => [ 'type' => 'string', 'index' => 'not_analyzed']
   118                             ]
   118                             ]
   119                         ]
   119                         ]
   120                         // TODO: add location information
       
   121                     ]
   120                     ]
   122                 ]
   121                 ]
   123             ]
   122             ]
   124         ];
   123         ];
   125         $response = Es::indices()->create($indexParams);
   124         $response = Es::indices()->create($indexParams);
   471         return [
   470         return [
   472             'title' => (string)$doc->getTitle(),
   471             'title' => (string)$doc->getTitle(),
   473             'date' => (string)$doc->getModified(),
   472             'date' => (string)$doc->getModified(),
   474             'location' => $this->getLocation($doc),
   473             'location' => $this->getLocation($doc),
   475             'creation_date' => $this->getCreationDate($doc),
   474             'creation_date' => $this->getCreationDate($doc),
   476             'language' => $doc->getLanguageValue(),
   475             'language' => $doc->getLanguagesValue(),
   477             'discourse_types' => $this->getDiscourseTypes($doc),
   476             'discourse_types' => $this->getDiscourseTypes($doc),
   478             'geonames_hierarchy' => $this->getGeonamesHierarchy($doc),
   477             'geonames_hierarchy' => $this->getGeonamesHierarchy($doc),
   479             'subject' => $this->getSubjects($doc),
   478             'subject' => $this->getSubjects($doc),
   480         ];
   479         ];
   481     }
   480     }
   483     /**
   482     /**
   484      * Index one document into Elasticsearch
   483      * Index one document into Elasticsearch
   485      *
   484      *
   486      * @return int (1 if sucess, 0 if error)
   485      * @return int (1 if sucess, 0 if error)
   487      */
   486      */
   488     private function indexOne($resultDoc)
   487     private function indexOne($docId, $docBody)
   489     {
   488     {
   490         $doc = $this->documentRepository->get($resultDoc->getId());
       
   491         $query_data = [
   489         $query_data = [
   492             'index' => config('elasticsearch.index'),
   490             'index' => config('elasticsearch.index'),
   493             'type' => 'document',
   491             'type' => 'document',
   494             'id' => (string)$doc->getId(),
   492             'id' => $docId,
   495             'body' => $this->getDocBody($doc)
   493             'body' => $docBody
   496         ];
   494         ];
   497         Es::index($query_data);
   495         Es::index($query_data);
   498     }
   496     }
   499 
   497 
   500     /**
   498     /**
   501      * Index multiple document into Elasticsearch
   499      * Index multiple document into Elasticsearch
   502      *
   500      *
   503      * @return int (1 if sucess, 0 if error)
   501      * @return int (1 if sucess, 0 if error)
   504      */
   502      */
   505      private function indexBulk($docs)
   503      private function indexBulk($docBodies)
   506      {
   504      {
   507           $query_data = ['body' => []];
   505           $query_data = ['body' => []];
   508           foreach($docs as $resultDoc){
   506           foreach($docBodies as $docId => $docBody){
   509               $doc = $this->documentRepository->get($resultDoc->getId());
       
   510               $query_data['body'][] = [
   507               $query_data['body'][] = [
   511                   'index' => [
   508                   'index' => [
   512                       '_index' => config('elasticsearch.index'),
   509                       '_index' => config('elasticsearch.index'),
   513                       '_type' => 'document',
   510                       '_type' => 'document',
   514                       '_id' => (string)$doc->getId()
   511                       '_id' => $docId
   515                   ]
   512                   ]
   516               ];
   513               ];
   517               $query_data['body'][] = $this->getDocBody($doc);
   514               $query_data['body'][] = $docBody;
   518           }
   515           }
   519           Es::bulk($query_data);
   516           Es::bulk($query_data);
   520      }
   517      }
   521     /**
   518     /**
   522      * Execute the console command.
   519      * Execute the console command.
   557             $this->error('Error resetting index ' . env('ELASTICSEARCH_INDEX'));
   554             $this->error('Error resetting index ' . env('ELASTICSEARCH_INDEX'));
   558         }
   555         }
   559 
   556 
   560         $this->info('Indexing documents...');
   557         $this->info('Indexing documents...');
   561 
   558 
   562         if ($limit<=0) {
   559         $limit = (int)$limit;
   563             $lastPage = $this->documentRepository->paginateAll($stepSize, 'page')->lastPage();
   560         $total = $this->documentRepository->getCount();
   564             $total = $this->documentRepository->getCount();
   561 
   565             $lastPageEntryCount = $stepSize+1;
   562         if($limit>0) {
   566         }
   563             $total = min($limit, $total);
   567         else {
   564         }
   568             $lastPage = min((int)($limit/$stepSize)+1, $this->documentRepository->paginateAll($stepSize, 'page')->lastPage());
   565 
   569             $total = $limit;
   566         $progressBar = $this->output->createProgressBar($total);
   570             $lastPageEntryCount = $limit % $stepSize;
       
   571         }
       
   572 
       
   573         if ($noBulk)
       
   574         {
       
   575             $progressBar = $this->output->createProgressBar($total);
       
   576         }
       
   577         else
       
   578         {
       
   579             $progressBar = $this->output->createProgressBar($lastPage);
       
   580         }
       
   581         $progressBar->setFormat(' %current%/%max% [%bar%] %percent:3s%% - %message%');
   567         $progressBar->setFormat(' %current%/%max% [%bar%] %percent:3s%% - %message%');
   582 
   568 
   583         for ($page=1;$page<=$lastPage;$page++)
   569         $page = 0;
   584         {
   570         $lastPage = PHP_INT_MAX;
   585             $docs = $this->documentRepository->paginateAll($stepSize, 'page', $page);
   571         $docIds = [];
   586             if ($noBulk)
   572 
   587             {
   573         while($page++<$lastPage) {
   588                 foreach ($docs as $i=>$doc){
   574             $docsPaginator = $this->documentRepository->paginate(null, $stepSize, config('corpusparole.pagination_page_param'), $page, "_graph");
   589                     if ($page==$lastPage && $i>=$lastPageEntryCount){
   575             $lastPage = $docsPaginator->lastPage();
   590                         break;
   576             $docsBodies = [];
   591                     }
   577             foreach($docsPaginator as $docResult) {
   592                     $progressBar->setMessage($doc->getId());
   578                 $docId = (string)$docResult->getId();
   593                     $progressBar->advance();
   579                 $progressBar->setMessage($docId);
   594                     $this->indexOne($doc);
       
   595                 }
       
   596             }
       
   597             else
       
   598             {
       
   599                 $progressBar->setMessage('Page '.$page);
       
   600                 $progressBar->advance();
   580                 $progressBar->advance();
   601                 $this->indexBulk($docs);
   581                 $doc = $this->documentRepository->get($docId);
       
   582                 $docBody = $this->getDocBody($doc);
       
   583                 if($noBulk) {
       
   584                     $this->indexOne($docId, $docBody);
       
   585                 } else {
       
   586                     $docsBodies[$docId] = $docBody;
       
   587                 }
       
   588                 $docIds[] = $docId;
       
   589             }
       
   590             if(!$noBulk) {
       
   591                 $this->indexBulk($docsBodies);
   602             }
   592             }
   603         }
   593         }
   604         $progressBar->finish();
   594         $progressBar->finish();
   605         $this->info("\nIndexing completed");
   595         $this->info("\nIndexing completed for " . count(array_unique($docIds))." documents (of ".count($docIds).").");
       
   596 
   606     }
   597     }
   607 }
   598 }