server/src/app/Console/Commands/IndexDocuments.php
changeset 24 de47e8f66e8b
child 25 4ce76c9e7729
equal deleted inserted replaced
23:037687868bc4 24:de47e8f66e8b
       
     1 <?php
       
     2 
       
     3 namespace CorpusParole\Console\Commands;
       
     4 
       
     5 use Illuminate\Console\Command;
       
     6 use CorpusParole\Repositories\DocumentRepository;
       
     7 use Es;
       
     8 
       
     9 class IndexDocuments extends Command
       
    10 {
       
    11 
       
    12     /**
       
    13      * The name and signature of the console command.
       
    14      *
       
    15      * @var string
       
    16      */
       
    17     protected $signature = 'corpus-parole:indexDocuments
       
    18                           {--limit=0 : index only the first n documents, 0 (default) means index everything }
       
    19                           {--no-bulk : index documents one by one instead of using ElasticSearch bulk indexing }
       
    20                           {--step-size=100 : number of documents to retrieve from repository at a time before indexing}';
       
    21 
       
    22     /**
       
    23      * The console command description.
       
    24      *
       
    25      * @var string
       
    26      */
       
    27     protected $description = 'Index documents into ElasticSearch.';
       
    28 
       
    29     /**
       
    30      * Create a new command instance.
       
    31      *
       
    32      * @return void
       
    33      */
       
    34     public function __construct(DocumentRepository $documentRepository)
       
    35     {
       
    36         $this->documentRepository = $documentRepository;
       
    37         parent::__construct();
       
    38     }
       
    39 
       
    40 
       
    41 
       
    42     /**
       
    43      * Reset Elasticsearch index
       
    44      *
       
    45      * @return int (1 if sucess, 0 if error)
       
    46      */
       
    47     private function resetIndex()
       
    48     {
       
    49         $indexParams = [
       
    50             'index' => env('ELASTICSEARCH_INDEX')
       
    51         ];
       
    52         if(Es::indices()->exists($indexParams)){
       
    53             $response = Es::indices()->delete($indexParams);
       
    54             if($response['acknowledged']!=1){
       
    55                 return 0;
       
    56             }
       
    57         }
       
    58         $indexParams['body'] = [
       
    59             'settings' => [
       
    60                 'number_of_shards' => env('ELASTICSEARCH_SHARDS'),
       
    61                 'number_of_replicas' => env('ELASTICSEARCH_REPLICAS'),
       
    62                 'index.mapping.ignore_malformed' => True
       
    63             ],
       
    64             'mappings' => [
       
    65                 'document' => [
       
    66                     'properties' => [
       
    67                         'title' => [
       
    68                             'type' => 'string',
       
    69                             'store' => True,
       
    70                             'fields' => [
       
    71                                 'raw' => [
       
    72                                     'type' => 'string',
       
    73                                     'index' => 'not_analyzed'
       
    74                                 ]
       
    75                             ]
       
    76                         ],
       
    77                         'date' => [
       
    78                             'type' => 'date',
       
    79                             'store' => True
       
    80                         ]
       
    81                     ]
       
    82                 ]
       
    83             ]
       
    84         ];
       
    85         $response = Es::indices()->create($indexParams);
       
    86         if($response['acknowledged']!=1){
       
    87             return 0;
       
    88         }
       
    89         return 1;
       
    90     }
       
    91 
       
    92     /**
       
    93      * Index one document into Elasticsearch
       
    94      *
       
    95      * @return int (1 if sucess, 0 if error)
       
    96      */
       
    97     private function indexOne($doc)
       
    98     {
       
    99         $query_data = [
       
   100             'index' => env('ELASTICSEARCH_INDEX'),
       
   101             'type' => 'document',
       
   102             'id' => (string)$doc->getId(),
       
   103             'body' => [
       
   104                 'title' => (string)$doc->getTitle(),
       
   105                 'date' => (string)$doc->getModified()
       
   106             ]
       
   107         ];
       
   108         Es::index($query_data);
       
   109     }
       
   110 
       
   111     /**
       
   112      * Index multiple document into Elasticsearch
       
   113      *
       
   114      * @return int (1 if sucess, 0 if error)
       
   115      */
       
   116      private function indexBulk($docs)
       
   117      {
       
   118           $query_data = ['body' => []];
       
   119           foreach($docs as $doc){
       
   120               $query_data['body'][] = [
       
   121                   'index' => [
       
   122                       '_index' => env('ELASTICSEARCH_INDEX'),
       
   123                       '_type' => 'document',
       
   124                       '_id' => (string)$doc->getId()
       
   125                   ]
       
   126               ];
       
   127               $query_data['body'][] = [
       
   128                   'title' => (string)$doc->getTitle(),
       
   129                   'date' => (string)$doc->getModified()
       
   130               ];
       
   131           }
       
   132           Es::bulk($query_data);
       
   133      }
       
   134     /**
       
   135      * Execute the console command.
       
   136      *
       
   137      * @return mixed
       
   138      */
       
   139     public function handle()
       
   140     {
       
   141         $this->info('Options:');
       
   142         $noBulk = $this->option('no-bulk');
       
   143         if ($noBulk)
       
   144         {
       
   145             $this->comment(' - Indexing without bulk insert');
       
   146         }
       
   147         else
       
   148         {
       
   149             $this->comment(' - Indexing using bulk insert');
       
   150         }
       
   151         $limit = $this->option('limit');
       
   152         if ($limit>0) {
       
   153             $this->comment(' - Indexing only the first '.$limit.' documents');
       
   154         }
       
   155         $stepSize = $this->option('step-size');
       
   156         $this->comment(' - Indexing with step size of '.$stepSize);
       
   157 
       
   158         $this->info('Resetting index...');
       
   159         $success = $this->resetIndex();
       
   160         if($success==1){
       
   161             $this->comment('Index reset!');
       
   162         }
       
   163         else{
       
   164             $this->error('Error resetting index ' . env('ELASTICSEARCH_INDEX'));
       
   165         }
       
   166 
       
   167         $this->info('Indexing documents...');
       
   168 
       
   169         if ($limit<=0) {
       
   170             $lastPage = $this->documentRepository->paginateAll($stepSize, 'page')->lastPage();
       
   171             $total = $this->documentRepository->getCount();
       
   172             $lastPageEntryCount = $stepSize+1;
       
   173         }
       
   174         else {
       
   175             $lastPage = min((int)($limit/$stepSize)+1, $this->documentRepository->paginateAll($stepSize, 'page')->lastPage());
       
   176             $total = $limit;
       
   177             $lastPageEntryCount = $limit % $stepSize;
       
   178         }
       
   179 
       
   180         if ($noBulk)
       
   181         {
       
   182             $progressBar = $this->output->createProgressBar($total);
       
   183         }
       
   184         else
       
   185         {
       
   186             $progressBar = $this->output->createProgressBar($lastPage);
       
   187         }
       
   188         $progressBar->setFormat(' %current%/%max% [%bar%] %percent:3s%% - %message%');
       
   189 
       
   190         for ($page=1;$page<=$lastPage;$page++)
       
   191         {
       
   192             $docs = $this->documentRepository->paginateAll($stepSize, 'page', $page);
       
   193             if ($noBulk)
       
   194             {
       
   195                 foreach ($docs as $i=>$doc){
       
   196                     if ($page==$lastPage && $i>=$lastPageEntryCount){
       
   197                         break;
       
   198                     }
       
   199                     $this->indexOne($doc);
       
   200                     $progressBar->advance();
       
   201                     $progressBar->setMessage($doc->getId());
       
   202                 }
       
   203             }
       
   204             else
       
   205             {
       
   206                 $this->indexBulk($docs);
       
   207                 $progressBar->advance();
       
   208                 $progressBar->setMessage('Page '.$page);
       
   209             }
       
   210         }
       
   211         $progressBar->finish();
       
   212         $this->info('Indexing completed');
       
   213     }
       
   214 }