|
1 <?php |
|
2 |
|
3 namespace CorpusParole\Console\Commands; |
|
4 |
|
5 use Illuminate\Console\Command; |
|
6 use CorpusParole\Repositories\DocumentRepository; |
|
7 use Es; |
|
8 |
|
9 class IndexDocuments extends Command |
|
10 { |
|
11 |
|
12 /** |
|
13 * The name and signature of the console command. |
|
14 * |
|
15 * @var string |
|
16 */ |
|
17 protected $signature = 'corpus-parole:indexDocuments |
|
18 {--limit=0 : index only the first n documents, 0 (default) means index everything } |
|
19 {--no-bulk : index documents one by one instead of using ElasticSearch bulk indexing } |
|
20 {--step-size=100 : number of documents to retrieve from repository at a time before indexing}'; |
|
21 |
|
22 /** |
|
23 * The console command description. |
|
24 * |
|
25 * @var string |
|
26 */ |
|
27 protected $description = 'Index documents into ElasticSearch.'; |
|
28 |
|
29 /** |
|
30 * Create a new command instance. |
|
31 * |
|
32 * @return void |
|
33 */ |
|
34 public function __construct(DocumentRepository $documentRepository) |
|
35 { |
|
36 $this->documentRepository = $documentRepository; |
|
37 parent::__construct(); |
|
38 } |
|
39 |
|
40 |
|
41 |
|
42 /** |
|
43 * Reset Elasticsearch index |
|
44 * |
|
45 * @return int (1 if sucess, 0 if error) |
|
46 */ |
|
47 private function resetIndex() |
|
48 { |
|
49 $indexParams = [ |
|
50 'index' => env('ELASTICSEARCH_INDEX') |
|
51 ]; |
|
52 if(Es::indices()->exists($indexParams)){ |
|
53 $response = Es::indices()->delete($indexParams); |
|
54 if($response['acknowledged']!=1){ |
|
55 return 0; |
|
56 } |
|
57 } |
|
58 $indexParams['body'] = [ |
|
59 'settings' => [ |
|
60 'number_of_shards' => env('ELASTICSEARCH_SHARDS'), |
|
61 'number_of_replicas' => env('ELASTICSEARCH_REPLICAS'), |
|
62 'index.mapping.ignore_malformed' => True |
|
63 ], |
|
64 'mappings' => [ |
|
65 'document' => [ |
|
66 'properties' => [ |
|
67 'title' => [ |
|
68 'type' => 'string', |
|
69 'store' => True, |
|
70 'fields' => [ |
|
71 'raw' => [ |
|
72 'type' => 'string', |
|
73 'index' => 'not_analyzed' |
|
74 ] |
|
75 ] |
|
76 ], |
|
77 'date' => [ |
|
78 'type' => 'date', |
|
79 'store' => True |
|
80 ] |
|
81 ] |
|
82 ] |
|
83 ] |
|
84 ]; |
|
85 $response = Es::indices()->create($indexParams); |
|
86 if($response['acknowledged']!=1){ |
|
87 return 0; |
|
88 } |
|
89 return 1; |
|
90 } |
|
91 |
|
92 /** |
|
93 * Index one document into Elasticsearch |
|
94 * |
|
95 * @return int (1 if sucess, 0 if error) |
|
96 */ |
|
97 private function indexOne($doc) |
|
98 { |
|
99 $query_data = [ |
|
100 'index' => env('ELASTICSEARCH_INDEX'), |
|
101 'type' => 'document', |
|
102 'id' => (string)$doc->getId(), |
|
103 'body' => [ |
|
104 'title' => (string)$doc->getTitle(), |
|
105 'date' => (string)$doc->getModified() |
|
106 ] |
|
107 ]; |
|
108 Es::index($query_data); |
|
109 } |
|
110 |
|
111 /** |
|
112 * Index multiple document into Elasticsearch |
|
113 * |
|
114 * @return int (1 if sucess, 0 if error) |
|
115 */ |
|
116 private function indexBulk($docs) |
|
117 { |
|
118 $query_data = ['body' => []]; |
|
119 foreach($docs as $doc){ |
|
120 $query_data['body'][] = [ |
|
121 'index' => [ |
|
122 '_index' => env('ELASTICSEARCH_INDEX'), |
|
123 '_type' => 'document', |
|
124 '_id' => (string)$doc->getId() |
|
125 ] |
|
126 ]; |
|
127 $query_data['body'][] = [ |
|
128 'title' => (string)$doc->getTitle(), |
|
129 'date' => (string)$doc->getModified() |
|
130 ]; |
|
131 } |
|
132 Es::bulk($query_data); |
|
133 } |
|
134 /** |
|
135 * Execute the console command. |
|
136 * |
|
137 * @return mixed |
|
138 */ |
|
139 public function handle() |
|
140 { |
|
141 $this->info('Options:'); |
|
142 $noBulk = $this->option('no-bulk'); |
|
143 if ($noBulk) |
|
144 { |
|
145 $this->comment(' - Indexing without bulk insert'); |
|
146 } |
|
147 else |
|
148 { |
|
149 $this->comment(' - Indexing using bulk insert'); |
|
150 } |
|
151 $limit = $this->option('limit'); |
|
152 if ($limit>0) { |
|
153 $this->comment(' - Indexing only the first '.$limit.' documents'); |
|
154 } |
|
155 $stepSize = $this->option('step-size'); |
|
156 $this->comment(' - Indexing with step size of '.$stepSize); |
|
157 |
|
158 $this->info('Resetting index...'); |
|
159 $success = $this->resetIndex(); |
|
160 if($success==1){ |
|
161 $this->comment('Index reset!'); |
|
162 } |
|
163 else{ |
|
164 $this->error('Error resetting index ' . env('ELASTICSEARCH_INDEX')); |
|
165 } |
|
166 |
|
167 $this->info('Indexing documents...'); |
|
168 |
|
169 if ($limit<=0) { |
|
170 $lastPage = $this->documentRepository->paginateAll($stepSize, 'page')->lastPage(); |
|
171 $total = $this->documentRepository->getCount(); |
|
172 $lastPageEntryCount = $stepSize+1; |
|
173 } |
|
174 else { |
|
175 $lastPage = min((int)($limit/$stepSize)+1, $this->documentRepository->paginateAll($stepSize, 'page')->lastPage()); |
|
176 $total = $limit; |
|
177 $lastPageEntryCount = $limit % $stepSize; |
|
178 } |
|
179 |
|
180 if ($noBulk) |
|
181 { |
|
182 $progressBar = $this->output->createProgressBar($total); |
|
183 } |
|
184 else |
|
185 { |
|
186 $progressBar = $this->output->createProgressBar($lastPage); |
|
187 } |
|
188 $progressBar->setFormat(' %current%/%max% [%bar%] %percent:3s%% - %message%'); |
|
189 |
|
190 for ($page=1;$page<=$lastPage;$page++) |
|
191 { |
|
192 $docs = $this->documentRepository->paginateAll($stepSize, 'page', $page); |
|
193 if ($noBulk) |
|
194 { |
|
195 foreach ($docs as $i=>$doc){ |
|
196 if ($page==$lastPage && $i>=$lastPageEntryCount){ |
|
197 break; |
|
198 } |
|
199 $this->indexOne($doc); |
|
200 $progressBar->advance(); |
|
201 $progressBar->setMessage($doc->getId()); |
|
202 } |
|
203 } |
|
204 else |
|
205 { |
|
206 $this->indexBulk($docs); |
|
207 $progressBar->advance(); |
|
208 $progressBar->setMessage('Page '.$page); |
|
209 } |
|
210 } |
|
211 $progressBar->finish(); |
|
212 $this->info('Indexing completed'); |
|
213 } |
|
214 } |