1 <?php |
1 <?php |
2 |
2 |
3 namespace CorpusParole\Console\Commands; |
3 namespace CorpusParole\Console\Commands; |
4 |
4 |
5 use Illuminate\Console\Command; |
5 use Illuminate\Console\Command; |
|
6 use EasyRdf\Resource; |
|
7 |
6 use GuzzleHttp\Client; |
8 use GuzzleHttp\Client; |
|
9 use CorpusParole\Libraries\Utils; |
7 use CorpusParole\Repositories\DocumentRepository; |
10 use CorpusParole\Repositories\DocumentRepository; |
8 use CorpusParole\Libraries\CocoonUtils; |
11 use CorpusParole\Libraries\CocoonUtils; |
9 use CorpusParole\Models\GeonamesHierarchy; |
12 use CorpusParole\Models\GeonamesHierarchy; |
|
13 use CorpusParole\Services\BnfResolverInterface; |
10 use Es; |
14 use Es; |
11 |
15 |
12 class IndexDocuments extends Command |
16 class IndexDocuments extends Command |
13 { |
17 { |
14 |
18 |
32 /** |
36 /** |
33 * Create a new command instance. |
37 * Create a new command instance. |
34 * |
38 * |
35 * @return void |
39 * @return void |
36 */ |
40 */ |
37 public function __construct(DocumentRepository $documentRepository, Client $httpClient) |
41 public function __construct(DocumentRepository $documentRepository, Client $httpClient, BnfResolverInterface $bnfResolver) |
38 { |
42 { |
39 $this->documentRepository = $documentRepository; |
43 $this->documentRepository = $documentRepository; |
|
44 $this->bnfResolver = $bnfResolver; |
40 $this->httpClient = $httpClient; |
45 $this->httpClient = $httpClient; |
41 parent::__construct(); |
46 parent::__construct(); |
42 } |
47 } |
43 |
48 |
44 |
49 |
80 ] |
85 ] |
81 ] |
86 ] |
82 ], |
87 ], |
83 'date' => [ 'type' => 'date' ], |
88 'date' => [ 'type' => 'date' ], |
84 'geonames_hyerarchy' => [ 'type' => 'string' ], |
89 'geonames_hyerarchy' => [ 'type' => 'string' ], |
85 'location' => [ 'type' => 'geo_point' ] |
90 'location' => [ 'type' => 'geo_point' ], |
|
91 'subject' => [ |
|
92 'type' => 'nested', |
|
93 'properties' => [ |
|
94 'label' => [ 'type' => 'string', 'index' => 'not_analyzed'], |
|
95 'code' => [ 'type' => 'string', 'index' => 'not_analyzed'], |
|
96 'label_code' => [ 'type' => 'string', 'index' => 'not_analyzed'] |
|
97 ] |
|
98 ] |
86 // TODO: add location information |
99 // TODO: add location information |
87 ] |
100 ] |
88 ] |
101 ] |
89 ] |
102 ] |
90 ]; |
103 ]; |
152 return $res; |
165 return $res; |
153 |
166 |
154 } |
167 } |
155 |
168 |
156 /** |
169 /** |
|
170 * get subjects as { 'label': label, 'code': code } objects |
|
171 * Takes only into account the bnf subjects |
|
172 */ |
|
173 private function getSubjects($doc) { |
|
174 |
|
175 $sres = array_reduce($doc->getSubjects(), function($res, $s) { |
|
176 $m = []; |
|
177 if($s instanceof Resource && preg_match(config('corpusparole.bnf_ark_url_regexp'), $s->getUri(), $m) === 1) { |
|
178 array_push($res, [ |
|
179 'uri' => $m[0], |
|
180 'code' => $m[1] |
|
181 ]); |
|
182 } |
|
183 return $res; |
|
184 }, []); |
|
185 |
|
186 $labels = $this->bnfResolver->getLabels(array_unique(array_map(function($so) { return $so['uri'];}, $sres))); |
|
187 |
|
188 return array_map(function($so) use ($labels) { return [ 'label' => $labels[$so['uri']], 'code' => $so['code'], 'label_code' => $labels[$so['uri']]."|".$so['code'] ]; }, $sres); |
|
189 |
|
190 } |
|
191 |
|
192 /** |
157 * Index one document into Elasticsearch |
193 * Index one document into Elasticsearch |
158 * |
194 * |
159 * @return int (1 if sucess, 0 if error) |
195 * @return int (1 if sucess, 0 if error) |
160 */ |
196 */ |
161 private function indexOne($resultDoc) |
197 private function indexOne($resultDoc) |
166 'type' => 'document', |
202 'type' => 'document', |
167 'id' => (string)$doc->getId(), |
203 'id' => (string)$doc->getId(), |
168 'body' => [ |
204 'body' => [ |
169 'title' => (string)$doc->getTitle(), |
205 'title' => (string)$doc->getTitle(), |
170 'date' => (string)$doc->getModified(), |
206 'date' => (string)$doc->getModified(), |
171 'geonames_hierarchy' => $this->getGeonamesHierarchy($doc) |
207 'geonames_hierarchy' => $this->getGeonamesHierarchy($doc), |
|
208 'subject' => $this->getSubjects($doc) |
172 ] |
209 ] |
173 ]; |
210 ]; |
174 Es::index($query_data); |
211 Es::index($query_data); |
175 } |
212 } |
176 |
213 |
180 * @return int (1 if sucess, 0 if error) |
217 * @return int (1 if sucess, 0 if error) |
181 */ |
218 */ |
182 private function indexBulk($docs) |
219 private function indexBulk($docs) |
183 { |
220 { |
184 $query_data = ['body' => []]; |
221 $query_data = ['body' => []]; |
185 foreach($docs as $doc){ |
222 foreach($docs as $resultDoc){ |
|
223 $doc = $this->documentRepository->get($resultDoc->getId()); |
186 $query_data['body'][] = [ |
224 $query_data['body'][] = [ |
187 'index' => [ |
225 'index' => [ |
188 '_index' => config('elasticsearch.index'), |
226 '_index' => config('elasticsearch.index'), |
189 '_type' => 'document', |
227 '_type' => 'document', |
190 '_id' => (string)$doc->getId() |
228 '_id' => (string)$doc->getId() |
191 ] |
229 ] |
192 ]; |
230 ]; |
193 $query_data['body'][] = [ |
231 $query_data['body'][] = [ |
194 'title' => (string)$doc->getTitle(), |
232 'title' => (string)$doc->getTitle(), |
195 'date' => (string)$doc->getModified() |
233 'date' => (string)$doc->getModified(), |
|
234 'geonames_hierarchy' => $this->getGeonamesHierarchy($doc), |
|
235 'subject' => $this->getSubjects($doc) |
196 ]; |
236 ]; |
197 } |
237 } |
198 Es::bulk($query_data); |
238 Es::bulk($query_data); |
199 } |
239 } |
200 /** |
240 /** |
261 foreach ($docs as $i=>$doc){ |
301 foreach ($docs as $i=>$doc){ |
262 if ($page==$lastPage && $i>=$lastPageEntryCount){ |
302 if ($page==$lastPage && $i>=$lastPageEntryCount){ |
263 break; |
303 break; |
264 } |
304 } |
265 $this->indexOne($doc); |
305 $this->indexOne($doc); |
|
306 $progressBar->setMessage($doc->getId()); |
266 $progressBar->advance(); |
307 $progressBar->advance(); |
267 $progressBar->setMessage($doc->getId()); |
|
268 } |
308 } |
269 } |
309 } |
270 else |
310 else |
271 { |
311 { |
272 $this->indexBulk($docs); |
312 $this->indexBulk($docs); |
|
313 $progressBar->setMessage('Page '.$page); |
273 $progressBar->advance(); |
314 $progressBar->advance(); |
274 $progressBar->setMessage('Page '.$page); |
|
275 } |
315 } |
276 } |
316 } |
277 $progressBar->finish(); |
317 $progressBar->finish(); |
278 $this->info('Indexing completed'); |
318 $this->info("\nIndexing completed"); |
279 } |
319 } |
280 } |
320 } |