2 |
2 |
3 namespace CorpusParole\Console\Commands; |
3 namespace CorpusParole\Console\Commands; |
4 |
4 |
5 use Illuminate\Console\Command; |
5 use Illuminate\Console\Command; |
6 use EasyRdf\Resource; |
6 use EasyRdf\Resource; |
|
7 use EasyRdf\Literal; |
7 |
8 |
8 use GuzzleHttp\Client; |
9 use GuzzleHttp\Client; |
9 use CorpusParole\Libraries\Utils; |
10 use CorpusParole\Libraries\Utils; |
10 use CorpusParole\Repositories\DocumentRepository; |
11 use CorpusParole\Repositories\DocumentRepository; |
11 use CorpusParole\Libraries\CocoonUtils; |
12 use CorpusParole\Libraries\CocoonUtils; |
12 use CorpusParole\Models\GeonamesHierarchy; |
13 use CorpusParole\Models\GeonamesHierarchy; |
13 use CorpusParole\Services\BnfResolverInterface; |
14 use CorpusParole\Services\BnfResolverInterface; |
|
15 use CorpusParole\Services\LexvoResolverInterface; |
14 use Es; |
16 use Es; |
15 |
17 |
16 class IndexDocuments extends Command |
18 class IndexDocuments extends Command |
17 { |
19 { |
18 |
20 |
22 * @var string |
24 * @var string |
23 */ |
25 */ |
24 protected $signature = 'corpus-parole:indexDocuments |
26 protected $signature = 'corpus-parole:indexDocuments |
25 {--limit=0 : index only the first n documents, 0 (default) means index everything } |
27 {--limit=0 : index only the first n documents, 0 (default) means index everything } |
26 {--no-bulk : index documents one by one instead of using ElasticSearch bulk indexing } |
28 {--no-bulk : index documents one by one instead of using ElasticSearch bulk indexing } |
27 {--step-size=100 : number of documents to retrieve from repository at a time before indexing}'; |
29 {--step-size=100 : number of documents to retrieve from repository at a time before indexing} |
|
30 {--reset-geo-cache: reset geo cache befr indexing}'; |
28 |
31 |
29 /** |
32 /** |
30 * The console command description. |
33 * The console command description. |
31 * |
34 * |
32 * @var string |
35 * @var string |
36 /** |
39 /** |
37 * Create a new command instance. |
40 * Create a new command instance. |
38 * |
41 * |
39 * @return void |
42 * @return void |
40 */ |
43 */ |
41 public function __construct(DocumentRepository $documentRepository, Client $httpClient, BnfResolverInterface $bnfResolver) |
44 public function __construct( |
|
45 DocumentRepository $documentRepository, |
|
46 Client $httpClient, |
|
47 BnfResolverInterface $bnfResolver, |
|
48 LexvoResolverInterface $lexvoResolver) |
42 { |
49 { |
43 $this->documentRepository = $documentRepository; |
50 $this->documentRepository = $documentRepository; |
44 $this->bnfResolver = $bnfResolver; |
51 $this->bnfResolver = $bnfResolver; |
|
52 $this->lexvoResolver = $lexvoResolver; |
45 $this->httpClient = $httpClient; |
53 $this->httpClient = $httpClient; |
46 parent::__construct(); |
54 parent::__construct(); |
47 } |
55 } |
48 |
56 |
49 |
57 |
51 /** |
59 /** |
52 * Reset Elasticsearch index |
60 * Reset Elasticsearch index |
53 * |
61 * |
54 * @return int (1 if sucess, 0 if error) |
62 * @return int (1 if sucess, 0 if error) |
55 */ |
63 */ |
56 private function resetIndex() |
64 private function resetIndex($resetGeoCache) |
57 { |
65 { |
|
66 if($resetGeoCache) { |
|
67 // delete all rows in GeonamesHierarchy |
|
68 GeonamesHierarchy::getQuery()->delete(); |
|
69 } |
58 $indexParams = [ |
70 $indexParams = [ |
59 'index' => env('ELASTICSEARCH_INDEX') |
71 'index' => env('ELASTICSEARCH_INDEX') |
60 ]; |
72 ]; |
61 if(Es::indices()->exists($indexParams)){ |
73 if(Es::indices()->exists($indexParams)){ |
62 $response = Es::indices()->delete($indexParams); |
74 $response = Es::indices()->delete($indexParams); |
124 'username' => config('corpusparole.geonames_username') ], |
136 'username' => config('corpusparole.geonames_username') ], |
125 'accept' => 'application/json' // TODO: check this |
137 'accept' => 'application/json' // TODO: check this |
126 ] |
138 ] |
127 )->getBody(); |
139 )->getBody(); |
128 $hjson = json_decode($apiBody); |
140 $hjson = json_decode($apiBody); |
129 $hcache = new GeonamesHierarchy; |
141 $hcache = new GeonamesHierarchy(); |
130 $hcache->geonamesid = $geonamesid; |
142 $hcache->geonamesid = $geonamesid; |
131 $hcache->hierarchy = $hjson; |
143 $hcache->hierarchy = $hjson; |
132 $hcache->save(); |
144 $hcache->save(); |
133 } |
145 } |
134 |
146 |
171 * Takes only into account the bnf subjects |
183 * Takes only into account the bnf subjects |
172 */ |
184 */ |
173 private function getSubjects($doc) { |
185 private function getSubjects($doc) { |
174 |
186 |
175 $sres = array_reduce($doc->getSubjects(), function($res, $s) { |
187 $sres = array_reduce($doc->getSubjects(), function($res, $s) { |
176 $m = []; |
188 $mBnf = []; |
177 if($s instanceof Resource && preg_match(config('corpusparole.bnf_ark_url_regexp'), $s->getUri(), $m) === 1) { |
189 $mLexvo = []; |
|
190 |
|
191 if($s instanceof Resource && preg_match(config('corpusparole.bnf_ark_url_regexp'), $s->getUri(), $mBnf) === 1) { |
178 array_push($res, [ |
192 array_push($res, [ |
179 'uri' => $m[0], |
193 'uri' => $mBnf[0], |
180 'code' => $m[1] |
194 'code' => $mBnf[1], |
|
195 'type' => 'bnf' |
181 ]); |
196 ]); |
182 } |
197 } elseif($s instanceof Resource && preg_match(config('corpusparole.lexvo_url_regexp'), $s->getUri(). $mLexvo) === 1) { |
|
198 array_push($res, [ |
|
199 'uri' => $mLexvo[0], |
|
200 'code' => $mLexvo[1], |
|
201 'type' => 'lxv' |
|
202 ]); |
|
203 } |
|
204 |
183 return $res; |
205 return $res; |
184 }, []); |
206 }, []); |
185 |
207 |
186 $labels = $this->bnfResolver->getLabels(array_unique(array_map(function($so) { return $so['uri'];}, $sres))); |
208 $labelsBnf = $this->bnfResolver->getLabels( |
187 |
209 array_unique(array_reduce( |
188 return array_map(function($so) use ($labels) { return [ 'label' => $labels[$so['uri']], 'code' => $so['code'], 'label_code' => $labels[$so['uri']]."|".$so['code'] ]; }, $sres); |
210 $sres, |
189 |
211 function($r, $so) { |
|
212 if($so['type'] === 'bnf') { |
|
213 array_push($r, $so['uri']); |
|
214 } |
|
215 return $r; |
|
216 },[] |
|
217 )) |
|
218 ); |
|
219 $labelsLexvo = $this->lexvoResolver->getLabels( |
|
220 array_unique(array_reduce( |
|
221 $sres, |
|
222 function($r, $so) { |
|
223 if($so['type'] === 'lxv') { |
|
224 array_push($r, $so['uri']); |
|
225 } |
|
226 return $r; |
|
227 },[] |
|
228 )) |
|
229 ); |
|
230 |
|
231 return array_map(function($so) use ($labelsBnf, $labelsLexvo) { |
|
232 $label = $so['uri']; |
|
233 if($so['type'] === 'bnf') { |
|
234 $label = $labelsBnf[$label]; |
|
235 } elseif ($so['type'] === 'lxv') { |
|
236 $label = $labelsLexvo[$label]; |
|
237 } |
|
238 return [ 'label' => $label, 'code' => $so['code'], 'label_code' => $label."|".$so['type']."|".$so['code'] ]; }, $sres |
|
239 ); |
190 } |
240 } |
191 |
241 |
192 /** |
242 /** |
193 * Index one document into Elasticsearch |
243 * Index one document into Elasticsearch |
194 * |
244 * |
259 $this->comment(' - Indexing only the first '.$limit.' documents'); |
309 $this->comment(' - Indexing only the first '.$limit.' documents'); |
260 } |
310 } |
261 $stepSize = $this->option('step-size'); |
311 $stepSize = $this->option('step-size'); |
262 $this->comment(' - Indexing with step size of '.$stepSize); |
312 $this->comment(' - Indexing with step size of '.$stepSize); |
263 |
313 |
|
314 $resetGeoCache = $this->option('reset-geo-cache', false); |
264 $this->info('Resetting index...'); |
315 $this->info('Resetting index...'); |
265 $success = $this->resetIndex(); |
316 $success = $this->resetIndex($resetGeoCache); |
266 if($success==1){ |
317 if($success==1){ |
267 $this->comment('Index reset!'); |
318 $this->comment('Index reset!'); |
268 } |
319 } |
269 else{ |
320 else{ |
270 $this->error('Error resetting index ' . env('ELASTICSEARCH_INDEX')); |
321 $this->error('Error resetting index ' . env('ELASTICSEARCH_INDEX')); |