author | ymh <ymh.work@gmail.com> |
Fri, 10 Feb 2017 12:03:12 +0100 | |
changeset 506 | 8a5bb4b48b85 |
parent 498 | 265992e5b379 |
permissions | -rw-r--r-- |
24
de47e8f66e8b
Added command "corpus-parole:indexDocuments" to index documents into ElasticSearch
durandn
parents:
diff
changeset
|
1 |
<?php |
de47e8f66e8b
Added command "corpus-parole:indexDocuments" to index documents into ElasticSearch
durandn
parents:
diff
changeset
|
2 |
|
de47e8f66e8b
Added command "corpus-parole:indexDocuments" to index documents into ElasticSearch
durandn
parents:
diff
changeset
|
3 |
namespace CorpusParole\Console\Commands; |
de47e8f66e8b
Added command "corpus-parole:indexDocuments" to index documents into ElasticSearch
durandn
parents:
diff
changeset
|
4 |
|
325 | 5 |
|
6 |
||
24
de47e8f66e8b
Added command "corpus-parole:indexDocuments" to index documents into ElasticSearch
durandn
parents:
diff
changeset
|
7 |
use Illuminate\Console\Command; |
321
aefaad270b9b
reimplement ThemeController using ES requests to be able to sort by label
ymh <ymh.work@gmail.com>
parents:
320
diff
changeset
|
8 |
use EasyRdf\Resource; |
322
084aae09edf4
correction on importRDF documents + evolution theme controller
ymh <ymh.work@gmail.com>
parents:
321
diff
changeset
|
9 |
use EasyRdf\Literal; |
325 | 10 |
use EasyRdf\Graph; |
11 |
||
12 |
use Carbon\Carbon; |
|
321
aefaad270b9b
reimplement ThemeController using ES requests to be able to sort by label
ymh <ymh.work@gmail.com>
parents:
320
diff
changeset
|
13 |
|
308
e032d686d88e
add hierarchy info in document indexation + geostats api controllers + add some keys to geonames resolver
ymh <ymh.work@gmail.com>
parents:
25
diff
changeset
|
14 |
use GuzzleHttp\Client; |
325 | 15 |
use GuzzleHttp\Exception\TransferException; |
16 |
use GuzzleHttp\Psr7; |
|
17 |
||
321
aefaad270b9b
reimplement ThemeController using ES requests to be able to sort by label
ymh <ymh.work@gmail.com>
parents:
320
diff
changeset
|
18 |
use CorpusParole\Libraries\Utils; |
24
de47e8f66e8b
Added command "corpus-parole:indexDocuments" to index documents into ElasticSearch
durandn
parents:
diff
changeset
|
19 |
use CorpusParole\Repositories\DocumentRepository; |
308
e032d686d88e
add hierarchy info in document indexation + geostats api controllers + add some keys to geonames resolver
ymh <ymh.work@gmail.com>
parents:
25
diff
changeset
|
20 |
use CorpusParole\Libraries\CocoonUtils; |
e032d686d88e
add hierarchy info in document indexation + geostats api controllers + add some keys to geonames resolver
ymh <ymh.work@gmail.com>
parents:
25
diff
changeset
|
21 |
use CorpusParole\Models\GeonamesHierarchy; |
321
aefaad270b9b
reimplement ThemeController using ES requests to be able to sort by label
ymh <ymh.work@gmail.com>
parents:
320
diff
changeset
|
22 |
use CorpusParole\Services\BnfResolverInterface; |
322
084aae09edf4
correction on importRDF documents + evolution theme controller
ymh <ymh.work@gmail.com>
parents:
321
diff
changeset
|
23 |
use CorpusParole\Services\LexvoResolverInterface; |
24
de47e8f66e8b
Added command "corpus-parole:indexDocuments" to index documents into ElasticSearch
durandn
parents:
diff
changeset
|
24 |
use Es; |
325 | 25 |
use Log; |
26 |
use Cache; |
|
24
de47e8f66e8b
Added command "corpus-parole:indexDocuments" to index documents into ElasticSearch
durandn
parents:
diff
changeset
|
27 |
|
de47e8f66e8b
Added command "corpus-parole:indexDocuments" to index documents into ElasticSearch
durandn
parents:
diff
changeset
|
28 |
class IndexDocuments extends Command |
de47e8f66e8b
Added command "corpus-parole:indexDocuments" to index documents into ElasticSearch
durandn
parents:
diff
changeset
|
29 |
{ |
de47e8f66e8b
Added command "corpus-parole:indexDocuments" to index documents into ElasticSearch
durandn
parents:
diff
changeset
|
30 |
|
de47e8f66e8b
Added command "corpus-parole:indexDocuments" to index documents into ElasticSearch
durandn
parents:
diff
changeset
|
31 |
/** |
de47e8f66e8b
Added command "corpus-parole:indexDocuments" to index documents into ElasticSearch
durandn
parents:
diff
changeset
|
32 |
* The name and signature of the console command. |
de47e8f66e8b
Added command "corpus-parole:indexDocuments" to index documents into ElasticSearch
durandn
parents:
diff
changeset
|
33 |
* |
de47e8f66e8b
Added command "corpus-parole:indexDocuments" to index documents into ElasticSearch
durandn
parents:
diff
changeset
|
34 |
* @var string |
de47e8f66e8b
Added command "corpus-parole:indexDocuments" to index documents into ElasticSearch
durandn
parents:
diff
changeset
|
35 |
*/ |
de47e8f66e8b
Added command "corpus-parole:indexDocuments" to index documents into ElasticSearch
durandn
parents:
diff
changeset
|
36 |
protected $signature = 'corpus-parole:indexDocuments |
de47e8f66e8b
Added command "corpus-parole:indexDocuments" to index documents into ElasticSearch
durandn
parents:
diff
changeset
|
37 |
{--limit=0 : index only the first n documents, 0 (default) means index everything } |
de47e8f66e8b
Added command "corpus-parole:indexDocuments" to index documents into ElasticSearch
durandn
parents:
diff
changeset
|
38 |
{--no-bulk : index documents one by one instead of using ElasticSearch bulk indexing } |
322
084aae09edf4
correction on importRDF documents + evolution theme controller
ymh <ymh.work@gmail.com>
parents:
321
diff
changeset
|
39 |
{--step-size=100 : number of documents to retrieve from repository at a time before indexing} |
323 | 40 |
{--reset-geo-cache : reset geo cache befr indexing}'; |
24
de47e8f66e8b
Added command "corpus-parole:indexDocuments" to index documents into ElasticSearch
durandn
parents:
diff
changeset
|
41 |
|
de47e8f66e8b
Added command "corpus-parole:indexDocuments" to index documents into ElasticSearch
durandn
parents:
diff
changeset
|
42 |
/** |
de47e8f66e8b
Added command "corpus-parole:indexDocuments" to index documents into ElasticSearch
durandn
parents:
diff
changeset
|
43 |
* The console command description. |
de47e8f66e8b
Added command "corpus-parole:indexDocuments" to index documents into ElasticSearch
durandn
parents:
diff
changeset
|
44 |
* |
de47e8f66e8b
Added command "corpus-parole:indexDocuments" to index documents into ElasticSearch
durandn
parents:
diff
changeset
|
45 |
* @var string |
de47e8f66e8b
Added command "corpus-parole:indexDocuments" to index documents into ElasticSearch
durandn
parents:
diff
changeset
|
46 |
*/ |
de47e8f66e8b
Added command "corpus-parole:indexDocuments" to index documents into ElasticSearch
durandn
parents:
diff
changeset
|
47 |
protected $description = 'Index documents into ElasticSearch.'; |
de47e8f66e8b
Added command "corpus-parole:indexDocuments" to index documents into ElasticSearch
durandn
parents:
diff
changeset
|
48 |
|
de47e8f66e8b
Added command "corpus-parole:indexDocuments" to index documents into ElasticSearch
durandn
parents:
diff
changeset
|
49 |
/** |
de47e8f66e8b
Added command "corpus-parole:indexDocuments" to index documents into ElasticSearch
durandn
parents:
diff
changeset
|
50 |
* Create a new command instance. |
de47e8f66e8b
Added command "corpus-parole:indexDocuments" to index documents into ElasticSearch
durandn
parents:
diff
changeset
|
51 |
* |
de47e8f66e8b
Added command "corpus-parole:indexDocuments" to index documents into ElasticSearch
durandn
parents:
diff
changeset
|
52 |
* @return void |
de47e8f66e8b
Added command "corpus-parole:indexDocuments" to index documents into ElasticSearch
durandn
parents:
diff
changeset
|
53 |
*/ |
322
084aae09edf4
correction on importRDF documents + evolution theme controller
ymh <ymh.work@gmail.com>
parents:
321
diff
changeset
|
54 |
public function __construct( |
084aae09edf4
correction on importRDF documents + evolution theme controller
ymh <ymh.work@gmail.com>
parents:
321
diff
changeset
|
55 |
DocumentRepository $documentRepository, |
084aae09edf4
correction on importRDF documents + evolution theme controller
ymh <ymh.work@gmail.com>
parents:
321
diff
changeset
|
56 |
Client $httpClient, |
084aae09edf4
correction on importRDF documents + evolution theme controller
ymh <ymh.work@gmail.com>
parents:
321
diff
changeset
|
57 |
BnfResolverInterface $bnfResolver, |
084aae09edf4
correction on importRDF documents + evolution theme controller
ymh <ymh.work@gmail.com>
parents:
321
diff
changeset
|
58 |
LexvoResolverInterface $lexvoResolver) |
24
de47e8f66e8b
Added command "corpus-parole:indexDocuments" to index documents into ElasticSearch
durandn
parents:
diff
changeset
|
59 |
{ |
de47e8f66e8b
Added command "corpus-parole:indexDocuments" to index documents into ElasticSearch
durandn
parents:
diff
changeset
|
60 |
$this->documentRepository = $documentRepository; |
321
aefaad270b9b
reimplement ThemeController using ES requests to be able to sort by label
ymh <ymh.work@gmail.com>
parents:
320
diff
changeset
|
61 |
$this->bnfResolver = $bnfResolver; |
322
084aae09edf4
correction on importRDF documents + evolution theme controller
ymh <ymh.work@gmail.com>
parents:
321
diff
changeset
|
62 |
$this->lexvoResolver = $lexvoResolver; |
308
e032d686d88e
add hierarchy info in document indexation + geostats api controllers + add some keys to geonames resolver
ymh <ymh.work@gmail.com>
parents:
25
diff
changeset
|
63 |
$this->httpClient = $httpClient; |
24
de47e8f66e8b
Added command "corpus-parole:indexDocuments" to index documents into ElasticSearch
durandn
parents:
diff
changeset
|
64 |
parent::__construct(); |
de47e8f66e8b
Added command "corpus-parole:indexDocuments" to index documents into ElasticSearch
durandn
parents:
diff
changeset
|
65 |
} |
de47e8f66e8b
Added command "corpus-parole:indexDocuments" to index documents into ElasticSearch
durandn
parents:
diff
changeset
|
66 |
|
de47e8f66e8b
Added command "corpus-parole:indexDocuments" to index documents into ElasticSearch
durandn
parents:
diff
changeset
|
67 |
|
de47e8f66e8b
Added command "corpus-parole:indexDocuments" to index documents into ElasticSearch
durandn
parents:
diff
changeset
|
68 |
|
de47e8f66e8b
Added command "corpus-parole:indexDocuments" to index documents into ElasticSearch
durandn
parents:
diff
changeset
|
69 |
/** |
de47e8f66e8b
Added command "corpus-parole:indexDocuments" to index documents into ElasticSearch
durandn
parents:
diff
changeset
|
70 |
* Reset Elasticsearch index |
de47e8f66e8b
Added command "corpus-parole:indexDocuments" to index documents into ElasticSearch
durandn
parents:
diff
changeset
|
71 |
* |
de47e8f66e8b
Added command "corpus-parole:indexDocuments" to index documents into ElasticSearch
durandn
parents:
diff
changeset
|
72 |
* @return int (1 if sucess, 0 if error) |
de47e8f66e8b
Added command "corpus-parole:indexDocuments" to index documents into ElasticSearch
durandn
parents:
diff
changeset
|
73 |
*/ |
323 | 74 |
private function resetIndex() |
24
de47e8f66e8b
Added command "corpus-parole:indexDocuments" to index documents into ElasticSearch
durandn
parents:
diff
changeset
|
75 |
{ |
de47e8f66e8b
Added command "corpus-parole:indexDocuments" to index documents into ElasticSearch
durandn
parents:
diff
changeset
|
76 |
$indexParams = [ |
406
cf0f23803a53
upgrade elasticsearch to 5.0, upgrade ember
ymh <ymh.work@gmail.com>
parents:
375
diff
changeset
|
77 |
'index' => config('elasticsearch.index') |
24
de47e8f66e8b
Added command "corpus-parole:indexDocuments" to index documents into ElasticSearch
durandn
parents:
diff
changeset
|
78 |
]; |
de47e8f66e8b
Added command "corpus-parole:indexDocuments" to index documents into ElasticSearch
durandn
parents:
diff
changeset
|
79 |
if(Es::indices()->exists($indexParams)){ |
de47e8f66e8b
Added command "corpus-parole:indexDocuments" to index documents into ElasticSearch
durandn
parents:
diff
changeset
|
80 |
$response = Es::indices()->delete($indexParams); |
de47e8f66e8b
Added command "corpus-parole:indexDocuments" to index documents into ElasticSearch
durandn
parents:
diff
changeset
|
81 |
if($response['acknowledged']!=1){ |
de47e8f66e8b
Added command "corpus-parole:indexDocuments" to index documents into ElasticSearch
durandn
parents:
diff
changeset
|
82 |
return 0; |
de47e8f66e8b
Added command "corpus-parole:indexDocuments" to index documents into ElasticSearch
durandn
parents:
diff
changeset
|
83 |
} |
de47e8f66e8b
Added command "corpus-parole:indexDocuments" to index documents into ElasticSearch
durandn
parents:
diff
changeset
|
84 |
} |
308
e032d686d88e
add hierarchy info in document indexation + geostats api controllers + add some keys to geonames resolver
ymh <ymh.work@gmail.com>
parents:
25
diff
changeset
|
85 |
// Note: removed the "'store' => True" parameters on fields and use _source on record instead |
e032d686d88e
add hierarchy info in document indexation + geostats api controllers + add some keys to geonames resolver
ymh <ymh.work@gmail.com>
parents:
25
diff
changeset
|
86 |
|
24
de47e8f66e8b
Added command "corpus-parole:indexDocuments" to index documents into ElasticSearch
durandn
parents:
diff
changeset
|
87 |
$indexParams['body'] = [ |
de47e8f66e8b
Added command "corpus-parole:indexDocuments" to index documents into ElasticSearch
durandn
parents:
diff
changeset
|
88 |
'settings' => [ |
308
e032d686d88e
add hierarchy info in document indexation + geostats api controllers + add some keys to geonames resolver
ymh <ymh.work@gmail.com>
parents:
25
diff
changeset
|
89 |
'number_of_shards' => config('elasticsearch.shards'), |
e032d686d88e
add hierarchy info in document indexation + geostats api controllers + add some keys to geonames resolver
ymh <ymh.work@gmail.com>
parents:
25
diff
changeset
|
90 |
'number_of_replicas' => config('elasticsearch.replicas'), |
320 | 91 |
'index.mapping.ignore_malformed' => True, |
92 |
'index.requests.cache.enable' => True |
|
24
de47e8f66e8b
Added command "corpus-parole:indexDocuments" to index documents into ElasticSearch
durandn
parents:
diff
changeset
|
93 |
], |
de47e8f66e8b
Added command "corpus-parole:indexDocuments" to index documents into ElasticSearch
durandn
parents:
diff
changeset
|
94 |
'mappings' => [ |
de47e8f66e8b
Added command "corpus-parole:indexDocuments" to index documents into ElasticSearch
durandn
parents:
diff
changeset
|
95 |
'document' => [ |
de47e8f66e8b
Added command "corpus-parole:indexDocuments" to index documents into ElasticSearch
durandn
parents:
diff
changeset
|
96 |
'properties' => [ |
de47e8f66e8b
Added command "corpus-parole:indexDocuments" to index documents into ElasticSearch
durandn
parents:
diff
changeset
|
97 |
'title' => [ |
de47e8f66e8b
Added command "corpus-parole:indexDocuments" to index documents into ElasticSearch
durandn
parents:
diff
changeset
|
98 |
'type' => 'string', |
de47e8f66e8b
Added command "corpus-parole:indexDocuments" to index documents into ElasticSearch
durandn
parents:
diff
changeset
|
99 |
'fields' => [ |
de47e8f66e8b
Added command "corpus-parole:indexDocuments" to index documents into ElasticSearch
durandn
parents:
diff
changeset
|
100 |
'raw' => [ |
de47e8f66e8b
Added command "corpus-parole:indexDocuments" to index documents into ElasticSearch
durandn
parents:
diff
changeset
|
101 |
'type' => 'string', |
de47e8f66e8b
Added command "corpus-parole:indexDocuments" to index documents into ElasticSearch
durandn
parents:
diff
changeset
|
102 |
'index' => 'not_analyzed' |
de47e8f66e8b
Added command "corpus-parole:indexDocuments" to index documents into ElasticSearch
durandn
parents:
diff
changeset
|
103 |
] |
de47e8f66e8b
Added command "corpus-parole:indexDocuments" to index documents into ElasticSearch
durandn
parents:
diff
changeset
|
104 |
] |
de47e8f66e8b
Added command "corpus-parole:indexDocuments" to index documents into ElasticSearch
durandn
parents:
diff
changeset
|
105 |
], |
498
265992e5b379
Add description to documents interface and indexation. Prepare #0025746
ymh <ymh.work@gmail.com>
parents:
497
diff
changeset
|
106 |
'description' => [ |
265992e5b379
Add description to documents interface and indexation. Prepare #0025746
ymh <ymh.work@gmail.com>
parents:
497
diff
changeset
|
107 |
'type' => 'string', |
265992e5b379
Add description to documents interface and indexation. Prepare #0025746
ymh <ymh.work@gmail.com>
parents:
497
diff
changeset
|
108 |
'fields' => [ |
265992e5b379
Add description to documents interface and indexation. Prepare #0025746
ymh <ymh.work@gmail.com>
parents:
497
diff
changeset
|
109 |
'french' => [ |
265992e5b379
Add description to documents interface and indexation. Prepare #0025746
ymh <ymh.work@gmail.com>
parents:
497
diff
changeset
|
110 |
'type' => 'string', |
265992e5b379
Add description to documents interface and indexation. Prepare #0025746
ymh <ymh.work@gmail.com>
parents:
497
diff
changeset
|
111 |
'analyzer' => 'french' |
265992e5b379
Add description to documents interface and indexation. Prepare #0025746
ymh <ymh.work@gmail.com>
parents:
497
diff
changeset
|
112 |
] |
265992e5b379
Add description to documents interface and indexation. Prepare #0025746
ymh <ymh.work@gmail.com>
parents:
497
diff
changeset
|
113 |
] |
265992e5b379
Add description to documents interface and indexation. Prepare #0025746
ymh <ymh.work@gmail.com>
parents:
497
diff
changeset
|
114 |
], |
326
226d5b17a119
- First implementation of filter for languages.
ymh <ymh.work@gmail.com>
parents:
325
diff
changeset
|
115 |
'date' => [ 'type' => 'date', 'index' => 'not_analyzed'], |
369
796725d33b67
Add location filter to documents api end-point
ymh <ymh.work@gmail.com>
parents:
326
diff
changeset
|
116 |
'geonames_hierarchy' => [ 'type' => 'string', 'index' => 'not_analyzed'], |
497
f3474aeec884
add contry code in indexation, Serialize types, prepare #0025746
ymh <ymh.work@gmail.com>
parents:
496
diff
changeset
|
117 |
'geonames_country' => ['type' => 'string', 'index' => 'not_analyzed'], |
326
226d5b17a119
- First implementation of filter for languages.
ymh <ymh.work@gmail.com>
parents:
325
diff
changeset
|
118 |
'location' => [ 'type' => 'geo_point'], |
226d5b17a119
- First implementation of filter for languages.
ymh <ymh.work@gmail.com>
parents:
325
diff
changeset
|
119 |
'creation_date' => ['type' => 'date', 'index' => 'not_analyzed'], |
226d5b17a119
- First implementation of filter for languages.
ymh <ymh.work@gmail.com>
parents:
325
diff
changeset
|
120 |
'language' => ['type' => 'string', 'index' => 'not_analyzed'], |
226d5b17a119
- First implementation of filter for languages.
ymh <ymh.work@gmail.com>
parents:
325
diff
changeset
|
121 |
'discourse_types' => ['type' => 'string', 'index' => 'not_analyzed'], |
375
145561ff51ff
change datestats to use elasticsearch
ymh <ymh.work@gmail.com>
parents:
369
diff
changeset
|
122 |
'creation_years' => [ |
145561ff51ff
change datestats to use elasticsearch
ymh <ymh.work@gmail.com>
parents:
369
diff
changeset
|
123 |
'type' => 'nested', |
145561ff51ff
change datestats to use elasticsearch
ymh <ymh.work@gmail.com>
parents:
369
diff
changeset
|
124 |
'properties' => [ |
145561ff51ff
change datestats to use elasticsearch
ymh <ymh.work@gmail.com>
parents:
369
diff
changeset
|
125 |
'year' => [ 'type' => 'short', 'index' => 'not_analyzed'], |
145561ff51ff
change datestats to use elasticsearch
ymh <ymh.work@gmail.com>
parents:
369
diff
changeset
|
126 |
'weight' => [ 'type' => 'float', 'index' => 'not_analyzed'], |
145561ff51ff
change datestats to use elasticsearch
ymh <ymh.work@gmail.com>
parents:
369
diff
changeset
|
127 |
] |
145561ff51ff
change datestats to use elasticsearch
ymh <ymh.work@gmail.com>
parents:
369
diff
changeset
|
128 |
] , |
321
aefaad270b9b
reimplement ThemeController using ES requests to be able to sort by label
ymh <ymh.work@gmail.com>
parents:
320
diff
changeset
|
129 |
'subject' => [ |
aefaad270b9b
reimplement ThemeController using ES requests to be able to sort by label
ymh <ymh.work@gmail.com>
parents:
320
diff
changeset
|
130 |
'type' => 'nested', |
aefaad270b9b
reimplement ThemeController using ES requests to be able to sort by label
ymh <ymh.work@gmail.com>
parents:
320
diff
changeset
|
131 |
'properties' => [ |
aefaad270b9b
reimplement ThemeController using ES requests to be able to sort by label
ymh <ymh.work@gmail.com>
parents:
320
diff
changeset
|
132 |
'label' => [ 'type' => 'string', 'index' => 'not_analyzed'], |
aefaad270b9b
reimplement ThemeController using ES requests to be able to sort by label
ymh <ymh.work@gmail.com>
parents:
320
diff
changeset
|
133 |
'code' => [ 'type' => 'string', 'index' => 'not_analyzed'], |
aefaad270b9b
reimplement ThemeController using ES requests to be able to sort by label
ymh <ymh.work@gmail.com>
parents:
320
diff
changeset
|
134 |
'label_code' => [ 'type' => 'string', 'index' => 'not_analyzed'] |
aefaad270b9b
reimplement ThemeController using ES requests to be able to sort by label
ymh <ymh.work@gmail.com>
parents:
320
diff
changeset
|
135 |
] |
aefaad270b9b
reimplement ThemeController using ES requests to be able to sort by label
ymh <ymh.work@gmail.com>
parents:
320
diff
changeset
|
136 |
] |
24
de47e8f66e8b
Added command "corpus-parole:indexDocuments" to index documents into ElasticSearch
durandn
parents:
diff
changeset
|
137 |
] |
de47e8f66e8b
Added command "corpus-parole:indexDocuments" to index documents into ElasticSearch
durandn
parents:
diff
changeset
|
138 |
] |
de47e8f66e8b
Added command "corpus-parole:indexDocuments" to index documents into ElasticSearch
durandn
parents:
diff
changeset
|
139 |
] |
de47e8f66e8b
Added command "corpus-parole:indexDocuments" to index documents into ElasticSearch
durandn
parents:
diff
changeset
|
140 |
]; |
de47e8f66e8b
Added command "corpus-parole:indexDocuments" to index documents into ElasticSearch
durandn
parents:
diff
changeset
|
141 |
$response = Es::indices()->create($indexParams); |
de47e8f66e8b
Added command "corpus-parole:indexDocuments" to index documents into ElasticSearch
durandn
parents:
diff
changeset
|
142 |
if($response['acknowledged']!=1){ |
de47e8f66e8b
Added command "corpus-parole:indexDocuments" to index documents into ElasticSearch
durandn
parents:
diff
changeset
|
143 |
return 0; |
de47e8f66e8b
Added command "corpus-parole:indexDocuments" to index documents into ElasticSearch
durandn
parents:
diff
changeset
|
144 |
} |
de47e8f66e8b
Added command "corpus-parole:indexDocuments" to index documents into ElasticSearch
durandn
parents:
diff
changeset
|
145 |
return 1; |
de47e8f66e8b
Added command "corpus-parole:indexDocuments" to index documents into ElasticSearch
durandn
parents:
diff
changeset
|
146 |
} |
de47e8f66e8b
Added command "corpus-parole:indexDocuments" to index documents into ElasticSearch
durandn
parents:
diff
changeset
|
147 |
|
308
e032d686d88e
add hierarchy info in document indexation + geostats api controllers + add some keys to geonames resolver
ymh <ymh.work@gmail.com>
parents:
25
diff
changeset
|
148 |
|
e032d686d88e
add hierarchy info in document indexation + geostats api controllers + add some keys to geonames resolver
ymh <ymh.work@gmail.com>
parents:
25
diff
changeset
|
149 |
private function getGeonamesHierarchyArray($geonamesid) { |
325 | 150 |
|
308
e032d686d88e
add hierarchy info in document indexation + geostats api controllers + add some keys to geonames resolver
ymh <ymh.work@gmail.com>
parents:
25
diff
changeset
|
151 |
$hcache = GeonamesHierarchy::where('geonamesid', $geonamesid)->first(); |
e032d686d88e
add hierarchy info in document indexation + geostats api controllers + add some keys to geonames resolver
ymh <ymh.work@gmail.com>
parents:
25
diff
changeset
|
152 |
if(is_null($hcache)) { |
e032d686d88e
add hierarchy info in document indexation + geostats api controllers + add some keys to geonames resolver
ymh <ymh.work@gmail.com>
parents:
25
diff
changeset
|
153 |
|
e032d686d88e
add hierarchy info in document indexation + geostats api controllers + add some keys to geonames resolver
ymh <ymh.work@gmail.com>
parents:
25
diff
changeset
|
154 |
// TODO: add delay to respect geonames 2k request/hour |
e032d686d88e
add hierarchy info in document indexation + geostats api controllers + add some keys to geonames resolver
ymh <ymh.work@gmail.com>
parents:
25
diff
changeset
|
155 |
// TODO: manage errors |
e032d686d88e
add hierarchy info in document indexation + geostats api controllers + add some keys to geonames resolver
ymh <ymh.work@gmail.com>
parents:
25
diff
changeset
|
156 |
|
e032d686d88e
add hierarchy info in document indexation + geostats api controllers + add some keys to geonames resolver
ymh <ymh.work@gmail.com>
parents:
25
diff
changeset
|
157 |
$apiBody = $this->httpClient->get( |
e032d686d88e
add hierarchy info in document indexation + geostats api controllers + add some keys to geonames resolver
ymh <ymh.work@gmail.com>
parents:
25
diff
changeset
|
158 |
config('corpusparole.geonames_hierarchy_webservice_url'), |
e032d686d88e
add hierarchy info in document indexation + geostats api controllers + add some keys to geonames resolver
ymh <ymh.work@gmail.com>
parents:
25
diff
changeset
|
159 |
[ 'query' => |
e032d686d88e
add hierarchy info in document indexation + geostats api controllers + add some keys to geonames resolver
ymh <ymh.work@gmail.com>
parents:
25
diff
changeset
|
160 |
[ 'geonameId' => $geonamesid, |
e032d686d88e
add hierarchy info in document indexation + geostats api controllers + add some keys to geonames resolver
ymh <ymh.work@gmail.com>
parents:
25
diff
changeset
|
161 |
'username' => config('corpusparole.geonames_username') ], |
e032d686d88e
add hierarchy info in document indexation + geostats api controllers + add some keys to geonames resolver
ymh <ymh.work@gmail.com>
parents:
25
diff
changeset
|
162 |
'accept' => 'application/json' // TODO: check this |
e032d686d88e
add hierarchy info in document indexation + geostats api controllers + add some keys to geonames resolver
ymh <ymh.work@gmail.com>
parents:
25
diff
changeset
|
163 |
] |
e032d686d88e
add hierarchy info in document indexation + geostats api controllers + add some keys to geonames resolver
ymh <ymh.work@gmail.com>
parents:
25
diff
changeset
|
164 |
)->getBody(); |
e032d686d88e
add hierarchy info in document indexation + geostats api controllers + add some keys to geonames resolver
ymh <ymh.work@gmail.com>
parents:
25
diff
changeset
|
165 |
$hjson = json_decode($apiBody); |
322
084aae09edf4
correction on importRDF documents + evolution theme controller
ymh <ymh.work@gmail.com>
parents:
321
diff
changeset
|
166 |
$hcache = new GeonamesHierarchy(); |
308
e032d686d88e
add hierarchy info in document indexation + geostats api controllers + add some keys to geonames resolver
ymh <ymh.work@gmail.com>
parents:
25
diff
changeset
|
167 |
$hcache->geonamesid = $geonamesid; |
e032d686d88e
add hierarchy info in document indexation + geostats api controllers + add some keys to geonames resolver
ymh <ymh.work@gmail.com>
parents:
25
diff
changeset
|
168 |
$hcache->hierarchy = $hjson; |
e032d686d88e
add hierarchy info in document indexation + geostats api controllers + add some keys to geonames resolver
ymh <ymh.work@gmail.com>
parents:
25
diff
changeset
|
169 |
$hcache->save(); |
e032d686d88e
add hierarchy info in document indexation + geostats api controllers + add some keys to geonames resolver
ymh <ymh.work@gmail.com>
parents:
25
diff
changeset
|
170 |
} |
e032d686d88e
add hierarchy info in document indexation + geostats api controllers + add some keys to geonames resolver
ymh <ymh.work@gmail.com>
parents:
25
diff
changeset
|
171 |
|
e032d686d88e
add hierarchy info in document indexation + geostats api controllers + add some keys to geonames resolver
ymh <ymh.work@gmail.com>
parents:
25
diff
changeset
|
172 |
$res = []; |
497
f3474aeec884
add contry code in indexation, Serialize types, prepare #0025746
ymh <ymh.work@gmail.com>
parents:
496
diff
changeset
|
173 |
$resCountry = null; |
308
e032d686d88e
add hierarchy info in document indexation + geostats api controllers + add some keys to geonames resolver
ymh <ymh.work@gmail.com>
parents:
25
diff
changeset
|
174 |
foreach($hcache->hierarchy['geonames'] as $hierarchyElem) { |
e032d686d88e
add hierarchy info in document indexation + geostats api controllers + add some keys to geonames resolver
ymh <ymh.work@gmail.com>
parents:
25
diff
changeset
|
175 |
if(in_array($hierarchyElem['fcode'], ['CONT','PCLI', 'PCL','PCLD', 'PCLF', 'PCLH', 'PCLIX', 'PCLIS', 'ADM1'])) { |
e032d686d88e
add hierarchy info in document indexation + geostats api controllers + add some keys to geonames resolver
ymh <ymh.work@gmail.com>
parents:
25
diff
changeset
|
176 |
array_push($res, $hierarchyElem['geonameId']); |
e032d686d88e
add hierarchy info in document indexation + geostats api controllers + add some keys to geonames resolver
ymh <ymh.work@gmail.com>
parents:
25
diff
changeset
|
177 |
} |
497
f3474aeec884
add contry code in indexation, Serialize types, prepare #0025746
ymh <ymh.work@gmail.com>
parents:
496
diff
changeset
|
178 |
if(!empty($hierarchyElem['fcode']) && strpos($hierarchyElem['fcode'], 'PCL') === 0) { |
f3474aeec884
add contry code in indexation, Serialize types, prepare #0025746
ymh <ymh.work@gmail.com>
parents:
496
diff
changeset
|
179 |
$resCountry = $hierarchyElem['geonameId']; |
f3474aeec884
add contry code in indexation, Serialize types, prepare #0025746
ymh <ymh.work@gmail.com>
parents:
496
diff
changeset
|
180 |
} |
308
e032d686d88e
add hierarchy info in document indexation + geostats api controllers + add some keys to geonames resolver
ymh <ymh.work@gmail.com>
parents:
25
diff
changeset
|
181 |
} |
497
f3474aeec884
add contry code in indexation, Serialize types, prepare #0025746
ymh <ymh.work@gmail.com>
parents:
496
diff
changeset
|
182 |
return [$resCountry, $res]; |
308
e032d686d88e
add hierarchy info in document indexation + geostats api controllers + add some keys to geonames resolver
ymh <ymh.work@gmail.com>
parents:
25
diff
changeset
|
183 |
|
e032d686d88e
add hierarchy info in document indexation + geostats api controllers + add some keys to geonames resolver
ymh <ymh.work@gmail.com>
parents:
25
diff
changeset
|
184 |
} |
e032d686d88e
add hierarchy info in document indexation + geostats api controllers + add some keys to geonames resolver
ymh <ymh.work@gmail.com>
parents:
25
diff
changeset
|
185 |
|
e032d686d88e
add hierarchy info in document indexation + geostats api controllers + add some keys to geonames resolver
ymh <ymh.work@gmail.com>
parents:
25
diff
changeset
|
186 |
/** |
e032d686d88e
add hierarchy info in document indexation + geostats api controllers + add some keys to geonames resolver
ymh <ymh.work@gmail.com>
parents:
25
diff
changeset
|
187 |
* get geonames hierarchy data. |
e032d686d88e
add hierarchy info in document indexation + geostats api controllers + add some keys to geonames resolver
ymh <ymh.work@gmail.com>
parents:
25
diff
changeset
|
188 |
* @return array list of geonames ids |
e032d686d88e
add hierarchy info in document indexation + geostats api controllers + add some keys to geonames resolver
ymh <ymh.work@gmail.com>
parents:
25
diff
changeset
|
189 |
*/ |
e032d686d88e
add hierarchy info in document indexation + geostats api controllers + add some keys to geonames resolver
ymh <ymh.work@gmail.com>
parents:
25
diff
changeset
|
190 |
private function getGeonamesHierarchy($doc) { |
e032d686d88e
add hierarchy info in document indexation + geostats api controllers + add some keys to geonames resolver
ymh <ymh.work@gmail.com>
parents:
25
diff
changeset
|
191 |
$geoRes = $doc->getGeoInfo(); |
e032d686d88e
add hierarchy info in document indexation + geostats api controllers + add some keys to geonames resolver
ymh <ymh.work@gmail.com>
parents:
25
diff
changeset
|
192 |
if(is_null($geoRes)) { |
497
f3474aeec884
add contry code in indexation, Serialize types, prepare #0025746
ymh <ymh.work@gmail.com>
parents:
496
diff
changeset
|
193 |
return [null,[]]; |
308
e032d686d88e
add hierarchy info in document indexation + geostats api controllers + add some keys to geonames resolver
ymh <ymh.work@gmail.com>
parents:
25
diff
changeset
|
194 |
} |
e032d686d88e
add hierarchy info in document indexation + geostats api controllers + add some keys to geonames resolver
ymh <ymh.work@gmail.com>
parents:
25
diff
changeset
|
195 |
// aggregate hierachy list from geonames results |
e032d686d88e
add hierarchy info in document indexation + geostats api controllers + add some keys to geonames resolver
ymh <ymh.work@gmail.com>
parents:
25
diff
changeset
|
196 |
$res = []; |
497
f3474aeec884
add contry code in indexation, Serialize types, prepare #0025746
ymh <ymh.work@gmail.com>
parents:
496
diff
changeset
|
197 |
// The country is the first one |
f3474aeec884
add contry code in indexation, Serialize types, prepare #0025746
ymh <ymh.work@gmail.com>
parents:
496
diff
changeset
|
198 |
$resCountry = null; |
308
e032d686d88e
add hierarchy info in document indexation + geostats api controllers + add some keys to geonames resolver
ymh <ymh.work@gmail.com>
parents:
25
diff
changeset
|
199 |
foreach($geoRes->getGeonamesLocs() as $gurl) { |
e032d686d88e
add hierarchy info in document indexation + geostats api controllers + add some keys to geonames resolver
ymh <ymh.work@gmail.com>
parents:
25
diff
changeset
|
200 |
$geonamesId = CocoonUtils::getGeonamesidFromUrl($gurl); |
e032d686d88e
add hierarchy info in document indexation + geostats api controllers + add some keys to geonames resolver
ymh <ymh.work@gmail.com>
parents:
25
diff
changeset
|
201 |
if(is_null($geonamesId)) { |
e032d686d88e
add hierarchy info in document indexation + geostats api controllers + add some keys to geonames resolver
ymh <ymh.work@gmail.com>
parents:
25
diff
changeset
|
202 |
continue; |
e032d686d88e
add hierarchy info in document indexation + geostats api controllers + add some keys to geonames resolver
ymh <ymh.work@gmail.com>
parents:
25
diff
changeset
|
203 |
} |
497
f3474aeec884
add contry code in indexation, Serialize types, prepare #0025746
ymh <ymh.work@gmail.com>
parents:
496
diff
changeset
|
204 |
list($country, $hierarchyIds) = $this->getGeonamesHierarchyArray($geonamesId); |
308
e032d686d88e
add hierarchy info in document indexation + geostats api controllers + add some keys to geonames resolver
ymh <ymh.work@gmail.com>
parents:
25
diff
changeset
|
205 |
$res = array_unique(array_merge($res, $hierarchyIds)); |
497
f3474aeec884
add contry code in indexation, Serialize types, prepare #0025746
ymh <ymh.work@gmail.com>
parents:
496
diff
changeset
|
206 |
if(is_null($resCountry) && !empty($country)) { |
f3474aeec884
add contry code in indexation, Serialize types, prepare #0025746
ymh <ymh.work@gmail.com>
parents:
496
diff
changeset
|
207 |
$resCountry = $country; |
f3474aeec884
add contry code in indexation, Serialize types, prepare #0025746
ymh <ymh.work@gmail.com>
parents:
496
diff
changeset
|
208 |
} |
308
e032d686d88e
add hierarchy info in document indexation + geostats api controllers + add some keys to geonames resolver
ymh <ymh.work@gmail.com>
parents:
25
diff
changeset
|
209 |
} |
497
f3474aeec884
add contry code in indexation, Serialize types, prepare #0025746
ymh <ymh.work@gmail.com>
parents:
496
diff
changeset
|
210 |
return [$resCountry, $res]; |
308
e032d686d88e
add hierarchy info in document indexation + geostats api controllers + add some keys to geonames resolver
ymh <ymh.work@gmail.com>
parents:
25
diff
changeset
|
211 |
|
e032d686d88e
add hierarchy info in document indexation + geostats api controllers + add some keys to geonames resolver
ymh <ymh.work@gmail.com>
parents:
25
diff
changeset
|
212 |
} |
e032d686d88e
add hierarchy info in document indexation + geostats api controllers + add some keys to geonames resolver
ymh <ymh.work@gmail.com>
parents:
25
diff
changeset
|
213 |
|
24
de47e8f66e8b
Added command "corpus-parole:indexDocuments" to index documents into ElasticSearch
durandn
parents:
diff
changeset
|
214 |
/** |
321
aefaad270b9b
reimplement ThemeController using ES requests to be able to sort by label
ymh <ymh.work@gmail.com>
parents:
320
diff
changeset
|
215 |
* get subjects as { 'label': label, 'code': code } objects |
aefaad270b9b
reimplement ThemeController using ES requests to be able to sort by label
ymh <ymh.work@gmail.com>
parents:
320
diff
changeset
|
216 |
* Takes only into account the bnf subjects |
aefaad270b9b
reimplement ThemeController using ES requests to be able to sort by label
ymh <ymh.work@gmail.com>
parents:
320
diff
changeset
|
217 |
*/ |
aefaad270b9b
reimplement ThemeController using ES requests to be able to sort by label
ymh <ymh.work@gmail.com>
parents:
320
diff
changeset
|
218 |
private function getSubjects($doc) { |
aefaad270b9b
reimplement ThemeController using ES requests to be able to sort by label
ymh <ymh.work@gmail.com>
parents:
320
diff
changeset
|
219 |
|
aefaad270b9b
reimplement ThemeController using ES requests to be able to sort by label
ymh <ymh.work@gmail.com>
parents:
320
diff
changeset
|
220 |
$sres = array_reduce($doc->getSubjects(), function($res, $s) { |
322
084aae09edf4
correction on importRDF documents + evolution theme controller
ymh <ymh.work@gmail.com>
parents:
321
diff
changeset
|
221 |
$mBnf = []; |
084aae09edf4
correction on importRDF documents + evolution theme controller
ymh <ymh.work@gmail.com>
parents:
321
diff
changeset
|
222 |
$mLexvo = []; |
084aae09edf4
correction on importRDF documents + evolution theme controller
ymh <ymh.work@gmail.com>
parents:
321
diff
changeset
|
223 |
|
084aae09edf4
correction on importRDF documents + evolution theme controller
ymh <ymh.work@gmail.com>
parents:
321
diff
changeset
|
224 |
if($s instanceof Resource && preg_match(config('corpusparole.bnf_ark_url_regexp'), $s->getUri(), $mBnf) === 1) { |
323 | 225 |
|
321
aefaad270b9b
reimplement ThemeController using ES requests to be able to sort by label
ymh <ymh.work@gmail.com>
parents:
320
diff
changeset
|
226 |
array_push($res, [ |
322
084aae09edf4
correction on importRDF documents + evolution theme controller
ymh <ymh.work@gmail.com>
parents:
321
diff
changeset
|
227 |
'uri' => $mBnf[0], |
084aae09edf4
correction on importRDF documents + evolution theme controller
ymh <ymh.work@gmail.com>
parents:
321
diff
changeset
|
228 |
'code' => $mBnf[1], |
496
a53762d61c06
Improve filters on themes, allow to filter by lexvo + olac. Bug #0025934
ymh <ymh.work@gmail.com>
parents:
407
diff
changeset
|
229 |
'type' => Utils::SUBJECT_TYPE_BNF |
322
084aae09edf4
correction on importRDF documents + evolution theme controller
ymh <ymh.work@gmail.com>
parents:
321
diff
changeset
|
230 |
]); |
323 | 231 |
} elseif($s instanceof Resource && preg_match(config('corpusparole.lexvo_url_regexp'), $s->getUri(), $mLexvo) === 1) { |
322
084aae09edf4
correction on importRDF documents + evolution theme controller
ymh <ymh.work@gmail.com>
parents:
321
diff
changeset
|
232 |
array_push($res, [ |
084aae09edf4
correction on importRDF documents + evolution theme controller
ymh <ymh.work@gmail.com>
parents:
321
diff
changeset
|
233 |
'uri' => $mLexvo[0], |
084aae09edf4
correction on importRDF documents + evolution theme controller
ymh <ymh.work@gmail.com>
parents:
321
diff
changeset
|
234 |
'code' => $mLexvo[1], |
496
a53762d61c06
Improve filters on themes, allow to filter by lexvo + olac. Bug #0025934
ymh <ymh.work@gmail.com>
parents:
407
diff
changeset
|
235 |
'type' => Utils::SUBJECT_TYPE_LEXVO |
321
aefaad270b9b
reimplement ThemeController using ES requests to be able to sort by label
ymh <ymh.work@gmail.com>
parents:
320
diff
changeset
|
236 |
]); |
323 | 237 |
} elseif($s instanceof Literal && strpos($s->getDatatypeUri(), config('corpusparole.olac_base_url')) === 0 ) { |
238 |
array_push($res, [ |
|
239 |
'uri' => $s->getValue(), |
|
240 |
'code' => $s->getValue(), |
|
496
a53762d61c06
Improve filters on themes, allow to filter by lexvo + olac. Bug #0025934
ymh <ymh.work@gmail.com>
parents:
407
diff
changeset
|
241 |
'type' => Utils::SUBJECT_TYPE_OLAC |
323 | 242 |
]); |
243 |
} elseif($s instanceof Literal) { |
|
244 |
array_push($res, [ |
|
245 |
'uri' => $s->getValue(), |
|
246 |
'code' => $s->getValue(), |
|
496
a53762d61c06
Improve filters on themes, allow to filter by lexvo + olac. Bug #0025934
ymh <ymh.work@gmail.com>
parents:
407
diff
changeset
|
247 |
'type' => Utils::SUBJECT_TYPE_TXT |
323 | 248 |
]); |
321
aefaad270b9b
reimplement ThemeController using ES requests to be able to sort by label
ymh <ymh.work@gmail.com>
parents:
320
diff
changeset
|
249 |
} |
aefaad270b9b
reimplement ThemeController using ES requests to be able to sort by label
ymh <ymh.work@gmail.com>
parents:
320
diff
changeset
|
250 |
return $res; |
aefaad270b9b
reimplement ThemeController using ES requests to be able to sort by label
ymh <ymh.work@gmail.com>
parents:
320
diff
changeset
|
251 |
}, []); |
aefaad270b9b
reimplement ThemeController using ES requests to be able to sort by label
ymh <ymh.work@gmail.com>
parents:
320
diff
changeset
|
252 |
|
506
8a5bb4b48b85
try to solve #0025932 + try to improve indexing process reliability by retrying bnf label resolve queries
ymh <ymh.work@gmail.com>
parents:
498
diff
changeset
|
253 |
$labelsResolved = false; |
8a5bb4b48b85
try to solve #0025932 + try to improve indexing process reliability by retrying bnf label resolve queries
ymh <ymh.work@gmail.com>
parents:
498
diff
changeset
|
254 |
$timeoutRetries = 0; |
8a5bb4b48b85
try to solve #0025932 + try to improve indexing process reliability by retrying bnf label resolve queries
ymh <ymh.work@gmail.com>
parents:
498
diff
changeset
|
255 |
while(!$labelsResolved && $timeoutRetries < config('corpusparole.bnf_max_retries', 3)) { |
8a5bb4b48b85
try to solve #0025932 + try to improve indexing process reliability by retrying bnf label resolve queries
ymh <ymh.work@gmail.com>
parents:
498
diff
changeset
|
256 |
try { |
8a5bb4b48b85
try to solve #0025932 + try to improve indexing process reliability by retrying bnf label resolve queries
ymh <ymh.work@gmail.com>
parents:
498
diff
changeset
|
257 |
$labelsBnf = $this->bnfResolver->getLabels( |
8a5bb4b48b85
try to solve #0025932 + try to improve indexing process reliability by retrying bnf label resolve queries
ymh <ymh.work@gmail.com>
parents:
498
diff
changeset
|
258 |
array_unique(array_reduce( |
8a5bb4b48b85
try to solve #0025932 + try to improve indexing process reliability by retrying bnf label resolve queries
ymh <ymh.work@gmail.com>
parents:
498
diff
changeset
|
259 |
$sres, |
8a5bb4b48b85
try to solve #0025932 + try to improve indexing process reliability by retrying bnf label resolve queries
ymh <ymh.work@gmail.com>
parents:
498
diff
changeset
|
260 |
function($r, $so) { |
8a5bb4b48b85
try to solve #0025932 + try to improve indexing process reliability by retrying bnf label resolve queries
ymh <ymh.work@gmail.com>
parents:
498
diff
changeset
|
261 |
if($so['type'] === Utils::SUBJECT_TYPE_BNF) { |
8a5bb4b48b85
try to solve #0025932 + try to improve indexing process reliability by retrying bnf label resolve queries
ymh <ymh.work@gmail.com>
parents:
498
diff
changeset
|
262 |
array_push($r, $so['uri']); |
8a5bb4b48b85
try to solve #0025932 + try to improve indexing process reliability by retrying bnf label resolve queries
ymh <ymh.work@gmail.com>
parents:
498
diff
changeset
|
263 |
} |
8a5bb4b48b85
try to solve #0025932 + try to improve indexing process reliability by retrying bnf label resolve queries
ymh <ymh.work@gmail.com>
parents:
498
diff
changeset
|
264 |
return $r; |
8a5bb4b48b85
try to solve #0025932 + try to improve indexing process reliability by retrying bnf label resolve queries
ymh <ymh.work@gmail.com>
parents:
498
diff
changeset
|
265 |
},[] |
8a5bb4b48b85
try to solve #0025932 + try to improve indexing process reliability by retrying bnf label resolve queries
ymh <ymh.work@gmail.com>
parents:
498
diff
changeset
|
266 |
)) |
8a5bb4b48b85
try to solve #0025932 + try to improve indexing process reliability by retrying bnf label resolve queries
ymh <ymh.work@gmail.com>
parents:
498
diff
changeset
|
267 |
); |
8a5bb4b48b85
try to solve #0025932 + try to improve indexing process reliability by retrying bnf label resolve queries
ymh <ymh.work@gmail.com>
parents:
498
diff
changeset
|
268 |
$labelsResolved = true; |
8a5bb4b48b85
try to solve #0025932 + try to improve indexing process reliability by retrying bnf label resolve queries
ymh <ymh.work@gmail.com>
parents:
498
diff
changeset
|
269 |
} catch(BnfResolverTimeoutException $e) { |
8a5bb4b48b85
try to solve #0025932 + try to improve indexing process reliability by retrying bnf label resolve queries
ymh <ymh.work@gmail.com>
parents:
498
diff
changeset
|
270 |
Log::warning('IndexDocument: Resolve label timeout, will retry'); |
8a5bb4b48b85
try to solve #0025932 + try to improve indexing process reliability by retrying bnf label resolve queries
ymh <ymh.work@gmail.com>
parents:
498
diff
changeset
|
271 |
$timeoutRetries++; |
8a5bb4b48b85
try to solve #0025932 + try to improve indexing process reliability by retrying bnf label resolve queries
ymh <ymh.work@gmail.com>
parents:
498
diff
changeset
|
272 |
continue; |
8a5bb4b48b85
try to solve #0025932 + try to improve indexing process reliability by retrying bnf label resolve queries
ymh <ymh.work@gmail.com>
parents:
498
diff
changeset
|
273 |
} |
8a5bb4b48b85
try to solve #0025932 + try to improve indexing process reliability by retrying bnf label resolve queries
ymh <ymh.work@gmail.com>
parents:
498
diff
changeset
|
274 |
} |
8a5bb4b48b85
try to solve #0025932 + try to improve indexing process reliability by retrying bnf label resolve queries
ymh <ymh.work@gmail.com>
parents:
498
diff
changeset
|
275 |
if(!$labelsResolved) { |
8a5bb4b48b85
try to solve #0025932 + try to improve indexing process reliability by retrying bnf label resolve queries
ymh <ymh.work@gmail.com>
parents:
498
diff
changeset
|
276 |
Log::error("IndexDocument: Some bnf labels not resolved (retry timeout: $timeoutRetries)."); |
8a5bb4b48b85
try to solve #0025932 + try to improve indexing process reliability by retrying bnf label resolve queries
ymh <ymh.work@gmail.com>
parents:
498
diff
changeset
|
277 |
$this->error("\nError resolving bnf labels (retry timeout: $timeoutRetries). Stopping"); |
8a5bb4b48b85
try to solve #0025932 + try to improve indexing process reliability by retrying bnf label resolve queries
ymh <ymh.work@gmail.com>
parents:
498
diff
changeset
|
278 |
throw new \Exception("Error resolving bnf labels (retry timeout: $timeoutRetries). Stopping"); |
8a5bb4b48b85
try to solve #0025932 + try to improve indexing process reliability by retrying bnf label resolve queries
ymh <ymh.work@gmail.com>
parents:
498
diff
changeset
|
279 |
} |
323 | 280 |
$labelsLexvo = $this->lexvoResolver->getNames( |
322
084aae09edf4
correction on importRDF documents + evolution theme controller
ymh <ymh.work@gmail.com>
parents:
321
diff
changeset
|
281 |
array_unique(array_reduce( |
084aae09edf4
correction on importRDF documents + evolution theme controller
ymh <ymh.work@gmail.com>
parents:
321
diff
changeset
|
282 |
$sres, |
084aae09edf4
correction on importRDF documents + evolution theme controller
ymh <ymh.work@gmail.com>
parents:
321
diff
changeset
|
283 |
function($r, $so) { |
496
a53762d61c06
Improve filters on themes, allow to filter by lexvo + olac. Bug #0025934
ymh <ymh.work@gmail.com>
parents:
407
diff
changeset
|
284 |
if($so['type'] === Utils::SUBJECT_TYPE_LEXVO) { |
322
084aae09edf4
correction on importRDF documents + evolution theme controller
ymh <ymh.work@gmail.com>
parents:
321
diff
changeset
|
285 |
array_push($r, $so['uri']); |
084aae09edf4
correction on importRDF documents + evolution theme controller
ymh <ymh.work@gmail.com>
parents:
321
diff
changeset
|
286 |
} |
084aae09edf4
correction on importRDF documents + evolution theme controller
ymh <ymh.work@gmail.com>
parents:
321
diff
changeset
|
287 |
return $r; |
084aae09edf4
correction on importRDF documents + evolution theme controller
ymh <ymh.work@gmail.com>
parents:
321
diff
changeset
|
288 |
},[] |
084aae09edf4
correction on importRDF documents + evolution theme controller
ymh <ymh.work@gmail.com>
parents:
321
diff
changeset
|
289 |
)) |
084aae09edf4
correction on importRDF documents + evolution theme controller
ymh <ymh.work@gmail.com>
parents:
321
diff
changeset
|
290 |
); |
321
aefaad270b9b
reimplement ThemeController using ES requests to be able to sort by label
ymh <ymh.work@gmail.com>
parents:
320
diff
changeset
|
291 |
|
322
084aae09edf4
correction on importRDF documents + evolution theme controller
ymh <ymh.work@gmail.com>
parents:
321
diff
changeset
|
292 |
return array_map(function($so) use ($labelsBnf, $labelsLexvo) { |
084aae09edf4
correction on importRDF documents + evolution theme controller
ymh <ymh.work@gmail.com>
parents:
321
diff
changeset
|
293 |
$label = $so['uri']; |
496
a53762d61c06
Improve filters on themes, allow to filter by lexvo + olac. Bug #0025934
ymh <ymh.work@gmail.com>
parents:
407
diff
changeset
|
294 |
if($so['type'] === Utils::SUBJECT_TYPE_BNF) { |
322
084aae09edf4
correction on importRDF documents + evolution theme controller
ymh <ymh.work@gmail.com>
parents:
321
diff
changeset
|
295 |
$label = $labelsBnf[$label]; |
496
a53762d61c06
Improve filters on themes, allow to filter by lexvo + olac. Bug #0025934
ymh <ymh.work@gmail.com>
parents:
407
diff
changeset
|
296 |
} elseif ($so['type'] === Utils::SUBJECT_TYPE_LEXVO) { |
322
084aae09edf4
correction on importRDF documents + evolution theme controller
ymh <ymh.work@gmail.com>
parents:
321
diff
changeset
|
297 |
$label = $labelsLexvo[$label]; |
084aae09edf4
correction on importRDF documents + evolution theme controller
ymh <ymh.work@gmail.com>
parents:
321
diff
changeset
|
298 |
} |
496
a53762d61c06
Improve filters on themes, allow to filter by lexvo + olac. Bug #0025934
ymh <ymh.work@gmail.com>
parents:
407
diff
changeset
|
299 |
return [ 'label' => $label, 'code' => $so['type']."|".$so['code'], 'label_code' => $label."|".$so['type']."|".$so['code'] ]; }, $sres |
322
084aae09edf4
correction on importRDF documents + evolution theme controller
ymh <ymh.work@gmail.com>
parents:
321
diff
changeset
|
300 |
); |
321
aefaad270b9b
reimplement ThemeController using ES requests to be able to sort by label
ymh <ymh.work@gmail.com>
parents:
320
diff
changeset
|
301 |
} |
aefaad270b9b
reimplement ThemeController using ES requests to be able to sort by label
ymh <ymh.work@gmail.com>
parents:
320
diff
changeset
|
302 |
|
325 | 303 |
private function graphResolvCoordinate($loc, $graph) { |
304 |
$latLit = $graph->getLiteral($loc, "<http://www.w3.org/2003/01/geo/wgs84_pos#lat>"); |
|
305 |
if(is_null($latLit) || empty($latLit->getValue())) { |
|
306 |
return null; |
|
307 |
} |
|
308 |
$lat = $latLit->getValue(); |
|
309 |
||
310 |
$longLit = $graph->getLiteral($loc, "<http://www.w3.org/2003/01/geo/wgs84_pos#long>"); |
|
311 |
if(is_null($longLit) || empty($longLit->getValue())) { |
|
312 |
return null; |
|
313 |
} |
|
314 |
$long = $longLit->getValue(); |
|
315 |
||
316 |
return [ $lat, $long ]; |
|
317 |
} |
|
318 |
||
319 |
private function loadGraph($url, $type) { |
|
320 |
try { |
|
321 |
$r = $this->httpClient->get($url); |
|
322 |
} catch (TransferException $e) { |
|
323 |
$this->error("loadGraph : Error Loading $url"); |
|
324 |
Log::error("loadGraph : Error Loading $url"); |
|
325 |
Log::error("loadGraph : Error request " . Psr7\str($e->getRequest())); |
|
326 |
if ($e->hasResponse()) { |
|
327 |
$this->error("loadGraph : Error response " . Psr7\str($e->getResponse())); |
|
328 |
Log::error("loadGraph : Error response " . Psr7\str($e->getResponse())); |
|
329 |
} |
|
330 |
return null; |
|
331 |
} |
|
332 |
try { |
|
333 |
$message = (string)$r->getBody(); |
|
334 |
$graph = new Graph($url, $message, $type); |
|
335 |
return $graph; |
|
336 |
} catch (EasyRdf\Exception $e) { |
|
337 |
$this->error("loadGraph : Error parsing $url"); |
|
338 |
Log::error("loadGraph : Error parsing $url"); |
|
339 |
if($e instanceof EasyRdf\Parser\Exception) { |
|
340 |
Log::error("loadGraph : Error exception line ".$e->getLine().", column: ".$e->getColumn()); |
|
341 |
} |
|
342 |
$this->error("loadGraph : Error exception message ".$e->getMessage()); |
|
343 |
Log::error("loadGraph : Error exception message ".$e->getMessage()); |
|
344 |
Log::error("loadGraph : Error content $message"); |
|
345 |
return null; |
|
346 |
} |
|
347 |
||
348 |
} |
|
349 |
||
350 |
private function geonamesResolveCoordinates($loc) { |
|
351 |
$coords = cache("corpus.geonames.coord.$loc"); |
|
352 |
if(is_null($coords)) { |
|
353 |
$graph = $this->loadGraph("{$loc}about.rdf", 'rdfxml'); |
|
354 |
$coords = is_null($graph)?null:$this->graphResolvCoordinate($loc, $graph); |
|
355 |
cache(["corpus.geonames.coord.$loc" => is_null($coords)?false:$coords], Carbon::now()->addMinutes(20)); |
|
356 |
} |
|
357 |
return ($coords===false)?null:$coords; |
|
358 |
} |
|
359 |
||
360 |
private function dbpediaResolveCoordinates($loc) { |
|
361 |
$coords = cache("corpus.dbpedia.coord.$loc"); |
|
362 |
if(is_null($coords)) { |
|
363 |
$graph = $this->loadGraph("$loc.rdf", 'rdfxml'); |
|
364 |
$coords = is_null($graph)?null:$this->graphResolvCoordinate($loc, $graph); |
|
365 |
cache(["corpus.dbpedia.coord.$loc"=> is_null($coords)?false:$coords], Carbon::now()->addMinutes(20)); |
|
366 |
} |
|
367 |
return ($coords===false)?null:$coords; |
|
368 |
} |
|
369 |
||
370 |
private function getLocation($doc) { |
|
371 |
||
372 |
$geoRes = $doc->getGeoInfo(); |
|
373 |
||
374 |
if(is_null($geoRes)) { |
|
375 |
return null; |
|
376 |
} |
|
377 |
||
378 |
$locUrls = []; |
|
379 |
foreach($geoRes->getRefLocs() as $loc) { |
|
380 |
if(preg_match(config('corpusparole.geonames_url_regexp'), $loc, $m) === 1) { |
|
381 |
||
382 |
if(!array_key_exists('geonames', $locUrls)) { |
|
383 |
$locUrls['geonames'] = []; |
|
384 |
} |
|
385 |
array_push($locUrls['geonames'], "http://sws.geonames.org/$m[1]/"); |
|
386 |
||
387 |
} elseif(preg_match(config('corpusparole.dbpedia_url_regexp'), $loc, $md) === 1) { |
|
388 |
if(!array_key_exists('dbpedia', $locUrls)) { |
|
389 |
$locUrls['dbpedia'] = []; |
|
390 |
} |
|
391 |
//$this->line("DBPEDIA MATCH $loc ".print_r($md,true)); |
|
392 |
array_push($locUrls['dbpedia'], "http://$md[1]/data/$md[4]"); |
|
393 |
} |
|
394 |
} |
|
395 |
||
396 |
$coordinates = null; |
|
397 |
foreach($locUrls as $locType => $locList) { |
|
398 |
foreach($locList as $locationUrl) { |
|
399 |
$coordinates = call_user_func([$this, "${locType}ResolveCoordinates"], $locationUrl); |
|
400 |
if(!is_null($coordinates)) { |
|
401 |
break; |
|
402 |
} |
|
403 |
} |
|
404 |
} |
|
405 |
||
406 |
if(is_null($coordinates)) { |
|
407 |
$coordinates = [$geoRes->getLatitudeValue(), $geoRes->getLongitudeValue()]; |
|
408 |
} |
|
409 |
||
410 |
if(empty($coordinates[0]) || empty($coordinates[1])) { |
|
411 |
return null; |
|
412 |
} else { |
|
413 |
return [floatval($coordinates[0]), floatval($coordinates[1])]; |
|
414 |
} |
|
415 |
||
416 |
} |
|
417 |
||
418 |
private function getCreationDate($doc) { |
|
419 |
||
420 |
$created = $doc->getCreated(); |
|
421 |
if(is_null($created)) { |
|
422 |
return null; |
|
423 |
} |
|
424 |
$dateType = $created->getDatatypeUri(); |
|
425 |
$res = null; |
|
426 |
||
427 |
if($dateType === "http://purl.org/dc/terms/Period") { |
|
428 |
$res = $this->processPeriod($created->getValue()); |
|
429 |
} |
|
430 |
elseif($dateType === "http://purl.org/dc/terms/W3CDTF") { |
|
431 |
$res = $this->processDate($created->getValue()); |
|
432 |
} |
|
433 |
||
434 |
return $res; |
|
435 |
||
436 |
} |
|
437 |
||
438 |
private function extractDate($dateStr) { |
|
439 |
if(preg_match("/^\\d{4}$/", $dateStr) === 1) { |
|
440 |
$dateStr = "$dateStr-1-1"; |
|
441 |
} |
|
442 |
$date = date_create($dateStr); |
|
443 |
if($date === false ) { |
|
444 |
Log::warning("DateStatsController:extractYear bad format for date $dateStr"); |
|
445 |
return null; |
|
446 |
} |
|
447 |
return $date; |
|
448 |
} |
|
449 |
||
375
145561ff51ff
change datestats to use elasticsearch
ymh <ymh.work@gmail.com>
parents:
369
diff
changeset
|
450 |
private function processPeriod($periodStr, $asDate=false) { |
325 | 451 |
$start = null; |
452 |
$end = null; |
|
453 |
foreach(explode(";", $periodStr) as $elem) { |
|
454 |
$elem = trim($elem); |
|
455 |
if(strpos($elem, 'start=') === 0) { |
|
456 |
$startDate = $this->extractDate(trim(substr($elem, 6))); |
|
457 |
if(is_null($startDate)) { |
|
458 |
return null; |
|
459 |
} |
|
460 |
$start = intval($startDate->format("Y")); |
|
461 |
if($start === false) { |
|
462 |
return null; |
|
463 |
} |
|
464 |
} elseif(strpos($elem, 'end=') === 0) { |
|
465 |
$endDate = $this->extractDate(trim(substr($elem, 4))); |
|
466 |
if(is_null($endDate)) { |
|
467 |
return null; |
|
468 |
} |
|
469 |
$end = intval($endDate->format("Y")); |
|
470 |
if($end === false) { |
|
471 |
return null; |
|
472 |
} |
|
473 |
} |
|
474 |
} |
|
475 |
||
476 |
if(is_null($start) || is_null($end) || $start>$end ) { |
|
477 |
Log::warning("Bad format for $periodStr"); |
|
478 |
return null; |
|
479 |
} |
|
480 |
||
375
145561ff51ff
change datestats to use elasticsearch
ymh <ymh.work@gmail.com>
parents:
369
diff
changeset
|
481 |
return array_map(function($y) use ($asDate){ |
145561ff51ff
change datestats to use elasticsearch
ymh <ymh.work@gmail.com>
parents:
369
diff
changeset
|
482 |
$date = \DateTime::createFromFormat("Y", "$y"); |
145561ff51ff
change datestats to use elasticsearch
ymh <ymh.work@gmail.com>
parents:
369
diff
changeset
|
483 |
if($asDate) { |
145561ff51ff
change datestats to use elasticsearch
ymh <ymh.work@gmail.com>
parents:
369
diff
changeset
|
484 |
return $date; |
145561ff51ff
change datestats to use elasticsearch
ymh <ymh.work@gmail.com>
parents:
369
diff
changeset
|
485 |
} else { |
145561ff51ff
change datestats to use elasticsearch
ymh <ymh.work@gmail.com>
parents:
369
diff
changeset
|
486 |
return $date->format(\DateTime::W3C); |
145561ff51ff
change datestats to use elasticsearch
ymh <ymh.work@gmail.com>
parents:
369
diff
changeset
|
487 |
} |
145561ff51ff
change datestats to use elasticsearch
ymh <ymh.work@gmail.com>
parents:
369
diff
changeset
|
488 |
|
325 | 489 |
}, range($start, $end)); |
490 |
} |
|
491 |
||
375
145561ff51ff
change datestats to use elasticsearch
ymh <ymh.work@gmail.com>
parents:
369
diff
changeset
|
492 |
private function processDate($dateStr, $asDate=false) { |
325 | 493 |
$date = $this->extractDate($dateStr); |
494 |
if(is_null($date)) { |
|
495 |
return null; |
|
496 |
} else { |
|
375
145561ff51ff
change datestats to use elasticsearch
ymh <ymh.work@gmail.com>
parents:
369
diff
changeset
|
497 |
if($asDate) { |
145561ff51ff
change datestats to use elasticsearch
ymh <ymh.work@gmail.com>
parents:
369
diff
changeset
|
498 |
return $date; |
145561ff51ff
change datestats to use elasticsearch
ymh <ymh.work@gmail.com>
parents:
369
diff
changeset
|
499 |
} else { |
145561ff51ff
change datestats to use elasticsearch
ymh <ymh.work@gmail.com>
parents:
369
diff
changeset
|
500 |
return $date->format(\DateTime::W3C); |
145561ff51ff
change datestats to use elasticsearch
ymh <ymh.work@gmail.com>
parents:
369
diff
changeset
|
501 |
} |
145561ff51ff
change datestats to use elasticsearch
ymh <ymh.work@gmail.com>
parents:
369
diff
changeset
|
502 |
|
145561ff51ff
change datestats to use elasticsearch
ymh <ymh.work@gmail.com>
parents:
369
diff
changeset
|
503 |
} |
145561ff51ff
change datestats to use elasticsearch
ymh <ymh.work@gmail.com>
parents:
369
diff
changeset
|
504 |
} |
145561ff51ff
change datestats to use elasticsearch
ymh <ymh.work@gmail.com>
parents:
369
diff
changeset
|
505 |
|
145561ff51ff
change datestats to use elasticsearch
ymh <ymh.work@gmail.com>
parents:
369
diff
changeset
|
506 |
private function getCreationYears($doc) { |
145561ff51ff
change datestats to use elasticsearch
ymh <ymh.work@gmail.com>
parents:
369
diff
changeset
|
507 |
$created = $doc->getCreated(); |
145561ff51ff
change datestats to use elasticsearch
ymh <ymh.work@gmail.com>
parents:
369
diff
changeset
|
508 |
if(is_null($created)) { |
145561ff51ff
change datestats to use elasticsearch
ymh <ymh.work@gmail.com>
parents:
369
diff
changeset
|
509 |
return []; |
325 | 510 |
} |
375
145561ff51ff
change datestats to use elasticsearch
ymh <ymh.work@gmail.com>
parents:
369
diff
changeset
|
511 |
$dateType = $created->getDatatypeUri(); |
145561ff51ff
change datestats to use elasticsearch
ymh <ymh.work@gmail.com>
parents:
369
diff
changeset
|
512 |
$dates = null; |
145561ff51ff
change datestats to use elasticsearch
ymh <ymh.work@gmail.com>
parents:
369
diff
changeset
|
513 |
|
145561ff51ff
change datestats to use elasticsearch
ymh <ymh.work@gmail.com>
parents:
369
diff
changeset
|
514 |
if($dateType === "http://purl.org/dc/terms/Period") { |
145561ff51ff
change datestats to use elasticsearch
ymh <ymh.work@gmail.com>
parents:
369
diff
changeset
|
515 |
$dates = $this->processPeriod($created->getValue(), true); |
145561ff51ff
change datestats to use elasticsearch
ymh <ymh.work@gmail.com>
parents:
369
diff
changeset
|
516 |
} |
145561ff51ff
change datestats to use elasticsearch
ymh <ymh.work@gmail.com>
parents:
369
diff
changeset
|
517 |
elseif($dateType === "http://purl.org/dc/terms/W3CDTF") { |
145561ff51ff
change datestats to use elasticsearch
ymh <ymh.work@gmail.com>
parents:
369
diff
changeset
|
518 |
$dates = $this->processDate($created->getValue(), true); |
145561ff51ff
change datestats to use elasticsearch
ymh <ymh.work@gmail.com>
parents:
369
diff
changeset
|
519 |
if(!is_null($dates)) { |
145561ff51ff
change datestats to use elasticsearch
ymh <ymh.work@gmail.com>
parents:
369
diff
changeset
|
520 |
$dates = [ $dates, ]; |
145561ff51ff
change datestats to use elasticsearch
ymh <ymh.work@gmail.com>
parents:
369
diff
changeset
|
521 |
} |
145561ff51ff
change datestats to use elasticsearch
ymh <ymh.work@gmail.com>
parents:
369
diff
changeset
|
522 |
} |
145561ff51ff
change datestats to use elasticsearch
ymh <ymh.work@gmail.com>
parents:
369
diff
changeset
|
523 |
if(is_null($dates)) { |
145561ff51ff
change datestats to use elasticsearch
ymh <ymh.work@gmail.com>
parents:
369
diff
changeset
|
524 |
return []; |
145561ff51ff
change datestats to use elasticsearch
ymh <ymh.work@gmail.com>
parents:
369
diff
changeset
|
525 |
} |
145561ff51ff
change datestats to use elasticsearch
ymh <ymh.work@gmail.com>
parents:
369
diff
changeset
|
526 |
$count = count($dates); |
145561ff51ff
change datestats to use elasticsearch
ymh <ymh.work@gmail.com>
parents:
369
diff
changeset
|
527 |
return array_map(function($d) use ($count) { |
145561ff51ff
change datestats to use elasticsearch
ymh <ymh.work@gmail.com>
parents:
369
diff
changeset
|
528 |
return [ |
145561ff51ff
change datestats to use elasticsearch
ymh <ymh.work@gmail.com>
parents:
369
diff
changeset
|
529 |
'year' => intval($d->format("Y")), |
145561ff51ff
change datestats to use elasticsearch
ymh <ymh.work@gmail.com>
parents:
369
diff
changeset
|
530 |
'weight' => 1/$count |
145561ff51ff
change datestats to use elasticsearch
ymh <ymh.work@gmail.com>
parents:
369
diff
changeset
|
531 |
]; |
145561ff51ff
change datestats to use elasticsearch
ymh <ymh.work@gmail.com>
parents:
369
diff
changeset
|
532 |
|
145561ff51ff
change datestats to use elasticsearch
ymh <ymh.work@gmail.com>
parents:
369
diff
changeset
|
533 |
}, $dates); |
325 | 534 |
} |
535 |
||
536 |
private function getDiscourseTypes($doc) { |
|
537 |
return array_reduce($doc->getDiscourseTypes(), function($res, $d) { |
|
538 |
$val = null; |
|
539 |
if($d instanceof Resource) { |
|
540 |
$val = $d->getUri(); |
|
541 |
} elseif($d instanceof Literal) { |
|
542 |
$datatype = $d->getDatatypeURI(); |
|
543 |
$val = (!empty($datatype)?"$datatype#":"").$d->getValue(); |
|
544 |
} |
|
545 |
if(!empty($val)) { |
|
546 |
array_push($res,$val); |
|
547 |
} |
|
548 |
return $res; |
|
549 |
}, []); |
|
550 |
} |
|
551 |
||
498
265992e5b379
Add description to documents interface and indexation. Prepare #0025746
ymh <ymh.work@gmail.com>
parents:
497
diff
changeset
|
552 |
private function getDescriptions($doc) { |
265992e5b379
Add description to documents interface and indexation. Prepare #0025746
ymh <ymh.work@gmail.com>
parents:
497
diff
changeset
|
553 |
return array_reduce($doc->getDescriptions(), function($res, $desc) { |
265992e5b379
Add description to documents interface and indexation. Prepare #0025746
ymh <ymh.work@gmail.com>
parents:
497
diff
changeset
|
554 |
$val = null; |
265992e5b379
Add description to documents interface and indexation. Prepare #0025746
ymh <ymh.work@gmail.com>
parents:
497
diff
changeset
|
555 |
if(is_string($desc)) { |
265992e5b379
Add description to documents interface and indexation. Prepare #0025746
ymh <ymh.work@gmail.com>
parents:
497
diff
changeset
|
556 |
$val = $desc; |
265992e5b379
Add description to documents interface and indexation. Prepare #0025746
ymh <ymh.work@gmail.com>
parents:
497
diff
changeset
|
557 |
} elseif($desc instanceof Literal) { |
265992e5b379
Add description to documents interface and indexation. Prepare #0025746
ymh <ymh.work@gmail.com>
parents:
497
diff
changeset
|
558 |
$val = $desc->getValue(); |
265992e5b379
Add description to documents interface and indexation. Prepare #0025746
ymh <ymh.work@gmail.com>
parents:
497
diff
changeset
|
559 |
} |
265992e5b379
Add description to documents interface and indexation. Prepare #0025746
ymh <ymh.work@gmail.com>
parents:
497
diff
changeset
|
560 |
if(!empty($val)) { |
265992e5b379
Add description to documents interface and indexation. Prepare #0025746
ymh <ymh.work@gmail.com>
parents:
497
diff
changeset
|
561 |
array_push($res, $val); |
265992e5b379
Add description to documents interface and indexation. Prepare #0025746
ymh <ymh.work@gmail.com>
parents:
497
diff
changeset
|
562 |
} |
265992e5b379
Add description to documents interface and indexation. Prepare #0025746
ymh <ymh.work@gmail.com>
parents:
497
diff
changeset
|
563 |
return $res; |
265992e5b379
Add description to documents interface and indexation. Prepare #0025746
ymh <ymh.work@gmail.com>
parents:
497
diff
changeset
|
564 |
}, []); |
265992e5b379
Add description to documents interface and indexation. Prepare #0025746
ymh <ymh.work@gmail.com>
parents:
497
diff
changeset
|
565 |
} |
265992e5b379
Add description to documents interface and indexation. Prepare #0025746
ymh <ymh.work@gmail.com>
parents:
497
diff
changeset
|
566 |
|
325 | 567 |
private function getDocBody($doc) { |
497
f3474aeec884
add contry code in indexation, Serialize types, prepare #0025746
ymh <ymh.work@gmail.com>
parents:
496
diff
changeset
|
568 |
list($geonamesCountry, $geonamesHierarchy) = $this->getGeonamesHierarchy($doc); |
325 | 569 |
return [ |
570 |
'title' => (string)$doc->getTitle(), |
|
571 |
'date' => (string)$doc->getModified(), |
|
572 |
'location' => $this->getLocation($doc), |
|
573 |
'creation_date' => $this->getCreationDate($doc), |
|
375
145561ff51ff
change datestats to use elasticsearch
ymh <ymh.work@gmail.com>
parents:
369
diff
changeset
|
574 |
'creation_years' => $this->getCreationYears($doc), |
326
226d5b17a119
- First implementation of filter for languages.
ymh <ymh.work@gmail.com>
parents:
325
diff
changeset
|
575 |
'language' => $doc->getLanguagesValue(), |
325 | 576 |
'discourse_types' => $this->getDiscourseTypes($doc), |
497
f3474aeec884
add contry code in indexation, Serialize types, prepare #0025746
ymh <ymh.work@gmail.com>
parents:
496
diff
changeset
|
577 |
'geonames_country' => $geonamesCountry, |
f3474aeec884
add contry code in indexation, Serialize types, prepare #0025746
ymh <ymh.work@gmail.com>
parents:
496
diff
changeset
|
578 |
'geonames_hierarchy' => $geonamesHierarchy, |
325 | 579 |
'subject' => $this->getSubjects($doc), |
498
265992e5b379
Add description to documents interface and indexation. Prepare #0025746
ymh <ymh.work@gmail.com>
parents:
497
diff
changeset
|
580 |
'description' => $this->getDescriptions($doc), |
325 | 581 |
]; |
582 |
} |
|
583 |
||
321
aefaad270b9b
reimplement ThemeController using ES requests to be able to sort by label
ymh <ymh.work@gmail.com>
parents:
320
diff
changeset
|
584 |
/** |
24
de47e8f66e8b
Added command "corpus-parole:indexDocuments" to index documents into ElasticSearch
durandn
parents:
diff
changeset
|
585 |
* Index one document into Elasticsearch |
de47e8f66e8b
Added command "corpus-parole:indexDocuments" to index documents into ElasticSearch
durandn
parents:
diff
changeset
|
586 |
* |
de47e8f66e8b
Added command "corpus-parole:indexDocuments" to index documents into ElasticSearch
durandn
parents:
diff
changeset
|
587 |
* @return int (1 if sucess, 0 if error) |
de47e8f66e8b
Added command "corpus-parole:indexDocuments" to index documents into ElasticSearch
durandn
parents:
diff
changeset
|
588 |
*/ |
326
226d5b17a119
- First implementation of filter for languages.
ymh <ymh.work@gmail.com>
parents:
325
diff
changeset
|
589 |
private function indexOne($docId, $docBody) |
24
de47e8f66e8b
Added command "corpus-parole:indexDocuments" to index documents into ElasticSearch
durandn
parents:
diff
changeset
|
590 |
{ |
de47e8f66e8b
Added command "corpus-parole:indexDocuments" to index documents into ElasticSearch
durandn
parents:
diff
changeset
|
591 |
$query_data = [ |
308
e032d686d88e
add hierarchy info in document indexation + geostats api controllers + add some keys to geonames resolver
ymh <ymh.work@gmail.com>
parents:
25
diff
changeset
|
592 |
'index' => config('elasticsearch.index'), |
24
de47e8f66e8b
Added command "corpus-parole:indexDocuments" to index documents into ElasticSearch
durandn
parents:
diff
changeset
|
593 |
'type' => 'document', |
326
226d5b17a119
- First implementation of filter for languages.
ymh <ymh.work@gmail.com>
parents:
325
diff
changeset
|
594 |
'id' => $docId, |
226d5b17a119
- First implementation of filter for languages.
ymh <ymh.work@gmail.com>
parents:
325
diff
changeset
|
595 |
'body' => $docBody |
24
de47e8f66e8b
Added command "corpus-parole:indexDocuments" to index documents into ElasticSearch
durandn
parents:
diff
changeset
|
596 |
]; |
de47e8f66e8b
Added command "corpus-parole:indexDocuments" to index documents into ElasticSearch
durandn
parents:
diff
changeset
|
597 |
Es::index($query_data); |
de47e8f66e8b
Added command "corpus-parole:indexDocuments" to index documents into ElasticSearch
durandn
parents:
diff
changeset
|
598 |
} |
de47e8f66e8b
Added command "corpus-parole:indexDocuments" to index documents into ElasticSearch
durandn
parents:
diff
changeset
|
599 |
|
de47e8f66e8b
Added command "corpus-parole:indexDocuments" to index documents into ElasticSearch
durandn
parents:
diff
changeset
|
600 |
/** |
de47e8f66e8b
Added command "corpus-parole:indexDocuments" to index documents into ElasticSearch
durandn
parents:
diff
changeset
|
601 |
* Index multiple document into Elasticsearch |
de47e8f66e8b
Added command "corpus-parole:indexDocuments" to index documents into ElasticSearch
durandn
parents:
diff
changeset
|
602 |
* |
de47e8f66e8b
Added command "corpus-parole:indexDocuments" to index documents into ElasticSearch
durandn
parents:
diff
changeset
|
603 |
* @return int (1 if sucess, 0 if error) |
de47e8f66e8b
Added command "corpus-parole:indexDocuments" to index documents into ElasticSearch
durandn
parents:
diff
changeset
|
604 |
*/ |
506
8a5bb4b48b85
try to solve #0025932 + try to improve indexing process reliability by retrying bnf label resolve queries
ymh <ymh.work@gmail.com>
parents:
498
diff
changeset
|
605 |
private function indexBulk($docBodies) { |
8a5bb4b48b85
try to solve #0025932 + try to improve indexing process reliability by retrying bnf label resolve queries
ymh <ymh.work@gmail.com>
parents:
498
diff
changeset
|
606 |
if(empty($docBodies)) { |
8a5bb4b48b85
try to solve #0025932 + try to improve indexing process reliability by retrying bnf label resolve queries
ymh <ymh.work@gmail.com>
parents:
498
diff
changeset
|
607 |
return; |
8a5bb4b48b85
try to solve #0025932 + try to improve indexing process reliability by retrying bnf label resolve queries
ymh <ymh.work@gmail.com>
parents:
498
diff
changeset
|
608 |
} |
8a5bb4b48b85
try to solve #0025932 + try to improve indexing process reliability by retrying bnf label resolve queries
ymh <ymh.work@gmail.com>
parents:
498
diff
changeset
|
609 |
$query_data = ['body' => []]; |
326
226d5b17a119
- First implementation of filter for languages.
ymh <ymh.work@gmail.com>
parents:
325
diff
changeset
|
610 |
foreach($docBodies as $docId => $docBody){ |
24
de47e8f66e8b
Added command "corpus-parole:indexDocuments" to index documents into ElasticSearch
durandn
parents:
diff
changeset
|
611 |
$query_data['body'][] = [ |
de47e8f66e8b
Added command "corpus-parole:indexDocuments" to index documents into ElasticSearch
durandn
parents:
diff
changeset
|
612 |
'index' => [ |
308
e032d686d88e
add hierarchy info in document indexation + geostats api controllers + add some keys to geonames resolver
ymh <ymh.work@gmail.com>
parents:
25
diff
changeset
|
613 |
'_index' => config('elasticsearch.index'), |
24
de47e8f66e8b
Added command "corpus-parole:indexDocuments" to index documents into ElasticSearch
durandn
parents:
diff
changeset
|
614 |
'_type' => 'document', |
326
226d5b17a119
- First implementation of filter for languages.
ymh <ymh.work@gmail.com>
parents:
325
diff
changeset
|
615 |
'_id' => $docId |
24
de47e8f66e8b
Added command "corpus-parole:indexDocuments" to index documents into ElasticSearch
durandn
parents:
diff
changeset
|
616 |
] |
de47e8f66e8b
Added command "corpus-parole:indexDocuments" to index documents into ElasticSearch
durandn
parents:
diff
changeset
|
617 |
]; |
326
226d5b17a119
- First implementation of filter for languages.
ymh <ymh.work@gmail.com>
parents:
325
diff
changeset
|
618 |
$query_data['body'][] = $docBody; |
24
de47e8f66e8b
Added command "corpus-parole:indexDocuments" to index documents into ElasticSearch
durandn
parents:
diff
changeset
|
619 |
} |
de47e8f66e8b
Added command "corpus-parole:indexDocuments" to index documents into ElasticSearch
durandn
parents:
diff
changeset
|
620 |
Es::bulk($query_data); |
de47e8f66e8b
Added command "corpus-parole:indexDocuments" to index documents into ElasticSearch
durandn
parents:
diff
changeset
|
621 |
} |
de47e8f66e8b
Added command "corpus-parole:indexDocuments" to index documents into ElasticSearch
durandn
parents:
diff
changeset
|
622 |
/** |
de47e8f66e8b
Added command "corpus-parole:indexDocuments" to index documents into ElasticSearch
durandn
parents:
diff
changeset
|
623 |
* Execute the console command. |
de47e8f66e8b
Added command "corpus-parole:indexDocuments" to index documents into ElasticSearch
durandn
parents:
diff
changeset
|
624 |
* |
de47e8f66e8b
Added command "corpus-parole:indexDocuments" to index documents into ElasticSearch
durandn
parents:
diff
changeset
|
625 |
* @return mixed |
de47e8f66e8b
Added command "corpus-parole:indexDocuments" to index documents into ElasticSearch
durandn
parents:
diff
changeset
|
626 |
*/ |
de47e8f66e8b
Added command "corpus-parole:indexDocuments" to index documents into ElasticSearch
durandn
parents:
diff
changeset
|
627 |
public function handle() |
de47e8f66e8b
Added command "corpus-parole:indexDocuments" to index documents into ElasticSearch
durandn
parents:
diff
changeset
|
628 |
{ |
de47e8f66e8b
Added command "corpus-parole:indexDocuments" to index documents into ElasticSearch
durandn
parents:
diff
changeset
|
629 |
$this->info('Options:'); |
de47e8f66e8b
Added command "corpus-parole:indexDocuments" to index documents into ElasticSearch
durandn
parents:
diff
changeset
|
630 |
$noBulk = $this->option('no-bulk'); |
de47e8f66e8b
Added command "corpus-parole:indexDocuments" to index documents into ElasticSearch
durandn
parents:
diff
changeset
|
631 |
if ($noBulk) |
de47e8f66e8b
Added command "corpus-parole:indexDocuments" to index documents into ElasticSearch
durandn
parents:
diff
changeset
|
632 |
{ |
de47e8f66e8b
Added command "corpus-parole:indexDocuments" to index documents into ElasticSearch
durandn
parents:
diff
changeset
|
633 |
$this->comment(' - Indexing without bulk insert'); |
de47e8f66e8b
Added command "corpus-parole:indexDocuments" to index documents into ElasticSearch
durandn
parents:
diff
changeset
|
634 |
} |
de47e8f66e8b
Added command "corpus-parole:indexDocuments" to index documents into ElasticSearch
durandn
parents:
diff
changeset
|
635 |
else |
de47e8f66e8b
Added command "corpus-parole:indexDocuments" to index documents into ElasticSearch
durandn
parents:
diff
changeset
|
636 |
{ |
de47e8f66e8b
Added command "corpus-parole:indexDocuments" to index documents into ElasticSearch
durandn
parents:
diff
changeset
|
637 |
$this->comment(' - Indexing using bulk insert'); |
de47e8f66e8b
Added command "corpus-parole:indexDocuments" to index documents into ElasticSearch
durandn
parents:
diff
changeset
|
638 |
} |
de47e8f66e8b
Added command "corpus-parole:indexDocuments" to index documents into ElasticSearch
durandn
parents:
diff
changeset
|
639 |
$limit = $this->option('limit'); |
de47e8f66e8b
Added command "corpus-parole:indexDocuments" to index documents into ElasticSearch
durandn
parents:
diff
changeset
|
640 |
if ($limit>0) { |
de47e8f66e8b
Added command "corpus-parole:indexDocuments" to index documents into ElasticSearch
durandn
parents:
diff
changeset
|
641 |
$this->comment(' - Indexing only the first '.$limit.' documents'); |
de47e8f66e8b
Added command "corpus-parole:indexDocuments" to index documents into ElasticSearch
durandn
parents:
diff
changeset
|
642 |
} |
de47e8f66e8b
Added command "corpus-parole:indexDocuments" to index documents into ElasticSearch
durandn
parents:
diff
changeset
|
643 |
$stepSize = $this->option('step-size'); |
de47e8f66e8b
Added command "corpus-parole:indexDocuments" to index documents into ElasticSearch
durandn
parents:
diff
changeset
|
644 |
$this->comment(' - Indexing with step size of '.$stepSize); |
de47e8f66e8b
Added command "corpus-parole:indexDocuments" to index documents into ElasticSearch
durandn
parents:
diff
changeset
|
645 |
|
323 | 646 |
if($this->option('reset-geo-cache', false)) { |
647 |
// delete all rows in GeonamesHierarchy |
|
648 |
GeonamesHierarchy::getQuery()->delete(); |
|
649 |
$this->comment('Geonames cache reset!'); |
|
650 |
} |
|
651 |
||
24
de47e8f66e8b
Added command "corpus-parole:indexDocuments" to index documents into ElasticSearch
durandn
parents:
diff
changeset
|
652 |
$this->info('Resetting index...'); |
323 | 653 |
$success = $this->resetIndex(); |
24
de47e8f66e8b
Added command "corpus-parole:indexDocuments" to index documents into ElasticSearch
durandn
parents:
diff
changeset
|
654 |
if($success==1){ |
de47e8f66e8b
Added command "corpus-parole:indexDocuments" to index documents into ElasticSearch
durandn
parents:
diff
changeset
|
655 |
$this->comment('Index reset!'); |
de47e8f66e8b
Added command "corpus-parole:indexDocuments" to index documents into ElasticSearch
durandn
parents:
diff
changeset
|
656 |
} |
de47e8f66e8b
Added command "corpus-parole:indexDocuments" to index documents into ElasticSearch
durandn
parents:
diff
changeset
|
657 |
else{ |
406
cf0f23803a53
upgrade elasticsearch to 5.0, upgrade ember
ymh <ymh.work@gmail.com>
parents:
375
diff
changeset
|
658 |
$this->error('Error resetting index ' . config('elasticsearch.index')); |
24
de47e8f66e8b
Added command "corpus-parole:indexDocuments" to index documents into ElasticSearch
durandn
parents:
diff
changeset
|
659 |
} |
de47e8f66e8b
Added command "corpus-parole:indexDocuments" to index documents into ElasticSearch
durandn
parents:
diff
changeset
|
660 |
|
de47e8f66e8b
Added command "corpus-parole:indexDocuments" to index documents into ElasticSearch
durandn
parents:
diff
changeset
|
661 |
$this->info('Indexing documents...'); |
de47e8f66e8b
Added command "corpus-parole:indexDocuments" to index documents into ElasticSearch
durandn
parents:
diff
changeset
|
662 |
|
326
226d5b17a119
- First implementation of filter for languages.
ymh <ymh.work@gmail.com>
parents:
325
diff
changeset
|
663 |
$limit = (int)$limit; |
226d5b17a119
- First implementation of filter for languages.
ymh <ymh.work@gmail.com>
parents:
325
diff
changeset
|
664 |
$total = $this->documentRepository->getCount(); |
226d5b17a119
- First implementation of filter for languages.
ymh <ymh.work@gmail.com>
parents:
325
diff
changeset
|
665 |
|
226d5b17a119
- First implementation of filter for languages.
ymh <ymh.work@gmail.com>
parents:
325
diff
changeset
|
666 |
if($limit>0) { |
226d5b17a119
- First implementation of filter for languages.
ymh <ymh.work@gmail.com>
parents:
325
diff
changeset
|
667 |
$total = min($limit, $total); |
24
de47e8f66e8b
Added command "corpus-parole:indexDocuments" to index documents into ElasticSearch
durandn
parents:
diff
changeset
|
668 |
} |
de47e8f66e8b
Added command "corpus-parole:indexDocuments" to index documents into ElasticSearch
durandn
parents:
diff
changeset
|
669 |
|
326
226d5b17a119
- First implementation of filter for languages.
ymh <ymh.work@gmail.com>
parents:
325
diff
changeset
|
670 |
$progressBar = $this->output->createProgressBar($total); |
24
de47e8f66e8b
Added command "corpus-parole:indexDocuments" to index documents into ElasticSearch
durandn
parents:
diff
changeset
|
671 |
$progressBar->setFormat(' %current%/%max% [%bar%] %percent:3s%% - %message%'); |
de47e8f66e8b
Added command "corpus-parole:indexDocuments" to index documents into ElasticSearch
durandn
parents:
diff
changeset
|
672 |
|
326
226d5b17a119
- First implementation of filter for languages.
ymh <ymh.work@gmail.com>
parents:
325
diff
changeset
|
673 |
$page = 0; |
407
2dba812c7ef2
add a way to build rpm for puppet files, correct elasticsearch provisioning, correct error on elasticsearch queries + tests
ymh <ymh.work@gmail.com>
parents:
406
diff
changeset
|
674 |
$lastPage = 2147483647; |
326
226d5b17a119
- First implementation of filter for languages.
ymh <ymh.work@gmail.com>
parents:
325
diff
changeset
|
675 |
$docIds = []; |
226d5b17a119
- First implementation of filter for languages.
ymh <ymh.work@gmail.com>
parents:
325
diff
changeset
|
676 |
|
226d5b17a119
- First implementation of filter for languages.
ymh <ymh.work@gmail.com>
parents:
325
diff
changeset
|
677 |
while($page++<$lastPage) { |
226d5b17a119
- First implementation of filter for languages.
ymh <ymh.work@gmail.com>
parents:
325
diff
changeset
|
678 |
$docsPaginator = $this->documentRepository->paginate(null, $stepSize, config('corpusparole.pagination_page_param'), $page, "_graph"); |
226d5b17a119
- First implementation of filter for languages.
ymh <ymh.work@gmail.com>
parents:
325
diff
changeset
|
679 |
$lastPage = $docsPaginator->lastPage(); |
226d5b17a119
- First implementation of filter for languages.
ymh <ymh.work@gmail.com>
parents:
325
diff
changeset
|
680 |
$docsBodies = []; |
226d5b17a119
- First implementation of filter for languages.
ymh <ymh.work@gmail.com>
parents:
325
diff
changeset
|
681 |
foreach($docsPaginator as $docResult) { |
226d5b17a119
- First implementation of filter for languages.
ymh <ymh.work@gmail.com>
parents:
325
diff
changeset
|
682 |
$docId = (string)$docResult->getId(); |
226d5b17a119
- First implementation of filter for languages.
ymh <ymh.work@gmail.com>
parents:
325
diff
changeset
|
683 |
$progressBar->setMessage($docId); |
226d5b17a119
- First implementation of filter for languages.
ymh <ymh.work@gmail.com>
parents:
325
diff
changeset
|
684 |
$progressBar->advance(); |
226d5b17a119
- First implementation of filter for languages.
ymh <ymh.work@gmail.com>
parents:
325
diff
changeset
|
685 |
$doc = $this->documentRepository->get($docId); |
506
8a5bb4b48b85
try to solve #0025932 + try to improve indexing process reliability by retrying bnf label resolve queries
ymh <ymh.work@gmail.com>
parents:
498
diff
changeset
|
686 |
if(is_null($doc)) { |
8a5bb4b48b85
try to solve #0025932 + try to improve indexing process reliability by retrying bnf label resolve queries
ymh <ymh.work@gmail.com>
parents:
498
diff
changeset
|
687 |
continue; |
8a5bb4b48b85
try to solve #0025932 + try to improve indexing process reliability by retrying bnf label resolve queries
ymh <ymh.work@gmail.com>
parents:
498
diff
changeset
|
688 |
} |
326
226d5b17a119
- First implementation of filter for languages.
ymh <ymh.work@gmail.com>
parents:
325
diff
changeset
|
689 |
$docBody = $this->getDocBody($doc); |
226d5b17a119
- First implementation of filter for languages.
ymh <ymh.work@gmail.com>
parents:
325
diff
changeset
|
690 |
if($noBulk) { |
226d5b17a119
- First implementation of filter for languages.
ymh <ymh.work@gmail.com>
parents:
325
diff
changeset
|
691 |
$this->indexOne($docId, $docBody); |
226d5b17a119
- First implementation of filter for languages.
ymh <ymh.work@gmail.com>
parents:
325
diff
changeset
|
692 |
} else { |
226d5b17a119
- First implementation of filter for languages.
ymh <ymh.work@gmail.com>
parents:
325
diff
changeset
|
693 |
$docsBodies[$docId] = $docBody; |
24
de47e8f66e8b
Added command "corpus-parole:indexDocuments" to index documents into ElasticSearch
durandn
parents:
diff
changeset
|
694 |
} |
326
226d5b17a119
- First implementation of filter for languages.
ymh <ymh.work@gmail.com>
parents:
325
diff
changeset
|
695 |
$docIds[] = $docId; |
24
de47e8f66e8b
Added command "corpus-parole:indexDocuments" to index documents into ElasticSearch
durandn
parents:
diff
changeset
|
696 |
} |
326
226d5b17a119
- First implementation of filter for languages.
ymh <ymh.work@gmail.com>
parents:
325
diff
changeset
|
697 |
if(!$noBulk) { |
226d5b17a119
- First implementation of filter for languages.
ymh <ymh.work@gmail.com>
parents:
325
diff
changeset
|
698 |
$this->indexBulk($docsBodies); |
24
de47e8f66e8b
Added command "corpus-parole:indexDocuments" to index documents into ElasticSearch
durandn
parents:
diff
changeset
|
699 |
} |
de47e8f66e8b
Added command "corpus-parole:indexDocuments" to index documents into ElasticSearch
durandn
parents:
diff
changeset
|
700 |
} |
de47e8f66e8b
Added command "corpus-parole:indexDocuments" to index documents into ElasticSearch
durandn
parents:
diff
changeset
|
701 |
$progressBar->finish(); |
326
226d5b17a119
- First implementation of filter for languages.
ymh <ymh.work@gmail.com>
parents:
325
diff
changeset
|
702 |
$this->info("\nIndexing completed for " . count(array_unique($docIds))." documents (of ".count($docIds).")."); |
226d5b17a119
- First implementation of filter for languages.
ymh <ymh.work@gmail.com>
parents:
325
diff
changeset
|
703 |
|
24
de47e8f66e8b
Added command "corpus-parole:indexDocuments" to index documents into ElasticSearch
durandn
parents:
diff
changeset
|
704 |
} |
de47e8f66e8b
Added command "corpus-parole:indexDocuments" to index documents into ElasticSearch
durandn
parents:
diff
changeset
|
705 |
} |