diff -r 07b44a378ad8 -r e032d686d88e server/src/app/Console/Commands/IndexDocuments.php --- a/server/src/app/Console/Commands/IndexDocuments.php Fri Sep 30 00:43:04 2016 +0200 +++ b/server/src/app/Console/Commands/IndexDocuments.php Sun Oct 02 11:49:00 2016 +0200 @@ -3,7 +3,10 @@ namespace CorpusParole\Console\Commands; use Illuminate\Console\Command; +use GuzzleHttp\Client; use CorpusParole\Repositories\DocumentRepository; +use CorpusParole\Libraries\CocoonUtils; +use CorpusParole\Models\GeonamesHierarchy; use Es; class IndexDocuments extends Command @@ -31,9 +34,10 @@ * * @return void */ - public function __construct(DocumentRepository $documentRepository) + public function __construct(DocumentRepository $documentRepository, Client $httpClient) { $this->documentRepository = $documentRepository; + $this->httpClient = $httpClient; parent::__construct(); } @@ -55,10 +59,12 @@ return 0; } } + // Note: removed the "'store' => True" parameters on fields and use _source on record instead + $indexParams['body'] = [ 'settings' => [ - 'number_of_shards' => conf('elasticsearch.shards'), - 'number_of_replicas' => conf('elasticsearch.replicas'), + 'number_of_shards' => config('elasticsearch.shards'), + 'number_of_replicas' => config('elasticsearch.replicas'), 'index.mapping.ignore_malformed' => True ], 'mappings' => [ @@ -66,7 +72,6 @@ 'properties' => [ 'title' => [ 'type' => 'string', - 'store' => True, 'fields' => [ 'raw' => [ 'type' => 'string', @@ -74,10 +79,10 @@ ] ] ], - 'date' => [ - 'type' => 'date', - 'store' => True - ] + 'date' => [ 'type' => 'date' ], + 'geonames_hyerarchy' => [ 'type' => 'string' ], + 'location' => [ 'type' => 'geo_point' ] + // TODO: add location information ] ] ] @@ -89,20 +94,80 @@ return 1; } + + private function getGeonamesHierarchyArray($geonamesid) { + // TODO: Manage this cache !!! + $hcache = GeonamesHierarchy::where('geonamesid', $geonamesid)->first(); + if(is_null($hcache)) { + + // TODO: add delay to respect geonames 2k request/hour + // TODO: manage errors + + $apiBody = $this->httpClient->get( + config('corpusparole.geonames_hierarchy_webservice_url'), + [ 'query' => + [ 'geonameId' => $geonamesid, + 'username' => config('corpusparole.geonames_username') ], + 'accept' => 'application/json' // TODO: check this + ] + )->getBody(); + $hjson = json_decode($apiBody); + $hcache = new GeonamesHierarchy; + $hcache->geonamesid = $geonamesid; + $hcache->hierarchy = $hjson; + $hcache->save(); + } + + $res = []; + foreach($hcache->hierarchy['geonames'] as $hierarchyElem) { + if(in_array($hierarchyElem['fcode'], ['CONT','PCLI', 'PCL','PCLD', 'PCLF', 'PCLH', 'PCLIX', 'PCLIS', 'ADM1'])) { + array_push($res, $hierarchyElem['geonameId']); + } + } + + return $res; + + } + + /** + * get geonames hierarchy data. + * @return array list of geonames ids + */ + private function getGeonamesHierarchy($doc) { + $geoRes = $doc->getGeoInfo(); + if(is_null($geoRes)) { + return []; + } + // aggregate hierachy list from geonames results + $res = []; + foreach($geoRes->getGeonamesLocs() as $gurl) { + $geonamesId = CocoonUtils::getGeonamesidFromUrl($gurl); + if(is_null($geonamesId)) { + continue; + } + $hierarchyIds = $this->getGeonamesHierarchyArray($geonamesId); + $res = array_unique(array_merge($res, $hierarchyIds)); + } + return $res; + + } + /** * Index one document into Elasticsearch * * @return int (1 if sucess, 0 if error) */ - private function indexOne($doc) + private function indexOne($resultDoc) { + $doc = $this->documentRepository->get($resultDoc->getId()); $query_data = [ - 'index' => conf('elasticsearch.index'), + 'index' => config('elasticsearch.index'), 'type' => 'document', 'id' => (string)$doc->getId(), 'body' => [ 'title' => (string)$doc->getTitle(), - 'date' => (string)$doc->getModified() + 'date' => (string)$doc->getModified(), + 'geonames_hierarchy' => $this->getGeonamesHierarchy($doc) ] ]; Es::index($query_data); @@ -119,7 +184,7 @@ foreach($docs as $doc){ $query_data['body'][] = [ 'index' => [ - '_index' => conf('elasticsearch.index'), + '_index' => config('elasticsearch.index'), '_type' => 'document', '_id' => (string)$doc->getId() ]