--- a/server/src/app/Console/Commands/IndexDocuments.php Fri Sep 30 00:43:04 2016 +0200
+++ b/server/src/app/Console/Commands/IndexDocuments.php Sun Oct 02 11:49:00 2016 +0200
@@ -3,7 +3,10 @@
namespace CorpusParole\Console\Commands;
use Illuminate\Console\Command;
+use GuzzleHttp\Client;
use CorpusParole\Repositories\DocumentRepository;
+use CorpusParole\Libraries\CocoonUtils;
+use CorpusParole\Models\GeonamesHierarchy;
use Es;
class IndexDocuments extends Command
@@ -31,9 +34,10 @@
*
* @return void
*/
- public function __construct(DocumentRepository $documentRepository)
+ public function __construct(DocumentRepository $documentRepository, Client $httpClient)
{
$this->documentRepository = $documentRepository;
+ $this->httpClient = $httpClient;
parent::__construct();
}
@@ -55,10 +59,12 @@
return 0;
}
}
+ // Note: removed the "'store' => True" parameters on fields and use _source on record instead
+
$indexParams['body'] = [
'settings' => [
- 'number_of_shards' => conf('elasticsearch.shards'),
- 'number_of_replicas' => conf('elasticsearch.replicas'),
+ 'number_of_shards' => config('elasticsearch.shards'),
+ 'number_of_replicas' => config('elasticsearch.replicas'),
'index.mapping.ignore_malformed' => True
],
'mappings' => [
@@ -66,7 +72,6 @@
'properties' => [
'title' => [
'type' => 'string',
- 'store' => True,
'fields' => [
'raw' => [
'type' => 'string',
@@ -74,10 +79,10 @@
]
]
],
- 'date' => [
- 'type' => 'date',
- 'store' => True
- ]
+ 'date' => [ 'type' => 'date' ],
+ 'geonames_hyerarchy' => [ 'type' => 'string' ],
+ 'location' => [ 'type' => 'geo_point' ]
+ // TODO: add location information
]
]
]
@@ -89,20 +94,80 @@
return 1;
}
+
+ private function getGeonamesHierarchyArray($geonamesid) {
+ // TODO: Manage this cache !!!
+ $hcache = GeonamesHierarchy::where('geonamesid', $geonamesid)->first();
+ if(is_null($hcache)) {
+
+ // TODO: add delay to respect geonames 2k request/hour
+ // TODO: manage errors
+
+ $apiBody = $this->httpClient->get(
+ config('corpusparole.geonames_hierarchy_webservice_url'),
+ [ 'query' =>
+ [ 'geonameId' => $geonamesid,
+ 'username' => config('corpusparole.geonames_username') ],
+ 'accept' => 'application/json' // TODO: check this
+ ]
+ )->getBody();
+ $hjson = json_decode($apiBody);
+ $hcache = new GeonamesHierarchy;
+ $hcache->geonamesid = $geonamesid;
+ $hcache->hierarchy = $hjson;
+ $hcache->save();
+ }
+
+ $res = [];
+ foreach($hcache->hierarchy['geonames'] as $hierarchyElem) {
+ if(in_array($hierarchyElem['fcode'], ['CONT','PCLI', 'PCL','PCLD', 'PCLF', 'PCLH', 'PCLIX', 'PCLIS', 'ADM1'])) {
+ array_push($res, $hierarchyElem['geonameId']);
+ }
+ }
+
+ return $res;
+
+ }
+
+ /**
+ * get geonames hierarchy data.
+ * @return array list of geonames ids
+ */
+ private function getGeonamesHierarchy($doc) {
+ $geoRes = $doc->getGeoInfo();
+ if(is_null($geoRes)) {
+ return [];
+ }
+ // aggregate hierachy list from geonames results
+ $res = [];
+ foreach($geoRes->getGeonamesLocs() as $gurl) {
+ $geonamesId = CocoonUtils::getGeonamesidFromUrl($gurl);
+ if(is_null($geonamesId)) {
+ continue;
+ }
+ $hierarchyIds = $this->getGeonamesHierarchyArray($geonamesId);
+ $res = array_unique(array_merge($res, $hierarchyIds));
+ }
+ return $res;
+
+ }
+
/**
* Index one document into Elasticsearch
*
* @return int (1 if sucess, 0 if error)
*/
- private function indexOne($doc)
+ private function indexOne($resultDoc)
{
+ $doc = $this->documentRepository->get($resultDoc->getId());
$query_data = [
- 'index' => conf('elasticsearch.index'),
+ 'index' => config('elasticsearch.index'),
'type' => 'document',
'id' => (string)$doc->getId(),
'body' => [
'title' => (string)$doc->getTitle(),
- 'date' => (string)$doc->getModified()
+ 'date' => (string)$doc->getModified(),
+ 'geonames_hierarchy' => $this->getGeonamesHierarchy($doc)
]
];
Es::index($query_data);
@@ -119,7 +184,7 @@
foreach($docs as $doc){
$query_data['body'][] = [
'index' => [
- '_index' => conf('elasticsearch.index'),
+ '_index' => config('elasticsearch.index'),
'_type' => 'document',
'_id' => (string)$doc->getId()
]