server/src/app/Console/Commands/IndexDocuments.php
changeset 308 e032d686d88e
parent 25 4ce76c9e7729
child 320 0fce13da58af
--- a/server/src/app/Console/Commands/IndexDocuments.php	Fri Sep 30 00:43:04 2016 +0200
+++ b/server/src/app/Console/Commands/IndexDocuments.php	Sun Oct 02 11:49:00 2016 +0200
@@ -3,7 +3,10 @@
 namespace CorpusParole\Console\Commands;
 
 use Illuminate\Console\Command;
+use GuzzleHttp\Client;
 use CorpusParole\Repositories\DocumentRepository;
+use CorpusParole\Libraries\CocoonUtils;
+use CorpusParole\Models\GeonamesHierarchy;
 use Es;
 
 class IndexDocuments extends Command
@@ -31,9 +34,10 @@
      *
      * @return void
      */
-    public function __construct(DocumentRepository $documentRepository)
+    public function __construct(DocumentRepository $documentRepository, Client $httpClient)
     {
         $this->documentRepository = $documentRepository;
+        $this->httpClient = $httpClient;
         parent::__construct();
     }
 
@@ -55,10 +59,12 @@
                 return 0;
             }
         }
+        // Note: removed the "'store' => True" parameters on fields and use _source on record instead
+
         $indexParams['body'] = [
             'settings' => [
-                'number_of_shards' => conf('elasticsearch.shards'),
-                'number_of_replicas' => conf('elasticsearch.replicas'),
+                'number_of_shards' => config('elasticsearch.shards'),
+                'number_of_replicas' => config('elasticsearch.replicas'),
                 'index.mapping.ignore_malformed' => True
             ],
             'mappings' => [
@@ -66,7 +72,6 @@
                     'properties' => [
                         'title' => [
                             'type' => 'string',
-                            'store' => True,
                             'fields' => [
                                 'raw' => [
                                     'type' => 'string',
@@ -74,10 +79,10 @@
                                 ]
                             ]
                         ],
-                        'date' => [
-                            'type' => 'date',
-                            'store' => True
-                        ]
+                        'date' => [ 'type' => 'date' ],
+                        'geonames_hyerarchy' => [ 'type' => 'string' ],
+                        'location' => [ 'type' => 'geo_point' ]
+                        // TODO: add location information
                     ]
                 ]
             ]
@@ -89,20 +94,80 @@
         return 1;
     }
 
+
+    private function getGeonamesHierarchyArray($geonamesid) {
+        // TODO: Manage this cache !!!
+        $hcache = GeonamesHierarchy::where('geonamesid', $geonamesid)->first();
+        if(is_null($hcache)) {
+
+            // TODO: add delay to respect geonames 2k request/hour
+            // TODO: manage errors
+
+            $apiBody = $this->httpClient->get(
+                config('corpusparole.geonames_hierarchy_webservice_url'),
+                [ 'query' =>
+                    [ 'geonameId' => $geonamesid,
+                      'username' => config('corpusparole.geonames_username') ],
+                  'accept' => 'application/json' // TODO: check this
+                ]
+            )->getBody();
+            $hjson = json_decode($apiBody);
+            $hcache = new GeonamesHierarchy;
+            $hcache->geonamesid = $geonamesid;
+            $hcache->hierarchy = $hjson;
+            $hcache->save();
+        }
+
+        $res = [];
+        foreach($hcache->hierarchy['geonames'] as $hierarchyElem) {
+            if(in_array($hierarchyElem['fcode'], ['CONT','PCLI', 'PCL','PCLD', 'PCLF', 'PCLH', 'PCLIX', 'PCLIS', 'ADM1'])) {
+                array_push($res, $hierarchyElem['geonameId']);
+            }
+        }
+
+        return $res;
+
+    }
+
+    /**
+     * get geonames hierarchy data.
+     * @return array list of geonames ids
+     */
+    private function getGeonamesHierarchy($doc) {
+        $geoRes = $doc->getGeoInfo();
+        if(is_null($geoRes)) {
+            return [];
+        }
+        // aggregate hierachy list from geonames results
+        $res = [];
+        foreach($geoRes->getGeonamesLocs() as $gurl) {
+            $geonamesId = CocoonUtils::getGeonamesidFromUrl($gurl);
+            if(is_null($geonamesId)) {
+                continue;
+            }
+            $hierarchyIds = $this->getGeonamesHierarchyArray($geonamesId);
+            $res = array_unique(array_merge($res, $hierarchyIds));
+        }
+        return $res;
+
+    }
+
     /**
      * Index one document into Elasticsearch
      *
      * @return int (1 if sucess, 0 if error)
      */
-    private function indexOne($doc)
+    private function indexOne($resultDoc)
     {
+        $doc = $this->documentRepository->get($resultDoc->getId());
         $query_data = [
-            'index' => conf('elasticsearch.index'),
+            'index' => config('elasticsearch.index'),
             'type' => 'document',
             'id' => (string)$doc->getId(),
             'body' => [
                 'title' => (string)$doc->getTitle(),
-                'date' => (string)$doc->getModified()
+                'date' => (string)$doc->getModified(),
+                'geonames_hierarchy' => $this->getGeonamesHierarchy($doc)
             ]
         ];
         Es::index($query_data);
@@ -119,7 +184,7 @@
           foreach($docs as $doc){
               $query_data['body'][] = [
                   'index' => [
-                      '_index' => conf('elasticsearch.index'),
+                      '_index' => config('elasticsearch.index'),
                       '_type' => 'document',
                       '_id' => (string)$doc->getId()
                   ]