add hierarchy info in document indexation + geostats api controllers + add some keys to geonames resolver
authorymh <ymh.work@gmail.com>
Sun, 02 Oct 2016 11:49:00 +0200
changeset 308 e032d686d88e
parent 307 07b44a378ad8
child 309 6ab16926b675
add hierarchy info in document indexation + geostats api controllers + add some keys to geonames resolver
cms/app-client/mirage/fixtures/geonames.js
server/src/.env.example
server/src/app/Console/Commands/IndexDocuments.php
server/src/app/Http/Controllers/Api/GeoStatsController.php
server/src/app/Libraries/CocoonUtils.php
server/src/app/Models/GeoResource.php
server/src/app/Models/GeonamesHierarchy.php
server/src/app/Services/GeonamesResolver.php
server/src/config/constants.php
server/src/config/corpusparole.php
server/src/database/migrations/2016_09_30_132045_create_geonames_hierarchies_table.php
server/src/routes/api.php
server/src/tests/Controllers/GeoStatsControllerTest.php
--- a/cms/app-client/mirage/fixtures/geonames.js	Fri Sep 30 00:43:04 2016 +0200
+++ b/cms/app-client/mirage/fixtures/geonames.js	Sun Oct 02 11:49:00 2016 +0200
@@ -748,5 +748,44 @@
     { 'id': '6451977', 'label': 'Neuilly-sur-Seine' },
     { 'id': '6427502', 'label': 'Saon' },
     { 'id': '6455546', 'label': 'Le Breuil-en-Bessin' },
-    { 'id': '6427527', 'label': 'Tourni\u00e8res' }
-];
\ No newline at end of file
+    { 'id': '6427527', 'label': 'Tourni\u00e8res' },
+    { 'id': '11071619', 'label': 'Bourgogne-Franche-Comt\u00e9'},
+    { 'id': '11071620', 'label': 'Nouvelle-Aquitaine'},
+    { 'id': '11071621', 'label': 'Normandy'},
+    { 'id': '11071622', 'label': 'Grand-Est'},
+    { 'id': '11071623', 'label': 'Occitania'},
+    { 'id': '11071624', 'label': 'Hauts-de-France'},
+    { 'id': '11071625', 'label': 'Auvergne-Rh\u00f4ne-Alpes'},
+    { 'id': '11153151', 'label': 'Abidjan'},
+    { 'id': '2139685', 'label': 'Nouvelle-Cal\u00e9donie'},
+    { 'id': '2140464', 'label': 'South Province'},
+    { 'id': '2140685', 'label': 'North Province'},
+    { 'id': '2287781', 'label': 'C\u00f4te d\u2019Ivoire'},
+    { 'id': '2861876', 'label': 'North Rhine-Westphalia'},
+    { 'id': '2921044', 'label': 'Allemagne'},
+    { 'id': '2985244', 'label': 'Provence-Alpes-C\u00f4te d\'Azur'},
+    { 'id': '2988289', 'label': 'Pays de la Loire'},
+    { 'id': '3012874', 'label': '\u00cele-de-France'},
+    { 'id': '3017382', 'label': 'France'},
+    { 'id': '3023519', 'label': 'R\u00e9gion Corse'},
+    { 'id': '3027939', 'label': 'Centre'},
+    { 'id': '3030293', 'label': 'Bretagne'},
+    { 'id': '3381670', 'label': 'Guyane Fran\u00e7aise'},
+    { 'id': '3382998', 'label': 'Surinam'},
+    { 'id': '3383062', 'label': 'Sipaliwini'},
+    { 'id': '3383329', 'label': 'Paramaribo'},
+    { 'id': '3383560', 'label': 'Marowijne'},
+    { 'id': '3463504', 'label': 'Federal District'},
+    { 'id': '3469034', 'label': 'Brazil'},
+    { 'id': '4736286', 'label': 'Texas'},
+    { 'id': '6252001', 'label': '\u00c9tats-Unis'},
+    { 'id': '6255146', 'label': 'Africa'},
+    { 'id': '6255148', 'label': 'Europe'},
+    { 'id': '6255149', 'label': 'North America'},
+    { 'id': '6255150', 'label': 'South America'},
+    { 'id': '6255151', 'label': 'Oceania'},
+    { 'id': '6690605', 'label': 'Guyane'},
+    { 'id': '934166', 'label': 'Plaines Wilhems District'},
+    { 'id': '934292', 'label': '\u00cele Maurice'},
+    { 'id': '935317', 'label': 'La R\u00e9union'}
+];
--- a/server/src/.env.example	Fri Sep 30 00:43:04 2016 +0200
+++ b/server/src/.env.example	Sun Oct 02 11:49:00 2016 +0200
@@ -59,3 +59,5 @@
 HANDLE_TEST_DSA_KEY=""
 HANDLE_TEST_DSA_PASSWORD=NULL
 HANDLE_TEST_DSA_ADMIN_HANDLE=""
+
+GEONAMES_USERNAME="demo"
--- a/server/src/app/Console/Commands/IndexDocuments.php	Fri Sep 30 00:43:04 2016 +0200
+++ b/server/src/app/Console/Commands/IndexDocuments.php	Sun Oct 02 11:49:00 2016 +0200
@@ -3,7 +3,10 @@
 namespace CorpusParole\Console\Commands;
 
 use Illuminate\Console\Command;
+use GuzzleHttp\Client;
 use CorpusParole\Repositories\DocumentRepository;
+use CorpusParole\Libraries\CocoonUtils;
+use CorpusParole\Models\GeonamesHierarchy;
 use Es;
 
 class IndexDocuments extends Command
@@ -31,9 +34,10 @@
      *
      * @return void
      */
-    public function __construct(DocumentRepository $documentRepository)
+    public function __construct(DocumentRepository $documentRepository, Client $httpClient)
     {
         $this->documentRepository = $documentRepository;
+        $this->httpClient = $httpClient;
         parent::__construct();
     }
 
@@ -55,10 +59,12 @@
                 return 0;
             }
         }
+        // Note: removed the "'store' => True" parameters on fields and use _source on record instead
+
         $indexParams['body'] = [
             'settings' => [
-                'number_of_shards' => conf('elasticsearch.shards'),
-                'number_of_replicas' => conf('elasticsearch.replicas'),
+                'number_of_shards' => config('elasticsearch.shards'),
+                'number_of_replicas' => config('elasticsearch.replicas'),
                 'index.mapping.ignore_malformed' => True
             ],
             'mappings' => [
@@ -66,7 +72,6 @@
                     'properties' => [
                         'title' => [
                             'type' => 'string',
-                            'store' => True,
                             'fields' => [
                                 'raw' => [
                                     'type' => 'string',
@@ -74,10 +79,10 @@
                                 ]
                             ]
                         ],
-                        'date' => [
-                            'type' => 'date',
-                            'store' => True
-                        ]
+                        'date' => [ 'type' => 'date' ],
+                        'geonames_hyerarchy' => [ 'type' => 'string' ],
+                        'location' => [ 'type' => 'geo_point' ]
+                        // TODO: add location information
                     ]
                 ]
             ]
@@ -89,20 +94,80 @@
         return 1;
     }
 
+
+    private function getGeonamesHierarchyArray($geonamesid) {
+        // TODO: Manage this cache !!!
+        $hcache = GeonamesHierarchy::where('geonamesid', $geonamesid)->first();
+        if(is_null($hcache)) {
+
+            // TODO: add delay to respect geonames 2k request/hour
+            // TODO: manage errors
+
+            $apiBody = $this->httpClient->get(
+                config('corpusparole.geonames_hierarchy_webservice_url'),
+                [ 'query' =>
+                    [ 'geonameId' => $geonamesid,
+                      'username' => config('corpusparole.geonames_username') ],
+                  'accept' => 'application/json' // TODO: check this
+                ]
+            )->getBody();
+            $hjson = json_decode($apiBody);
+            $hcache = new GeonamesHierarchy;
+            $hcache->geonamesid = $geonamesid;
+            $hcache->hierarchy = $hjson;
+            $hcache->save();
+        }
+
+        $res = [];
+        foreach($hcache->hierarchy['geonames'] as $hierarchyElem) {
+            if(in_array($hierarchyElem['fcode'], ['CONT','PCLI', 'PCL','PCLD', 'PCLF', 'PCLH', 'PCLIX', 'PCLIS', 'ADM1'])) {
+                array_push($res, $hierarchyElem['geonameId']);
+            }
+        }
+
+        return $res;
+
+    }
+
+    /**
+     * get geonames hierarchy data.
+     * @return array list of geonames ids
+     */
+    private function getGeonamesHierarchy($doc) {
+        $geoRes = $doc->getGeoInfo();
+        if(is_null($geoRes)) {
+            return [];
+        }
+        // aggregate hierachy list from geonames results
+        $res = [];
+        foreach($geoRes->getGeonamesLocs() as $gurl) {
+            $geonamesId = CocoonUtils::getGeonamesidFromUrl($gurl);
+            if(is_null($geonamesId)) {
+                continue;
+            }
+            $hierarchyIds = $this->getGeonamesHierarchyArray($geonamesId);
+            $res = array_unique(array_merge($res, $hierarchyIds));
+        }
+        return $res;
+
+    }
+
     /**
      * Index one document into Elasticsearch
      *
      * @return int (1 if sucess, 0 if error)
      */
-    private function indexOne($doc)
+    private function indexOne($resultDoc)
     {
+        $doc = $this->documentRepository->get($resultDoc->getId());
         $query_data = [
-            'index' => conf('elasticsearch.index'),
+            'index' => config('elasticsearch.index'),
             'type' => 'document',
             'id' => (string)$doc->getId(),
             'body' => [
                 'title' => (string)$doc->getTitle(),
-                'date' => (string)$doc->getModified()
+                'date' => (string)$doc->getModified(),
+                'geonames_hierarchy' => $this->getGeonamesHierarchy($doc)
             ]
         ];
         Es::index($query_data);
@@ -119,7 +184,7 @@
           foreach($docs as $doc){
               $query_data['body'][] = [
                   'index' => [
-                      '_index' => conf('elasticsearch.index'),
+                      '_index' => config('elasticsearch.index'),
                       '_type' => 'document',
                       '_id' => (string)$doc->getId()
                   ]
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/server/src/app/Http/Controllers/Api/GeoStatsController.php	Sun Oct 02 11:49:00 2016 +0200
@@ -0,0 +1,44 @@
+<?php
+
+namespace CorpusParole\Http\Controllers\Api;
+
+use Illuminate\Http\Request;
+
+use CorpusParole\Http\Requests;
+use CorpusParole\Http\Controllers\Controller;
+use Es;
+
+class GeoStatsController extends Controller
+{
+    /**
+     * Display the specified resource.
+     *
+     * @return \Illuminate\Http\Response
+     */
+    public function index(Request $request)
+    {
+        $query = [
+            'index' => env('ELASTICSEARCH_INDEX'),
+            'body' => [
+                "size" => 0,
+                "aggs" => [
+                    "geos" => [
+                        "terms" => [
+                            "size" => 0,
+                            "field" => "geonames_hierarchy"
+                        ]
+                    ]
+                ]
+            ]
+        ];
+        $esRes = Es::search($query);
+
+        $geosats = [];
+
+        foreach($esRes['aggregations']['geos']['buckets'] as $bucket) {
+            $geosats[(string)($bucket['key'])] = $bucket['doc_count'];
+        }
+
+        return response()->json(['geostats' => $geosats ]);
+    }
+}
--- a/server/src/app/Libraries/CocoonUtils.php	Fri Sep 30 00:43:04 2016 +0200
+++ b/server/src/app/Libraries/CocoonUtils.php	Sun Oct 02 11:49:00 2016 +0200
@@ -76,4 +76,14 @@
         );
     }
 
+    public static function getGeonamesidFromUrl($url) {
+        $matches = [];
+        if(preg_match(config('corpusparole.geonames_url_regexp'), $url, $matches) === 1) {
+            return $matches[1];
+        }
+        else {
+            return null;
+        }
+    }
+
 }
--- a/server/src/app/Models/GeoResource.php	Fri Sep 30 00:43:04 2016 +0200
+++ b/server/src/app/Models/GeoResource.php	Sun Oct 02 11:49:00 2016 +0200
@@ -105,7 +105,9 @@
         return is_null($long)?null:$long->getValue();
     }
 
-
+    public function getGeonamesLocs() {
+        return preg_grep(config('corpusparole.geonames_url_regexp'), $this->getRefLocs());
+    }
 
     public function jsonSerialize() {
         $notes = array_map(
@@ -175,4 +177,4 @@
     }
 
 
-}
\ No newline at end of file
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/server/src/app/Models/GeonamesHierarchy.php	Sun Oct 02 11:49:00 2016 +0200
@@ -0,0 +1,17 @@
+<?php
+
+namespace CorpusParole\Models;
+
+use Illuminate\Database\Eloquent\Model;
+
+class GeonamesHierarchy extends Model
+{
+    /**
+     * The attributes that should be casted to native types.
+     *
+     * @var array
+     */
+    protected $casts = [
+        'hierarchy' => 'array',
+    ];
+}
--- a/server/src/app/Services/GeonamesResolver.php	Fri Sep 30 00:43:04 2016 +0200
+++ b/server/src/app/Services/GeonamesResolver.php	Sun Oct 02 11:49:00 2016 +0200
@@ -65,7 +65,7 @@
         $label = isset($labels['fr']) ? $labels['fr'] : ( isset($labels[''])? $labels['']: null) ;
 
         if(is_null($label)) {
-            $labelLit = $graph->getLiteral("<$url>", "<http://www.geonames.org/ontology#name>");            
+            $labelLit = $graph->getLiteral("<$url>", "<http://www.geonames.org/ontology#name>");
             $label = (!is_null($labelLit)) ? $labelLit->getValue() : null;
         }
 
@@ -86,8 +86,9 @@
      */
     public function getLabel($id) {
         $geonamesid = $id;
-        if(strpos($id, config('corpusparole.geonames_base_url')) === 0) {
-            $geonamesid = substr($id, strlen(config('corpusparole.geonames_base_url')));
+        $matches = [];
+        if( preg_match(config('corpusparole.geonames_url_regexp'), $id, $matches) === 1) {
+            $geonamesid = $matches[1];
         }
         $geonamesid = rtrim($geonamesid, '/');
 
--- a/server/src/config/constants.php	Fri Sep 30 00:43:04 2016 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,7 +0,0 @@
-<?php
-
-return [
-
-    'VERSION' => [0,0,0,'alpha',1],
-
-];
--- a/server/src/config/corpusparole.php	Fri Sep 30 00:43:04 2016 +0200
+++ b/server/src/config/corpusparole.php	Sun Oct 02 11:49:00 2016 +0200
@@ -125,8 +125,11 @@
     'bnf_completion_url' => 'http://data.bnf.fr/search-letter/',
 
     'geonames_base_url' => 'http://sws.geonames.org/',
+    'geonames_url_regexp' => '/http[s]?\:\/\/(?:sws|www)\.geonames\.org\/(\d+)\/?/',
     'geonames_cache_expiration' => 60*24*30,
     'geonames_max_ids' => 500,
+    'geonames_hierarchy_webservice_url' => 'http://api.geonames.org/hierarchyJSON',
+    'geonames_username' => env('GEONAMES_USERNAME'),
 
     'bo_client_environment' => [
         "modulePrefix" => "bo-client",
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/server/src/database/migrations/2016_09_30_132045_create_geonames_hierarchies_table.php	Sun Oct 02 11:49:00 2016 +0200
@@ -0,0 +1,33 @@
+<?php
+
+use Illuminate\Support\Facades\Schema;
+use Illuminate\Database\Schema\Blueprint;
+use Illuminate\Database\Migrations\Migration;
+
+class CreateGeonamesHierarchiesTable extends Migration
+{
+    /**
+     * Run the migrations.
+     *
+     * @return void
+     */
+    public function up()
+    {
+        Schema::create('geonames_hierarchies', function (Blueprint $table) {
+            $table->increments('id');
+            $table->timestamps();
+            $table->string('geonamesid')->unique();
+            $table->json('hierarchy');
+        });
+    }
+
+    /**
+     * Reverse the migrations.
+     *
+     * @return void
+     */
+    public function down()
+    {
+        Schema::dropIfExists('geonames_hierarchies');
+    }
+}
--- a/server/src/routes/api.php	Fri Sep 30 00:43:04 2016 +0200
+++ b/server/src/routes/api.php	Sun Oct 02 11:49:00 2016 +0200
@@ -40,5 +40,7 @@
                         ['only' => ['index']]);
         Route::resource('datestats', 'Api\DateStatsController',
                         ['only' => ['index']]);
+        Route::resource('geostats', 'Api\GeoStatsController',
+                        ['only' => ['index']]);
     });
 });
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/server/src/tests/Controllers/GeoStatsControllerTest.php	Sun Oct 02 11:49:00 2016 +0200
@@ -0,0 +1,65 @@
+<?php
+
+use Es;
+
+class GeoStatsControllerTest extends TestCase
+{
+   public function testGetIndex()
+    {
+        $query = [
+            'index' => env('ELASTICSEARCH_INDEX'),
+            'body' => [
+                "size" => 0,
+                "aggs" => [
+                    "geos" => [
+                        "terms" => [
+                            "size" => 0,
+                            "field" => "geonames_hierarchy"
+                        ]
+                    ]
+                ]
+            ]
+        ];
+
+        Es::shouldReceive('search')
+                    ->once()
+                    ->with($query)
+                    ->andReturn(json_decode("{
+  \"took\" : 17,
+  \"timed_out\" : false,
+  \"_shards\" : {
+    \"total\" : 1,
+    \"successful\" : 1,
+    \"failed\" : 0
+  },
+  \"hits\" : {
+    \"total\" : 3011,
+    \"max_score\" : 0.0,
+    \"hits\" : [ ]
+  },
+  \"aggregations\" : {
+    \"geos\" : {
+      \"doc_count_error_upper_bound\" : 0,
+      \"sum_other_doc_count\" : 0,
+      \"buckets\" : [ {
+        \"key\" : 6255148,
+        \"doc_count\" : 2684
+      }, {
+        \"key\" : 3017382,
+        \"doc_count\" : 2674
+      }, {
+        \"key\" : 3027939,
+        \"doc_count\" : 851
+      } ]
+    }
+  }
+}", true));
+
+        $this->get('/api/v1/stats/geostats/')->assertTrue($this->response->isOk(), $this->response->content());
+        $this->seeJsonEquals(["geostats" => [
+            '6255148' => 2684,
+            '3017382' => 2674,
+            '3027939' => 851
+        ]]);
+    }
+}