# HG changeset patch # User ymh # Date 1475401740 -7200 # Node ID e032d686d88ed4746c8558670c36aa254a153975 # Parent 07b44a378ad8279218cfd437703f8a60ae7c4db3 add hierarchy info in document indexation + geostats api controllers + add some keys to geonames resolver diff -r 07b44a378ad8 -r e032d686d88e cms/app-client/mirage/fixtures/geonames.js --- a/cms/app-client/mirage/fixtures/geonames.js Fri Sep 30 00:43:04 2016 +0200 +++ b/cms/app-client/mirage/fixtures/geonames.js Sun Oct 02 11:49:00 2016 +0200 @@ -748,5 +748,44 @@ { 'id': '6451977', 'label': 'Neuilly-sur-Seine' }, { 'id': '6427502', 'label': 'Saon' }, { 'id': '6455546', 'label': 'Le Breuil-en-Bessin' }, - { 'id': '6427527', 'label': 'Tourni\u00e8res' } -]; \ No newline at end of file + { 'id': '6427527', 'label': 'Tourni\u00e8res' }, + { 'id': '11071619', 'label': 'Bourgogne-Franche-Comt\u00e9'}, + { 'id': '11071620', 'label': 'Nouvelle-Aquitaine'}, + { 'id': '11071621', 'label': 'Normandy'}, + { 'id': '11071622', 'label': 'Grand-Est'}, + { 'id': '11071623', 'label': 'Occitania'}, + { 'id': '11071624', 'label': 'Hauts-de-France'}, + { 'id': '11071625', 'label': 'Auvergne-Rh\u00f4ne-Alpes'}, + { 'id': '11153151', 'label': 'Abidjan'}, + { 'id': '2139685', 'label': 'Nouvelle-Cal\u00e9donie'}, + { 'id': '2140464', 'label': 'South Province'}, + { 'id': '2140685', 'label': 'North Province'}, + { 'id': '2287781', 'label': 'C\u00f4te d\u2019Ivoire'}, + { 'id': '2861876', 'label': 'North Rhine-Westphalia'}, + { 'id': '2921044', 'label': 'Allemagne'}, + { 'id': '2985244', 'label': 'Provence-Alpes-C\u00f4te d\'Azur'}, + { 'id': '2988289', 'label': 'Pays de la Loire'}, + { 'id': '3012874', 'label': '\u00cele-de-France'}, + { 'id': '3017382', 'label': 'France'}, + { 'id': '3023519', 'label': 'R\u00e9gion Corse'}, + { 'id': '3027939', 'label': 'Centre'}, + { 'id': '3030293', 'label': 'Bretagne'}, + { 'id': '3381670', 'label': 'Guyane Fran\u00e7aise'}, + { 'id': '3382998', 'label': 'Surinam'}, + { 'id': '3383062', 'label': 'Sipaliwini'}, + { 'id': '3383329', 'label': 'Paramaribo'}, + { 'id': '3383560', 'label': 'Marowijne'}, + { 'id': '3463504', 'label': 'Federal District'}, + { 'id': '3469034', 'label': 'Brazil'}, + { 'id': '4736286', 'label': 'Texas'}, + { 'id': '6252001', 'label': '\u00c9tats-Unis'}, + { 'id': '6255146', 'label': 'Africa'}, + { 'id': '6255148', 'label': 'Europe'}, + { 'id': '6255149', 'label': 'North America'}, + { 'id': '6255150', 'label': 'South America'}, + { 'id': '6255151', 'label': 'Oceania'}, + { 'id': '6690605', 'label': 'Guyane'}, + { 'id': '934166', 'label': 'Plaines Wilhems District'}, + { 'id': '934292', 'label': '\u00cele Maurice'}, + { 'id': '935317', 'label': 'La R\u00e9union'} +]; diff -r 07b44a378ad8 -r e032d686d88e server/src/.env.example --- a/server/src/.env.example Fri Sep 30 00:43:04 2016 +0200 +++ b/server/src/.env.example Sun Oct 02 11:49:00 2016 +0200 @@ -59,3 +59,5 @@ HANDLE_TEST_DSA_KEY="" HANDLE_TEST_DSA_PASSWORD=NULL HANDLE_TEST_DSA_ADMIN_HANDLE="" + +GEONAMES_USERNAME="demo" diff -r 07b44a378ad8 -r e032d686d88e server/src/app/Console/Commands/IndexDocuments.php --- a/server/src/app/Console/Commands/IndexDocuments.php Fri Sep 30 00:43:04 2016 +0200 +++ b/server/src/app/Console/Commands/IndexDocuments.php Sun Oct 02 11:49:00 2016 +0200 @@ -3,7 +3,10 @@ namespace CorpusParole\Console\Commands; use Illuminate\Console\Command; +use GuzzleHttp\Client; use CorpusParole\Repositories\DocumentRepository; +use CorpusParole\Libraries\CocoonUtils; +use CorpusParole\Models\GeonamesHierarchy; use Es; class IndexDocuments extends Command @@ -31,9 +34,10 @@ * * @return void */ - public function __construct(DocumentRepository $documentRepository) + public function __construct(DocumentRepository $documentRepository, Client $httpClient) { $this->documentRepository = $documentRepository; + $this->httpClient = $httpClient; parent::__construct(); } @@ -55,10 +59,12 @@ return 0; } } + // Note: removed the "'store' => True" parameters on fields and use _source on record instead + $indexParams['body'] = [ 'settings' => [ - 'number_of_shards' => conf('elasticsearch.shards'), - 'number_of_replicas' => conf('elasticsearch.replicas'), + 'number_of_shards' => config('elasticsearch.shards'), + 'number_of_replicas' => config('elasticsearch.replicas'), 'index.mapping.ignore_malformed' => True ], 'mappings' => [ @@ -66,7 +72,6 @@ 'properties' => [ 'title' => [ 'type' => 'string', - 'store' => True, 'fields' => [ 'raw' => [ 'type' => 'string', @@ -74,10 +79,10 @@ ] ] ], - 'date' => [ - 'type' => 'date', - 'store' => True - ] + 'date' => [ 'type' => 'date' ], + 'geonames_hyerarchy' => [ 'type' => 'string' ], + 'location' => [ 'type' => 'geo_point' ] + // TODO: add location information ] ] ] @@ -89,20 +94,80 @@ return 1; } + + private function getGeonamesHierarchyArray($geonamesid) { + // TODO: Manage this cache !!! + $hcache = GeonamesHierarchy::where('geonamesid', $geonamesid)->first(); + if(is_null($hcache)) { + + // TODO: add delay to respect geonames 2k request/hour + // TODO: manage errors + + $apiBody = $this->httpClient->get( + config('corpusparole.geonames_hierarchy_webservice_url'), + [ 'query' => + [ 'geonameId' => $geonamesid, + 'username' => config('corpusparole.geonames_username') ], + 'accept' => 'application/json' // TODO: check this + ] + )->getBody(); + $hjson = json_decode($apiBody); + $hcache = new GeonamesHierarchy; + $hcache->geonamesid = $geonamesid; + $hcache->hierarchy = $hjson; + $hcache->save(); + } + + $res = []; + foreach($hcache->hierarchy['geonames'] as $hierarchyElem) { + if(in_array($hierarchyElem['fcode'], ['CONT','PCLI', 'PCL','PCLD', 'PCLF', 'PCLH', 'PCLIX', 'PCLIS', 'ADM1'])) { + array_push($res, $hierarchyElem['geonameId']); + } + } + + return $res; + + } + + /** + * get geonames hierarchy data. + * @return array list of geonames ids + */ + private function getGeonamesHierarchy($doc) { + $geoRes = $doc->getGeoInfo(); + if(is_null($geoRes)) { + return []; + } + // aggregate hierachy list from geonames results + $res = []; + foreach($geoRes->getGeonamesLocs() as $gurl) { + $geonamesId = CocoonUtils::getGeonamesidFromUrl($gurl); + if(is_null($geonamesId)) { + continue; + } + $hierarchyIds = $this->getGeonamesHierarchyArray($geonamesId); + $res = array_unique(array_merge($res, $hierarchyIds)); + } + return $res; + + } + /** * Index one document into Elasticsearch * * @return int (1 if sucess, 0 if error) */ - private function indexOne($doc) + private function indexOne($resultDoc) { + $doc = $this->documentRepository->get($resultDoc->getId()); $query_data = [ - 'index' => conf('elasticsearch.index'), + 'index' => config('elasticsearch.index'), 'type' => 'document', 'id' => (string)$doc->getId(), 'body' => [ 'title' => (string)$doc->getTitle(), - 'date' => (string)$doc->getModified() + 'date' => (string)$doc->getModified(), + 'geonames_hierarchy' => $this->getGeonamesHierarchy($doc) ] ]; Es::index($query_data); @@ -119,7 +184,7 @@ foreach($docs as $doc){ $query_data['body'][] = [ 'index' => [ - '_index' => conf('elasticsearch.index'), + '_index' => config('elasticsearch.index'), '_type' => 'document', '_id' => (string)$doc->getId() ] diff -r 07b44a378ad8 -r e032d686d88e server/src/app/Http/Controllers/Api/GeoStatsController.php --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/server/src/app/Http/Controllers/Api/GeoStatsController.php Sun Oct 02 11:49:00 2016 +0200 @@ -0,0 +1,44 @@ + env('ELASTICSEARCH_INDEX'), + 'body' => [ + "size" => 0, + "aggs" => [ + "geos" => [ + "terms" => [ + "size" => 0, + "field" => "geonames_hierarchy" + ] + ] + ] + ] + ]; + $esRes = Es::search($query); + + $geosats = []; + + foreach($esRes['aggregations']['geos']['buckets'] as $bucket) { + $geosats[(string)($bucket['key'])] = $bucket['doc_count']; + } + + return response()->json(['geostats' => $geosats ]); + } +} diff -r 07b44a378ad8 -r e032d686d88e server/src/app/Libraries/CocoonUtils.php --- a/server/src/app/Libraries/CocoonUtils.php Fri Sep 30 00:43:04 2016 +0200 +++ b/server/src/app/Libraries/CocoonUtils.php Sun Oct 02 11:49:00 2016 +0200 @@ -76,4 +76,14 @@ ); } + public static function getGeonamesidFromUrl($url) { + $matches = []; + if(preg_match(config('corpusparole.geonames_url_regexp'), $url, $matches) === 1) { + return $matches[1]; + } + else { + return null; + } + } + } diff -r 07b44a378ad8 -r e032d686d88e server/src/app/Models/GeoResource.php --- a/server/src/app/Models/GeoResource.php Fri Sep 30 00:43:04 2016 +0200 +++ b/server/src/app/Models/GeoResource.php Sun Oct 02 11:49:00 2016 +0200 @@ -105,7 +105,9 @@ return is_null($long)?null:$long->getValue(); } - + public function getGeonamesLocs() { + return preg_grep(config('corpusparole.geonames_url_regexp'), $this->getRefLocs()); + } public function jsonSerialize() { $notes = array_map( @@ -175,4 +177,4 @@ } -} \ No newline at end of file +} diff -r 07b44a378ad8 -r e032d686d88e server/src/app/Models/GeonamesHierarchy.php --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/server/src/app/Models/GeonamesHierarchy.php Sun Oct 02 11:49:00 2016 +0200 @@ -0,0 +1,17 @@ + 'array', + ]; +} diff -r 07b44a378ad8 -r e032d686d88e server/src/app/Services/GeonamesResolver.php --- a/server/src/app/Services/GeonamesResolver.php Fri Sep 30 00:43:04 2016 +0200 +++ b/server/src/app/Services/GeonamesResolver.php Sun Oct 02 11:49:00 2016 +0200 @@ -65,7 +65,7 @@ $label = isset($labels['fr']) ? $labels['fr'] : ( isset($labels[''])? $labels['']: null) ; if(is_null($label)) { - $labelLit = $graph->getLiteral("<$url>", ""); + $labelLit = $graph->getLiteral("<$url>", ""); $label = (!is_null($labelLit)) ? $labelLit->getValue() : null; } @@ -86,8 +86,9 @@ */ public function getLabel($id) { $geonamesid = $id; - if(strpos($id, config('corpusparole.geonames_base_url')) === 0) { - $geonamesid = substr($id, strlen(config('corpusparole.geonames_base_url'))); + $matches = []; + if( preg_match(config('corpusparole.geonames_url_regexp'), $id, $matches) === 1) { + $geonamesid = $matches[1]; } $geonamesid = rtrim($geonamesid, '/'); diff -r 07b44a378ad8 -r e032d686d88e server/src/config/constants.php --- a/server/src/config/constants.php Fri Sep 30 00:43:04 2016 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,7 +0,0 @@ - [0,0,0,'alpha',1], - -]; diff -r 07b44a378ad8 -r e032d686d88e server/src/config/corpusparole.php --- a/server/src/config/corpusparole.php Fri Sep 30 00:43:04 2016 +0200 +++ b/server/src/config/corpusparole.php Sun Oct 02 11:49:00 2016 +0200 @@ -125,8 +125,11 @@ 'bnf_completion_url' => 'http://data.bnf.fr/search-letter/', 'geonames_base_url' => 'http://sws.geonames.org/', + 'geonames_url_regexp' => '/http[s]?\:\/\/(?:sws|www)\.geonames\.org\/(\d+)\/?/', 'geonames_cache_expiration' => 60*24*30, 'geonames_max_ids' => 500, + 'geonames_hierarchy_webservice_url' => 'http://api.geonames.org/hierarchyJSON', + 'geonames_username' => env('GEONAMES_USERNAME'), 'bo_client_environment' => [ "modulePrefix" => "bo-client", diff -r 07b44a378ad8 -r e032d686d88e server/src/database/migrations/2016_09_30_132045_create_geonames_hierarchies_table.php --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/server/src/database/migrations/2016_09_30_132045_create_geonames_hierarchies_table.php Sun Oct 02 11:49:00 2016 +0200 @@ -0,0 +1,33 @@ +increments('id'); + $table->timestamps(); + $table->string('geonamesid')->unique(); + $table->json('hierarchy'); + }); + } + + /** + * Reverse the migrations. + * + * @return void + */ + public function down() + { + Schema::dropIfExists('geonames_hierarchies'); + } +} diff -r 07b44a378ad8 -r e032d686d88e server/src/routes/api.php --- a/server/src/routes/api.php Fri Sep 30 00:43:04 2016 +0200 +++ b/server/src/routes/api.php Sun Oct 02 11:49:00 2016 +0200 @@ -40,5 +40,7 @@ ['only' => ['index']]); Route::resource('datestats', 'Api\DateStatsController', ['only' => ['index']]); + Route::resource('geostats', 'Api\GeoStatsController', + ['only' => ['index']]); }); }); diff -r 07b44a378ad8 -r e032d686d88e server/src/tests/Controllers/GeoStatsControllerTest.php --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/server/src/tests/Controllers/GeoStatsControllerTest.php Sun Oct 02 11:49:00 2016 +0200 @@ -0,0 +1,65 @@ + env('ELASTICSEARCH_INDEX'), + 'body' => [ + "size" => 0, + "aggs" => [ + "geos" => [ + "terms" => [ + "size" => 0, + "field" => "geonames_hierarchy" + ] + ] + ] + ] + ]; + + Es::shouldReceive('search') + ->once() + ->with($query) + ->andReturn(json_decode("{ + \"took\" : 17, + \"timed_out\" : false, + \"_shards\" : { + \"total\" : 1, + \"successful\" : 1, + \"failed\" : 0 + }, + \"hits\" : { + \"total\" : 3011, + \"max_score\" : 0.0, + \"hits\" : [ ] + }, + \"aggregations\" : { + \"geos\" : { + \"doc_count_error_upper_bound\" : 0, + \"sum_other_doc_count\" : 0, + \"buckets\" : [ { + \"key\" : 6255148, + \"doc_count\" : 2684 + }, { + \"key\" : 3017382, + \"doc_count\" : 2674 + }, { + \"key\" : 3027939, + \"doc_count\" : 851 + } ] + } + } +}", true)); + + $this->get('/api/v1/stats/geostats/')->assertTrue($this->response->isOk(), $this->response->content()); + $this->seeJsonEquals(["geostats" => [ + '6255148' => 2684, + '3017382' => 2674, + '3027939' => 851 + ]]); + } +}