add hierarchy info in document indexation + geostats api controllers + add some keys to geonames resolver
--- a/cms/app-client/mirage/fixtures/geonames.js Fri Sep 30 00:43:04 2016 +0200
+++ b/cms/app-client/mirage/fixtures/geonames.js Sun Oct 02 11:49:00 2016 +0200
@@ -748,5 +748,44 @@
{ 'id': '6451977', 'label': 'Neuilly-sur-Seine' },
{ 'id': '6427502', 'label': 'Saon' },
{ 'id': '6455546', 'label': 'Le Breuil-en-Bessin' },
- { 'id': '6427527', 'label': 'Tourni\u00e8res' }
-];
\ No newline at end of file
+ { 'id': '6427527', 'label': 'Tourni\u00e8res' },
+ { 'id': '11071619', 'label': 'Bourgogne-Franche-Comt\u00e9'},
+ { 'id': '11071620', 'label': 'Nouvelle-Aquitaine'},
+ { 'id': '11071621', 'label': 'Normandy'},
+ { 'id': '11071622', 'label': 'Grand-Est'},
+ { 'id': '11071623', 'label': 'Occitania'},
+ { 'id': '11071624', 'label': 'Hauts-de-France'},
+ { 'id': '11071625', 'label': 'Auvergne-Rh\u00f4ne-Alpes'},
+ { 'id': '11153151', 'label': 'Abidjan'},
+ { 'id': '2139685', 'label': 'Nouvelle-Cal\u00e9donie'},
+ { 'id': '2140464', 'label': 'South Province'},
+ { 'id': '2140685', 'label': 'North Province'},
+ { 'id': '2287781', 'label': 'C\u00f4te d\u2019Ivoire'},
+ { 'id': '2861876', 'label': 'North Rhine-Westphalia'},
+ { 'id': '2921044', 'label': 'Allemagne'},
+ { 'id': '2985244', 'label': 'Provence-Alpes-C\u00f4te d\'Azur'},
+ { 'id': '2988289', 'label': 'Pays de la Loire'},
+ { 'id': '3012874', 'label': '\u00cele-de-France'},
+ { 'id': '3017382', 'label': 'France'},
+ { 'id': '3023519', 'label': 'R\u00e9gion Corse'},
+ { 'id': '3027939', 'label': 'Centre'},
+ { 'id': '3030293', 'label': 'Bretagne'},
+ { 'id': '3381670', 'label': 'Guyane Fran\u00e7aise'},
+ { 'id': '3382998', 'label': 'Surinam'},
+ { 'id': '3383062', 'label': 'Sipaliwini'},
+ { 'id': '3383329', 'label': 'Paramaribo'},
+ { 'id': '3383560', 'label': 'Marowijne'},
+ { 'id': '3463504', 'label': 'Federal District'},
+ { 'id': '3469034', 'label': 'Brazil'},
+ { 'id': '4736286', 'label': 'Texas'},
+ { 'id': '6252001', 'label': '\u00c9tats-Unis'},
+ { 'id': '6255146', 'label': 'Africa'},
+ { 'id': '6255148', 'label': 'Europe'},
+ { 'id': '6255149', 'label': 'North America'},
+ { 'id': '6255150', 'label': 'South America'},
+ { 'id': '6255151', 'label': 'Oceania'},
+ { 'id': '6690605', 'label': 'Guyane'},
+ { 'id': '934166', 'label': 'Plaines Wilhems District'},
+ { 'id': '934292', 'label': '\u00cele Maurice'},
+ { 'id': '935317', 'label': 'La R\u00e9union'}
+];
--- a/server/src/.env.example Fri Sep 30 00:43:04 2016 +0200
+++ b/server/src/.env.example Sun Oct 02 11:49:00 2016 +0200
@@ -59,3 +59,5 @@
HANDLE_TEST_DSA_KEY=""
HANDLE_TEST_DSA_PASSWORD=NULL
HANDLE_TEST_DSA_ADMIN_HANDLE=""
+
+GEONAMES_USERNAME="demo"
--- a/server/src/app/Console/Commands/IndexDocuments.php Fri Sep 30 00:43:04 2016 +0200
+++ b/server/src/app/Console/Commands/IndexDocuments.php Sun Oct 02 11:49:00 2016 +0200
@@ -3,7 +3,10 @@
namespace CorpusParole\Console\Commands;
use Illuminate\Console\Command;
+use GuzzleHttp\Client;
use CorpusParole\Repositories\DocumentRepository;
+use CorpusParole\Libraries\CocoonUtils;
+use CorpusParole\Models\GeonamesHierarchy;
use Es;
class IndexDocuments extends Command
@@ -31,9 +34,10 @@
*
* @return void
*/
- public function __construct(DocumentRepository $documentRepository)
+ public function __construct(DocumentRepository $documentRepository, Client $httpClient)
{
$this->documentRepository = $documentRepository;
+ $this->httpClient = $httpClient;
parent::__construct();
}
@@ -55,10 +59,12 @@
return 0;
}
}
+ // Note: removed the "'store' => True" parameters on fields and use _source on record instead
+
$indexParams['body'] = [
'settings' => [
- 'number_of_shards' => conf('elasticsearch.shards'),
- 'number_of_replicas' => conf('elasticsearch.replicas'),
+ 'number_of_shards' => config('elasticsearch.shards'),
+ 'number_of_replicas' => config('elasticsearch.replicas'),
'index.mapping.ignore_malformed' => True
],
'mappings' => [
@@ -66,7 +72,6 @@
'properties' => [
'title' => [
'type' => 'string',
- 'store' => True,
'fields' => [
'raw' => [
'type' => 'string',
@@ -74,10 +79,10 @@
]
]
],
- 'date' => [
- 'type' => 'date',
- 'store' => True
- ]
+ 'date' => [ 'type' => 'date' ],
+ 'geonames_hyerarchy' => [ 'type' => 'string' ],
+ 'location' => [ 'type' => 'geo_point' ]
+ // TODO: add location information
]
]
]
@@ -89,20 +94,80 @@
return 1;
}
+
+ private function getGeonamesHierarchyArray($geonamesid) {
+ // TODO: Manage this cache !!!
+ $hcache = GeonamesHierarchy::where('geonamesid', $geonamesid)->first();
+ if(is_null($hcache)) {
+
+ // TODO: add delay to respect geonames 2k request/hour
+ // TODO: manage errors
+
+ $apiBody = $this->httpClient->get(
+ config('corpusparole.geonames_hierarchy_webservice_url'),
+ [ 'query' =>
+ [ 'geonameId' => $geonamesid,
+ 'username' => config('corpusparole.geonames_username') ],
+ 'accept' => 'application/json' // TODO: check this
+ ]
+ )->getBody();
+ $hjson = json_decode($apiBody);
+ $hcache = new GeonamesHierarchy;
+ $hcache->geonamesid = $geonamesid;
+ $hcache->hierarchy = $hjson;
+ $hcache->save();
+ }
+
+ $res = [];
+ foreach($hcache->hierarchy['geonames'] as $hierarchyElem) {
+ if(in_array($hierarchyElem['fcode'], ['CONT','PCLI', 'PCL','PCLD', 'PCLF', 'PCLH', 'PCLIX', 'PCLIS', 'ADM1'])) {
+ array_push($res, $hierarchyElem['geonameId']);
+ }
+ }
+
+ return $res;
+
+ }
+
+ /**
+ * get geonames hierarchy data.
+ * @return array list of geonames ids
+ */
+ private function getGeonamesHierarchy($doc) {
+ $geoRes = $doc->getGeoInfo();
+ if(is_null($geoRes)) {
+ return [];
+ }
+ // aggregate hierachy list from geonames results
+ $res = [];
+ foreach($geoRes->getGeonamesLocs() as $gurl) {
+ $geonamesId = CocoonUtils::getGeonamesidFromUrl($gurl);
+ if(is_null($geonamesId)) {
+ continue;
+ }
+ $hierarchyIds = $this->getGeonamesHierarchyArray($geonamesId);
+ $res = array_unique(array_merge($res, $hierarchyIds));
+ }
+ return $res;
+
+ }
+
/**
* Index one document into Elasticsearch
*
* @return int (1 if sucess, 0 if error)
*/
- private function indexOne($doc)
+ private function indexOne($resultDoc)
{
+ $doc = $this->documentRepository->get($resultDoc->getId());
$query_data = [
- 'index' => conf('elasticsearch.index'),
+ 'index' => config('elasticsearch.index'),
'type' => 'document',
'id' => (string)$doc->getId(),
'body' => [
'title' => (string)$doc->getTitle(),
- 'date' => (string)$doc->getModified()
+ 'date' => (string)$doc->getModified(),
+ 'geonames_hierarchy' => $this->getGeonamesHierarchy($doc)
]
];
Es::index($query_data);
@@ -119,7 +184,7 @@
foreach($docs as $doc){
$query_data['body'][] = [
'index' => [
- '_index' => conf('elasticsearch.index'),
+ '_index' => config('elasticsearch.index'),
'_type' => 'document',
'_id' => (string)$doc->getId()
]
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/server/src/app/Http/Controllers/Api/GeoStatsController.php Sun Oct 02 11:49:00 2016 +0200
@@ -0,0 +1,44 @@
+<?php
+
+namespace CorpusParole\Http\Controllers\Api;
+
+use Illuminate\Http\Request;
+
+use CorpusParole\Http\Requests;
+use CorpusParole\Http\Controllers\Controller;
+use Es;
+
+class GeoStatsController extends Controller
+{
+ /**
+ * Display the specified resource.
+ *
+ * @return \Illuminate\Http\Response
+ */
+ public function index(Request $request)
+ {
+ $query = [
+ 'index' => env('ELASTICSEARCH_INDEX'),
+ 'body' => [
+ "size" => 0,
+ "aggs" => [
+ "geos" => [
+ "terms" => [
+ "size" => 0,
+ "field" => "geonames_hierarchy"
+ ]
+ ]
+ ]
+ ]
+ ];
+ $esRes = Es::search($query);
+
+ $geosats = [];
+
+ foreach($esRes['aggregations']['geos']['buckets'] as $bucket) {
+ $geosats[(string)($bucket['key'])] = $bucket['doc_count'];
+ }
+
+ return response()->json(['geostats' => $geosats ]);
+ }
+}
--- a/server/src/app/Libraries/CocoonUtils.php Fri Sep 30 00:43:04 2016 +0200
+++ b/server/src/app/Libraries/CocoonUtils.php Sun Oct 02 11:49:00 2016 +0200
@@ -76,4 +76,14 @@
);
}
+ public static function getGeonamesidFromUrl($url) {
+ $matches = [];
+ if(preg_match(config('corpusparole.geonames_url_regexp'), $url, $matches) === 1) {
+ return $matches[1];
+ }
+ else {
+ return null;
+ }
+ }
+
}
--- a/server/src/app/Models/GeoResource.php Fri Sep 30 00:43:04 2016 +0200
+++ b/server/src/app/Models/GeoResource.php Sun Oct 02 11:49:00 2016 +0200
@@ -105,7 +105,9 @@
return is_null($long)?null:$long->getValue();
}
-
+ public function getGeonamesLocs() {
+ return preg_grep(config('corpusparole.geonames_url_regexp'), $this->getRefLocs());
+ }
public function jsonSerialize() {
$notes = array_map(
@@ -175,4 +177,4 @@
}
-}
\ No newline at end of file
+}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/server/src/app/Models/GeonamesHierarchy.php Sun Oct 02 11:49:00 2016 +0200
@@ -0,0 +1,17 @@
+<?php
+
+namespace CorpusParole\Models;
+
+use Illuminate\Database\Eloquent\Model;
+
+class GeonamesHierarchy extends Model
+{
+ /**
+ * The attributes that should be casted to native types.
+ *
+ * @var array
+ */
+ protected $casts = [
+ 'hierarchy' => 'array',
+ ];
+}
--- a/server/src/app/Services/GeonamesResolver.php Fri Sep 30 00:43:04 2016 +0200
+++ b/server/src/app/Services/GeonamesResolver.php Sun Oct 02 11:49:00 2016 +0200
@@ -65,7 +65,7 @@
$label = isset($labels['fr']) ? $labels['fr'] : ( isset($labels[''])? $labels['']: null) ;
if(is_null($label)) {
- $labelLit = $graph->getLiteral("<$url>", "<http://www.geonames.org/ontology#name>");
+ $labelLit = $graph->getLiteral("<$url>", "<http://www.geonames.org/ontology#name>");
$label = (!is_null($labelLit)) ? $labelLit->getValue() : null;
}
@@ -86,8 +86,9 @@
*/
public function getLabel($id) {
$geonamesid = $id;
- if(strpos($id, config('corpusparole.geonames_base_url')) === 0) {
- $geonamesid = substr($id, strlen(config('corpusparole.geonames_base_url')));
+ $matches = [];
+ if( preg_match(config('corpusparole.geonames_url_regexp'), $id, $matches) === 1) {
+ $geonamesid = $matches[1];
}
$geonamesid = rtrim($geonamesid, '/');
--- a/server/src/config/constants.php Fri Sep 30 00:43:04 2016 +0200
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,7 +0,0 @@
-<?php
-
-return [
-
- 'VERSION' => [0,0,0,'alpha',1],
-
-];
--- a/server/src/config/corpusparole.php Fri Sep 30 00:43:04 2016 +0200
+++ b/server/src/config/corpusparole.php Sun Oct 02 11:49:00 2016 +0200
@@ -125,8 +125,11 @@
'bnf_completion_url' => 'http://data.bnf.fr/search-letter/',
'geonames_base_url' => 'http://sws.geonames.org/',
+ 'geonames_url_regexp' => '/http[s]?\:\/\/(?:sws|www)\.geonames\.org\/(\d+)\/?/',
'geonames_cache_expiration' => 60*24*30,
'geonames_max_ids' => 500,
+ 'geonames_hierarchy_webservice_url' => 'http://api.geonames.org/hierarchyJSON',
+ 'geonames_username' => env('GEONAMES_USERNAME'),
'bo_client_environment' => [
"modulePrefix" => "bo-client",
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/server/src/database/migrations/2016_09_30_132045_create_geonames_hierarchies_table.php Sun Oct 02 11:49:00 2016 +0200
@@ -0,0 +1,33 @@
+<?php
+
+use Illuminate\Support\Facades\Schema;
+use Illuminate\Database\Schema\Blueprint;
+use Illuminate\Database\Migrations\Migration;
+
+class CreateGeonamesHierarchiesTable extends Migration
+{
+ /**
+ * Run the migrations.
+ *
+ * @return void
+ */
+ public function up()
+ {
+ Schema::create('geonames_hierarchies', function (Blueprint $table) {
+ $table->increments('id');
+ $table->timestamps();
+ $table->string('geonamesid')->unique();
+ $table->json('hierarchy');
+ });
+ }
+
+ /**
+ * Reverse the migrations.
+ *
+ * @return void
+ */
+ public function down()
+ {
+ Schema::dropIfExists('geonames_hierarchies');
+ }
+}
--- a/server/src/routes/api.php Fri Sep 30 00:43:04 2016 +0200
+++ b/server/src/routes/api.php Sun Oct 02 11:49:00 2016 +0200
@@ -40,5 +40,7 @@
['only' => ['index']]);
Route::resource('datestats', 'Api\DateStatsController',
['only' => ['index']]);
+ Route::resource('geostats', 'Api\GeoStatsController',
+ ['only' => ['index']]);
});
});
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/server/src/tests/Controllers/GeoStatsControllerTest.php Sun Oct 02 11:49:00 2016 +0200
@@ -0,0 +1,65 @@
+<?php
+
+use Es;
+
+class GeoStatsControllerTest extends TestCase
+{
+ public function testGetIndex()
+ {
+ $query = [
+ 'index' => env('ELASTICSEARCH_INDEX'),
+ 'body' => [
+ "size" => 0,
+ "aggs" => [
+ "geos" => [
+ "terms" => [
+ "size" => 0,
+ "field" => "geonames_hierarchy"
+ ]
+ ]
+ ]
+ ]
+ ];
+
+ Es::shouldReceive('search')
+ ->once()
+ ->with($query)
+ ->andReturn(json_decode("{
+ \"took\" : 17,
+ \"timed_out\" : false,
+ \"_shards\" : {
+ \"total\" : 1,
+ \"successful\" : 1,
+ \"failed\" : 0
+ },
+ \"hits\" : {
+ \"total\" : 3011,
+ \"max_score\" : 0.0,
+ \"hits\" : [ ]
+ },
+ \"aggregations\" : {
+ \"geos\" : {
+ \"doc_count_error_upper_bound\" : 0,
+ \"sum_other_doc_count\" : 0,
+ \"buckets\" : [ {
+ \"key\" : 6255148,
+ \"doc_count\" : 2684
+ }, {
+ \"key\" : 3017382,
+ \"doc_count\" : 2674
+ }, {
+ \"key\" : 3027939,
+ \"doc_count\" : 851
+ } ]
+ }
+ }
+}", true));
+
+ $this->get('/api/v1/stats/geostats/')->assertTrue($this->response->isOk(), $this->response->content());
+ $this->seeJsonEquals(["geostats" => [
+ '6255148' => 2684,
+ '3017382' => 2674,
+ '3027939' => 851
+ ]]);
+ }
+}