--- a/dev/provisioning/modules/sysconfig/manifests/php.pp Fri Oct 07 02:07:34 2016 +0200
+++ b/dev/provisioning/modules/sysconfig/manifests/php.pp Sun Oct 09 11:44:18 2016 +0200
@@ -29,7 +29,7 @@
augeas { "php_ini/memory_limit":
lens => "PHP.lns",
incl => "/opt/remi/php56/root/etc/php.ini",
- changes => "set PHP/memory_limit 128M",
+ changes => "set PHP/memory_limit 512M",
notify => Service['httpd'],
require => Package["php56"]
}->
--- a/dev/provisioning/modules/sysconfig/templates/corpus/corpus_env.conf.erb Fri Oct 07 02:07:34 2016 +0200
+++ b/dev/provisioning/modules/sysconfig/templates/corpus/corpus_env.conf.erb Sun Oct 09 11:44:18 2016 +0200
@@ -4,7 +4,7 @@
SetEnv APP_URL http://<%= @vhost %>/corpus-back/
SetEnv DATABASE_DRIVER mysql
-SetEnv DB_HOST localhost
+SetEnv DB_HOST <%= @db_host %>
SetEnv DB_DATABASE <%= @db_name %>
SetEnv DB_USERNAME <%= @db_user %>
SetEnv DB_PASSWORD <%= @db_pw %>
@@ -21,9 +21,41 @@
SetEnv CORPUSPAROLE_COCOON_RDF_BASE_URI http://cocoon.huma-num.fr/exist/crdo/rdf/
SetEnv CORPUSPAROLE_COCOON_OAIPMH_URL http://cocoon.huma-num.fr/crdo_servlet/oai-pmh
-SetEnv CORPUSPAROLE_SESAME_BASE_URL http://127.0.0.1:8080/openrdf-sesame
+SetEnv CORPUSPAROLE_SESAME_BASE_URL http://<%= @sesame_host %>:<%= @sesame_port %>/openrdf-sesame
SetEnv CORPUSPAROLE_SESAME_REPOSITORY corpus
SetEnv CORPUSPAROLE_SESAME_REPOSITORY_RAW corpus_raw
SetEnv CORPUSPAROLE_SESAME_LEXVO_REPOSITORY lexvo
SetEnv EASYRDF_HTTP_CLIENT_TIMEOUT 5000
+
+SetEnv ELASTICSEARCH_URL <@= @elasticsearch_host %>:<@= @elasticsearch_port %>
+SetEnv ELASTICSEARCH_LOG_PATH 'logs/elasticsearch.log'
+SetEnv ELASTICSEARCH_INDEX 'corpus'
+SetEnv ELASTICSEARCH_SHARDS 1
+SetEnv ELASTICSEARCH_REPLICAS 1
+
+SetEnv HANDLE_HOST <%= @handle_host %>
+SetEnv HANDLE_PORT <%= @handle_port %>
+SetEnv HANDLE_PREFIX <%= @handle_prefix %>
+SetEnv HANDLE_ADMIN_ID <%= @handle_admin_id %>
+
+SetEnv HANDLE_CERT_OR_PKEY "<%= @handle_cert_or_pkey %>"
+SetEnv HANDLE_PASSWORD <%= @handle_password %>
+
+SetEnv HANDLE_TEST_PREFIX <%= @handle_test_prefix %>
+
+
+SetEnv HANDLE_TEST_CERT "corpusadmpkeycrt.pem"
+SetEnv HANDLE_TEST_CERT_PASSWORD NULL
+SetEnv HANDLE_TEST_CERT_ADMIN_HANDLE "300:<%= @handle_test_prefix %>/CORPUS_ADMIN"
+
+SetEnv HANDLE_TEST_RSA_KEY "corpusadmpriv.pem"
+SetEnv HANDLE_TEST_RSA_PASSWORD NULL
+SetEnv HANDLE_TEST_RSA_ADMIN_HANDLE "300:<%= @handle_test_prefix %>/CORPUS_ADMIN_RSA"
+
+
+SetEnv HANDLE_TEST_DSA_KEY "corpusadmdsapriv.pem"
+SetEnv HANDLE_TEST_DSA_PASSWORD NULL
+SetEnv HANDLE_TEST_DSA_ADMIN_HANDLE "300:<%= @handle_test_prefix %>/CORPUS_ADMIN_DSA"
+
+SetEnv GEONAMES_USERNAME "<%= @geonames-username %>"
--- a/dev/provisioning/modules/sysconfig/templates/corpus/local.env.erb Fri Oct 07 02:07:34 2016 +0200
+++ b/dev/provisioning/modules/sysconfig/templates/corpus/local.env.erb Sun Oct 09 11:44:18 2016 +0200
@@ -4,7 +4,7 @@
APP_URL=http://<%= @vhost %>/corpus-back/
DATABASE_DRIVER=mysql
-DB_HOST=localhost
+DB_HOST=<%= @db_host %>
DB_DATABASE=<%= @db_name %>
DB_USERNAME=<%= @db_user %>
DB_PASSWORD=<%= @db_pw %>
@@ -21,9 +21,42 @@
CORPUSPAROLE_COCOON_RDF_BASE_URI=http://cocoon.huma-num.fr/exist/crdo/rdf/
CORPUSPAROLE_COCOON_OAIPMH_URL=http://cocoon.huma-num.fr/crdo_servlet/oai-pmh
-CORPUSPAROLE_SESAME_BASE_URL=http://172.16.1.6:8080/openrdf-sesame/
+CORPUSPAROLE_SESAME_BASE_URL=http://<%= @sesame_host %>:<%= @sesame_port %>/openrdf-sesame
CORPUSPAROLE_SESAME_REPOSITORY=corpus
CORPUSPAROLE_SESAME_REPOSITORY_RAW=corpus_raw
CORPUSPAROLE_SESAME_LEXVO_REPOSITORY=lexvo
EASYRDF_HTTP_CLIENT_TIMEOUT=5000
+
+ELASTICSEARCH_URL=<@= @elasticsearch_host %>:<@= @elasticsearch_port %>
+ELASTICSEARCH_LOG_PATH='logs/elasticsearch.log'
+ELASTICSEARCH_INDEX='corpus'
+ELASTICSEARCH_SHARDS=1
+ELASTICSEARCH_REPLICAS=1
+
+HANDLE_HOST=<%= @handle_host %>
+HANDLE_PORT=<%= @handle_port %>
+HANDLE_PREFIX=<%= @handle_prefix %>
+HANDLE_ADMIN_ID=<%= @handle_admin_id %>
+
+HANDLE_CERT_OR_PKEY="<%= @handle_cert_or_pkey %>"
+HANDLE_PASSWORD=<%= @handle_password %>
+
+HANDLE_TEST_PREFIX=<%= @handle_test_prefix %>
+
+
+HANDLE_TEST_CERT="corpusadmpkeycrt.pem"
+HANDLE_TEST_CERT_PASSWORD=NULL
+HANDLE_TEST_CERT_ADMIN_HANDLE="300:<%= @handle_test_prefix %>/CORPUS_ADMIN"
+
+HANDLE_TEST_RSA_KEY="corpusadmpriv.pem"
+HANDLE_TEST_RSA_PASSWORD=NULL
+HANDLE_TEST_RSA_ADMIN_HANDLE="300:<%= @handle_test_prefix %>/CORPUS_ADMIN_RSA"
+
+
+HANDLE_TEST_DSA_KEY="corpusadmdsapriv.pem"
+HANDLE_TEST_DSA_PASSWORD=NULL
+HANDLE_TEST_DSA_ADMIN_HANDLE="300:<%= @handle_test_prefix %>/CORPUS_ADMIN_DSA"
+
+GEONAMES_USERNAME="<%= @geonames-username %>"
+
--- a/server/src/app/Console/Commands/IndexDocuments.php Fri Oct 07 02:07:34 2016 +0200
+++ b/server/src/app/Console/Commands/IndexDocuments.php Sun Oct 09 11:44:18 2016 +0200
@@ -2,11 +2,19 @@
namespace CorpusParole\Console\Commands;
+
+
use Illuminate\Console\Command;
use EasyRdf\Resource;
use EasyRdf\Literal;
+use EasyRdf\Graph;
+
+use Carbon\Carbon;
use GuzzleHttp\Client;
+use GuzzleHttp\Exception\TransferException;
+use GuzzleHttp\Psr7;
+
use CorpusParole\Libraries\Utils;
use CorpusParole\Repositories\DocumentRepository;
use CorpusParole\Libraries\CocoonUtils;
@@ -14,6 +22,8 @@
use CorpusParole\Services\BnfResolverInterface;
use CorpusParole\Services\LexvoResolverInterface;
use Es;
+use Log;
+use Cache;
class IndexDocuments extends Command
{
@@ -96,6 +106,9 @@
'date' => [ 'type' => 'date' ],
'geonames_hyerarchy' => [ 'type' => 'string' ],
'location' => [ 'type' => 'geo_point' ],
+ 'creation_date' => ['type' => 'date'],
+ 'language' => ['type' => 'string'],
+ 'discourse_types' => ['type' => 'string'],
'subject' => [
'type' => 'nested',
'properties' => [
@@ -118,7 +131,7 @@
private function getGeonamesHierarchyArray($geonamesid) {
- // TODO: Manage this cache !!!
+
$hcache = GeonamesHierarchy::where('geonamesid', $geonamesid)->first();
if(is_null($hcache)) {
@@ -146,7 +159,6 @@
array_push($res, $hierarchyElem['geonameId']);
}
}
-
return $res;
}
@@ -210,7 +222,6 @@
'type' => 'txt'
]);
}
-
return $res;
}, []);
@@ -248,6 +259,227 @@
);
}
+ private function graphResolvCoordinate($loc, $graph) {
+ $latLit = $graph->getLiteral($loc, "<http://www.w3.org/2003/01/geo/wgs84_pos#lat>");
+ if(is_null($latLit) || empty($latLit->getValue())) {
+ return null;
+ }
+ $lat = $latLit->getValue();
+
+ $longLit = $graph->getLiteral($loc, "<http://www.w3.org/2003/01/geo/wgs84_pos#long>");
+ if(is_null($longLit) || empty($longLit->getValue())) {
+ return null;
+ }
+ $long = $longLit->getValue();
+
+ return [ $lat, $long ];
+ }
+
+ private function loadGraph($url, $type) {
+ try {
+ $r = $this->httpClient->get($url);
+ } catch (TransferException $e) {
+ $this->error("loadGraph : Error Loading $url");
+ Log::error("loadGraph : Error Loading $url");
+ Log::error("loadGraph : Error request " . Psr7\str($e->getRequest()));
+ if ($e->hasResponse()) {
+ $this->error("loadGraph : Error response " . Psr7\str($e->getResponse()));
+ Log::error("loadGraph : Error response " . Psr7\str($e->getResponse()));
+ }
+ return null;
+ }
+ try {
+ $message = (string)$r->getBody();
+ $graph = new Graph($url, $message, $type);
+ return $graph;
+ } catch (EasyRdf\Exception $e) {
+ $this->error("loadGraph : Error parsing $url");
+ Log::error("loadGraph : Error parsing $url");
+ if($e instanceof EasyRdf\Parser\Exception) {
+ Log::error("loadGraph : Error exception line ".$e->getLine().", column: ".$e->getColumn());
+ }
+ $this->error("loadGraph : Error exception message ".$e->getMessage());
+ Log::error("loadGraph : Error exception message ".$e->getMessage());
+ Log::error("loadGraph : Error content $message");
+ return null;
+ }
+
+ }
+
+ private function geonamesResolveCoordinates($loc) {
+ $coords = cache("corpus.geonames.coord.$loc");
+ if(is_null($coords)) {
+ $graph = $this->loadGraph("{$loc}about.rdf", 'rdfxml');
+ $coords = is_null($graph)?null:$this->graphResolvCoordinate($loc, $graph);
+ cache(["corpus.geonames.coord.$loc" => is_null($coords)?false:$coords], Carbon::now()->addMinutes(20));
+ }
+ return ($coords===false)?null:$coords;
+ }
+
+ private function dbpediaResolveCoordinates($loc) {
+ $coords = cache("corpus.dbpedia.coord.$loc");
+ if(is_null($coords)) {
+ $graph = $this->loadGraph("$loc.rdf", 'rdfxml');
+ $coords = is_null($graph)?null:$this->graphResolvCoordinate($loc, $graph);
+ cache(["corpus.dbpedia.coord.$loc"=> is_null($coords)?false:$coords], Carbon::now()->addMinutes(20));
+ }
+ return ($coords===false)?null:$coords;
+ }
+
+ private function getLocation($doc) {
+
+ $geoRes = $doc->getGeoInfo();
+
+ if(is_null($geoRes)) {
+ return null;
+ }
+
+ $locUrls = [];
+ foreach($geoRes->getRefLocs() as $loc) {
+ if(preg_match(config('corpusparole.geonames_url_regexp'), $loc, $m) === 1) {
+
+ if(!array_key_exists('geonames', $locUrls)) {
+ $locUrls['geonames'] = [];
+ }
+ array_push($locUrls['geonames'], "http://sws.geonames.org/$m[1]/");
+
+ } elseif(preg_match(config('corpusparole.dbpedia_url_regexp'), $loc, $md) === 1) {
+ if(!array_key_exists('dbpedia', $locUrls)) {
+ $locUrls['dbpedia'] = [];
+ }
+ //$this->line("DBPEDIA MATCH $loc ".print_r($md,true));
+ array_push($locUrls['dbpedia'], "http://$md[1]/data/$md[4]");
+ }
+ }
+
+ $coordinates = null;
+ foreach($locUrls as $locType => $locList) {
+ foreach($locList as $locationUrl) {
+ $coordinates = call_user_func([$this, "${locType}ResolveCoordinates"], $locationUrl);
+ if(!is_null($coordinates)) {
+ break;
+ }
+ }
+ }
+
+ if(is_null($coordinates)) {
+ $coordinates = [$geoRes->getLatitudeValue(), $geoRes->getLongitudeValue()];
+ }
+
+ if(empty($coordinates[0]) || empty($coordinates[1])) {
+ return null;
+ } else {
+ return [floatval($coordinates[0]), floatval($coordinates[1])];
+ }
+
+ }
+
+ private function getCreationDate($doc) {
+
+ $created = $doc->getCreated();
+ if(is_null($created)) {
+ return null;
+ }
+ $dateType = $created->getDatatypeUri();
+ $res = null;
+
+ if($dateType === "http://purl.org/dc/terms/Period") {
+ $res = $this->processPeriod($created->getValue());
+ }
+ elseif($dateType === "http://purl.org/dc/terms/W3CDTF") {
+ $res = $this->processDate($created->getValue());
+ }
+
+ return $res;
+
+ }
+
+ private function extractDate($dateStr) {
+ if(preg_match("/^\\d{4}$/", $dateStr) === 1) {
+ $dateStr = "$dateStr-1-1";
+ }
+ $date = date_create($dateStr);
+ if($date === false ) {
+ Log::warning("DateStatsController:extractYear bad format for date $dateStr");
+ return null;
+ }
+ return $date;
+ }
+
+ private function processPeriod($periodStr) {
+ $start = null;
+ $end = null;
+ foreach(explode(";", $periodStr) as $elem) {
+ $elem = trim($elem);
+ if(strpos($elem, 'start=') === 0) {
+ $startDate = $this->extractDate(trim(substr($elem, 6)));
+ if(is_null($startDate)) {
+ return null;
+ }
+ $start = intval($startDate->format("Y"));
+ if($start === false) {
+ return null;
+ }
+ } elseif(strpos($elem, 'end=') === 0) {
+ $endDate = $this->extractDate(trim(substr($elem, 4)));
+ if(is_null($endDate)) {
+ return null;
+ }
+ $end = intval($endDate->format("Y"));
+ if($end === false) {
+ return null;
+ }
+ }
+ }
+
+ if(is_null($start) || is_null($end) || $start>$end ) {
+ Log::warning("Bad format for $periodStr");
+ return null;
+ }
+
+ return array_map(function($y) {
+ return \DateTime::createFromFormat("Y", "$y")->format(\DateTime::W3C);
+ }, range($start, $end));
+ }
+
+ private function processDate($dateStr) {
+ $date = $this->extractDate($dateStr);
+ if(is_null($date)) {
+ return null;
+ } else {
+ return $date->format(\DateTime::W3C);
+ }
+ }
+
+ private function getDiscourseTypes($doc) {
+ return array_reduce($doc->getDiscourseTypes(), function($res, $d) {
+ $val = null;
+ if($d instanceof Resource) {
+ $val = $d->getUri();
+ } elseif($d instanceof Literal) {
+ $datatype = $d->getDatatypeURI();
+ $val = (!empty($datatype)?"$datatype#":"").$d->getValue();
+ }
+ if(!empty($val)) {
+ array_push($res,$val);
+ }
+ return $res;
+ }, []);
+ }
+
+ private function getDocBody($doc) {
+ return [
+ 'title' => (string)$doc->getTitle(),
+ 'date' => (string)$doc->getModified(),
+ 'location' => $this->getLocation($doc),
+ 'creation_date' => $this->getCreationDate($doc),
+ 'language' => $doc->getLanguageValue(),
+ 'discourse_types' => $this->getDiscourseTypes($doc),
+ 'geonames_hierarchy' => $this->getGeonamesHierarchy($doc),
+ 'subject' => $this->getSubjects($doc),
+ ];
+ }
+
/**
* Index one document into Elasticsearch
*
@@ -260,12 +492,7 @@
'index' => config('elasticsearch.index'),
'type' => 'document',
'id' => (string)$doc->getId(),
- 'body' => [
- 'title' => (string)$doc->getTitle(),
- 'date' => (string)$doc->getModified(),
- 'geonames_hierarchy' => $this->getGeonamesHierarchy($doc),
- 'subject' => $this->getSubjects($doc)
- ]
+ 'body' => $this->getDocBody($doc)
];
Es::index($query_data);
}
@@ -287,12 +514,7 @@
'_id' => (string)$doc->getId()
]
];
- $query_data['body'][] = [
- 'title' => (string)$doc->getTitle(),
- 'date' => (string)$doc->getModified(),
- 'geonames_hierarchy' => $this->getGeonamesHierarchy($doc),
- 'subject' => $this->getSubjects($doc)
- ];
+ $query_data['body'][] = $this->getDocBody($doc);
}
Es::bulk($query_data);
}
@@ -367,16 +589,16 @@
if ($page==$lastPage && $i>=$lastPageEntryCount){
break;
}
- $this->indexOne($doc);
$progressBar->setMessage($doc->getId());
$progressBar->advance();
+ $this->indexOne($doc);
}
}
else
{
- $this->indexBulk($docs);
$progressBar->setMessage('Page '.$page);
$progressBar->advance();
+ $this->indexBulk($docs);
}
}
$progressBar->finish();
--- a/server/src/app/Models/DocumentBase.php Fri Oct 07 02:07:34 2016 +0200
+++ b/server/src/app/Models/DocumentBase.php Sun Oct 09 11:44:18 2016 +0200
@@ -28,8 +28,9 @@
private $title = false;
private $lang = null;
private $langResolved = null;
- private $issued = null;
- private $modified = null;
+ private $issued = false;
+ private $modified = false;
+ private $created = false;
public function getProvidedCHO() {
@@ -44,8 +45,9 @@
$this->title = false;
$this->lang = null;
$this->langResolved = null;
- $this->issued = null;
- $this->modified = null;
+ $this->issued = false;
+ $this->modified = false;
+ $this->created = false;
}
public function getId() {
@@ -125,7 +127,7 @@
}
public function getIssued() {
- if(is_null($this->issued)) {
+ if($this->issued === false) {
try {
$this->issued = $this->getProvidedCHO()->getLiteral("<http://purl.org/dc/terms/issued>");
} catch(\Exception $e) {
@@ -140,8 +142,19 @@
return is_null($issued)?null:$issued->getValue();
}
+ public function getCreated() {
+ if($this->created === false) {
+ try {
+ $this->created = $this->getProvidedCHO()->getLiteral("<http://purl.org/dc/terms/created>");
+ } catch(\Exception $e) {
+ $this->created = null;
+ }
+ }
+ return $this->created;
+ }
+
public function getModified() {
- if(is_null($this->modified)) {
+ if($this->modified === false) {
try {
$this->modified = $this->getProvidedCHO()->getLiteral("<http://purl.org/dc/terms/modified>");
if(is_null($this->modified)) {
--- a/server/src/config/corpusparole.php Fri Oct 07 02:07:34 2016 +0200
+++ b/server/src/config/corpusparole.php Sun Oct 09 11:44:18 2016 +0200
@@ -132,6 +132,8 @@
'bnf_query_url' => 'http://data.bnf.fr/sparql',
'bnf_completion_url' => 'http://data.bnf.fr/search-letter/',
+ 'dbpedia_url_regexp' => '/^https?\:\/\/(([[:alpha:]]+\.)?dbpedia\.org)\/(resource|page)\/([^\/]+)\/?$/',
+
'geonames_base_url' => 'http://sws.geonames.org/',
'geonames_url_regexp' => '/http[s]?\:\/\/(?:sws|www)\.geonames\.org\/(\d+)\/?/',
'geonames_cache_expiration' => 60*24*30,