# HG changeset patch # User ymh # Date 1476006258 -7200 # Node ID 31a4987f6017d1b2e07d3d480aafe1baa45ac18b # Parent 92fc9d077f9538cb56281857b8e4f1c24d135d04 Add fields to document index diff -r 92fc9d077f95 -r 31a4987f6017 dev/provisioning/modules/sysconfig/manifests/php.pp --- a/dev/provisioning/modules/sysconfig/manifests/php.pp Fri Oct 07 02:07:34 2016 +0200 +++ b/dev/provisioning/modules/sysconfig/manifests/php.pp Sun Oct 09 11:44:18 2016 +0200 @@ -29,7 +29,7 @@ augeas { "php_ini/memory_limit": lens => "PHP.lns", incl => "/opt/remi/php56/root/etc/php.ini", - changes => "set PHP/memory_limit 128M", + changes => "set PHP/memory_limit 512M", notify => Service['httpd'], require => Package["php56"] }-> diff -r 92fc9d077f95 -r 31a4987f6017 dev/provisioning/modules/sysconfig/templates/corpus/corpus_env.conf.erb --- a/dev/provisioning/modules/sysconfig/templates/corpus/corpus_env.conf.erb Fri Oct 07 02:07:34 2016 +0200 +++ b/dev/provisioning/modules/sysconfig/templates/corpus/corpus_env.conf.erb Sun Oct 09 11:44:18 2016 +0200 @@ -4,7 +4,7 @@ SetEnv APP_URL http://<%= @vhost %>/corpus-back/ SetEnv DATABASE_DRIVER mysql -SetEnv DB_HOST localhost +SetEnv DB_HOST <%= @db_host %> SetEnv DB_DATABASE <%= @db_name %> SetEnv DB_USERNAME <%= @db_user %> SetEnv DB_PASSWORD <%= @db_pw %> @@ -21,9 +21,41 @@ SetEnv CORPUSPAROLE_COCOON_RDF_BASE_URI http://cocoon.huma-num.fr/exist/crdo/rdf/ SetEnv CORPUSPAROLE_COCOON_OAIPMH_URL http://cocoon.huma-num.fr/crdo_servlet/oai-pmh -SetEnv CORPUSPAROLE_SESAME_BASE_URL http://127.0.0.1:8080/openrdf-sesame +SetEnv CORPUSPAROLE_SESAME_BASE_URL http://<%= @sesame_host %>:<%= @sesame_port %>/openrdf-sesame SetEnv CORPUSPAROLE_SESAME_REPOSITORY corpus SetEnv CORPUSPAROLE_SESAME_REPOSITORY_RAW corpus_raw SetEnv CORPUSPAROLE_SESAME_LEXVO_REPOSITORY lexvo SetEnv EASYRDF_HTTP_CLIENT_TIMEOUT 5000 + +SetEnv ELASTICSEARCH_URL <@= @elasticsearch_host %>:<@= @elasticsearch_port %> +SetEnv ELASTICSEARCH_LOG_PATH 'logs/elasticsearch.log' +SetEnv ELASTICSEARCH_INDEX 'corpus' +SetEnv ELASTICSEARCH_SHARDS 1 +SetEnv ELASTICSEARCH_REPLICAS 1 + +SetEnv HANDLE_HOST <%= @handle_host %> +SetEnv HANDLE_PORT <%= @handle_port %> +SetEnv HANDLE_PREFIX <%= @handle_prefix %> +SetEnv HANDLE_ADMIN_ID <%= @handle_admin_id %> + +SetEnv HANDLE_CERT_OR_PKEY "<%= @handle_cert_or_pkey %>" +SetEnv HANDLE_PASSWORD <%= @handle_password %> + +SetEnv HANDLE_TEST_PREFIX <%= @handle_test_prefix %> + + +SetEnv HANDLE_TEST_CERT "corpusadmpkeycrt.pem" +SetEnv HANDLE_TEST_CERT_PASSWORD NULL +SetEnv HANDLE_TEST_CERT_ADMIN_HANDLE "300:<%= @handle_test_prefix %>/CORPUS_ADMIN" + +SetEnv HANDLE_TEST_RSA_KEY "corpusadmpriv.pem" +SetEnv HANDLE_TEST_RSA_PASSWORD NULL +SetEnv HANDLE_TEST_RSA_ADMIN_HANDLE "300:<%= @handle_test_prefix %>/CORPUS_ADMIN_RSA" + + +SetEnv HANDLE_TEST_DSA_KEY "corpusadmdsapriv.pem" +SetEnv HANDLE_TEST_DSA_PASSWORD NULL +SetEnv HANDLE_TEST_DSA_ADMIN_HANDLE "300:<%= @handle_test_prefix %>/CORPUS_ADMIN_DSA" + +SetEnv GEONAMES_USERNAME "<%= @geonames-username %>" diff -r 92fc9d077f95 -r 31a4987f6017 dev/provisioning/modules/sysconfig/templates/corpus/local.env.erb --- a/dev/provisioning/modules/sysconfig/templates/corpus/local.env.erb Fri Oct 07 02:07:34 2016 +0200 +++ b/dev/provisioning/modules/sysconfig/templates/corpus/local.env.erb Sun Oct 09 11:44:18 2016 +0200 @@ -4,7 +4,7 @@ APP_URL=http://<%= @vhost %>/corpus-back/ DATABASE_DRIVER=mysql -DB_HOST=localhost +DB_HOST=<%= @db_host %> DB_DATABASE=<%= @db_name %> DB_USERNAME=<%= @db_user %> DB_PASSWORD=<%= @db_pw %> @@ -21,9 +21,42 @@ CORPUSPAROLE_COCOON_RDF_BASE_URI=http://cocoon.huma-num.fr/exist/crdo/rdf/ CORPUSPAROLE_COCOON_OAIPMH_URL=http://cocoon.huma-num.fr/crdo_servlet/oai-pmh -CORPUSPAROLE_SESAME_BASE_URL=http://172.16.1.6:8080/openrdf-sesame/ +CORPUSPAROLE_SESAME_BASE_URL=http://<%= @sesame_host %>:<%= @sesame_port %>/openrdf-sesame CORPUSPAROLE_SESAME_REPOSITORY=corpus CORPUSPAROLE_SESAME_REPOSITORY_RAW=corpus_raw CORPUSPAROLE_SESAME_LEXVO_REPOSITORY=lexvo EASYRDF_HTTP_CLIENT_TIMEOUT=5000 + +ELASTICSEARCH_URL=<@= @elasticsearch_host %>:<@= @elasticsearch_port %> +ELASTICSEARCH_LOG_PATH='logs/elasticsearch.log' +ELASTICSEARCH_INDEX='corpus' +ELASTICSEARCH_SHARDS=1 +ELASTICSEARCH_REPLICAS=1 + +HANDLE_HOST=<%= @handle_host %> +HANDLE_PORT=<%= @handle_port %> +HANDLE_PREFIX=<%= @handle_prefix %> +HANDLE_ADMIN_ID=<%= @handle_admin_id %> + +HANDLE_CERT_OR_PKEY="<%= @handle_cert_or_pkey %>" +HANDLE_PASSWORD=<%= @handle_password %> + +HANDLE_TEST_PREFIX=<%= @handle_test_prefix %> + + +HANDLE_TEST_CERT="corpusadmpkeycrt.pem" +HANDLE_TEST_CERT_PASSWORD=NULL +HANDLE_TEST_CERT_ADMIN_HANDLE="300:<%= @handle_test_prefix %>/CORPUS_ADMIN" + +HANDLE_TEST_RSA_KEY="corpusadmpriv.pem" +HANDLE_TEST_RSA_PASSWORD=NULL +HANDLE_TEST_RSA_ADMIN_HANDLE="300:<%= @handle_test_prefix %>/CORPUS_ADMIN_RSA" + + +HANDLE_TEST_DSA_KEY="corpusadmdsapriv.pem" +HANDLE_TEST_DSA_PASSWORD=NULL +HANDLE_TEST_DSA_ADMIN_HANDLE="300:<%= @handle_test_prefix %>/CORPUS_ADMIN_DSA" + +GEONAMES_USERNAME="<%= @geonames-username %>" + diff -r 92fc9d077f95 -r 31a4987f6017 server/src/app/Console/Commands/IndexDocuments.php --- a/server/src/app/Console/Commands/IndexDocuments.php Fri Oct 07 02:07:34 2016 +0200 +++ b/server/src/app/Console/Commands/IndexDocuments.php Sun Oct 09 11:44:18 2016 +0200 @@ -2,11 +2,19 @@ namespace CorpusParole\Console\Commands; + + use Illuminate\Console\Command; use EasyRdf\Resource; use EasyRdf\Literal; +use EasyRdf\Graph; + +use Carbon\Carbon; use GuzzleHttp\Client; +use GuzzleHttp\Exception\TransferException; +use GuzzleHttp\Psr7; + use CorpusParole\Libraries\Utils; use CorpusParole\Repositories\DocumentRepository; use CorpusParole\Libraries\CocoonUtils; @@ -14,6 +22,8 @@ use CorpusParole\Services\BnfResolverInterface; use CorpusParole\Services\LexvoResolverInterface; use Es; +use Log; +use Cache; class IndexDocuments extends Command { @@ -96,6 +106,9 @@ 'date' => [ 'type' => 'date' ], 'geonames_hyerarchy' => [ 'type' => 'string' ], 'location' => [ 'type' => 'geo_point' ], + 'creation_date' => ['type' => 'date'], + 'language' => ['type' => 'string'], + 'discourse_types' => ['type' => 'string'], 'subject' => [ 'type' => 'nested', 'properties' => [ @@ -118,7 +131,7 @@ private function getGeonamesHierarchyArray($geonamesid) { - // TODO: Manage this cache !!! + $hcache = GeonamesHierarchy::where('geonamesid', $geonamesid)->first(); if(is_null($hcache)) { @@ -146,7 +159,6 @@ array_push($res, $hierarchyElem['geonameId']); } } - return $res; } @@ -210,7 +222,6 @@ 'type' => 'txt' ]); } - return $res; }, []); @@ -248,6 +259,227 @@ ); } + private function graphResolvCoordinate($loc, $graph) { + $latLit = $graph->getLiteral($loc, ""); + if(is_null($latLit) || empty($latLit->getValue())) { + return null; + } + $lat = $latLit->getValue(); + + $longLit = $graph->getLiteral($loc, ""); + if(is_null($longLit) || empty($longLit->getValue())) { + return null; + } + $long = $longLit->getValue(); + + return [ $lat, $long ]; + } + + private function loadGraph($url, $type) { + try { + $r = $this->httpClient->get($url); + } catch (TransferException $e) { + $this->error("loadGraph : Error Loading $url"); + Log::error("loadGraph : Error Loading $url"); + Log::error("loadGraph : Error request " . Psr7\str($e->getRequest())); + if ($e->hasResponse()) { + $this->error("loadGraph : Error response " . Psr7\str($e->getResponse())); + Log::error("loadGraph : Error response " . Psr7\str($e->getResponse())); + } + return null; + } + try { + $message = (string)$r->getBody(); + $graph = new Graph($url, $message, $type); + return $graph; + } catch (EasyRdf\Exception $e) { + $this->error("loadGraph : Error parsing $url"); + Log::error("loadGraph : Error parsing $url"); + if($e instanceof EasyRdf\Parser\Exception) { + Log::error("loadGraph : Error exception line ".$e->getLine().", column: ".$e->getColumn()); + } + $this->error("loadGraph : Error exception message ".$e->getMessage()); + Log::error("loadGraph : Error exception message ".$e->getMessage()); + Log::error("loadGraph : Error content $message"); + return null; + } + + } + + private function geonamesResolveCoordinates($loc) { + $coords = cache("corpus.geonames.coord.$loc"); + if(is_null($coords)) { + $graph = $this->loadGraph("{$loc}about.rdf", 'rdfxml'); + $coords = is_null($graph)?null:$this->graphResolvCoordinate($loc, $graph); + cache(["corpus.geonames.coord.$loc" => is_null($coords)?false:$coords], Carbon::now()->addMinutes(20)); + } + return ($coords===false)?null:$coords; + } + + private function dbpediaResolveCoordinates($loc) { + $coords = cache("corpus.dbpedia.coord.$loc"); + if(is_null($coords)) { + $graph = $this->loadGraph("$loc.rdf", 'rdfxml'); + $coords = is_null($graph)?null:$this->graphResolvCoordinate($loc, $graph); + cache(["corpus.dbpedia.coord.$loc"=> is_null($coords)?false:$coords], Carbon::now()->addMinutes(20)); + } + return ($coords===false)?null:$coords; + } + + private function getLocation($doc) { + + $geoRes = $doc->getGeoInfo(); + + if(is_null($geoRes)) { + return null; + } + + $locUrls = []; + foreach($geoRes->getRefLocs() as $loc) { + if(preg_match(config('corpusparole.geonames_url_regexp'), $loc, $m) === 1) { + + if(!array_key_exists('geonames', $locUrls)) { + $locUrls['geonames'] = []; + } + array_push($locUrls['geonames'], "http://sws.geonames.org/$m[1]/"); + + } elseif(preg_match(config('corpusparole.dbpedia_url_regexp'), $loc, $md) === 1) { + if(!array_key_exists('dbpedia', $locUrls)) { + $locUrls['dbpedia'] = []; + } + //$this->line("DBPEDIA MATCH $loc ".print_r($md,true)); + array_push($locUrls['dbpedia'], "http://$md[1]/data/$md[4]"); + } + } + + $coordinates = null; + foreach($locUrls as $locType => $locList) { + foreach($locList as $locationUrl) { + $coordinates = call_user_func([$this, "${locType}ResolveCoordinates"], $locationUrl); + if(!is_null($coordinates)) { + break; + } + } + } + + if(is_null($coordinates)) { + $coordinates = [$geoRes->getLatitudeValue(), $geoRes->getLongitudeValue()]; + } + + if(empty($coordinates[0]) || empty($coordinates[1])) { + return null; + } else { + return [floatval($coordinates[0]), floatval($coordinates[1])]; + } + + } + + private function getCreationDate($doc) { + + $created = $doc->getCreated(); + if(is_null($created)) { + return null; + } + $dateType = $created->getDatatypeUri(); + $res = null; + + if($dateType === "http://purl.org/dc/terms/Period") { + $res = $this->processPeriod($created->getValue()); + } + elseif($dateType === "http://purl.org/dc/terms/W3CDTF") { + $res = $this->processDate($created->getValue()); + } + + return $res; + + } + + private function extractDate($dateStr) { + if(preg_match("/^\\d{4}$/", $dateStr) === 1) { + $dateStr = "$dateStr-1-1"; + } + $date = date_create($dateStr); + if($date === false ) { + Log::warning("DateStatsController:extractYear bad format for date $dateStr"); + return null; + } + return $date; + } + + private function processPeriod($periodStr) { + $start = null; + $end = null; + foreach(explode(";", $periodStr) as $elem) { + $elem = trim($elem); + if(strpos($elem, 'start=') === 0) { + $startDate = $this->extractDate(trim(substr($elem, 6))); + if(is_null($startDate)) { + return null; + } + $start = intval($startDate->format("Y")); + if($start === false) { + return null; + } + } elseif(strpos($elem, 'end=') === 0) { + $endDate = $this->extractDate(trim(substr($elem, 4))); + if(is_null($endDate)) { + return null; + } + $end = intval($endDate->format("Y")); + if($end === false) { + return null; + } + } + } + + if(is_null($start) || is_null($end) || $start>$end ) { + Log::warning("Bad format for $periodStr"); + return null; + } + + return array_map(function($y) { + return \DateTime::createFromFormat("Y", "$y")->format(\DateTime::W3C); + }, range($start, $end)); + } + + private function processDate($dateStr) { + $date = $this->extractDate($dateStr); + if(is_null($date)) { + return null; + } else { + return $date->format(\DateTime::W3C); + } + } + + private function getDiscourseTypes($doc) { + return array_reduce($doc->getDiscourseTypes(), function($res, $d) { + $val = null; + if($d instanceof Resource) { + $val = $d->getUri(); + } elseif($d instanceof Literal) { + $datatype = $d->getDatatypeURI(); + $val = (!empty($datatype)?"$datatype#":"").$d->getValue(); + } + if(!empty($val)) { + array_push($res,$val); + } + return $res; + }, []); + } + + private function getDocBody($doc) { + return [ + 'title' => (string)$doc->getTitle(), + 'date' => (string)$doc->getModified(), + 'location' => $this->getLocation($doc), + 'creation_date' => $this->getCreationDate($doc), + 'language' => $doc->getLanguageValue(), + 'discourse_types' => $this->getDiscourseTypes($doc), + 'geonames_hierarchy' => $this->getGeonamesHierarchy($doc), + 'subject' => $this->getSubjects($doc), + ]; + } + /** * Index one document into Elasticsearch * @@ -260,12 +492,7 @@ 'index' => config('elasticsearch.index'), 'type' => 'document', 'id' => (string)$doc->getId(), - 'body' => [ - 'title' => (string)$doc->getTitle(), - 'date' => (string)$doc->getModified(), - 'geonames_hierarchy' => $this->getGeonamesHierarchy($doc), - 'subject' => $this->getSubjects($doc) - ] + 'body' => $this->getDocBody($doc) ]; Es::index($query_data); } @@ -287,12 +514,7 @@ '_id' => (string)$doc->getId() ] ]; - $query_data['body'][] = [ - 'title' => (string)$doc->getTitle(), - 'date' => (string)$doc->getModified(), - 'geonames_hierarchy' => $this->getGeonamesHierarchy($doc), - 'subject' => $this->getSubjects($doc) - ]; + $query_data['body'][] = $this->getDocBody($doc); } Es::bulk($query_data); } @@ -367,16 +589,16 @@ if ($page==$lastPage && $i>=$lastPageEntryCount){ break; } - $this->indexOne($doc); $progressBar->setMessage($doc->getId()); $progressBar->advance(); + $this->indexOne($doc); } } else { - $this->indexBulk($docs); $progressBar->setMessage('Page '.$page); $progressBar->advance(); + $this->indexBulk($docs); } } $progressBar->finish(); diff -r 92fc9d077f95 -r 31a4987f6017 server/src/app/Models/DocumentBase.php --- a/server/src/app/Models/DocumentBase.php Fri Oct 07 02:07:34 2016 +0200 +++ b/server/src/app/Models/DocumentBase.php Sun Oct 09 11:44:18 2016 +0200 @@ -28,8 +28,9 @@ private $title = false; private $lang = null; private $langResolved = null; - private $issued = null; - private $modified = null; + private $issued = false; + private $modified = false; + private $created = false; public function getProvidedCHO() { @@ -44,8 +45,9 @@ $this->title = false; $this->lang = null; $this->langResolved = null; - $this->issued = null; - $this->modified = null; + $this->issued = false; + $this->modified = false; + $this->created = false; } public function getId() { @@ -125,7 +127,7 @@ } public function getIssued() { - if(is_null($this->issued)) { + if($this->issued === false) { try { $this->issued = $this->getProvidedCHO()->getLiteral(""); } catch(\Exception $e) { @@ -140,8 +142,19 @@ return is_null($issued)?null:$issued->getValue(); } + public function getCreated() { + if($this->created === false) { + try { + $this->created = $this->getProvidedCHO()->getLiteral(""); + } catch(\Exception $e) { + $this->created = null; + } + } + return $this->created; + } + public function getModified() { - if(is_null($this->modified)) { + if($this->modified === false) { try { $this->modified = $this->getProvidedCHO()->getLiteral(""); if(is_null($this->modified)) { diff -r 92fc9d077f95 -r 31a4987f6017 server/src/config/corpusparole.php --- a/server/src/config/corpusparole.php Fri Oct 07 02:07:34 2016 +0200 +++ b/server/src/config/corpusparole.php Sun Oct 09 11:44:18 2016 +0200 @@ -132,6 +132,8 @@ 'bnf_query_url' => 'http://data.bnf.fr/sparql', 'bnf_completion_url' => 'http://data.bnf.fr/search-letter/', + 'dbpedia_url_regexp' => '/^https?\:\/\/(([[:alpha:]]+\.)?dbpedia\.org)\/(resource|page)\/([^\/]+)\/?$/', + 'geonames_base_url' => 'http://sws.geonames.org/', 'geonames_url_regexp' => '/http[s]?\:\/\/(?:sws|www)\.geonames\.org\/(\d+)\/?/', 'geonames_cache_expiration' => 60*24*30,