Add fields to document index
authorymh <ymh.work@gmail.com>
Sun, 09 Oct 2016 11:44:18 +0200
changeset 325 31a4987f6017
parent 324 92fc9d077f95
child 326 226d5b17a119
Add fields to document index
dev/provisioning/modules/sysconfig/manifests/php.pp
dev/provisioning/modules/sysconfig/templates/corpus/corpus_env.conf.erb
dev/provisioning/modules/sysconfig/templates/corpus/local.env.erb
server/src/app/Console/Commands/IndexDocuments.php
server/src/app/Models/DocumentBase.php
server/src/config/corpusparole.php
--- a/dev/provisioning/modules/sysconfig/manifests/php.pp	Fri Oct 07 02:07:34 2016 +0200
+++ b/dev/provisioning/modules/sysconfig/manifests/php.pp	Sun Oct 09 11:44:18 2016 +0200
@@ -29,7 +29,7 @@
     augeas { "php_ini/memory_limit":
         lens    => "PHP.lns",
         incl    => "/opt/remi/php56/root/etc/php.ini",
-        changes => "set PHP/memory_limit 128M",
+        changes => "set PHP/memory_limit 512M",
         notify => Service['httpd'],
         require => Package["php56"]
     }->
--- a/dev/provisioning/modules/sysconfig/templates/corpus/corpus_env.conf.erb	Fri Oct 07 02:07:34 2016 +0200
+++ b/dev/provisioning/modules/sysconfig/templates/corpus/corpus_env.conf.erb	Sun Oct 09 11:44:18 2016 +0200
@@ -4,7 +4,7 @@
 SetEnv APP_URL http://<%= @vhost %>/corpus-back/
 
 SetEnv DATABASE_DRIVER mysql
-SetEnv DB_HOST localhost
+SetEnv DB_HOST <%= @db_host %>
 SetEnv DB_DATABASE <%= @db_name %>
 SetEnv DB_USERNAME <%= @db_user %>
 SetEnv DB_PASSWORD <%= @db_pw %>
@@ -21,9 +21,41 @@
 
 SetEnv CORPUSPAROLE_COCOON_RDF_BASE_URI http://cocoon.huma-num.fr/exist/crdo/rdf/
 SetEnv CORPUSPAROLE_COCOON_OAIPMH_URL http://cocoon.huma-num.fr/crdo_servlet/oai-pmh
-SetEnv CORPUSPAROLE_SESAME_BASE_URL http://127.0.0.1:8080/openrdf-sesame
+SetEnv CORPUSPAROLE_SESAME_BASE_URL http://<%= @sesame_host %>:<%= @sesame_port %>/openrdf-sesame
 SetEnv CORPUSPAROLE_SESAME_REPOSITORY corpus
 SetEnv CORPUSPAROLE_SESAME_REPOSITORY_RAW corpus_raw
 SetEnv CORPUSPAROLE_SESAME_LEXVO_REPOSITORY lexvo
 
 SetEnv EASYRDF_HTTP_CLIENT_TIMEOUT 5000
+
+SetEnv ELASTICSEARCH_URL <@= @elasticsearch_host %>:<@= @elasticsearch_port %>
+SetEnv ELASTICSEARCH_LOG_PATH 'logs/elasticsearch.log'
+SetEnv ELASTICSEARCH_INDEX 'corpus'
+SetEnv ELASTICSEARCH_SHARDS 1
+SetEnv ELASTICSEARCH_REPLICAS 1
+
+SetEnv HANDLE_HOST <%= @handle_host %>
+SetEnv HANDLE_PORT <%= @handle_port %>
+SetEnv HANDLE_PREFIX <%= @handle_prefix %>
+SetEnv HANDLE_ADMIN_ID <%= @handle_admin_id %>
+
+SetEnv HANDLE_CERT_OR_PKEY "<%= @handle_cert_or_pkey %>"
+SetEnv HANDLE_PASSWORD <%= @handle_password %>
+
+SetEnv HANDLE_TEST_PREFIX <%= @handle_test_prefix %>
+
+
+SetEnv HANDLE_TEST_CERT "corpusadmpkeycrt.pem"
+SetEnv HANDLE_TEST_CERT_PASSWORD NULL
+SetEnv HANDLE_TEST_CERT_ADMIN_HANDLE "300:<%= @handle_test_prefix %>/CORPUS_ADMIN"
+
+SetEnv HANDLE_TEST_RSA_KEY "corpusadmpriv.pem"
+SetEnv HANDLE_TEST_RSA_PASSWORD NULL
+SetEnv HANDLE_TEST_RSA_ADMIN_HANDLE "300:<%= @handle_test_prefix %>/CORPUS_ADMIN_RSA"
+
+
+SetEnv HANDLE_TEST_DSA_KEY "corpusadmdsapriv.pem"
+SetEnv HANDLE_TEST_DSA_PASSWORD NULL
+SetEnv HANDLE_TEST_DSA_ADMIN_HANDLE "300:<%= @handle_test_prefix %>/CORPUS_ADMIN_DSA"
+
+SetEnv GEONAMES_USERNAME "<%= @geonames-username %>"
--- a/dev/provisioning/modules/sysconfig/templates/corpus/local.env.erb	Fri Oct 07 02:07:34 2016 +0200
+++ b/dev/provisioning/modules/sysconfig/templates/corpus/local.env.erb	Sun Oct 09 11:44:18 2016 +0200
@@ -4,7 +4,7 @@
 APP_URL=http://<%= @vhost %>/corpus-back/
 
 DATABASE_DRIVER=mysql
-DB_HOST=localhost
+DB_HOST=<%= @db_host %>
 DB_DATABASE=<%= @db_name %>
 DB_USERNAME=<%= @db_user %>
 DB_PASSWORD=<%= @db_pw %>
@@ -21,9 +21,42 @@
 
 CORPUSPAROLE_COCOON_RDF_BASE_URI=http://cocoon.huma-num.fr/exist/crdo/rdf/
 CORPUSPAROLE_COCOON_OAIPMH_URL=http://cocoon.huma-num.fr/crdo_servlet/oai-pmh
-CORPUSPAROLE_SESAME_BASE_URL=http://172.16.1.6:8080/openrdf-sesame/
+CORPUSPAROLE_SESAME_BASE_URL=http://<%= @sesame_host %>:<%= @sesame_port %>/openrdf-sesame
 CORPUSPAROLE_SESAME_REPOSITORY=corpus
 CORPUSPAROLE_SESAME_REPOSITORY_RAW=corpus_raw
 CORPUSPAROLE_SESAME_LEXVO_REPOSITORY=lexvo
 
 EASYRDF_HTTP_CLIENT_TIMEOUT=5000
+
+ELASTICSEARCH_URL=<@= @elasticsearch_host %>:<@= @elasticsearch_port %>
+ELASTICSEARCH_LOG_PATH='logs/elasticsearch.log'
+ELASTICSEARCH_INDEX='corpus'
+ELASTICSEARCH_SHARDS=1
+ELASTICSEARCH_REPLICAS=1
+
+HANDLE_HOST=<%= @handle_host %>
+HANDLE_PORT=<%= @handle_port %>
+HANDLE_PREFIX=<%= @handle_prefix %>
+HANDLE_ADMIN_ID=<%= @handle_admin_id %>
+
+HANDLE_CERT_OR_PKEY="<%= @handle_cert_or_pkey %>"
+HANDLE_PASSWORD=<%= @handle_password %>
+
+HANDLE_TEST_PREFIX=<%= @handle_test_prefix %>
+
+
+HANDLE_TEST_CERT="corpusadmpkeycrt.pem"
+HANDLE_TEST_CERT_PASSWORD=NULL
+HANDLE_TEST_CERT_ADMIN_HANDLE="300:<%= @handle_test_prefix %>/CORPUS_ADMIN"
+
+HANDLE_TEST_RSA_KEY="corpusadmpriv.pem"
+HANDLE_TEST_RSA_PASSWORD=NULL
+HANDLE_TEST_RSA_ADMIN_HANDLE="300:<%= @handle_test_prefix %>/CORPUS_ADMIN_RSA"
+
+
+HANDLE_TEST_DSA_KEY="corpusadmdsapriv.pem"
+HANDLE_TEST_DSA_PASSWORD=NULL
+HANDLE_TEST_DSA_ADMIN_HANDLE="300:<%= @handle_test_prefix %>/CORPUS_ADMIN_DSA"
+
+GEONAMES_USERNAME="<%= @geonames-username %>"
+
--- a/server/src/app/Console/Commands/IndexDocuments.php	Fri Oct 07 02:07:34 2016 +0200
+++ b/server/src/app/Console/Commands/IndexDocuments.php	Sun Oct 09 11:44:18 2016 +0200
@@ -2,11 +2,19 @@
 
 namespace CorpusParole\Console\Commands;
 
+
+
 use Illuminate\Console\Command;
 use EasyRdf\Resource;
 use EasyRdf\Literal;
+use EasyRdf\Graph;
+
+use Carbon\Carbon;
 
 use GuzzleHttp\Client;
+use GuzzleHttp\Exception\TransferException;
+use GuzzleHttp\Psr7;
+
 use CorpusParole\Libraries\Utils;
 use CorpusParole\Repositories\DocumentRepository;
 use CorpusParole\Libraries\CocoonUtils;
@@ -14,6 +22,8 @@
 use CorpusParole\Services\BnfResolverInterface;
 use CorpusParole\Services\LexvoResolverInterface;
 use Es;
+use Log;
+use Cache;
 
 class IndexDocuments extends Command
 {
@@ -96,6 +106,9 @@
                         'date' => [ 'type' => 'date' ],
                         'geonames_hyerarchy' => [ 'type' => 'string' ],
                         'location' => [ 'type' => 'geo_point' ],
+                        'creation_date' => ['type' => 'date'],
+                        'language' => ['type' => 'string'],
+                        'discourse_types' => ['type' => 'string'],
                         'subject' => [
                             'type' => 'nested',
                             'properties' => [
@@ -118,7 +131,7 @@
 
 
     private function getGeonamesHierarchyArray($geonamesid) {
-        // TODO: Manage this cache !!!
+
         $hcache = GeonamesHierarchy::where('geonamesid', $geonamesid)->first();
         if(is_null($hcache)) {
 
@@ -146,7 +159,6 @@
                 array_push($res, $hierarchyElem['geonameId']);
             }
         }
-
         return $res;
 
     }
@@ -210,7 +222,6 @@
                     'type' => 'txt'
                 ]);
             }
-
             return $res;
         }, []);
 
@@ -248,6 +259,227 @@
         );
     }
 
+    private function graphResolvCoordinate($loc, $graph) {
+        $latLit = $graph->getLiteral($loc, "<http://www.w3.org/2003/01/geo/wgs84_pos#lat>");
+        if(is_null($latLit) || empty($latLit->getValue())) {
+            return null;
+        }
+        $lat = $latLit->getValue();
+
+        $longLit = $graph->getLiteral($loc, "<http://www.w3.org/2003/01/geo/wgs84_pos#long>");
+        if(is_null($longLit) || empty($longLit->getValue())) {
+            return null;
+        }
+        $long = $longLit->getValue();
+
+        return [ $lat, $long ];
+    }
+
+    private function loadGraph($url, $type) {
+        try {
+            $r = $this->httpClient->get($url);
+        } catch (TransferException $e) {
+            $this->error("loadGraph : Error Loading $url");
+            Log::error("loadGraph : Error Loading $url");
+            Log::error("loadGraph : Error request " . Psr7\str($e->getRequest()));
+            if ($e->hasResponse()) {
+                $this->error("loadGraph : Error response " . Psr7\str($e->getResponse()));
+                Log::error("loadGraph : Error response " . Psr7\str($e->getResponse()));
+            }
+            return null;
+        }
+        try {
+            $message = (string)$r->getBody();
+            $graph = new Graph($url, $message, $type);
+            return $graph;
+        } catch (EasyRdf\Exception $e) {
+            $this->error("loadGraph : Error parsing $url");
+            Log::error("loadGraph : Error parsing $url");
+            if($e instanceof EasyRdf\Parser\Exception) {
+                Log::error("loadGraph : Error exception line ".$e->getLine().", column: ".$e->getColumn());
+            }
+            $this->error("loadGraph : Error exception message ".$e->getMessage());
+            Log::error("loadGraph : Error exception message ".$e->getMessage());
+            Log::error("loadGraph : Error content $message");
+            return null;
+        }
+
+    }
+
+    private function geonamesResolveCoordinates($loc) {
+        $coords = cache("corpus.geonames.coord.$loc");
+        if(is_null($coords)) {
+            $graph = $this->loadGraph("{$loc}about.rdf", 'rdfxml');
+            $coords = is_null($graph)?null:$this->graphResolvCoordinate($loc, $graph);
+            cache(["corpus.geonames.coord.$loc" => is_null($coords)?false:$coords], Carbon::now()->addMinutes(20));
+        }
+        return ($coords===false)?null:$coords;
+    }
+
+    private function dbpediaResolveCoordinates($loc) {
+        $coords = cache("corpus.dbpedia.coord.$loc");
+        if(is_null($coords)) {
+            $graph = $this->loadGraph("$loc.rdf", 'rdfxml');
+            $coords = is_null($graph)?null:$this->graphResolvCoordinate($loc, $graph);
+            cache(["corpus.dbpedia.coord.$loc"=> is_null($coords)?false:$coords], Carbon::now()->addMinutes(20));
+        }
+        return ($coords===false)?null:$coords;
+    }
+
+    private function getLocation($doc) {
+
+        $geoRes = $doc->getGeoInfo();
+
+        if(is_null($geoRes)) {
+            return null;
+        }
+
+        $locUrls = [];
+        foreach($geoRes->getRefLocs() as $loc) {
+            if(preg_match(config('corpusparole.geonames_url_regexp'), $loc, $m) === 1) {
+
+                if(!array_key_exists('geonames', $locUrls)) {
+                    $locUrls['geonames'] = [];
+                }
+                array_push($locUrls['geonames'], "http://sws.geonames.org/$m[1]/");
+
+            } elseif(preg_match(config('corpusparole.dbpedia_url_regexp'), $loc, $md) === 1) {
+                if(!array_key_exists('dbpedia', $locUrls)) {
+                    $locUrls['dbpedia'] = [];
+                }
+                //$this->line("DBPEDIA MATCH $loc ".print_r($md,true));
+                array_push($locUrls['dbpedia'], "http://$md[1]/data/$md[4]");
+            }
+        }
+
+        $coordinates = null;
+        foreach($locUrls as $locType => $locList) {
+            foreach($locList as $locationUrl) {
+                $coordinates = call_user_func([$this, "${locType}ResolveCoordinates"], $locationUrl);
+                if(!is_null($coordinates)) {
+                    break;
+                }
+            }
+        }
+
+        if(is_null($coordinates)) {
+            $coordinates = [$geoRes->getLatitudeValue(), $geoRes->getLongitudeValue()];
+        }
+
+        if(empty($coordinates[0]) || empty($coordinates[1])) {
+            return null;
+        } else {
+            return [floatval($coordinates[0]), floatval($coordinates[1])];
+        }
+
+    }
+
+    private function getCreationDate($doc) {
+
+        $created = $doc->getCreated();
+        if(is_null($created)) {
+            return null;
+        }
+        $dateType = $created->getDatatypeUri();
+        $res = null;
+
+        if($dateType === "http://purl.org/dc/terms/Period") {
+            $res = $this->processPeriod($created->getValue());
+        }
+        elseif($dateType === "http://purl.org/dc/terms/W3CDTF") {
+            $res = $this->processDate($created->getValue());
+        }
+
+        return $res;
+
+    }
+
+    private function extractDate($dateStr) {
+        if(preg_match("/^\\d{4}$/", $dateStr) === 1) {
+            $dateStr = "$dateStr-1-1";
+        }
+        $date = date_create($dateStr);
+        if($date === false ) {
+            Log::warning("DateStatsController:extractYear bad format for date $dateStr");
+            return null;
+        }
+        return $date;
+    }
+
+    private function processPeriod($periodStr) {
+        $start = null;
+        $end = null;
+        foreach(explode(";", $periodStr) as $elem) {
+            $elem = trim($elem);
+            if(strpos($elem, 'start=') === 0) {
+                $startDate = $this->extractDate(trim(substr($elem, 6)));
+                if(is_null($startDate)) {
+                    return null;
+                }
+                $start = intval($startDate->format("Y"));
+                if($start === false) {
+                    return null;
+                }
+            } elseif(strpos($elem, 'end=') === 0) {
+                $endDate = $this->extractDate(trim(substr($elem, 4)));
+                if(is_null($endDate)) {
+                    return null;
+                }
+                $end = intval($endDate->format("Y"));
+                if($end === false) {
+                    return null;
+                }
+            }
+        }
+
+        if(is_null($start) || is_null($end) || $start>$end ) {
+            Log::warning("Bad format for $periodStr");
+            return null;
+        }
+
+        return array_map(function($y) {
+            return \DateTime::createFromFormat("Y", "$y")->format(\DateTime::W3C);
+        }, range($start, $end));
+    }
+
+    private function processDate($dateStr) {
+        $date = $this->extractDate($dateStr);
+        if(is_null($date))  {
+            return null;
+        } else {
+            return $date->format(\DateTime::W3C);
+        }
+    }
+
+    private function getDiscourseTypes($doc) {
+        return array_reduce($doc->getDiscourseTypes(), function($res, $d) {
+            $val = null;
+            if($d instanceof Resource) {
+                $val = $d->getUri();
+            } elseif($d instanceof Literal) {
+                $datatype = $d->getDatatypeURI();
+                $val = (!empty($datatype)?"$datatype#":"").$d->getValue();
+            }
+            if(!empty($val)) {
+                array_push($res,$val);
+            }
+            return $res;
+        }, []);
+    }
+
+    private function getDocBody($doc) {
+        return [
+            'title' => (string)$doc->getTitle(),
+            'date' => (string)$doc->getModified(),
+            'location' => $this->getLocation($doc),
+            'creation_date' => $this->getCreationDate($doc),
+            'language' => $doc->getLanguageValue(),
+            'discourse_types' => $this->getDiscourseTypes($doc),
+            'geonames_hierarchy' => $this->getGeonamesHierarchy($doc),
+            'subject' => $this->getSubjects($doc),
+        ];
+    }
+
     /**
      * Index one document into Elasticsearch
      *
@@ -260,12 +492,7 @@
             'index' => config('elasticsearch.index'),
             'type' => 'document',
             'id' => (string)$doc->getId(),
-            'body' => [
-                'title' => (string)$doc->getTitle(),
-                'date' => (string)$doc->getModified(),
-                'geonames_hierarchy' => $this->getGeonamesHierarchy($doc),
-                'subject' => $this->getSubjects($doc)
-            ]
+            'body' => $this->getDocBody($doc)
         ];
         Es::index($query_data);
     }
@@ -287,12 +514,7 @@
                       '_id' => (string)$doc->getId()
                   ]
               ];
-              $query_data['body'][] = [
-                  'title' => (string)$doc->getTitle(),
-                  'date' => (string)$doc->getModified(),
-                  'geonames_hierarchy' => $this->getGeonamesHierarchy($doc),
-                  'subject' => $this->getSubjects($doc)
-              ];
+              $query_data['body'][] = $this->getDocBody($doc);
           }
           Es::bulk($query_data);
      }
@@ -367,16 +589,16 @@
                     if ($page==$lastPage && $i>=$lastPageEntryCount){
                         break;
                     }
-                    $this->indexOne($doc);
                     $progressBar->setMessage($doc->getId());
                     $progressBar->advance();
+                    $this->indexOne($doc);
                 }
             }
             else
             {
-                $this->indexBulk($docs);
                 $progressBar->setMessage('Page '.$page);
                 $progressBar->advance();
+                $this->indexBulk($docs);
             }
         }
         $progressBar->finish();
--- a/server/src/app/Models/DocumentBase.php	Fri Oct 07 02:07:34 2016 +0200
+++ b/server/src/app/Models/DocumentBase.php	Sun Oct 09 11:44:18 2016 +0200
@@ -28,8 +28,9 @@
     private $title = false;
     private $lang = null;
     private $langResolved = null;
-    private $issued = null;
-    private $modified = null;
+    private $issued = false;
+    private $modified = false;
+    private $created = false;
 
 
     public function getProvidedCHO() {
@@ -44,8 +45,9 @@
         $this->title = false;
         $this->lang = null;
         $this->langResolved = null;
-        $this->issued = null;
-        $this->modified = null;
+        $this->issued = false;
+        $this->modified = false;
+        $this->created = false;
     }
 
     public function getId() {
@@ -125,7 +127,7 @@
     }
 
     public function getIssued() {
-        if(is_null($this->issued)) {
+        if($this->issued === false) {
             try {
                 $this->issued = $this->getProvidedCHO()->getLiteral("<http://purl.org/dc/terms/issued>");
             } catch(\Exception $e) {
@@ -140,8 +142,19 @@
         return is_null($issued)?null:$issued->getValue();
     }
 
+    public function getCreated() {
+        if($this->created === false) {
+            try {
+                $this->created = $this->getProvidedCHO()->getLiteral("<http://purl.org/dc/terms/created>");
+            } catch(\Exception $e) {
+                $this->created = null;
+            }
+        }
+        return $this->created;
+    }
+
     public function getModified() {
-        if(is_null($this->modified)) {
+        if($this->modified === false) {
             try {
                 $this->modified = $this->getProvidedCHO()->getLiteral("<http://purl.org/dc/terms/modified>");
                 if(is_null($this->modified)) {
--- a/server/src/config/corpusparole.php	Fri Oct 07 02:07:34 2016 +0200
+++ b/server/src/config/corpusparole.php	Sun Oct 09 11:44:18 2016 +0200
@@ -132,6 +132,8 @@
     'bnf_query_url' => 'http://data.bnf.fr/sparql',
     'bnf_completion_url' => 'http://data.bnf.fr/search-letter/',
 
+    'dbpedia_url_regexp' => '/^https?\:\/\/(([[:alpha:]]+\.)?dbpedia\.org)\/(resource|page)\/([^\/]+)\/?$/',
+
     'geonames_base_url' => 'http://sws.geonames.org/',
     'geonames_url_regexp' => '/http[s]?\:\/\/(?:sws|www)\.geonames\.org\/(\d+)\/?/',
     'geonames_cache_expiration' => 60*24*30,