--- a/server/src/app/Console/Commands/IndexDocuments.php Fri Oct 07 02:07:34 2016 +0200
+++ b/server/src/app/Console/Commands/IndexDocuments.php Sun Oct 09 11:44:18 2016 +0200
@@ -2,11 +2,19 @@
namespace CorpusParole\Console\Commands;
+
+
use Illuminate\Console\Command;
use EasyRdf\Resource;
use EasyRdf\Literal;
+use EasyRdf\Graph;
+
+use Carbon\Carbon;
use GuzzleHttp\Client;
+use GuzzleHttp\Exception\TransferException;
+use GuzzleHttp\Psr7;
+
use CorpusParole\Libraries\Utils;
use CorpusParole\Repositories\DocumentRepository;
use CorpusParole\Libraries\CocoonUtils;
@@ -14,6 +22,8 @@
use CorpusParole\Services\BnfResolverInterface;
use CorpusParole\Services\LexvoResolverInterface;
use Es;
+use Log;
+use Cache;
class IndexDocuments extends Command
{
@@ -96,6 +106,9 @@
'date' => [ 'type' => 'date' ],
'geonames_hyerarchy' => [ 'type' => 'string' ],
'location' => [ 'type' => 'geo_point' ],
+ 'creation_date' => ['type' => 'date'],
+ 'language' => ['type' => 'string'],
+ 'discourse_types' => ['type' => 'string'],
'subject' => [
'type' => 'nested',
'properties' => [
@@ -118,7 +131,7 @@
private function getGeonamesHierarchyArray($geonamesid) {
- // TODO: Manage this cache !!!
+
$hcache = GeonamesHierarchy::where('geonamesid', $geonamesid)->first();
if(is_null($hcache)) {
@@ -146,7 +159,6 @@
array_push($res, $hierarchyElem['geonameId']);
}
}
-
return $res;
}
@@ -210,7 +222,6 @@
'type' => 'txt'
]);
}
-
return $res;
}, []);
@@ -248,6 +259,227 @@
);
}
+ private function graphResolvCoordinate($loc, $graph) {
+ $latLit = $graph->getLiteral($loc, "<http://www.w3.org/2003/01/geo/wgs84_pos#lat>");
+ if(is_null($latLit) || empty($latLit->getValue())) {
+ return null;
+ }
+ $lat = $latLit->getValue();
+
+ $longLit = $graph->getLiteral($loc, "<http://www.w3.org/2003/01/geo/wgs84_pos#long>");
+ if(is_null($longLit) || empty($longLit->getValue())) {
+ return null;
+ }
+ $long = $longLit->getValue();
+
+ return [ $lat, $long ];
+ }
+
+ private function loadGraph($url, $type) {
+ try {
+ $r = $this->httpClient->get($url);
+ } catch (TransferException $e) {
+ $this->error("loadGraph : Error Loading $url");
+ Log::error("loadGraph : Error Loading $url");
+ Log::error("loadGraph : Error request " . Psr7\str($e->getRequest()));
+ if ($e->hasResponse()) {
+ $this->error("loadGraph : Error response " . Psr7\str($e->getResponse()));
+ Log::error("loadGraph : Error response " . Psr7\str($e->getResponse()));
+ }
+ return null;
+ }
+ try {
+ $message = (string)$r->getBody();
+ $graph = new Graph($url, $message, $type);
+ return $graph;
+ } catch (EasyRdf\Exception $e) {
+ $this->error("loadGraph : Error parsing $url");
+ Log::error("loadGraph : Error parsing $url");
+ if($e instanceof EasyRdf\Parser\Exception) {
+ Log::error("loadGraph : Error exception line ".$e->getLine().", column: ".$e->getColumn());
+ }
+ $this->error("loadGraph : Error exception message ".$e->getMessage());
+ Log::error("loadGraph : Error exception message ".$e->getMessage());
+ Log::error("loadGraph : Error content $message");
+ return null;
+ }
+
+ }
+
+ private function geonamesResolveCoordinates($loc) {
+ $coords = cache("corpus.geonames.coord.$loc");
+ if(is_null($coords)) {
+ $graph = $this->loadGraph("{$loc}about.rdf", 'rdfxml');
+ $coords = is_null($graph)?null:$this->graphResolvCoordinate($loc, $graph);
+ cache(["corpus.geonames.coord.$loc" => is_null($coords)?false:$coords], Carbon::now()->addMinutes(20));
+ }
+ return ($coords===false)?null:$coords;
+ }
+
+ private function dbpediaResolveCoordinates($loc) {
+ $coords = cache("corpus.dbpedia.coord.$loc");
+ if(is_null($coords)) {
+ $graph = $this->loadGraph("$loc.rdf", 'rdfxml');
+ $coords = is_null($graph)?null:$this->graphResolvCoordinate($loc, $graph);
+ cache(["corpus.dbpedia.coord.$loc"=> is_null($coords)?false:$coords], Carbon::now()->addMinutes(20));
+ }
+ return ($coords===false)?null:$coords;
+ }
+
+ private function getLocation($doc) {
+
+ $geoRes = $doc->getGeoInfo();
+
+ if(is_null($geoRes)) {
+ return null;
+ }
+
+ $locUrls = [];
+ foreach($geoRes->getRefLocs() as $loc) {
+ if(preg_match(config('corpusparole.geonames_url_regexp'), $loc, $m) === 1) {
+
+ if(!array_key_exists('geonames', $locUrls)) {
+ $locUrls['geonames'] = [];
+ }
+ array_push($locUrls['geonames'], "http://sws.geonames.org/$m[1]/");
+
+ } elseif(preg_match(config('corpusparole.dbpedia_url_regexp'), $loc, $md) === 1) {
+ if(!array_key_exists('dbpedia', $locUrls)) {
+ $locUrls['dbpedia'] = [];
+ }
+ //$this->line("DBPEDIA MATCH $loc ".print_r($md,true));
+ array_push($locUrls['dbpedia'], "http://$md[1]/data/$md[4]");
+ }
+ }
+
+ $coordinates = null;
+ foreach($locUrls as $locType => $locList) {
+ foreach($locList as $locationUrl) {
+ $coordinates = call_user_func([$this, "${locType}ResolveCoordinates"], $locationUrl);
+ if(!is_null($coordinates)) {
+ break;
+ }
+ }
+ }
+
+ if(is_null($coordinates)) {
+ $coordinates = [$geoRes->getLatitudeValue(), $geoRes->getLongitudeValue()];
+ }
+
+ if(empty($coordinates[0]) || empty($coordinates[1])) {
+ return null;
+ } else {
+ return [floatval($coordinates[0]), floatval($coordinates[1])];
+ }
+
+ }
+
+ private function getCreationDate($doc) {
+
+ $created = $doc->getCreated();
+ if(is_null($created)) {
+ return null;
+ }
+ $dateType = $created->getDatatypeUri();
+ $res = null;
+
+ if($dateType === "http://purl.org/dc/terms/Period") {
+ $res = $this->processPeriod($created->getValue());
+ }
+ elseif($dateType === "http://purl.org/dc/terms/W3CDTF") {
+ $res = $this->processDate($created->getValue());
+ }
+
+ return $res;
+
+ }
+
+ private function extractDate($dateStr) {
+ if(preg_match("/^\\d{4}$/", $dateStr) === 1) {
+ $dateStr = "$dateStr-1-1";
+ }
+ $date = date_create($dateStr);
+ if($date === false ) {
+ Log::warning("DateStatsController:extractYear bad format for date $dateStr");
+ return null;
+ }
+ return $date;
+ }
+
+ private function processPeriod($periodStr) {
+ $start = null;
+ $end = null;
+ foreach(explode(";", $periodStr) as $elem) {
+ $elem = trim($elem);
+ if(strpos($elem, 'start=') === 0) {
+ $startDate = $this->extractDate(trim(substr($elem, 6)));
+ if(is_null($startDate)) {
+ return null;
+ }
+ $start = intval($startDate->format("Y"));
+ if($start === false) {
+ return null;
+ }
+ } elseif(strpos($elem, 'end=') === 0) {
+ $endDate = $this->extractDate(trim(substr($elem, 4)));
+ if(is_null($endDate)) {
+ return null;
+ }
+ $end = intval($endDate->format("Y"));
+ if($end === false) {
+ return null;
+ }
+ }
+ }
+
+ if(is_null($start) || is_null($end) || $start>$end ) {
+ Log::warning("Bad format for $periodStr");
+ return null;
+ }
+
+ return array_map(function($y) {
+ return \DateTime::createFromFormat("Y", "$y")->format(\DateTime::W3C);
+ }, range($start, $end));
+ }
+
+ private function processDate($dateStr) {
+ $date = $this->extractDate($dateStr);
+ if(is_null($date)) {
+ return null;
+ } else {
+ return $date->format(\DateTime::W3C);
+ }
+ }
+
+ private function getDiscourseTypes($doc) {
+ return array_reduce($doc->getDiscourseTypes(), function($res, $d) {
+ $val = null;
+ if($d instanceof Resource) {
+ $val = $d->getUri();
+ } elseif($d instanceof Literal) {
+ $datatype = $d->getDatatypeURI();
+ $val = (!empty($datatype)?"$datatype#":"").$d->getValue();
+ }
+ if(!empty($val)) {
+ array_push($res,$val);
+ }
+ return $res;
+ }, []);
+ }
+
+ private function getDocBody($doc) {
+ return [
+ 'title' => (string)$doc->getTitle(),
+ 'date' => (string)$doc->getModified(),
+ 'location' => $this->getLocation($doc),
+ 'creation_date' => $this->getCreationDate($doc),
+ 'language' => $doc->getLanguageValue(),
+ 'discourse_types' => $this->getDiscourseTypes($doc),
+ 'geonames_hierarchy' => $this->getGeonamesHierarchy($doc),
+ 'subject' => $this->getSubjects($doc),
+ ];
+ }
+
/**
* Index one document into Elasticsearch
*
@@ -260,12 +492,7 @@
'index' => config('elasticsearch.index'),
'type' => 'document',
'id' => (string)$doc->getId(),
- 'body' => [
- 'title' => (string)$doc->getTitle(),
- 'date' => (string)$doc->getModified(),
- 'geonames_hierarchy' => $this->getGeonamesHierarchy($doc),
- 'subject' => $this->getSubjects($doc)
- ]
+ 'body' => $this->getDocBody($doc)
];
Es::index($query_data);
}
@@ -287,12 +514,7 @@
'_id' => (string)$doc->getId()
]
];
- $query_data['body'][] = [
- 'title' => (string)$doc->getTitle(),
- 'date' => (string)$doc->getModified(),
- 'geonames_hierarchy' => $this->getGeonamesHierarchy($doc),
- 'subject' => $this->getSubjects($doc)
- ];
+ $query_data['body'][] = $this->getDocBody($doc);
}
Es::bulk($query_data);
}
@@ -367,16 +589,16 @@
if ($page==$lastPage && $i>=$lastPageEntryCount){
break;
}
- $this->indexOne($doc);
$progressBar->setMessage($doc->getId());
$progressBar->advance();
+ $this->indexOne($doc);
}
}
else
{
- $this->indexBulk($docs);
$progressBar->setMessage('Page '.$page);
$progressBar->advance();
+ $this->indexBulk($docs);
}
}
$progressBar->finish();