diff -r 92fc9d077f95 -r 31a4987f6017 server/src/app/Console/Commands/IndexDocuments.php --- a/server/src/app/Console/Commands/IndexDocuments.php Fri Oct 07 02:07:34 2016 +0200 +++ b/server/src/app/Console/Commands/IndexDocuments.php Sun Oct 09 11:44:18 2016 +0200 @@ -2,11 +2,19 @@ namespace CorpusParole\Console\Commands; + + use Illuminate\Console\Command; use EasyRdf\Resource; use EasyRdf\Literal; +use EasyRdf\Graph; + +use Carbon\Carbon; use GuzzleHttp\Client; +use GuzzleHttp\Exception\TransferException; +use GuzzleHttp\Psr7; + use CorpusParole\Libraries\Utils; use CorpusParole\Repositories\DocumentRepository; use CorpusParole\Libraries\CocoonUtils; @@ -14,6 +22,8 @@ use CorpusParole\Services\BnfResolverInterface; use CorpusParole\Services\LexvoResolverInterface; use Es; +use Log; +use Cache; class IndexDocuments extends Command { @@ -96,6 +106,9 @@ 'date' => [ 'type' => 'date' ], 'geonames_hyerarchy' => [ 'type' => 'string' ], 'location' => [ 'type' => 'geo_point' ], + 'creation_date' => ['type' => 'date'], + 'language' => ['type' => 'string'], + 'discourse_types' => ['type' => 'string'], 'subject' => [ 'type' => 'nested', 'properties' => [ @@ -118,7 +131,7 @@ private function getGeonamesHierarchyArray($geonamesid) { - // TODO: Manage this cache !!! + $hcache = GeonamesHierarchy::where('geonamesid', $geonamesid)->first(); if(is_null($hcache)) { @@ -146,7 +159,6 @@ array_push($res, $hierarchyElem['geonameId']); } } - return $res; } @@ -210,7 +222,6 @@ 'type' => 'txt' ]); } - return $res; }, []); @@ -248,6 +259,227 @@ ); } + private function graphResolvCoordinate($loc, $graph) { + $latLit = $graph->getLiteral($loc, ""); + if(is_null($latLit) || empty($latLit->getValue())) { + return null; + } + $lat = $latLit->getValue(); + + $longLit = $graph->getLiteral($loc, ""); + if(is_null($longLit) || empty($longLit->getValue())) { + return null; + } + $long = $longLit->getValue(); + + return [ $lat, $long ]; + } + + private function loadGraph($url, $type) { + try { + $r = $this->httpClient->get($url); + } catch (TransferException $e) { + $this->error("loadGraph : Error Loading $url"); + Log::error("loadGraph : Error Loading $url"); + Log::error("loadGraph : Error request " . Psr7\str($e->getRequest())); + if ($e->hasResponse()) { + $this->error("loadGraph : Error response " . Psr7\str($e->getResponse())); + Log::error("loadGraph : Error response " . Psr7\str($e->getResponse())); + } + return null; + } + try { + $message = (string)$r->getBody(); + $graph = new Graph($url, $message, $type); + return $graph; + } catch (EasyRdf\Exception $e) { + $this->error("loadGraph : Error parsing $url"); + Log::error("loadGraph : Error parsing $url"); + if($e instanceof EasyRdf\Parser\Exception) { + Log::error("loadGraph : Error exception line ".$e->getLine().", column: ".$e->getColumn()); + } + $this->error("loadGraph : Error exception message ".$e->getMessage()); + Log::error("loadGraph : Error exception message ".$e->getMessage()); + Log::error("loadGraph : Error content $message"); + return null; + } + + } + + private function geonamesResolveCoordinates($loc) { + $coords = cache("corpus.geonames.coord.$loc"); + if(is_null($coords)) { + $graph = $this->loadGraph("{$loc}about.rdf", 'rdfxml'); + $coords = is_null($graph)?null:$this->graphResolvCoordinate($loc, $graph); + cache(["corpus.geonames.coord.$loc" => is_null($coords)?false:$coords], Carbon::now()->addMinutes(20)); + } + return ($coords===false)?null:$coords; + } + + private function dbpediaResolveCoordinates($loc) { + $coords = cache("corpus.dbpedia.coord.$loc"); + if(is_null($coords)) { + $graph = $this->loadGraph("$loc.rdf", 'rdfxml'); + $coords = is_null($graph)?null:$this->graphResolvCoordinate($loc, $graph); + cache(["corpus.dbpedia.coord.$loc"=> is_null($coords)?false:$coords], Carbon::now()->addMinutes(20)); + } + return ($coords===false)?null:$coords; + } + + private function getLocation($doc) { + + $geoRes = $doc->getGeoInfo(); + + if(is_null($geoRes)) { + return null; + } + + $locUrls = []; + foreach($geoRes->getRefLocs() as $loc) { + if(preg_match(config('corpusparole.geonames_url_regexp'), $loc, $m) === 1) { + + if(!array_key_exists('geonames', $locUrls)) { + $locUrls['geonames'] = []; + } + array_push($locUrls['geonames'], "http://sws.geonames.org/$m[1]/"); + + } elseif(preg_match(config('corpusparole.dbpedia_url_regexp'), $loc, $md) === 1) { + if(!array_key_exists('dbpedia', $locUrls)) { + $locUrls['dbpedia'] = []; + } + //$this->line("DBPEDIA MATCH $loc ".print_r($md,true)); + array_push($locUrls['dbpedia'], "http://$md[1]/data/$md[4]"); + } + } + + $coordinates = null; + foreach($locUrls as $locType => $locList) { + foreach($locList as $locationUrl) { + $coordinates = call_user_func([$this, "${locType}ResolveCoordinates"], $locationUrl); + if(!is_null($coordinates)) { + break; + } + } + } + + if(is_null($coordinates)) { + $coordinates = [$geoRes->getLatitudeValue(), $geoRes->getLongitudeValue()]; + } + + if(empty($coordinates[0]) || empty($coordinates[1])) { + return null; + } else { + return [floatval($coordinates[0]), floatval($coordinates[1])]; + } + + } + + private function getCreationDate($doc) { + + $created = $doc->getCreated(); + if(is_null($created)) { + return null; + } + $dateType = $created->getDatatypeUri(); + $res = null; + + if($dateType === "http://purl.org/dc/terms/Period") { + $res = $this->processPeriod($created->getValue()); + } + elseif($dateType === "http://purl.org/dc/terms/W3CDTF") { + $res = $this->processDate($created->getValue()); + } + + return $res; + + } + + private function extractDate($dateStr) { + if(preg_match("/^\\d{4}$/", $dateStr) === 1) { + $dateStr = "$dateStr-1-1"; + } + $date = date_create($dateStr); + if($date === false ) { + Log::warning("DateStatsController:extractYear bad format for date $dateStr"); + return null; + } + return $date; + } + + private function processPeriod($periodStr) { + $start = null; + $end = null; + foreach(explode(";", $periodStr) as $elem) { + $elem = trim($elem); + if(strpos($elem, 'start=') === 0) { + $startDate = $this->extractDate(trim(substr($elem, 6))); + if(is_null($startDate)) { + return null; + } + $start = intval($startDate->format("Y")); + if($start === false) { + return null; + } + } elseif(strpos($elem, 'end=') === 0) { + $endDate = $this->extractDate(trim(substr($elem, 4))); + if(is_null($endDate)) { + return null; + } + $end = intval($endDate->format("Y")); + if($end === false) { + return null; + } + } + } + + if(is_null($start) || is_null($end) || $start>$end ) { + Log::warning("Bad format for $periodStr"); + return null; + } + + return array_map(function($y) { + return \DateTime::createFromFormat("Y", "$y")->format(\DateTime::W3C); + }, range($start, $end)); + } + + private function processDate($dateStr) { + $date = $this->extractDate($dateStr); + if(is_null($date)) { + return null; + } else { + return $date->format(\DateTime::W3C); + } + } + + private function getDiscourseTypes($doc) { + return array_reduce($doc->getDiscourseTypes(), function($res, $d) { + $val = null; + if($d instanceof Resource) { + $val = $d->getUri(); + } elseif($d instanceof Literal) { + $datatype = $d->getDatatypeURI(); + $val = (!empty($datatype)?"$datatype#":"").$d->getValue(); + } + if(!empty($val)) { + array_push($res,$val); + } + return $res; + }, []); + } + + private function getDocBody($doc) { + return [ + 'title' => (string)$doc->getTitle(), + 'date' => (string)$doc->getModified(), + 'location' => $this->getLocation($doc), + 'creation_date' => $this->getCreationDate($doc), + 'language' => $doc->getLanguageValue(), + 'discourse_types' => $this->getDiscourseTypes($doc), + 'geonames_hierarchy' => $this->getGeonamesHierarchy($doc), + 'subject' => $this->getSubjects($doc), + ]; + } + /** * Index one document into Elasticsearch * @@ -260,12 +492,7 @@ 'index' => config('elasticsearch.index'), 'type' => 'document', 'id' => (string)$doc->getId(), - 'body' => [ - 'title' => (string)$doc->getTitle(), - 'date' => (string)$doc->getModified(), - 'geonames_hierarchy' => $this->getGeonamesHierarchy($doc), - 'subject' => $this->getSubjects($doc) - ] + 'body' => $this->getDocBody($doc) ]; Es::index($query_data); } @@ -287,12 +514,7 @@ '_id' => (string)$doc->getId() ] ]; - $query_data['body'][] = [ - 'title' => (string)$doc->getTitle(), - 'date' => (string)$doc->getModified(), - 'geonames_hierarchy' => $this->getGeonamesHierarchy($doc), - 'subject' => $this->getSubjects($doc) - ]; + $query_data['body'][] = $this->getDocBody($doc); } Es::bulk($query_data); } @@ -367,16 +589,16 @@ if ($page==$lastPage && $i>=$lastPageEntryCount){ break; } - $this->indexOne($doc); $progressBar->setMessage($doc->getId()); $progressBar->advance(); + $this->indexOne($doc); } } else { - $this->indexBulk($docs); $progressBar->setMessage('Page '.$page); $progressBar->advance(); + $this->indexBulk($docs); } } $progressBar->finish();