server/src/app/Console/Commands/IndexDocuments.php
changeset 325 31a4987f6017
parent 323 47f0611cc57d
child 326 226d5b17a119
--- a/server/src/app/Console/Commands/IndexDocuments.php	Fri Oct 07 02:07:34 2016 +0200
+++ b/server/src/app/Console/Commands/IndexDocuments.php	Sun Oct 09 11:44:18 2016 +0200
@@ -2,11 +2,19 @@
 
 namespace CorpusParole\Console\Commands;
 
+
+
 use Illuminate\Console\Command;
 use EasyRdf\Resource;
 use EasyRdf\Literal;
+use EasyRdf\Graph;
+
+use Carbon\Carbon;
 
 use GuzzleHttp\Client;
+use GuzzleHttp\Exception\TransferException;
+use GuzzleHttp\Psr7;
+
 use CorpusParole\Libraries\Utils;
 use CorpusParole\Repositories\DocumentRepository;
 use CorpusParole\Libraries\CocoonUtils;
@@ -14,6 +22,8 @@
 use CorpusParole\Services\BnfResolverInterface;
 use CorpusParole\Services\LexvoResolverInterface;
 use Es;
+use Log;
+use Cache;
 
 class IndexDocuments extends Command
 {
@@ -96,6 +106,9 @@
                         'date' => [ 'type' => 'date' ],
                         'geonames_hyerarchy' => [ 'type' => 'string' ],
                         'location' => [ 'type' => 'geo_point' ],
+                        'creation_date' => ['type' => 'date'],
+                        'language' => ['type' => 'string'],
+                        'discourse_types' => ['type' => 'string'],
                         'subject' => [
                             'type' => 'nested',
                             'properties' => [
@@ -118,7 +131,7 @@
 
 
     private function getGeonamesHierarchyArray($geonamesid) {
-        // TODO: Manage this cache !!!
+
         $hcache = GeonamesHierarchy::where('geonamesid', $geonamesid)->first();
         if(is_null($hcache)) {
 
@@ -146,7 +159,6 @@
                 array_push($res, $hierarchyElem['geonameId']);
             }
         }
-
         return $res;
 
     }
@@ -210,7 +222,6 @@
                     'type' => 'txt'
                 ]);
             }
-
             return $res;
         }, []);
 
@@ -248,6 +259,227 @@
         );
     }
 
+    private function graphResolvCoordinate($loc, $graph) {
+        $latLit = $graph->getLiteral($loc, "<http://www.w3.org/2003/01/geo/wgs84_pos#lat>");
+        if(is_null($latLit) || empty($latLit->getValue())) {
+            return null;
+        }
+        $lat = $latLit->getValue();
+
+        $longLit = $graph->getLiteral($loc, "<http://www.w3.org/2003/01/geo/wgs84_pos#long>");
+        if(is_null($longLit) || empty($longLit->getValue())) {
+            return null;
+        }
+        $long = $longLit->getValue();
+
+        return [ $lat, $long ];
+    }
+
+    private function loadGraph($url, $type) {
+        try {
+            $r = $this->httpClient->get($url);
+        } catch (TransferException $e) {
+            $this->error("loadGraph : Error Loading $url");
+            Log::error("loadGraph : Error Loading $url");
+            Log::error("loadGraph : Error request " . Psr7\str($e->getRequest()));
+            if ($e->hasResponse()) {
+                $this->error("loadGraph : Error response " . Psr7\str($e->getResponse()));
+                Log::error("loadGraph : Error response " . Psr7\str($e->getResponse()));
+            }
+            return null;
+        }
+        try {
+            $message = (string)$r->getBody();
+            $graph = new Graph($url, $message, $type);
+            return $graph;
+        } catch (EasyRdf\Exception $e) {
+            $this->error("loadGraph : Error parsing $url");
+            Log::error("loadGraph : Error parsing $url");
+            if($e instanceof EasyRdf\Parser\Exception) {
+                Log::error("loadGraph : Error exception line ".$e->getLine().", column: ".$e->getColumn());
+            }
+            $this->error("loadGraph : Error exception message ".$e->getMessage());
+            Log::error("loadGraph : Error exception message ".$e->getMessage());
+            Log::error("loadGraph : Error content $message");
+            return null;
+        }
+
+    }
+
+    private function geonamesResolveCoordinates($loc) {
+        $coords = cache("corpus.geonames.coord.$loc");
+        if(is_null($coords)) {
+            $graph = $this->loadGraph("{$loc}about.rdf", 'rdfxml');
+            $coords = is_null($graph)?null:$this->graphResolvCoordinate($loc, $graph);
+            cache(["corpus.geonames.coord.$loc" => is_null($coords)?false:$coords], Carbon::now()->addMinutes(20));
+        }
+        return ($coords===false)?null:$coords;
+    }
+
+    private function dbpediaResolveCoordinates($loc) {
+        $coords = cache("corpus.dbpedia.coord.$loc");
+        if(is_null($coords)) {
+            $graph = $this->loadGraph("$loc.rdf", 'rdfxml');
+            $coords = is_null($graph)?null:$this->graphResolvCoordinate($loc, $graph);
+            cache(["corpus.dbpedia.coord.$loc"=> is_null($coords)?false:$coords], Carbon::now()->addMinutes(20));
+        }
+        return ($coords===false)?null:$coords;
+    }
+
+    private function getLocation($doc) {
+
+        $geoRes = $doc->getGeoInfo();
+
+        if(is_null($geoRes)) {
+            return null;
+        }
+
+        $locUrls = [];
+        foreach($geoRes->getRefLocs() as $loc) {
+            if(preg_match(config('corpusparole.geonames_url_regexp'), $loc, $m) === 1) {
+
+                if(!array_key_exists('geonames', $locUrls)) {
+                    $locUrls['geonames'] = [];
+                }
+                array_push($locUrls['geonames'], "http://sws.geonames.org/$m[1]/");
+
+            } elseif(preg_match(config('corpusparole.dbpedia_url_regexp'), $loc, $md) === 1) {
+                if(!array_key_exists('dbpedia', $locUrls)) {
+                    $locUrls['dbpedia'] = [];
+                }
+                //$this->line("DBPEDIA MATCH $loc ".print_r($md,true));
+                array_push($locUrls['dbpedia'], "http://$md[1]/data/$md[4]");
+            }
+        }
+
+        $coordinates = null;
+        foreach($locUrls as $locType => $locList) {
+            foreach($locList as $locationUrl) {
+                $coordinates = call_user_func([$this, "${locType}ResolveCoordinates"], $locationUrl);
+                if(!is_null($coordinates)) {
+                    break;
+                }
+            }
+        }
+
+        if(is_null($coordinates)) {
+            $coordinates = [$geoRes->getLatitudeValue(), $geoRes->getLongitudeValue()];
+        }
+
+        if(empty($coordinates[0]) || empty($coordinates[1])) {
+            return null;
+        } else {
+            return [floatval($coordinates[0]), floatval($coordinates[1])];
+        }
+
+    }
+
+    private function getCreationDate($doc) {
+
+        $created = $doc->getCreated();
+        if(is_null($created)) {
+            return null;
+        }
+        $dateType = $created->getDatatypeUri();
+        $res = null;
+
+        if($dateType === "http://purl.org/dc/terms/Period") {
+            $res = $this->processPeriod($created->getValue());
+        }
+        elseif($dateType === "http://purl.org/dc/terms/W3CDTF") {
+            $res = $this->processDate($created->getValue());
+        }
+
+        return $res;
+
+    }
+
+    private function extractDate($dateStr) {
+        if(preg_match("/^\\d{4}$/", $dateStr) === 1) {
+            $dateStr = "$dateStr-1-1";
+        }
+        $date = date_create($dateStr);
+        if($date === false ) {
+            Log::warning("DateStatsController:extractYear bad format for date $dateStr");
+            return null;
+        }
+        return $date;
+    }
+
+    private function processPeriod($periodStr) {
+        $start = null;
+        $end = null;
+        foreach(explode(";", $periodStr) as $elem) {
+            $elem = trim($elem);
+            if(strpos($elem, 'start=') === 0) {
+                $startDate = $this->extractDate(trim(substr($elem, 6)));
+                if(is_null($startDate)) {
+                    return null;
+                }
+                $start = intval($startDate->format("Y"));
+                if($start === false) {
+                    return null;
+                }
+            } elseif(strpos($elem, 'end=') === 0) {
+                $endDate = $this->extractDate(trim(substr($elem, 4)));
+                if(is_null($endDate)) {
+                    return null;
+                }
+                $end = intval($endDate->format("Y"));
+                if($end === false) {
+                    return null;
+                }
+            }
+        }
+
+        if(is_null($start) || is_null($end) || $start>$end ) {
+            Log::warning("Bad format for $periodStr");
+            return null;
+        }
+
+        return array_map(function($y) {
+            return \DateTime::createFromFormat("Y", "$y")->format(\DateTime::W3C);
+        }, range($start, $end));
+    }
+
+    private function processDate($dateStr) {
+        $date = $this->extractDate($dateStr);
+        if(is_null($date))  {
+            return null;
+        } else {
+            return $date->format(\DateTime::W3C);
+        }
+    }
+
+    private function getDiscourseTypes($doc) {
+        return array_reduce($doc->getDiscourseTypes(), function($res, $d) {
+            $val = null;
+            if($d instanceof Resource) {
+                $val = $d->getUri();
+            } elseif($d instanceof Literal) {
+                $datatype = $d->getDatatypeURI();
+                $val = (!empty($datatype)?"$datatype#":"").$d->getValue();
+            }
+            if(!empty($val)) {
+                array_push($res,$val);
+            }
+            return $res;
+        }, []);
+    }
+
+    private function getDocBody($doc) {
+        return [
+            'title' => (string)$doc->getTitle(),
+            'date' => (string)$doc->getModified(),
+            'location' => $this->getLocation($doc),
+            'creation_date' => $this->getCreationDate($doc),
+            'language' => $doc->getLanguageValue(),
+            'discourse_types' => $this->getDiscourseTypes($doc),
+            'geonames_hierarchy' => $this->getGeonamesHierarchy($doc),
+            'subject' => $this->getSubjects($doc),
+        ];
+    }
+
     /**
      * Index one document into Elasticsearch
      *
@@ -260,12 +492,7 @@
             'index' => config('elasticsearch.index'),
             'type' => 'document',
             'id' => (string)$doc->getId(),
-            'body' => [
-                'title' => (string)$doc->getTitle(),
-                'date' => (string)$doc->getModified(),
-                'geonames_hierarchy' => $this->getGeonamesHierarchy($doc),
-                'subject' => $this->getSubjects($doc)
-            ]
+            'body' => $this->getDocBody($doc)
         ];
         Es::index($query_data);
     }
@@ -287,12 +514,7 @@
                       '_id' => (string)$doc->getId()
                   ]
               ];
-              $query_data['body'][] = [
-                  'title' => (string)$doc->getTitle(),
-                  'date' => (string)$doc->getModified(),
-                  'geonames_hierarchy' => $this->getGeonamesHierarchy($doc),
-                  'subject' => $this->getSubjects($doc)
-              ];
+              $query_data['body'][] = $this->getDocBody($doc);
           }
           Es::bulk($query_data);
      }
@@ -367,16 +589,16 @@
                     if ($page==$lastPage && $i>=$lastPageEntryCount){
                         break;
                     }
-                    $this->indexOne($doc);
                     $progressBar->setMessage($doc->getId());
                     $progressBar->advance();
+                    $this->indexOne($doc);
                 }
             }
             else
             {
-                $this->indexBulk($docs);
                 $progressBar->setMessage('Page '.$page);
                 $progressBar->advance();
+                $this->indexBulk($docs);
             }
         }
         $progressBar->finish();