change datestats to use elasticsearch
authorymh <ymh.work@gmail.com>
Thu, 20 Oct 2016 11:24:45 +0200
changeset 375 145561ff51ff
parent 374 c622fa18eb32
child 376 02f113d43f18
change datestats to use elasticsearch
server/src/app/Console/Commands/IndexDocuments.php
server/src/app/Http/Controllers/Api/DateStatsController.php
server/src/tests/Controllers/DateStatsControllerTest.php
--- a/server/src/app/Console/Commands/IndexDocuments.php	Thu Oct 20 12:56:24 2016 +0530
+++ b/server/src/app/Console/Commands/IndexDocuments.php	Thu Oct 20 11:24:45 2016 +0200
@@ -109,6 +109,13 @@
                         'creation_date' => ['type' => 'date', 'index' => 'not_analyzed'],
                         'language' => ['type' => 'string', 'index' => 'not_analyzed'],
                         'discourse_types' => ['type' => 'string', 'index' => 'not_analyzed'],
+                        'creation_years' => [
+                            'type' => 'nested',
+                            'properties' => [
+                                'year' => [ 'type' => 'short', 'index' => 'not_analyzed'],
+                                'weight' => [ 'type' => 'float', 'index' => 'not_analyzed'],
+                            ]
+                        ] ,
                         'subject' => [
                             'type' => 'nested',
                             'properties' => [
@@ -405,7 +412,7 @@
         return $date;
     }
 
-    private function processPeriod($periodStr) {
+    private function processPeriod($periodStr, $asDate=false) {
         $start = null;
         $end = null;
         foreach(explode(";", $periodStr) as $elem) {
@@ -436,18 +443,59 @@
             return null;
         }
 
-        return array_map(function($y) {
-            return \DateTime::createFromFormat("Y", "$y")->format(\DateTime::W3C);
+        return array_map(function($y) use ($asDate){
+            $date = \DateTime::createFromFormat("Y", "$y");
+            if($asDate) {
+                return $date;
+            } else {
+                return $date->format(\DateTime::W3C);
+            }
+
         }, range($start, $end));
     }
 
-    private function processDate($dateStr) {
+    private function processDate($dateStr, $asDate=false) {
         $date = $this->extractDate($dateStr);
         if(is_null($date))  {
             return null;
         } else {
-            return $date->format(\DateTime::W3C);
+            if($asDate) {
+                return $date;
+            } else {
+                return $date->format(\DateTime::W3C);
+            }
+
+        }
+    }
+
+    private function getCreationYears($doc) {
+        $created = $doc->getCreated();
+        if(is_null($created)) {
+            return [];
         }
+        $dateType = $created->getDatatypeUri();
+        $dates = null;
+
+        if($dateType === "http://purl.org/dc/terms/Period") {
+            $dates = $this->processPeriod($created->getValue(), true);
+        }
+        elseif($dateType === "http://purl.org/dc/terms/W3CDTF") {
+            $dates = $this->processDate($created->getValue(), true);
+            if(!is_null($dates)) {
+                $dates = [ $dates, ];
+            }
+        }
+        if(is_null($dates)) {
+            return [];
+        }
+        $count = count($dates);
+        return array_map(function($d) use ($count) {
+            return [
+                'year' => intval($d->format("Y")),
+                'weight' => 1/$count
+            ];
+
+        }, $dates);
     }
 
     private function getDiscourseTypes($doc) {
@@ -472,6 +520,7 @@
             'date' => (string)$doc->getModified(),
             'location' => $this->getLocation($doc),
             'creation_date' => $this->getCreationDate($doc),
+            'creation_years' => $this->getCreationYears($doc),
             'language' => $doc->getLanguagesValue(),
             'discourse_types' => $this->getDiscourseTypes($doc),
             'geonames_hierarchy' => $this->getGeonamesHierarchy($doc),
--- a/server/src/app/Http/Controllers/Api/DateStatsController.php	Thu Oct 20 12:56:24 2016 +0530
+++ b/server/src/app/Http/Controllers/Api/DateStatsController.php	Thu Oct 20 11:24:45 2016 +0200
@@ -5,6 +5,7 @@
 // use CorpusParole\Http\Requests;
 use Illuminate\Http\Request;
 use Log;
+use Es;
 
 use CorpusParole\Libraries\Sparql\SparqlClient;
 
@@ -26,101 +27,50 @@
      */
     public function index(Request $request)
     {
-        $query =  preg_replace('/\s+/', ' ', "SELECT (?d as ?date) (COUNT(?d) AS ?count)
-            WHERE {
-                ?_ a <http://www.europeana.eu/schemas/edm/ProvidedCHO>.
-                ?_ <http://purl.org/dc/terms/created> ?d
-            }
-            GROUP BY ?d
-            ORDER BY ?d");
 
-        $res = $this->sparqlClient->query($query);
-
-        $dates = [];
-
-        foreach ($res as $row) {
+        $query = [ "match_all" => []];
 
-            $count = intval($row->count->getValue());
-            $date = $row->date;
-            $dateType = $date->getDatatypeUri();
+        $esQuery = [
+            'index' => env('ELASTICSEARCH_INDEX'),
+            'body' => [
+                "size" => 0,
+                "query" => $query,
+                "aggs" => [
+                    "datestats" => [
+                        "nested"=> [
+                            "path" => "creation_years"
+                        ],
+                        "aggs" => [
+                            "years" => [
+                                "terms"=> [
+                                    "field" => "creation_years.year",
+                                    "size" => 0,
+                                    "order" => [
+                                        "_term" => "asc"
+                                    ]
+                                ],
+                                "aggs" => [
+                                    "year_count" => [
+                                        "sum" => [
+                                            "field" => "creation_years.weight"
+                                        ]
+                                    ]
+                                ]
+                            ]
+                        ]
+                    ]
+                ]
+            ]
+        ];
+        $esRes = Es::search($esQuery);
 
-            $processedDates = [];
-            if($dateType === "http://purl.org/dc/terms/Period") {
-                $processedDates = $this->processPeriod($date->getValue(), $count);
-            }
-            elseif($dateType === "http://purl.org/dc/terms/W3CDTF") {
-                $processedDates = $this->processDate($date->getValue(), $count);
-            }
+        $datestats = [];
 
-            $dates = array_reduce(array_keys($processedDates), function($datesArray, $item) use ($processedDates) {
-                if(!isset($datesArray[$item])) {
-                    $datesArray[$item] = 0;
-                }
-                $datesArray[$item] += $processedDates[$item];
-                return $datesArray;
-            }, $dates);
+        foreach($esRes['aggregations']['datestats']['years']['buckets'] as $bucket) {
+            $datestats[(string)($bucket['key'])] = round($bucket['year_count']['value']);
         }
 
-        ksort($dates);
-
-        return response()->json(['datestats' => $dates ]);
+        return response()->json(['datestats' => $datestats ]);
     }
 
-    private function extractYear($dateStr) {
-        if(preg_match("/^\\d{4}$/", $dateStr) === 1) {
-            $dateStr = "$dateStr-1-1";
-        }
-        $date = date_create($dateStr);
-        if($date === false ) {
-            Log::warning("DateStatsController:extractYear bad format for date $dateStr");
-        }
-        return $date?$date->format("Y"):false;
-    }
-
-    private function processPeriod($periodStr, $count) {
-        $start = null;
-        $end = null;
-        foreach(explode(";", $periodStr) as $elem) {
-            $elem = trim($elem);
-            if(strpos($elem, 'start=') === 0) {
-                $start = intval($this->extractYear(trim(substr($elem, 6))));
-                if($start === false) {
-                    return [];
-                }
-            } elseif(strpos($elem, 'end=') === 0) {
-                $end = intval($this->extractYear(trim(substr($elem, 4))));
-                if($end === false) {
-                    return [];
-                }
-            }
-        }
-
-        if(is_null($start) || is_null($end) || $start>$end ) {
-            // TODO: log problem
-            return [];
-        }
-
-        $res = [];
-        $mean = (int)($count/($end+1-$start));
-        $remains = $count%($end+1-$start);
-        for($d=$start; $d<=$end; $d++) {
-            $nb = $mean + ((($remains--)>0)?1:0);
-            if($nb !== 0) {
-                $res[strval($d)] = $nb;
-            }
-        }
-
-        return $res;
-    }
-
-    private function processDate($dateStr, $count) {
-        $date = $this->extractYear($dateStr);
-        if($date === false)  {
-            return [];
-        } else {
-            return [ $this->extractYear($dateStr) => $count ];
-        }
-    }
-
-
 }
--- a/server/src/tests/Controllers/DateStatsControllerTest.php	Thu Oct 20 12:56:24 2016 +0530
+++ b/server/src/tests/Controllers/DateStatsControllerTest.php	Thu Oct 20 11:24:45 2016 +0200
@@ -2,20 +2,50 @@
 
 use Mockery as m;
 
+use Es;
+
 use EasyRdf\Literal;
 
 class DateStatsControllerTest extends TestCase
 {
     private $sparqlClient;
 
+    const ES_QUERY = [
+        'index' => 'corpus',
+        'body' => [
+            "size" => 0,
+            "query" => [ "match_all" => []],
+            "aggs" => [
+                "datestats" => [
+                    "nested"=> [
+                        "path" => "creation_years"
+                    ],
+                    "aggs" => [
+                        "years" => [
+                            "terms"=> [
+                                "field" => "creation_years.year",
+                                "size" => 0,
+                                "order" => [
+                                    "_term" => "asc"
+                                ]
+                            ],
+                            "aggs" => [
+                                "year_count" => [
+                                    "sum" => [
+                                        "field" => "creation_years.weight"
+                                    ]
+                                ]
+                            ]
+                        ]
+                    ]
+                ]
+            ]
+        ]
+    ];
+
     public function setUp() {
 
         parent::setup();
-
-        // create a mock of the post repository interface and inject it into the
-        // IoC container
-        $this->sparqlClient = m::mock('CorpusParole\Libraries\Sparql\SparqlClient');
-        $this->app->instance('CorpusParole\Libraries\Sparql\SparqlClient', $this->sparqlClient);
     }
 
     public function tearDown() {
@@ -26,243 +56,99 @@
 
     public function testIndexQuery() {
 
-        $query =  preg_replace('/\s+/', ' ', "SELECT (?d as ?date) (COUNT(?d) AS ?count)
-            WHERE {
-                ?_ a <http://www.europeana.eu/schemas/edm/ProvidedCHO>.
-                ?_ <http://purl.org/dc/terms/created> ?d
-            }
-            GROUP BY ?d
-            ORDER BY ?d");
-
-
-        $this->sparqlClient
-            ->shouldReceive('query')
-            ->with($query)
-            ->once()
-            ->andReturn(new \ArrayIterator([]));
-        $this->get('/api/v1/stats/datestats/');
-        $this->seeJsonEquals(["datestats" => []]);
-    }
-
-    public function testIndexMultiple() {
-
-         $this->sparqlClient
-             ->shouldReceive('query')
-             ->once()
-             ->andReturn(new \ArrayIterator([
-                 (object)['date'=>new Literal('1975', null, "http://purl.org/dc/terms/W3CDTF"), 'count' => Literal::create(1)],
-                 (object)['date'=>new Literal('1965', null, "http://purl.org/dc/terms/W3CDTF"), 'count' => Literal::create(2)],
-                 (object)['date'=>new Literal('1955', null, "http://purl.org/dc/terms/W3CDTF"), 'count' => Literal::create(3)],
-             ]));
-         $this->get('/api/v1/stats/datestats/')->assertTrue($this->response->isOk(), $this->response->content());
-         $this->seeJsonEquals(["datestats" => [
-             "1955" => 3,
-             "1965" => 2,
-             "1975" => 1,
-         ]]);
-    }
-
-    public function testIndexSimple() {
-
-         $this->sparqlClient
-             ->shouldReceive('query')
-             ->once()
-             ->andReturn(new \ArrayIterator([
-                 (object)['date'=>new Literal('1955', null, "http://purl.org/dc/terms/W3CDTF"), 'count' => Literal::create(1)],
-                 (object)['date'=>new Literal('1965', null, "http://purl.org/dc/terms/W3CDTF"), 'count' => Literal::create(1)],
-                 (object)['date'=>new Literal('1975', null, "http://purl.org/dc/terms/W3CDTF"), 'count' => Literal::create(1)],
-             ]));
-         $this->get('/api/v1/stats/datestats/')->assertTrue($this->response->isOk(), $this->response->content());
-         $this->seeJsonEquals(["datestats" => [
-             "1955" => 1,
-             "1965" => 1,
-             "1975" => 1,
-         ]]);
-    }
-
-    public function testIndexPeriod() {
-
-        $this->sparqlClient
-             ->shouldReceive('query')
-             ->once()
-             ->andReturn(new \ArrayIterator([
-                 (object)['date'=>new Literal('start=1955; end=1965', null, "http://purl.org/dc/terms/Period"), 'count' => Literal::create(11)],
-             ]));
-        $this->get('/api/v1/stats/datestats/')->assertTrue($this->response->isOk(), $this->response->content());
-        $this->seeJsonEquals(["datestats" => [
-            "1955" => 1,
-            "1956" => 1,
-            "1957" => 1,
-            "1958" => 1,
-            "1959" => 1,
-            "1960" => 1,
-            "1961" => 1,
-            "1962" => 1,
-            "1963" => 1,
-            "1964" => 1,
-            "1965" => 1,
-        ]]);
-    }
-
-    public function testIndexPeriodRemainMore() {
-
-        $this->sparqlClient
-             ->shouldReceive('query')
-             ->once()
-             ->andReturn(new \ArrayIterator([
-                 (object)['date'=>new Literal('start=1955; end=1965', null, "http://purl.org/dc/terms/Period"), 'count' => Literal::create(15)],
-             ]));
-        $this->get('/api/v1/stats/datestats/')->assertTrue($this->response->isOk(), $this->response->content());
-        $this->seeJsonEquals(["datestats" => [
-            "1955" => 2,
-            "1956" => 2,
-            "1957" => 2,
-            "1958" => 2,
-            "1959" => 1,
-            "1960" => 1,
-            "1961" => 1,
-            "1962" => 1,
-            "1963" => 1,
-            "1964" => 1,
-            "1965" => 1,
-        ]]);
+        Es::shouldReceive('search')
+                ->once()
+                ->with(self::ES_QUERY)
+                ->andReturn(json_decode('{
+                    "took" : 132,
+                    "timed_out" : false,
+                    "_shards" : {
+                        "total" : 1,
+                        "successful" : 1,
+                        "failed" : 0
+                    },
+                    "hits" : {
+                        "total" : 3373,
+                        "max_score" : 0.0,
+                        "hits" : [ ]
+                    },
+                    "aggregations" : {
+                        "datestats" : {
+                            "doc_count" : 3725,
+                            "years" : {
+                                "doc_count_error_upper_bound" : 0,
+                                "sum_other_doc_count" : 0,
+                                "buckets" : []
+                            }
+                        }
+                    }
+                }', true));
+            $this->get('/api/v1/stats/datestats/')->assertTrue($this->response->isOk(), $this->response->content());
+            $this->seeJsonEquals(["datestats" => []]);
     }
 
-    public function testIndexPeriodRemainLess() {
-
-        $this->sparqlClient
-             ->shouldReceive('query')
-             ->once()
-             ->andReturn(new \ArrayIterator([
-                 (object)['date'=>new Literal('start=1955; end=1965', null, "http://purl.org/dc/terms/Period"), 'count' => Literal::create(10)],
-             ]));
-        $this->get('/api/v1/stats/datestats/')->assertTrue($this->response->isOk(), $this->response->content());
-        $this->seeJsonEquals(["datestats" => [
-            "1955" => 1,
-            "1956" => 1,
-            "1957" => 1,
-            "1958" => 1,
-            "1959" => 1,
-            "1960" => 1,
-            "1961" => 1,
-            "1962" => 1,
-            "1963" => 1,
-            "1964" => 1,
-        ]]);
-    }
-
-    public function testIndexMix() {
+    public function testIndexResult() {
 
-        $this->sparqlClient
-             ->shouldReceive('query')
-             ->once()
-             ->andReturn(new \ArrayIterator([
-                 (object)['date'=>new Literal('start=1955; end=1965', null, "http://purl.org/dc/terms/Period"), 'count' => Literal::create(11)],
-                 (object)['date'=>new Literal('1960', null, "http://purl.org/dc/terms/W3CDTF"), 'count' => Literal::create(2)],
-             ]));
-        $this->get('/api/v1/stats/datestats/')->assertTrue($this->response->isOk(), $this->response->content());
-        $this->seeJsonEquals(["datestats" => [
-            "1955" => 1,
-            "1956" => 1,
-            "1957" => 1,
-            "1958" => 1,
-            "1959" => 1,
-            "1960" => 3,
-            "1961" => 1,
-            "1962" => 1,
-            "1963" => 1,
-            "1964" => 1,
-            "1965" => 1,
-        ]]);
-    }
-
-    public function testIndexBadDate() {
-
-         $this->sparqlClient
-             ->shouldReceive('query')
-             ->once()
-             ->andReturn(new \ArrayIterator([
-                 (object)['date'=>new Literal('1955', null, "http://purl.org/dc/terms/W3CDTF"), 'count' => Literal::create(1)],
-                 (object)['date'=>new Literal('HELLO', null, "http://purl.org/dc/terms/W3CDTF"), 'count' => Literal::create(1)],
-                 (object)['date'=>new Literal('1975', null, "http://purl.org/dc/terms/W3CDTF"), 'count' => Literal::create(1)],
-             ]));
-         $this->get('/api/v1/stats/datestats/')->assertTrue($this->response->isOk(), $this->response->content());
-         $this->seeJsonEquals(["datestats" => [
-             "1955" => 1,
-             "1975" => 1,
-         ]]);
+        Es::shouldReceive('search')
+                ->once()
+                ->with(self::ES_QUERY)
+                ->andReturn(json_decode('{
+                    "took" : 132,
+                    "timed_out" : false,
+                    "_shards" : {
+                        "total" : 1,
+                        "successful" : 1,
+                        "failed" : 0
+                    },
+                    "hits" : {
+                        "total" : 3373,
+                        "max_score" : 0.0,
+                        "hits" : [ ]
+                    },
+                    "aggregations" : {
+                        "datestats" : {
+                            "doc_count" : 3725,
+                            "years" : {
+                                "doc_count_error_upper_bound" : 0,
+                                "sum_other_doc_count" : 0,
+                                "buckets" : [ {
+                                    "key" : 1948,
+                                    "doc_count" : 3,
+                                    "year_count" : { "value" : 3.0 }
+                                }, {
+                                    "key" : 1957,
+                                    "doc_count" : 29,
+                                    "year_count" : { "value" : 29.0 }
+                                }, {
+                                    "key" : 1963,
+                                    "doc_count" : 22,
+                                    "year_count" : { "value" : 21.5 }
+                                },  {
+                                    "key" : 1970,
+                                    "doc_count" : 411,
+                                    "year_count" : { "value" : 403.68333334475756 }
+                                },  {
+                                    "key" : 1986,
+                                    "doc_count" : 68,
+                                    "year_count" : { "value" : 14.133333388715982 }
+                                }, {
+                                    "key" : 1996,
+                                    "doc_count" : 40,
+                                    "year_count" : { "value" : 36.05000001564622 }
+                                } ]
+                            }
+                        }
+                    }
+                }', true));
+            $this->get('/api/v1/stats/datestats/')->assertTrue($this->response->isOk(), $this->response->content());
+            $this->seeJsonEquals(["datestats" => [
+                "1948" => 3,
+                "1957" => 29,
+                "1963" => 22,
+                "1970" => 404,
+                "1986" => 14,
+                "1996" => 36
+            ]]);
     }
 
-    public function testIndexBadPeriod() {
-
-        $this->sparqlClient
-             ->shouldReceive('query')
-             ->once()
-             ->andReturn(new \ArrayIterator([
-                 (object)['date'=>new Literal('start=1955; end=FOO', null, "http://purl.org/dc/terms/Period"), 'count' => Literal::create(11)],
-             ]));
-        $this->get('/api/v1/stats/datestats/')->assertTrue($this->response->isOk(), $this->response->content());
-        $this->seeJsonEquals(["datestats" => [
-        ]]);
-    }
-
-    public function testIndexBadPeriodMissing() {
-
-        $this->sparqlClient
-             ->shouldReceive('query')
-             ->once()
-             ->andReturn(new \ArrayIterator([
-                 (object)['date'=>new Literal('start=1955', null, "http://purl.org/dc/terms/Period"), 'count' => Literal::create(11)],
-             ]));
-        $this->get('/api/v1/stats/datestats/')->assertTrue($this->response->isOk(), $this->response->content());
-        $this->seeJsonEquals(["datestats" => [
-        ]]);
-    }
-
-    public function testIndexFullPeriod() {
-
-        $this->sparqlClient
-             ->shouldReceive('query')
-             ->once()
-             ->andReturn(new \ArrayIterator([
-                 (object)['date'=>new Literal('start=1955; end=1965; scheme=v3; name=v4;', null, "http://purl.org/dc/terms/Period"), 'count' => Literal::create(11)],
-             ]));
-        $this->get('/api/v1/stats/datestats/')->assertTrue($this->response->isOk(), $this->response->content());
-        $this->seeJsonEquals(["datestats" => [
-            "1955" => 1,
-            "1956" => 1,
-            "1957" => 1,
-            "1958" => 1,
-            "1959" => 1,
-            "1960" => 1,
-            "1961" => 1,
-            "1962" => 1,
-            "1963" => 1,
-            "1964" => 1,
-            "1965" => 1,
-        ]]);
-    }
-
-    public function testIndexMultipleFormat() {
-
-         $this->sparqlClient
-             ->shouldReceive('query')
-             ->once()
-             ->andReturn(new \ArrayIterator([
-                 (object)['date'=>new Literal('1975-02-05', null, "http://purl.org/dc/terms/W3CDTF"), 'count' => Literal::create(1)],
-                 (object)['date'=>new Literal('1965-03', null, "http://purl.org/dc/terms/W3CDTF"), 'count' => Literal::create(2)],
-                 (object)['date'=>new Literal('1955-02-12T08:30:00+00:00', null, "http://purl.org/dc/terms/W3CDTF"), 'count' => Literal::create(3)],
-                 (object)['date'=>new Literal('1950-08-18T08:30:00Z', null, "http://purl.org/dc/terms/W3CDTF"), 'count' => Literal::create(4)],
-             ]));
-         $this->get('/api/v1/stats/datestats/')->assertTrue($this->response->isOk(), $this->response->content());
-         $this->seeJsonEquals(["datestats" => [
-             "1950" => 4,
-             "1955" => 3,
-             "1965" => 2,
-             "1975" => 1,
-         ]]);
-    }
-
-
-
 }