# HG changeset patch # User ymh # Date 1476955485 -7200 # Node ID 145561ff51ff040b6e4a86b4e971cbfa0a5ef745 # Parent c622fa18eb32278392ed4618baa8eb91024977cd change datestats to use elasticsearch diff -r c622fa18eb32 -r 145561ff51ff server/src/app/Console/Commands/IndexDocuments.php --- a/server/src/app/Console/Commands/IndexDocuments.php Thu Oct 20 12:56:24 2016 +0530 +++ b/server/src/app/Console/Commands/IndexDocuments.php Thu Oct 20 11:24:45 2016 +0200 @@ -109,6 +109,13 @@ 'creation_date' => ['type' => 'date', 'index' => 'not_analyzed'], 'language' => ['type' => 'string', 'index' => 'not_analyzed'], 'discourse_types' => ['type' => 'string', 'index' => 'not_analyzed'], + 'creation_years' => [ + 'type' => 'nested', + 'properties' => [ + 'year' => [ 'type' => 'short', 'index' => 'not_analyzed'], + 'weight' => [ 'type' => 'float', 'index' => 'not_analyzed'], + ] + ] , 'subject' => [ 'type' => 'nested', 'properties' => [ @@ -405,7 +412,7 @@ return $date; } - private function processPeriod($periodStr) { + private function processPeriod($periodStr, $asDate=false) { $start = null; $end = null; foreach(explode(";", $periodStr) as $elem) { @@ -436,18 +443,59 @@ return null; } - return array_map(function($y) { - return \DateTime::createFromFormat("Y", "$y")->format(\DateTime::W3C); + return array_map(function($y) use ($asDate){ + $date = \DateTime::createFromFormat("Y", "$y"); + if($asDate) { + return $date; + } else { + return $date->format(\DateTime::W3C); + } + }, range($start, $end)); } - private function processDate($dateStr) { + private function processDate($dateStr, $asDate=false) { $date = $this->extractDate($dateStr); if(is_null($date)) { return null; } else { - return $date->format(\DateTime::W3C); + if($asDate) { + return $date; + } else { + return $date->format(\DateTime::W3C); + } + + } + } + + private function getCreationYears($doc) { + $created = $doc->getCreated(); + if(is_null($created)) { + return []; } + $dateType = $created->getDatatypeUri(); + $dates = null; + + if($dateType === "http://purl.org/dc/terms/Period") { + $dates = $this->processPeriod($created->getValue(), true); + } + elseif($dateType === "http://purl.org/dc/terms/W3CDTF") { + $dates = $this->processDate($created->getValue(), true); + if(!is_null($dates)) { + $dates = [ $dates, ]; + } + } + if(is_null($dates)) { + return []; + } + $count = count($dates); + return array_map(function($d) use ($count) { + return [ + 'year' => intval($d->format("Y")), + 'weight' => 1/$count + ]; + + }, $dates); } private function getDiscourseTypes($doc) { @@ -472,6 +520,7 @@ 'date' => (string)$doc->getModified(), 'location' => $this->getLocation($doc), 'creation_date' => $this->getCreationDate($doc), + 'creation_years' => $this->getCreationYears($doc), 'language' => $doc->getLanguagesValue(), 'discourse_types' => $this->getDiscourseTypes($doc), 'geonames_hierarchy' => $this->getGeonamesHierarchy($doc), diff -r c622fa18eb32 -r 145561ff51ff server/src/app/Http/Controllers/Api/DateStatsController.php --- a/server/src/app/Http/Controllers/Api/DateStatsController.php Thu Oct 20 12:56:24 2016 +0530 +++ b/server/src/app/Http/Controllers/Api/DateStatsController.php Thu Oct 20 11:24:45 2016 +0200 @@ -5,6 +5,7 @@ // use CorpusParole\Http\Requests; use Illuminate\Http\Request; use Log; +use Es; use CorpusParole\Libraries\Sparql\SparqlClient; @@ -26,101 +27,50 @@ */ public function index(Request $request) { - $query = preg_replace('/\s+/', ' ', "SELECT (?d as ?date) (COUNT(?d) AS ?count) - WHERE { - ?_ a . - ?_ ?d - } - GROUP BY ?d - ORDER BY ?d"); - $res = $this->sparqlClient->query($query); - - $dates = []; - - foreach ($res as $row) { + $query = [ "match_all" => []]; - $count = intval($row->count->getValue()); - $date = $row->date; - $dateType = $date->getDatatypeUri(); + $esQuery = [ + 'index' => env('ELASTICSEARCH_INDEX'), + 'body' => [ + "size" => 0, + "query" => $query, + "aggs" => [ + "datestats" => [ + "nested"=> [ + "path" => "creation_years" + ], + "aggs" => [ + "years" => [ + "terms"=> [ + "field" => "creation_years.year", + "size" => 0, + "order" => [ + "_term" => "asc" + ] + ], + "aggs" => [ + "year_count" => [ + "sum" => [ + "field" => "creation_years.weight" + ] + ] + ] + ] + ] + ] + ] + ] + ]; + $esRes = Es::search($esQuery); - $processedDates = []; - if($dateType === "http://purl.org/dc/terms/Period") { - $processedDates = $this->processPeriod($date->getValue(), $count); - } - elseif($dateType === "http://purl.org/dc/terms/W3CDTF") { - $processedDates = $this->processDate($date->getValue(), $count); - } + $datestats = []; - $dates = array_reduce(array_keys($processedDates), function($datesArray, $item) use ($processedDates) { - if(!isset($datesArray[$item])) { - $datesArray[$item] = 0; - } - $datesArray[$item] += $processedDates[$item]; - return $datesArray; - }, $dates); + foreach($esRes['aggregations']['datestats']['years']['buckets'] as $bucket) { + $datestats[(string)($bucket['key'])] = round($bucket['year_count']['value']); } - ksort($dates); - - return response()->json(['datestats' => $dates ]); + return response()->json(['datestats' => $datestats ]); } - private function extractYear($dateStr) { - if(preg_match("/^\\d{4}$/", $dateStr) === 1) { - $dateStr = "$dateStr-1-1"; - } - $date = date_create($dateStr); - if($date === false ) { - Log::warning("DateStatsController:extractYear bad format for date $dateStr"); - } - return $date?$date->format("Y"):false; - } - - private function processPeriod($periodStr, $count) { - $start = null; - $end = null; - foreach(explode(";", $periodStr) as $elem) { - $elem = trim($elem); - if(strpos($elem, 'start=') === 0) { - $start = intval($this->extractYear(trim(substr($elem, 6)))); - if($start === false) { - return []; - } - } elseif(strpos($elem, 'end=') === 0) { - $end = intval($this->extractYear(trim(substr($elem, 4)))); - if($end === false) { - return []; - } - } - } - - if(is_null($start) || is_null($end) || $start>$end ) { - // TODO: log problem - return []; - } - - $res = []; - $mean = (int)($count/($end+1-$start)); - $remains = $count%($end+1-$start); - for($d=$start; $d<=$end; $d++) { - $nb = $mean + ((($remains--)>0)?1:0); - if($nb !== 0) { - $res[strval($d)] = $nb; - } - } - - return $res; - } - - private function processDate($dateStr, $count) { - $date = $this->extractYear($dateStr); - if($date === false) { - return []; - } else { - return [ $this->extractYear($dateStr) => $count ]; - } - } - - } diff -r c622fa18eb32 -r 145561ff51ff server/src/tests/Controllers/DateStatsControllerTest.php --- a/server/src/tests/Controllers/DateStatsControllerTest.php Thu Oct 20 12:56:24 2016 +0530 +++ b/server/src/tests/Controllers/DateStatsControllerTest.php Thu Oct 20 11:24:45 2016 +0200 @@ -2,20 +2,50 @@ use Mockery as m; +use Es; + use EasyRdf\Literal; class DateStatsControllerTest extends TestCase { private $sparqlClient; + const ES_QUERY = [ + 'index' => 'corpus', + 'body' => [ + "size" => 0, + "query" => [ "match_all" => []], + "aggs" => [ + "datestats" => [ + "nested"=> [ + "path" => "creation_years" + ], + "aggs" => [ + "years" => [ + "terms"=> [ + "field" => "creation_years.year", + "size" => 0, + "order" => [ + "_term" => "asc" + ] + ], + "aggs" => [ + "year_count" => [ + "sum" => [ + "field" => "creation_years.weight" + ] + ] + ] + ] + ] + ] + ] + ] + ]; + public function setUp() { parent::setup(); - - // create a mock of the post repository interface and inject it into the - // IoC container - $this->sparqlClient = m::mock('CorpusParole\Libraries\Sparql\SparqlClient'); - $this->app->instance('CorpusParole\Libraries\Sparql\SparqlClient', $this->sparqlClient); } public function tearDown() { @@ -26,243 +56,99 @@ public function testIndexQuery() { - $query = preg_replace('/\s+/', ' ', "SELECT (?d as ?date) (COUNT(?d) AS ?count) - WHERE { - ?_ a . - ?_ ?d - } - GROUP BY ?d - ORDER BY ?d"); - - - $this->sparqlClient - ->shouldReceive('query') - ->with($query) - ->once() - ->andReturn(new \ArrayIterator([])); - $this->get('/api/v1/stats/datestats/'); - $this->seeJsonEquals(["datestats" => []]); - } - - public function testIndexMultiple() { - - $this->sparqlClient - ->shouldReceive('query') - ->once() - ->andReturn(new \ArrayIterator([ - (object)['date'=>new Literal('1975', null, "http://purl.org/dc/terms/W3CDTF"), 'count' => Literal::create(1)], - (object)['date'=>new Literal('1965', null, "http://purl.org/dc/terms/W3CDTF"), 'count' => Literal::create(2)], - (object)['date'=>new Literal('1955', null, "http://purl.org/dc/terms/W3CDTF"), 'count' => Literal::create(3)], - ])); - $this->get('/api/v1/stats/datestats/')->assertTrue($this->response->isOk(), $this->response->content()); - $this->seeJsonEquals(["datestats" => [ - "1955" => 3, - "1965" => 2, - "1975" => 1, - ]]); - } - - public function testIndexSimple() { - - $this->sparqlClient - ->shouldReceive('query') - ->once() - ->andReturn(new \ArrayIterator([ - (object)['date'=>new Literal('1955', null, "http://purl.org/dc/terms/W3CDTF"), 'count' => Literal::create(1)], - (object)['date'=>new Literal('1965', null, "http://purl.org/dc/terms/W3CDTF"), 'count' => Literal::create(1)], - (object)['date'=>new Literal('1975', null, "http://purl.org/dc/terms/W3CDTF"), 'count' => Literal::create(1)], - ])); - $this->get('/api/v1/stats/datestats/')->assertTrue($this->response->isOk(), $this->response->content()); - $this->seeJsonEquals(["datestats" => [ - "1955" => 1, - "1965" => 1, - "1975" => 1, - ]]); - } - - public function testIndexPeriod() { - - $this->sparqlClient - ->shouldReceive('query') - ->once() - ->andReturn(new \ArrayIterator([ - (object)['date'=>new Literal('start=1955; end=1965', null, "http://purl.org/dc/terms/Period"), 'count' => Literal::create(11)], - ])); - $this->get('/api/v1/stats/datestats/')->assertTrue($this->response->isOk(), $this->response->content()); - $this->seeJsonEquals(["datestats" => [ - "1955" => 1, - "1956" => 1, - "1957" => 1, - "1958" => 1, - "1959" => 1, - "1960" => 1, - "1961" => 1, - "1962" => 1, - "1963" => 1, - "1964" => 1, - "1965" => 1, - ]]); - } - - public function testIndexPeriodRemainMore() { - - $this->sparqlClient - ->shouldReceive('query') - ->once() - ->andReturn(new \ArrayIterator([ - (object)['date'=>new Literal('start=1955; end=1965', null, "http://purl.org/dc/terms/Period"), 'count' => Literal::create(15)], - ])); - $this->get('/api/v1/stats/datestats/')->assertTrue($this->response->isOk(), $this->response->content()); - $this->seeJsonEquals(["datestats" => [ - "1955" => 2, - "1956" => 2, - "1957" => 2, - "1958" => 2, - "1959" => 1, - "1960" => 1, - "1961" => 1, - "1962" => 1, - "1963" => 1, - "1964" => 1, - "1965" => 1, - ]]); + Es::shouldReceive('search') + ->once() + ->with(self::ES_QUERY) + ->andReturn(json_decode('{ + "took" : 132, + "timed_out" : false, + "_shards" : { + "total" : 1, + "successful" : 1, + "failed" : 0 + }, + "hits" : { + "total" : 3373, + "max_score" : 0.0, + "hits" : [ ] + }, + "aggregations" : { + "datestats" : { + "doc_count" : 3725, + "years" : { + "doc_count_error_upper_bound" : 0, + "sum_other_doc_count" : 0, + "buckets" : [] + } + } + } + }', true)); + $this->get('/api/v1/stats/datestats/')->assertTrue($this->response->isOk(), $this->response->content()); + $this->seeJsonEquals(["datestats" => []]); } - public function testIndexPeriodRemainLess() { - - $this->sparqlClient - ->shouldReceive('query') - ->once() - ->andReturn(new \ArrayIterator([ - (object)['date'=>new Literal('start=1955; end=1965', null, "http://purl.org/dc/terms/Period"), 'count' => Literal::create(10)], - ])); - $this->get('/api/v1/stats/datestats/')->assertTrue($this->response->isOk(), $this->response->content()); - $this->seeJsonEquals(["datestats" => [ - "1955" => 1, - "1956" => 1, - "1957" => 1, - "1958" => 1, - "1959" => 1, - "1960" => 1, - "1961" => 1, - "1962" => 1, - "1963" => 1, - "1964" => 1, - ]]); - } - - public function testIndexMix() { + public function testIndexResult() { - $this->sparqlClient - ->shouldReceive('query') - ->once() - ->andReturn(new \ArrayIterator([ - (object)['date'=>new Literal('start=1955; end=1965', null, "http://purl.org/dc/terms/Period"), 'count' => Literal::create(11)], - (object)['date'=>new Literal('1960', null, "http://purl.org/dc/terms/W3CDTF"), 'count' => Literal::create(2)], - ])); - $this->get('/api/v1/stats/datestats/')->assertTrue($this->response->isOk(), $this->response->content()); - $this->seeJsonEquals(["datestats" => [ - "1955" => 1, - "1956" => 1, - "1957" => 1, - "1958" => 1, - "1959" => 1, - "1960" => 3, - "1961" => 1, - "1962" => 1, - "1963" => 1, - "1964" => 1, - "1965" => 1, - ]]); - } - - public function testIndexBadDate() { - - $this->sparqlClient - ->shouldReceive('query') - ->once() - ->andReturn(new \ArrayIterator([ - (object)['date'=>new Literal('1955', null, "http://purl.org/dc/terms/W3CDTF"), 'count' => Literal::create(1)], - (object)['date'=>new Literal('HELLO', null, "http://purl.org/dc/terms/W3CDTF"), 'count' => Literal::create(1)], - (object)['date'=>new Literal('1975', null, "http://purl.org/dc/terms/W3CDTF"), 'count' => Literal::create(1)], - ])); - $this->get('/api/v1/stats/datestats/')->assertTrue($this->response->isOk(), $this->response->content()); - $this->seeJsonEquals(["datestats" => [ - "1955" => 1, - "1975" => 1, - ]]); + Es::shouldReceive('search') + ->once() + ->with(self::ES_QUERY) + ->andReturn(json_decode('{ + "took" : 132, + "timed_out" : false, + "_shards" : { + "total" : 1, + "successful" : 1, + "failed" : 0 + }, + "hits" : { + "total" : 3373, + "max_score" : 0.0, + "hits" : [ ] + }, + "aggregations" : { + "datestats" : { + "doc_count" : 3725, + "years" : { + "doc_count_error_upper_bound" : 0, + "sum_other_doc_count" : 0, + "buckets" : [ { + "key" : 1948, + "doc_count" : 3, + "year_count" : { "value" : 3.0 } + }, { + "key" : 1957, + "doc_count" : 29, + "year_count" : { "value" : 29.0 } + }, { + "key" : 1963, + "doc_count" : 22, + "year_count" : { "value" : 21.5 } + }, { + "key" : 1970, + "doc_count" : 411, + "year_count" : { "value" : 403.68333334475756 } + }, { + "key" : 1986, + "doc_count" : 68, + "year_count" : { "value" : 14.133333388715982 } + }, { + "key" : 1996, + "doc_count" : 40, + "year_count" : { "value" : 36.05000001564622 } + } ] + } + } + } + }', true)); + $this->get('/api/v1/stats/datestats/')->assertTrue($this->response->isOk(), $this->response->content()); + $this->seeJsonEquals(["datestats" => [ + "1948" => 3, + "1957" => 29, + "1963" => 22, + "1970" => 404, + "1986" => 14, + "1996" => 36 + ]]); } - public function testIndexBadPeriod() { - - $this->sparqlClient - ->shouldReceive('query') - ->once() - ->andReturn(new \ArrayIterator([ - (object)['date'=>new Literal('start=1955; end=FOO', null, "http://purl.org/dc/terms/Period"), 'count' => Literal::create(11)], - ])); - $this->get('/api/v1/stats/datestats/')->assertTrue($this->response->isOk(), $this->response->content()); - $this->seeJsonEquals(["datestats" => [ - ]]); - } - - public function testIndexBadPeriodMissing() { - - $this->sparqlClient - ->shouldReceive('query') - ->once() - ->andReturn(new \ArrayIterator([ - (object)['date'=>new Literal('start=1955', null, "http://purl.org/dc/terms/Period"), 'count' => Literal::create(11)], - ])); - $this->get('/api/v1/stats/datestats/')->assertTrue($this->response->isOk(), $this->response->content()); - $this->seeJsonEquals(["datestats" => [ - ]]); - } - - public function testIndexFullPeriod() { - - $this->sparqlClient - ->shouldReceive('query') - ->once() - ->andReturn(new \ArrayIterator([ - (object)['date'=>new Literal('start=1955; end=1965; scheme=v3; name=v4;', null, "http://purl.org/dc/terms/Period"), 'count' => Literal::create(11)], - ])); - $this->get('/api/v1/stats/datestats/')->assertTrue($this->response->isOk(), $this->response->content()); - $this->seeJsonEquals(["datestats" => [ - "1955" => 1, - "1956" => 1, - "1957" => 1, - "1958" => 1, - "1959" => 1, - "1960" => 1, - "1961" => 1, - "1962" => 1, - "1963" => 1, - "1964" => 1, - "1965" => 1, - ]]); - } - - public function testIndexMultipleFormat() { - - $this->sparqlClient - ->shouldReceive('query') - ->once() - ->andReturn(new \ArrayIterator([ - (object)['date'=>new Literal('1975-02-05', null, "http://purl.org/dc/terms/W3CDTF"), 'count' => Literal::create(1)], - (object)['date'=>new Literal('1965-03', null, "http://purl.org/dc/terms/W3CDTF"), 'count' => Literal::create(2)], - (object)['date'=>new Literal('1955-02-12T08:30:00+00:00', null, "http://purl.org/dc/terms/W3CDTF"), 'count' => Literal::create(3)], - (object)['date'=>new Literal('1950-08-18T08:30:00Z', null, "http://purl.org/dc/terms/W3CDTF"), 'count' => Literal::create(4)], - ])); - $this->get('/api/v1/stats/datestats/')->assertTrue($this->response->isOk(), $this->response->content()); - $this->seeJsonEquals(["datestats" => [ - "1950" => 4, - "1955" => 3, - "1965" => 2, - "1975" => 1, - ]]); - } - - - }