reimplement ThemeController using ES requests to be able to sort by label
authorymh <ymh.work@gmail.com>
Wed, 05 Oct 2016 02:31:25 +0200
changeset 321 aefaad270b9b
parent 320 0fce13da58af
child 322 084aae09edf4
reimplement ThemeController using ES requests to be able to sort by label
cms/app-client/app/controllers/application.js
server/src/app/Console/Commands/IndexDocuments.php
server/src/app/Exceptions/Handler.php
server/src/app/Http/Controllers/Api/ThemeController.php
server/src/config/corpusparole.php
--- a/cms/app-client/app/controllers/application.js	Tue Oct 04 13:53:56 2016 +0200
+++ b/cms/app-client/app/controllers/application.js	Wed Oct 05 02:31:25 2016 +0200
@@ -69,7 +69,7 @@
             }
         });
         if(!this.arraysEqual(this.get('date').toArray(), intervals)) {
-            this.set('date', intervals);    
+            this.set('date', intervals);
         }
     }),
 
@@ -91,14 +91,14 @@
         return true;
     },
 
-    
+
 
     itemObserver: Ember.observer('player.item', function() {
         var self = this;
         this.store.findRecord('document', this.get('player').get('item'), { reload: true }).then(function(model){
             self.get('player').set('model', model);
             if (self.get('player').get('model').get('transcript')) {
-                self.store.findRecord('transcript', encodeURIComponent(self.get('player').get('item'))).then(function(model) {
+                self.store.findRecord('transcript', self.get('player').get('item')).then(function(model) {
                     self.get('player').set('transcript', model);
                 });
             } else {
@@ -120,7 +120,7 @@
     }),
     noticeModelObserver: Ember.observer('noticeModel', function() {
         if (!this.get('noticeModel')) {
-            this.set('notice', null); 
+            this.set('notice', null);
         }
     }),
 
--- a/server/src/app/Console/Commands/IndexDocuments.php	Tue Oct 04 13:53:56 2016 +0200
+++ b/server/src/app/Console/Commands/IndexDocuments.php	Wed Oct 05 02:31:25 2016 +0200
@@ -3,10 +3,14 @@
 namespace CorpusParole\Console\Commands;
 
 use Illuminate\Console\Command;
+use EasyRdf\Resource;
+
 use GuzzleHttp\Client;
+use CorpusParole\Libraries\Utils;
 use CorpusParole\Repositories\DocumentRepository;
 use CorpusParole\Libraries\CocoonUtils;
 use CorpusParole\Models\GeonamesHierarchy;
+use CorpusParole\Services\BnfResolverInterface;
 use Es;
 
 class IndexDocuments extends Command
@@ -34,9 +38,10 @@
      *
      * @return void
      */
-    public function __construct(DocumentRepository $documentRepository, Client $httpClient)
+    public function __construct(DocumentRepository $documentRepository, Client $httpClient, BnfResolverInterface $bnfResolver)
     {
         $this->documentRepository = $documentRepository;
+        $this->bnfResolver = $bnfResolver;
         $this->httpClient = $httpClient;
         parent::__construct();
     }
@@ -82,7 +87,15 @@
                         ],
                         'date' => [ 'type' => 'date' ],
                         'geonames_hyerarchy' => [ 'type' => 'string' ],
-                        'location' => [ 'type' => 'geo_point' ]
+                        'location' => [ 'type' => 'geo_point' ],
+                        'subject' => [
+                            'type' => 'nested',
+                            'properties' => [
+                                'label' => [ 'type' => 'string', 'index' => 'not_analyzed'],
+                                'code' => [ 'type' => 'string', 'index' => 'not_analyzed'],
+                                'label_code' => [ 'type' => 'string', 'index' => 'not_analyzed']
+                            ]
+                        ]
                         // TODO: add location information
                     ]
                 ]
@@ -154,6 +167,29 @@
     }
 
     /**
+     * get subjects as { 'label': label, 'code': code } objects
+     * Takes only into account the bnf subjects
+     */
+    private function getSubjects($doc) {
+
+        $sres = array_reduce($doc->getSubjects(), function($res, $s) {
+            $m = [];
+            if($s instanceof Resource && preg_match(config('corpusparole.bnf_ark_url_regexp'), $s->getUri(), $m) === 1) {
+                array_push($res, [
+                    'uri' => $m[0],
+                    'code' => $m[1]
+                ]);
+            }
+            return $res;
+        }, []);
+
+        $labels = $this->bnfResolver->getLabels(array_unique(array_map(function($so) { return $so['uri'];}, $sres)));
+
+        return array_map(function($so) use ($labels) { return [ 'label' => $labels[$so['uri']], 'code' => $so['code'], 'label_code' =>  $labels[$so['uri']]."|".$so['code'] ]; }, $sres);
+
+    }
+
+    /**
      * Index one document into Elasticsearch
      *
      * @return int (1 if sucess, 0 if error)
@@ -168,7 +204,8 @@
             'body' => [
                 'title' => (string)$doc->getTitle(),
                 'date' => (string)$doc->getModified(),
-                'geonames_hierarchy' => $this->getGeonamesHierarchy($doc)
+                'geonames_hierarchy' => $this->getGeonamesHierarchy($doc),
+                'subject' => $this->getSubjects($doc)
             ]
         ];
         Es::index($query_data);
@@ -182,7 +219,8 @@
      private function indexBulk($docs)
      {
           $query_data = ['body' => []];
-          foreach($docs as $doc){
+          foreach($docs as $resultDoc){
+              $doc = $this->documentRepository->get($resultDoc->getId());
               $query_data['body'][] = [
                   'index' => [
                       '_index' => config('elasticsearch.index'),
@@ -192,7 +230,9 @@
               ];
               $query_data['body'][] = [
                   'title' => (string)$doc->getTitle(),
-                  'date' => (string)$doc->getModified()
+                  'date' => (string)$doc->getModified(),
+                  'geonames_hierarchy' => $this->getGeonamesHierarchy($doc),
+                  'subject' => $this->getSubjects($doc)
               ];
           }
           Es::bulk($query_data);
@@ -263,18 +303,18 @@
                         break;
                     }
                     $this->indexOne($doc);
+                    $progressBar->setMessage($doc->getId());
                     $progressBar->advance();
-                    $progressBar->setMessage($doc->getId());
                 }
             }
             else
             {
                 $this->indexBulk($docs);
+                $progressBar->setMessage('Page '.$page);
                 $progressBar->advance();
-                $progressBar->setMessage('Page '.$page);
             }
         }
         $progressBar->finish();
-        $this->info('Indexing completed');
+        $this->info("\nIndexing completed");
     }
 }
--- a/server/src/app/Exceptions/Handler.php	Tue Oct 04 13:53:56 2016 +0200
+++ b/server/src/app/Exceptions/Handler.php	Wed Oct 05 02:31:25 2016 +0200
@@ -39,10 +39,17 @@
     public function render($request, Exception $e)
     {
         if ( $request->isXmlHttpRequest() || $request->wantsJson() ) {
+
+            $statusCode = 500;
+            if(is_callable([$e, 'getStatusCode'])) {
+                $statusCode = $e->getStatusCode();
+            }
+
             return response()->json([
+                'code' => $statusCode,
                 'message' => class_basename( $e ) . ' in ' . basename( $e->getFile() ) . ' line ' . $e->getLine() . ( ( $message = $e->getMessage() ) ? ': ' . $e->getMessage() : '.' ),
                 'trace' => $e->getTrace()
-            ], 500);
+            ], $statusCode);
         }
         return parent::render($request, $e);
     }
--- a/server/src/app/Http/Controllers/Api/ThemeController.php	Tue Oct 04 13:53:56 2016 +0200
+++ b/server/src/app/Http/Controllers/Api/ThemeController.php	Wed Oct 05 02:31:25 2016 +0200
@@ -10,6 +10,9 @@
 use EasyRdf\Resource;
 use EasyRdf\Literal;
 
+use Es;
+use Log;
+
 use CorpusParole\Libraries\Sparql\SparqlClient;
 use CorpusParole\Services\LexvoResolverInterface;
 use CorpusParole\Services\BnfResolverInterface;
@@ -32,80 +35,65 @@
      */
     public function index(Request $request)
     {
-        $filter = $request->input('filter', 'bnf');
-        $resolve = filter_var($request->input('resolve', true), FILTER_VALIDATE_BOOLEAN);
+        $index = $request->input('index', 0);
+        $limit = $request->input('limit', 0);
+        $sort = $request->input('sort', 'count');
 
-        $filterClause = "";
 
-        if($filter === 'bnf') {
-            $filterClause = "FILTER (isIRI(?o) && regex(str(?o), '^".config('corpusparole.bnf_ark_base_url')."')) .";
-        }
-        elseif($filter === 'uri') {
-            $filterClause = "FILTER isIRI(?o) .";
-        }
-        elseif($filter === 'all' || $filter === 'none' || $filter === '') {
-            $filterClause = "";
-        }
-        else {
-            abort(401,"Value for filter parameter must be in 'bnf', 'uri', 'all' or 'none'");
+        if($sort == "count" || $sort == "descending") {
+            $order_field = "_count";
+            $order_dir = "desc";
+        } elseif($sort == "-count") {
+            $order_field = "_count";
+            $order_dir = "asc";
+        } elseif ($sort == "label" || $sort == "alphabetical") {
+            $order_field = "_term";
+            $order_dir = "asc";
+        } elseif ($sort == "-label") {
+            $order_field = "_term";
+            $order_dir = "desc";
+        } else {
+            $order_field = "_count";
+            $order_dir = "desc";
         }
 
-        $query =  preg_replace('/\s+/', ' ',
-         "select (?o as ?theme) (COUNT(?s) as ?count) where {
-            ?s a <http://www.europeana.eu/schemas/edm/ProvidedCHO> .
-            ?s <http://purl.org/dc/elements/1.1/subject> ?o .
-            $filterClause
-          }
-          GROUP BY ?o
-          ORDER BY DESC(?count)");
+        $query = [
+            'index' => env('ELASTICSEARCH_INDEX'),
+            'body' => [
+                'size' => 0,
+                'aggs' => [
+                    "subjects" => [
+                        "nested" => [ "path" => "subject" ],
+                        "aggs" => [
+                            "subjects" => [
+                                "terms" => [
+                                    "field" => "subject.label_code",
+                                    "size" => $limit * ($index+1),
+                                    "order" => [ $order_field => $order_dir ]
+                                ]
+                            ]
+                        ]
+                    ]
+                ]
+            ]
+        ];
 
-        $docs = $this->sparqlClient->query($query);
+        $esRes = Es::search($query);
 
         $themes = [];
-        $labels = [];
 
-        $bnfUris = [];
-        $lexvoUris = [];
+        $bucketList = array_slice($esRes['aggregations']['subjects']['subjects']['buckets'], $index*$limit, $limit);
 
-        foreach ($docs as $row) {
-            $key = "";
-            $label = null;
-            if($row->theme instanceof Resource) {
-                $key = $row->theme->getUri();
-                if($resolve && strpos($key, config('corpusparole.bnf_ark_base_url')) === 0) {
-                    array_push($bnfUris, $key);
-                }
-                elseif($resolve && strpos($key, config('corpusparole.lexvo_base_url')) === 0) {
-                    array_push($lexvoUris, $key);
-                }
-                $label = null;
-            }
-            elseif($row->theme instanceof Literal) {
-                $key = $row->theme->getValue();
-                $label = $row->theme->getValue();
-            }
-
-            $themes[$key] = [
+        foreach($bucketList as $bucket) {
+            $parts = explode("|", $bucket['key']);
+            $label = $parts[0];
+            $url = config('corpusparole.bnf_ark_base_url').$parts[1];
+            $themes[$url] = [
                 "label" => $label,
-                "count" => $row->count->getValue()
+                "count" => $bucket['doc_count']
             ];
         }
 
-        if($resolve) {
-            if(count($lexvoUris) > 0) {
-                $labels = $this->lexvoResolver->getNames($lexvoUris);
-            }
-            if(count($bnfUris) > 0) {
-                $labels = array_merge($labels, $this->bnfResolver->getLabels($bnfUris));
-            }
-            foreach ($themes as $themeKey => $themeDef) {
-                if(array_key_exists($themeKey, $labels)) {
-                    $themeDef['label'] = $labels[$themeKey];
-                }
-                $themes[$themeKey] = $themeDef;
-            }
-        }
-
         return response()->json(['themes' => $themes ]);
 
     }
--- a/server/src/config/corpusparole.php	Tue Oct 04 13:53:56 2016 +0200
+++ b/server/src/config/corpusparole.php	Wed Oct 05 02:31:25 2016 +0200
@@ -119,6 +119,7 @@
 
     'bnf_base_url' => 'http://data.bnf.fr/',
     'bnf_ark_base_url' => 'http://ark.bnf.fr/',
+    'bnf_ark_url_regexp' => '/http[s]?\:\/\/(?:data|ark)\.bnf\.fr\/(ark\:\/12148\/[[:alnum:]]+)\/?/',
     'bnf_cache_expiration' => 60*24*30,
     'bnf_max_ids' => 5,
     'bnf_query_url' => 'http://data.bnf.fr/sparql',