server/src/app/Repositories/RdfDocumentRepository.php
changeset 339 766af1228b05
parent 329 0a2c2ad49d75
child 369 796725d33b67
--- a/server/src/app/Repositories/RdfDocumentRepository.php	Sun Oct 16 22:23:31 2016 +0530
+++ b/server/src/app/Repositories/RdfDocumentRepository.php	Sun Oct 16 23:19:57 2016 +0530
@@ -9,6 +9,7 @@
 use CorpusParole\Libraries\CorpusParoleException;
 use CorpusParole\Libraries\Utils;
 use CorpusParole\Libraries\Sparql\SparqlClient;
+use CorpusParole\Libraries\Filters\CorpusFilterManager;
 
 
 use CorpusParole\Services\LexvoResolverInterface;
@@ -18,6 +19,8 @@
 use Illuminate\Pagination\LengthAwarePaginator;
 use Illuminate\Pagination\Paginator;
 
+use Es;
+
 /**
  * Implement the DocumentRepository using EasyRdf
  * TODO: certainly split the transaction management (+add, +delete +transaction ) to an external class -> for this extend the sparql client.
@@ -31,18 +34,19 @@
         "    ?title".
         "    ?issued".
         "    ?modified".
-        "    ?lang".
+        "    (group_concat(distinct ?language;separator=\", \") as ?lang) ".
         "    (group_concat(distinct ?publisher;separator=\", \") as ?publishers) ".
         "  WHERE {".
-        "  GRAPH ?uri { ?doc a <http://www.europeana.eu/schemas/edm/ProvidedCHO>.".
-        "    ?doc <http://purl.org/dc/elements/1.1/title> ?title.".
-        "    OPTIONAL {?doc <http://purl.org/dc/elements/1.1/language> ?lang.} ".
-        "    OPTIONAL {?doc <http://purl.org/dc/terms/issued> ?issued.} ".
-        "    OPTIONAL {?doc <http://purl.org/dc/terms/modified> ?modified.} ".
-        "    OPTIONAL {?doc <http://purl.org/dc/elements/1.1/publisher> ?publisher.} }".
+        "    GRAPH ?uri { ?doc a <http://www.europeana.eu/schemas/edm/ProvidedCHO>.".
+        "      ?doc <http://purl.org/dc/elements/1.1/title> ?title.".
+        "      OPTIONAL {?doc <http://purl.org/dc/elements/1.1/language> ?language.} ".
+        "      OPTIONAL {?doc <http://purl.org/dc/terms/issued> ?issued.} ".
+        "      OPTIONAL {?doc <http://purl.org/dc/terms/modified> ?modified.} ".
+        "      OPTIONAL {?doc <http://purl.org/dc/elements/1.1/publisher> ?publisher.}".
+        "    }. ".
+        "    %s".
         "  } ".
-        "  GROUP BY ?uri ?doc ?title ?issued ?modified ?lang ".
-        "  ORDER BY ?uri";
+        "  GROUP BY ?uri ?doc ?title ?issued ?modified ";
 
     const ADDITIONAL_DOC_QUERIES = [
         "SELECT".
@@ -102,7 +106,14 @@
             $newGraph->add($doc->doc, "http://purl.org/dc/elements/1.1/title", $doc->title);
         }
         if(isset($doc->lang)) {
-            $newGraph->add($doc->doc, "http://purl.org/dc/elements/1.1/language", $doc->lang);
+            foreach(explode(", ", $doc->lang) as $langStr) {
+                $langStr = trim($langStr);
+                if(filter_var($langStr, FILTER_VALIDATE_URL)) {
+                    $newGraph->addResource($doc->doc, "http://purl.org/dc/elements/1.1/language", $langStr);
+                } else {
+                    $newGraph->addLiteral($doc->doc, "http://purl.org/dc/elements/1.1/language", $langStr);
+                }
+            }
         }
         if(isset($doc->issued)) {
             $newGraph->add($doc->doc, "http://purl.org/dc/terms/issued", $doc->issued);
@@ -122,26 +133,105 @@
         return $newGraph;
     }
 
-    private function queryDocs($offset=null, $limit=null) {
+    private function queryES($filters=null, $offset=null, $limit=null, $sort=null) {
+
+        if(empty($sort)) {
+            $sort = ["_doc"];
+        } elseif (is_string($sort)) {
+            $sort = [$sort];
+        }
+        if(is_null($filters)) {
+            //$filters = ['language' => ["http://lexvo.org/id/iso639-3/oci", "http://lexvo.org/id/iso639-3/bre"]];
+            $filters = [];
+        }
+
+        $qFilterParts = [];
+
+        if(array_key_exists('language', $filters) && !empty($filters['language'])) {
+            $languages = $filters['language'];
+            if(is_string($languages)) {
+                $languages = [ $languages, ];
+            }
+            $qFilterParts[] = CorpusFilterManager::getLanguagesFilterPart($languages);
+        }
+
+        $query = [
+            'index' => config('corpusparole.elasticsearch_index'),
+            'body' => [
+                "size" => empty($limit)?0:$limit,
+                "from" => $offset,
+                "sort" => $sort
+            ]
+        ];
+
+        if(count($qFilterParts)>0) {
+            $query['body']['query'] = ['constant_score' => [
+                'filter' => [
+                    'bool' => [
+                        'must' => $qFilterParts
+                    ]
+                ]
+            ] ];
+        }
+
+        $esRes = Es::search($query);
+
+        return ['total' => $esRes['hits']['total'], 'documents' => array_map(function($r) {
+            return $r['_id'];
+        }, $esRes['hits']['hits'])];
+
+    }
+
+    /**
+     * Query docs.
+     * if $filter is empty or null and $sort is '_graph', the documents list is fetched from the triple store, otherwise, They are fetched from ElasticSearch
+     */
+    private function queryDocs($filters=null, $offset=null, $limit=null, $sort=null) {
 
         $resDocs = [];
+
         $limitsClauses = [];
+        $sortClauseStr = "";
         $limitsClausesStr = "";
+        $filterUris = "";
 
-        if(!is_null($offset)) {
-            array_push($limitsClauses, "OFFSET $offset");
-        }
-        if(!is_null($limit)) {
-            array_push($limitsClauses, "LIMIT $limit");
-        }
-        if(!empty($limitsClauses)) {
-            $limitsClausesStr = "\n" . join(" ", $limitsClauses);
+        if(empty($filters) && $sort === "_graph") {
+            if(!is_null($offset)) {
+                array_push($limitsClauses, "OFFSET $offset");
+            }
+            if(!is_null($limit)) {
+                array_push($limitsClauses, "LIMIT $limit");
+            }
+            if(!empty($limitsClauses)) {
+                $limitsClausesStr = "\n" . join(" ", $limitsClauses);
+            }
+            $sortClauseStr = "\n ORDER BY ?uri";
+            $total = $this->getCount();
+        } else {
+            $esRes = $this->queryES($filters, $offset, $limit);
+            // WARNING: we count on the fact that php keep keys order
+            $total = intval($esRes['total']);
+            foreach($esRes['documents'] as $esDocId) {
+                $uri = config('corpusparole.corpus_doc_id_base_uri_prefix').$esDocId;
+                $resDocs[$uri] = null;
+            }
+            if(count($resDocs) > 0) {
+                $filterUris = "FILTER(?uri in (<".join(">, <" , array_keys($resDocs)).">)) ";
+            } else {
+                return ['meta' => [ 'total'=> $total ], 'documents' => []];
+            }
+
         }
 
-        $docs = $this->sparqlClient->query(self::BASE_DOC_QUERY.$limitsClausesStr);
+
+        $sparqlQuery = sprintf(self::BASE_DOC_QUERY.$sortClauseStr.$limitsClausesStr, $filterUris);
+
+        $docs = $this->sparqlClient->query($sparqlQuery);
+
         foreach($docs as $doc) {
             $graph = $this->getResGraph($doc);
             if(is_null($graph)) {
+                Log::debug("NULL GRAPH - odd");
                 continue;
             }
             $uri = $doc->uri->getUri();
@@ -149,10 +239,12 @@
         }
 
         if(count($resDocs) == 0) {
-            return [];
+            return ['meta' => [ 'total'=> $total ], 'documents' => []];
         }
 
-        $filterUris = "FILTER(?uri in (<".join(">, <" , array_keys($resDocs)).">)) ";
+        if(empty($filterUris)) {
+            $filterUris = "FILTER(?uri in (<".join(">, <" , array_keys($resDocs)).">)) ";
+        }
 
         foreach(self::ADDITIONAL_DOC_QUERIES as $query) {
             $docs = $this->sparqlClient->query(sprintf($query, $filterUris));
@@ -171,11 +263,13 @@
             }
         }
 
-        return array_map(function($g) { return new DocumentResult($g->getUri(), $g); }, array_values($resDocs));
+        $documentsResults = array_map(function($g) { return new DocumentResult($g->getUri(), $g); }, array_values($resDocs));
+
+        return ['meta' => [ 'total'=> $total ], 'documents' => $documentsResults];
     }
 
     public function all() {
-        return $this->queryDocs();
+        return $this->queryDocs(null, null, null, "_graph")['documents'];
     }
 
     public function get($id, $short=false) {
@@ -233,10 +327,19 @@
         }
     }
 
-    public function getCount() {
-        $res = $this->sparqlClient->query("SELECT (COUNT (DISTINCT ?g) as ?count) WHERE { GRAPH ?g { ?s a <http://www.europeana.eu/schemas/edm/ProvidedCHO> } }");
-        assert(!is_null($res) && $res->count()==1);
-        return $res[0]->count->getValue();
+    public function getCount($filters=null) {
+        $count = 0;
+        if(empty($filters)) {
+            $res = $this->sparqlClient->query("SELECT (COUNT (DISTINCT ?g) as ?count) WHERE { GRAPH ?g { ?s a <http://www.europeana.eu/schemas/edm/ProvidedCHO> } }");
+            assert(!is_null($res) && count($res)==1);
+            $count = intval($res[0]->count->getValue());
+        } else {
+            $esRes = $this->queryES($filters, 0, 0);
+            $count = intval($esRes['hits']['total']);
+        }
+
+        return $count;
+
     }
 
     //SELECT ?g WHERE { GRAPH ?g { ?s ?p ?o } }
@@ -250,6 +353,19 @@
      */
     public function paginateAll($perPage = 15, $pageName = 'page', $page = null)
     {
+        return $this->paginate(null, $perPage, $pageName, null);
+    }
+
+    /**
+     * Paginate filtered document as a paginator.
+     *
+     * @param  array $filters
+     * @param  int  $perPage
+     * @param  string  $pageName
+     * @return \Illuminate\Contracts\Pagination\LengthAwarePaginator
+     */
+    public function paginate($filters = null, $perPage = 15, $pageName = 'page', $page = null, $sort=null) {
+
         assert(is_numeric($perPage));
 
         if(is_null($page)) {
@@ -258,18 +374,18 @@
 
         assert(is_null($page) || is_numeric($page));
 
-        $total = $this->getCount();
-
         $offset = max(0,($page - 1) * $perPage);
 
-        $results = $this->queryDocs($offset, $perPage);
+        $results = $this->queryDocs($filters, $offset, $perPage, $sort);
 
-        return new LengthAwarePaginator($results, $total, $perPage, $page, [
+        return new LengthAwarePaginator($results['documents'], $results['meta']['total'], $perPage, $page, [
             'path' => Paginator::resolveCurrentPath(),
             'pageName' => $pageName,
         ]);
+
     }
 
+
     /**
      * Resolve lexvo id for all documents in the list
      * this allow to optimise the call of lexvo repository
@@ -280,8 +396,10 @@
         $languageIds = [];
         #get the list pf language needing resolving
         foreach ($docList as $doc) {
-            if($doc->getLanguageValue() && is_null($doc->getLanguageResolved())) {
-                $languageIds[$doc->getLanguageValue()] = true;
+            if(!empty($doc->getLanguagesValue()) && is_null($doc->getLanguagesResolved())) {
+                foreach($doc->getLanguagesValue() as $lang) {
+                    $languageIds[$lang]=true;
+                }
             }
         }
 
@@ -289,8 +407,12 @@
         $langNames = $this->lexvoResolver->getNames(array_keys($languageIds));
 
         foreach ($docList as $doc) {
-            if($doc->getLanguageValue() && is_null($doc->getLanguageResolved())) {
-                $doc->setLanguageResolved($langNames[$doc->getLanguageValue()]);
+            if(!empty($doc->getLanguagesValue()) && is_null($doc->getLanguagesResolved())) {
+                $langResolved = [];
+                foreach($doc->getLanguagesValue() as $lang) {
+                    $langResolved[] = $langNames[$lang];
+                }
+                $doc->setLanguageResolved($langResolved);
             }
         }