server/src/app/Repositories/RdfDocumentRepository.php
changeset 326 226d5b17a119
parent 306 3fccf43160a7
child 329 0a2c2ad49d75
equal deleted inserted replaced
325:31a4987f6017 326:226d5b17a119
    16 use EasyRdf\Graph;
    16 use EasyRdf\Graph;
    17 
    17 
    18 use Illuminate\Pagination\LengthAwarePaginator;
    18 use Illuminate\Pagination\LengthAwarePaginator;
    19 use Illuminate\Pagination\Paginator;
    19 use Illuminate\Pagination\Paginator;
    20 
    20 
       
    21 use Es;
       
    22 
    21 /**
    23 /**
    22  * Implement the DocumentRepository using EasyRdf
    24  * Implement the DocumentRepository using EasyRdf
    23  * TODO: certainly split the transaction management (+add, +delete +transaction ) to an external class -> for this extend the sparql client.
    25  * TODO: certainly split the transaction management (+add, +delete +transaction ) to an external class -> for this extend the sparql client.
    24  */
    26  */
    25 class RdfDocumentRepository implements DocumentRepository {
    27 class RdfDocumentRepository implements DocumentRepository {
    29         "    ?uri".
    31         "    ?uri".
    30         "    ?doc".
    32         "    ?doc".
    31         "    ?title".
    33         "    ?title".
    32         "    ?issued".
    34         "    ?issued".
    33         "    ?modified".
    35         "    ?modified".
    34         "    ?lang".
    36         "    (group_concat(distinct ?language;separator=\", \") as ?lang) ".
    35         "    (group_concat(distinct ?publisher;separator=\", \") as ?publishers) ".
    37         "    (group_concat(distinct ?publisher;separator=\", \") as ?publishers) ".
    36         "  WHERE {".
    38         "  WHERE {".
    37         "  GRAPH ?uri { ?doc a <http://www.europeana.eu/schemas/edm/ProvidedCHO>.".
    39         "    GRAPH ?uri { ?doc a <http://www.europeana.eu/schemas/edm/ProvidedCHO>.".
    38         "    ?doc <http://purl.org/dc/elements/1.1/title> ?title.".
    40         "      ?doc <http://purl.org/dc/elements/1.1/title> ?title.".
    39         "    OPTIONAL {?doc <http://purl.org/dc/elements/1.1/language> ?lang.} ".
    41         "      OPTIONAL {?doc <http://purl.org/dc/elements/1.1/language> ?language.} ".
    40         "    OPTIONAL {?doc <http://purl.org/dc/terms/issued> ?issued.} ".
    42         "      OPTIONAL {?doc <http://purl.org/dc/terms/issued> ?issued.} ".
    41         "    OPTIONAL {?doc <http://purl.org/dc/terms/modified> ?modified.} ".
    43         "      OPTIONAL {?doc <http://purl.org/dc/terms/modified> ?modified.} ".
    42         "    OPTIONAL {?doc <http://purl.org/dc/elements/1.1/publisher> ?publisher.} }".
    44         "      OPTIONAL {?doc <http://purl.org/dc/elements/1.1/publisher> ?publisher.}".
       
    45         "    }. ".
       
    46         "    %s".
    43         "  } ".
    47         "  } ".
    44         "  GROUP BY ?uri ?doc ?title ?issued ?modified ?lang ".
    48         "  GROUP BY ?uri ?doc ?title ?issued ?modified ";
    45         "  ORDER BY ?uri";
       
    46 
    49 
    47     const ADDITIONAL_DOC_QUERIES = [
    50     const ADDITIONAL_DOC_QUERIES = [
    48         "SELECT".
    51         "SELECT".
    49         "    ?uri".
    52         "    ?uri".
    50         "    ?doc".
    53         "    ?doc".
   100         $newGraph->add($doc->doc, "rdf:type", $newGraph->resource("http://www.europeana.eu/schemas/edm/ProvidedCHO"));
   103         $newGraph->add($doc->doc, "rdf:type", $newGraph->resource("http://www.europeana.eu/schemas/edm/ProvidedCHO"));
   101         if(isset($doc->title)) {
   104         if(isset($doc->title)) {
   102             $newGraph->add($doc->doc, "http://purl.org/dc/elements/1.1/title", $doc->title);
   105             $newGraph->add($doc->doc, "http://purl.org/dc/elements/1.1/title", $doc->title);
   103         }
   106         }
   104         if(isset($doc->lang)) {
   107         if(isset($doc->lang)) {
   105             $newGraph->add($doc->doc, "http://purl.org/dc/elements/1.1/language", $doc->lang);
   108             foreach(explode(", ", $doc->lang) as $langStr) {
       
   109                 $langStr = trim($langStr);
       
   110                 if(filter_var($langStr, FILTER_VALIDATE_URL)) {
       
   111                     $newGraph->addResource($doc->doc, "http://purl.org/dc/elements/1.1/language", $langStr);
       
   112                 } else {
       
   113                     $newGraph->addLiteral($doc->doc, "http://purl.org/dc/elements/1.1/language", $langStr);
       
   114                 }
       
   115             }
   106         }
   116         }
   107         if(isset($doc->issued)) {
   117         if(isset($doc->issued)) {
   108             $newGraph->add($doc->doc, "http://purl.org/dc/terms/issued", $doc->issued);
   118             $newGraph->add($doc->doc, "http://purl.org/dc/terms/issued", $doc->issued);
   109         }
   119         }
   110         if(isset($doc->modified)) {
   120         if(isset($doc->modified)) {
   120             $newGraph->add($doc->doc, config('corpusparole.corpus_ontology_url').'transcript', $doc->transcript_url);
   130             $newGraph->add($doc->doc, config('corpusparole.corpus_ontology_url').'transcript', $doc->transcript_url);
   121         }
   131         }
   122         return $newGraph;
   132         return $newGraph;
   123     }
   133     }
   124 
   134 
   125     private function queryDocs($offset=null, $limit=null) {
   135     private function queryES($filters=null, $offset=null, $limit=null, $sort=null) {
       
   136 
       
   137         if(empty($sort)) {
       
   138             $sort = ["_doc"];
       
   139         } elseif (is_string($sort)) {
       
   140             $sort = [$sort];
       
   141         }
       
   142         if(is_null($filters)) {
       
   143             //$filters = ['language' => ["http://lexvo.org/id/iso639-3/oci", "http://lexvo.org/id/iso639-3/bre"]];
       
   144             $filters = [];
       
   145         }
       
   146 
       
   147         $qFilterParts = [];
       
   148 
       
   149         if(array_key_exists('language', $filters) && !empty($filters['language'])) {
       
   150             $languages = $filters['language'];
       
   151             if(is_string($languages)) {
       
   152                 $languages = [ $languages, ];
       
   153             }
       
   154             $qFilterParts[] = [
       
   155                 'bool' => [
       
   156                     'should' => [
       
   157                         [ 'terms' => [ 'language' => $languages ]]
       
   158                     ]
       
   159                 ]
       
   160             ];
       
   161         }
       
   162 
       
   163         $query = [
       
   164             'index' => config('corpusparole.elasticsearch_index'),
       
   165             'body' => [
       
   166                 "size" => empty($limit)?0:$limit,
       
   167                 "from" => $offset,
       
   168                 "sort" => $sort
       
   169             ]
       
   170         ];
       
   171 
       
   172         if(count($qFilterParts)>0) {
       
   173             $query['body']['query'] = ['constant_score' => [
       
   174                 'filter' => [
       
   175                     'bool' => [
       
   176                         'must' => $qFilterParts
       
   177                     ]
       
   178                 ]
       
   179             ] ];
       
   180         }
       
   181 
       
   182         $esRes = Es::search($query);
       
   183 
       
   184         return ['total' => $esRes['hits']['total'], 'documents' => array_map(function($r) {
       
   185             return $r['_id'];
       
   186         }, $esRes['hits']['hits'])];
       
   187 
       
   188     }
       
   189 
       
   190     /**
       
   191      * Query docs.
       
   192      * if $filter is empty or null and $sort is '_graph', the documents list is fetched from the triple store, otherwise, They are fetched from ElasticSearch
       
   193      */
       
   194     private function queryDocs($filters=null, $offset=null, $limit=null, $sort=null) {
   126 
   195 
   127         $resDocs = [];
   196         $resDocs = [];
       
   197 
   128         $limitsClauses = [];
   198         $limitsClauses = [];
       
   199         $sortClauseStr = "";
   129         $limitsClausesStr = "";
   200         $limitsClausesStr = "";
   130 
   201         $filterUris = "";
   131         if(!is_null($offset)) {
   202 
   132             array_push($limitsClauses, "OFFSET $offset");
   203         if(empty($filters) && $sort === "_graph") {
   133         }
   204             if(!is_null($offset)) {
   134         if(!is_null($limit)) {
   205                 array_push($limitsClauses, "OFFSET $offset");
   135             array_push($limitsClauses, "LIMIT $limit");
   206             }
   136         }
   207             if(!is_null($limit)) {
   137         if(!empty($limitsClauses)) {
   208                 array_push($limitsClauses, "LIMIT $limit");
   138             $limitsClausesStr = "\n" . join(" ", $limitsClauses);
   209             }
   139         }
   210             if(!empty($limitsClauses)) {
   140 
   211                 $limitsClausesStr = "\n" . join(" ", $limitsClauses);
   141         $docs = $this->sparqlClient->query(self::BASE_DOC_QUERY.$limitsClausesStr);
   212             }
       
   213             $sortClauseStr = "\n ORDER BY ?uri";
       
   214             $total = $this->getCount();
       
   215         } else {
       
   216             $esRes = $this->queryES($filters, $offset, $limit);
       
   217             // WARNING: we count on the fact that php keep keys order
       
   218             $total = intval($esRes['total']);
       
   219             foreach($esRes['documents'] as $esDocId) {
       
   220                 $uri = config('corpusparole.corpus_doc_id_base_uri_prefix').$esDocId;
       
   221                 $resDocs[$uri] = null;
       
   222             }
       
   223             if(count($resDocs) > 0) {
       
   224                 $filterUris = "FILTER(?uri in (<".join(">, <" , array_keys($resDocs)).">)) ";
       
   225             } else {
       
   226                 return ['meta' => [ 'total'=> $total ], 'documents' => []];
       
   227             }
       
   228 
       
   229         }
       
   230 
       
   231 
       
   232         $sparqlQuery = sprintf(self::BASE_DOC_QUERY.$sortClauseStr.$limitsClausesStr, $filterUris);
       
   233 
       
   234         $docs = $this->sparqlClient->query($sparqlQuery);
       
   235 
   142         foreach($docs as $doc) {
   236         foreach($docs as $doc) {
   143             $graph = $this->getResGraph($doc);
   237             $graph = $this->getResGraph($doc);
   144             if(is_null($graph)) {
   238             if(is_null($graph)) {
       
   239                 Log::debug("NULL GRAPH - odd");
   145                 continue;
   240                 continue;
   146             }
   241             }
   147             $uri = $doc->uri->getUri();
   242             $uri = $doc->uri->getUri();
   148             $resDocs[$uri] = $graph;
   243             $resDocs[$uri] = $graph;
   149         }
   244         }
   150 
   245 
   151         if(count($resDocs) == 0) {
   246         if(count($resDocs) == 0) {
   152             return [];
   247             return ['meta' => [ 'total'=> $total ], 'documents' => []];
   153         }
   248         }
   154 
   249 
   155         $filterUris = "FILTER(?uri in (<".join(">, <" , array_keys($resDocs)).">)) ";
   250         if(empty($filterUris)) {
       
   251             $filterUris = "FILTER(?uri in (<".join(">, <" , array_keys($resDocs)).">)) ";
       
   252         }
   156 
   253 
   157         foreach(self::ADDITIONAL_DOC_QUERIES as $query) {
   254         foreach(self::ADDITIONAL_DOC_QUERIES as $query) {
   158             $docs = $this->sparqlClient->query(sprintf($query, $filterUris));
   255             $docs = $this->sparqlClient->query(sprintf($query, $filterUris));
   159             foreach($docs as $doc) {
   256             foreach($docs as $doc) {
   160                 $graph = $this->getResGraph($doc);
   257                 $graph = $this->getResGraph($doc);
   169                     $resDocs[$uri] = $graph;
   266                     $resDocs[$uri] = $graph;
   170                 }
   267                 }
   171             }
   268             }
   172         }
   269         }
   173 
   270 
   174         return array_map(function($g) { return new DocumentResult($g->getUri(), $g); }, array_values($resDocs));
   271         $documentsResults = array_map(function($g) { return new DocumentResult($g->getUri(), $g); }, array_values($resDocs));
       
   272 
       
   273         return ['meta' => [ 'total'=> $total ], 'documents' => $documentsResults];
   175     }
   274     }
   176 
   275 
   177     public function all() {
   276     public function all() {
   178         return $this->queryDocs();
   277         return $this->queryDocs(null, null, null, "_graph")['documents'];
   179     }
   278     }
   180 
   279 
   181     public function get($id, $short=false) {
   280     public function get($id, $short=false) {
   182 
   281 
   183         if(strpos($id, config('corpusparole.corpus_id_scheme')) === 0) {
   282         if(strpos($id, config('corpusparole.corpus_id_scheme')) === 0) {
   231             }
   330             }
   232             throw $e;
   331             throw $e;
   233         }
   332         }
   234     }
   333     }
   235 
   334 
   236     public function getCount() {
   335     public function getCount($filters=null) {
   237         $res = $this->sparqlClient->query("SELECT (COUNT (DISTINCT ?g) as ?count) WHERE { GRAPH ?g { ?s a <http://www.europeana.eu/schemas/edm/ProvidedCHO> } }");
   336         $count = 0;
   238         assert(!is_null($res) && $res->count()==1);
   337         if(empty($filters)) {
   239         return $res[0]->count->getValue();
   338             $res = $this->sparqlClient->query("SELECT (COUNT (DISTINCT ?g) as ?count) WHERE { GRAPH ?g { ?s a <http://www.europeana.eu/schemas/edm/ProvidedCHO> } }");
       
   339             assert(!is_null($res) && count($res)==1);
       
   340             $count = intval($res[0]->count->getValue());
       
   341         } else {
       
   342             $esRes = $this->queryES($filters, 0, 0);
       
   343             $count = intval($esRes['hits']['total']);
       
   344         }
       
   345 
       
   346         return $count;
       
   347 
   240     }
   348     }
   241 
   349 
   242     //SELECT ?g WHERE { GRAPH ?g { ?s ?p ?o } }
   350     //SELECT ?g WHERE { GRAPH ?g { ?s ?p ?o } }
   243 
   351 
   244     /**
   352     /**
   248      * @param  string  $pageName
   356      * @param  string  $pageName
   249      * @return \Illuminate\Contracts\Pagination\LengthAwarePaginator
   357      * @return \Illuminate\Contracts\Pagination\LengthAwarePaginator
   250      */
   358      */
   251     public function paginateAll($perPage = 15, $pageName = 'page', $page = null)
   359     public function paginateAll($perPage = 15, $pageName = 'page', $page = null)
   252     {
   360     {
       
   361         return $this->paginate(null, $perPage, $pageName, null);
       
   362     }
       
   363 
       
   364     /**
       
   365      * Paginate filtered document as a paginator.
       
   366      *
       
   367      * @param  array $filters
       
   368      * @param  int  $perPage
       
   369      * @param  string  $pageName
       
   370      * @return \Illuminate\Contracts\Pagination\LengthAwarePaginator
       
   371      */
       
   372     public function paginate($filters = null, $perPage = 15, $pageName = 'page', $page = null, $sort=null) {
       
   373 
   253         assert(is_numeric($perPage));
   374         assert(is_numeric($perPage));
   254 
   375 
   255         if(is_null($page)) {
   376         if(is_null($page)) {
   256             $page = Paginator::resolveCurrentPage($pageName);
   377             $page = Paginator::resolveCurrentPage($pageName);
   257         }
   378         }
   258 
   379 
   259         assert(is_null($page) || is_numeric($page));
   380         assert(is_null($page) || is_numeric($page));
   260 
   381 
   261         $total = $this->getCount();
       
   262 
       
   263         $offset = max(0,($page - 1) * $perPage);
   382         $offset = max(0,($page - 1) * $perPage);
   264 
   383 
   265         $results = $this->queryDocs($offset, $perPage);
   384         $results = $this->queryDocs($filters, $offset, $perPage, $sort);
   266 
   385 
   267         return new LengthAwarePaginator($results, $total, $perPage, $page, [
   386         return new LengthAwarePaginator($results['documents'], $results['meta']['total'], $perPage, $page, [
   268             'path' => Paginator::resolveCurrentPath(),
   387             'path' => Paginator::resolveCurrentPath(),
   269             'pageName' => $pageName,
   388             'pageName' => $pageName,
   270         ]);
   389         ]);
   271     }
   390 
       
   391     }
       
   392 
   272 
   393 
   273     /**
   394     /**
   274      * Resolve lexvo id for all documents in the list
   395      * Resolve lexvo id for all documents in the list
   275      * this allow to optimise the call of lexvo repository
   396      * this allow to optimise the call of lexvo repository
   276      * @param $docList Array: a list (Array) of document to resolve
   397      * @param $docList Array: a list (Array) of document to resolve
   278     public function resolveLexvo(Array $docList) {
   399     public function resolveLexvo(Array $docList) {
   279 
   400 
   280         $languageIds = [];
   401         $languageIds = [];
   281         #get the list pf language needing resolving
   402         #get the list pf language needing resolving
   282         foreach ($docList as $doc) {
   403         foreach ($docList as $doc) {
   283             if($doc->getLanguageValue() && is_null($doc->getLanguageResolved())) {
   404             if(!empty($doc->getLanguagesValue()) && is_null($doc->getLanguagesResolved())) {
   284                 $languageIds[$doc->getLanguageValue()] = true;
   405                 foreach($doc->getLanguagesValue() as $lang) {
       
   406                     $languageIds[$lang]=true;
       
   407                 }
   285             }
   408             }
   286         }
   409         }
   287 
   410 
   288         # call LexvoResolver
   411         # call LexvoResolver
   289         $langNames = $this->lexvoResolver->getNames(array_keys($languageIds));
   412         $langNames = $this->lexvoResolver->getNames(array_keys($languageIds));
   290 
   413 
   291         foreach ($docList as $doc) {
   414         foreach ($docList as $doc) {
   292             if($doc->getLanguageValue() && is_null($doc->getLanguageResolved())) {
   415             if(!empty($doc->getLanguagesValue()) && is_null($doc->getLanguagesResolved())) {
   293                 $doc->setLanguageResolved($langNames[$doc->getLanguageValue()]);
   416                 $langResolved = [];
       
   417                 foreach($doc->getLanguagesValue() as $lang) {
       
   418                     $langResolved[] = $langNames[$lang];
       
   419                 }
       
   420                 $doc->setLanguageResolved($langResolved);
   294             }
   421             }
   295         }
   422         }
   296 
   423 
   297         return $docList;
   424         return $docList;
   298     }
   425     }