server/src/app/Repositories/RdfDocumentRepository.php
changeset 339 766af1228b05
parent 329 0a2c2ad49d75
child 369 796725d33b67
equal deleted inserted replaced
338:4a3899b6a7ed 339:766af1228b05
     7 use CorpusParole\Models\DocumentResult;
     7 use CorpusParole\Models\DocumentResult;
     8 use CorpusParole\Models\Document;
     8 use CorpusParole\Models\Document;
     9 use CorpusParole\Libraries\CorpusParoleException;
     9 use CorpusParole\Libraries\CorpusParoleException;
    10 use CorpusParole\Libraries\Utils;
    10 use CorpusParole\Libraries\Utils;
    11 use CorpusParole\Libraries\Sparql\SparqlClient;
    11 use CorpusParole\Libraries\Sparql\SparqlClient;
       
    12 use CorpusParole\Libraries\Filters\CorpusFilterManager;
    12 
    13 
    13 
    14 
    14 use CorpusParole\Services\LexvoResolverInterface;
    15 use CorpusParole\Services\LexvoResolverInterface;
    15 
    16 
    16 use EasyRdf\Graph;
    17 use EasyRdf\Graph;
    17 
    18 
    18 use Illuminate\Pagination\LengthAwarePaginator;
    19 use Illuminate\Pagination\LengthAwarePaginator;
    19 use Illuminate\Pagination\Paginator;
    20 use Illuminate\Pagination\Paginator;
       
    21 
       
    22 use Es;
    20 
    23 
    21 /**
    24 /**
    22  * Implement the DocumentRepository using EasyRdf
    25  * Implement the DocumentRepository using EasyRdf
    23  * TODO: certainly split the transaction management (+add, +delete +transaction ) to an external class -> for this extend the sparql client.
    26  * TODO: certainly split the transaction management (+add, +delete +transaction ) to an external class -> for this extend the sparql client.
    24  */
    27  */
    29         "    ?uri".
    32         "    ?uri".
    30         "    ?doc".
    33         "    ?doc".
    31         "    ?title".
    34         "    ?title".
    32         "    ?issued".
    35         "    ?issued".
    33         "    ?modified".
    36         "    ?modified".
    34         "    ?lang".
    37         "    (group_concat(distinct ?language;separator=\", \") as ?lang) ".
    35         "    (group_concat(distinct ?publisher;separator=\", \") as ?publishers) ".
    38         "    (group_concat(distinct ?publisher;separator=\", \") as ?publishers) ".
    36         "  WHERE {".
    39         "  WHERE {".
    37         "  GRAPH ?uri { ?doc a <http://www.europeana.eu/schemas/edm/ProvidedCHO>.".
    40         "    GRAPH ?uri { ?doc a <http://www.europeana.eu/schemas/edm/ProvidedCHO>.".
    38         "    ?doc <http://purl.org/dc/elements/1.1/title> ?title.".
    41         "      ?doc <http://purl.org/dc/elements/1.1/title> ?title.".
    39         "    OPTIONAL {?doc <http://purl.org/dc/elements/1.1/language> ?lang.} ".
    42         "      OPTIONAL {?doc <http://purl.org/dc/elements/1.1/language> ?language.} ".
    40         "    OPTIONAL {?doc <http://purl.org/dc/terms/issued> ?issued.} ".
    43         "      OPTIONAL {?doc <http://purl.org/dc/terms/issued> ?issued.} ".
    41         "    OPTIONAL {?doc <http://purl.org/dc/terms/modified> ?modified.} ".
    44         "      OPTIONAL {?doc <http://purl.org/dc/terms/modified> ?modified.} ".
    42         "    OPTIONAL {?doc <http://purl.org/dc/elements/1.1/publisher> ?publisher.} }".
    45         "      OPTIONAL {?doc <http://purl.org/dc/elements/1.1/publisher> ?publisher.}".
       
    46         "    }. ".
       
    47         "    %s".
    43         "  } ".
    48         "  } ".
    44         "  GROUP BY ?uri ?doc ?title ?issued ?modified ?lang ".
    49         "  GROUP BY ?uri ?doc ?title ?issued ?modified ";
    45         "  ORDER BY ?uri";
       
    46 
    50 
    47     const ADDITIONAL_DOC_QUERIES = [
    51     const ADDITIONAL_DOC_QUERIES = [
    48         "SELECT".
    52         "SELECT".
    49         "    ?uri".
    53         "    ?uri".
    50         "    ?doc".
    54         "    ?doc".
   100         $newGraph->add($doc->doc, "rdf:type", $newGraph->resource("http://www.europeana.eu/schemas/edm/ProvidedCHO"));
   104         $newGraph->add($doc->doc, "rdf:type", $newGraph->resource("http://www.europeana.eu/schemas/edm/ProvidedCHO"));
   101         if(isset($doc->title)) {
   105         if(isset($doc->title)) {
   102             $newGraph->add($doc->doc, "http://purl.org/dc/elements/1.1/title", $doc->title);
   106             $newGraph->add($doc->doc, "http://purl.org/dc/elements/1.1/title", $doc->title);
   103         }
   107         }
   104         if(isset($doc->lang)) {
   108         if(isset($doc->lang)) {
   105             $newGraph->add($doc->doc, "http://purl.org/dc/elements/1.1/language", $doc->lang);
   109             foreach(explode(", ", $doc->lang) as $langStr) {
       
   110                 $langStr = trim($langStr);
       
   111                 if(filter_var($langStr, FILTER_VALIDATE_URL)) {
       
   112                     $newGraph->addResource($doc->doc, "http://purl.org/dc/elements/1.1/language", $langStr);
       
   113                 } else {
       
   114                     $newGraph->addLiteral($doc->doc, "http://purl.org/dc/elements/1.1/language", $langStr);
       
   115                 }
       
   116             }
   106         }
   117         }
   107         if(isset($doc->issued)) {
   118         if(isset($doc->issued)) {
   108             $newGraph->add($doc->doc, "http://purl.org/dc/terms/issued", $doc->issued);
   119             $newGraph->add($doc->doc, "http://purl.org/dc/terms/issued", $doc->issued);
   109         }
   120         }
   110         if(isset($doc->modified)) {
   121         if(isset($doc->modified)) {
   120             $newGraph->add($doc->doc, config('corpusparole.corpus_ontology_url').'transcript', $doc->transcript_url);
   131             $newGraph->add($doc->doc, config('corpusparole.corpus_ontology_url').'transcript', $doc->transcript_url);
   121         }
   132         }
   122         return $newGraph;
   133         return $newGraph;
   123     }
   134     }
   124 
   135 
   125     private function queryDocs($offset=null, $limit=null) {
   136     private function queryES($filters=null, $offset=null, $limit=null, $sort=null) {
       
   137 
       
   138         if(empty($sort)) {
       
   139             $sort = ["_doc"];
       
   140         } elseif (is_string($sort)) {
       
   141             $sort = [$sort];
       
   142         }
       
   143         if(is_null($filters)) {
       
   144             //$filters = ['language' => ["http://lexvo.org/id/iso639-3/oci", "http://lexvo.org/id/iso639-3/bre"]];
       
   145             $filters = [];
       
   146         }
       
   147 
       
   148         $qFilterParts = [];
       
   149 
       
   150         if(array_key_exists('language', $filters) && !empty($filters['language'])) {
       
   151             $languages = $filters['language'];
       
   152             if(is_string($languages)) {
       
   153                 $languages = [ $languages, ];
       
   154             }
       
   155             $qFilterParts[] = CorpusFilterManager::getLanguagesFilterPart($languages);
       
   156         }
       
   157 
       
   158         $query = [
       
   159             'index' => config('corpusparole.elasticsearch_index'),
       
   160             'body' => [
       
   161                 "size" => empty($limit)?0:$limit,
       
   162                 "from" => $offset,
       
   163                 "sort" => $sort
       
   164             ]
       
   165         ];
       
   166 
       
   167         if(count($qFilterParts)>0) {
       
   168             $query['body']['query'] = ['constant_score' => [
       
   169                 'filter' => [
       
   170                     'bool' => [
       
   171                         'must' => $qFilterParts
       
   172                     ]
       
   173                 ]
       
   174             ] ];
       
   175         }
       
   176 
       
   177         $esRes = Es::search($query);
       
   178 
       
   179         return ['total' => $esRes['hits']['total'], 'documents' => array_map(function($r) {
       
   180             return $r['_id'];
       
   181         }, $esRes['hits']['hits'])];
       
   182 
       
   183     }
       
   184 
       
   185     /**
       
   186      * Query docs.
       
   187      * if $filter is empty or null and $sort is '_graph', the documents list is fetched from the triple store, otherwise, They are fetched from ElasticSearch
       
   188      */
       
   189     private function queryDocs($filters=null, $offset=null, $limit=null, $sort=null) {
   126 
   190 
   127         $resDocs = [];
   191         $resDocs = [];
       
   192 
   128         $limitsClauses = [];
   193         $limitsClauses = [];
       
   194         $sortClauseStr = "";
   129         $limitsClausesStr = "";
   195         $limitsClausesStr = "";
   130 
   196         $filterUris = "";
   131         if(!is_null($offset)) {
   197 
   132             array_push($limitsClauses, "OFFSET $offset");
   198         if(empty($filters) && $sort === "_graph") {
   133         }
   199             if(!is_null($offset)) {
   134         if(!is_null($limit)) {
   200                 array_push($limitsClauses, "OFFSET $offset");
   135             array_push($limitsClauses, "LIMIT $limit");
   201             }
   136         }
   202             if(!is_null($limit)) {
   137         if(!empty($limitsClauses)) {
   203                 array_push($limitsClauses, "LIMIT $limit");
   138             $limitsClausesStr = "\n" . join(" ", $limitsClauses);
   204             }
   139         }
   205             if(!empty($limitsClauses)) {
   140 
   206                 $limitsClausesStr = "\n" . join(" ", $limitsClauses);
   141         $docs = $this->sparqlClient->query(self::BASE_DOC_QUERY.$limitsClausesStr);
   207             }
       
   208             $sortClauseStr = "\n ORDER BY ?uri";
       
   209             $total = $this->getCount();
       
   210         } else {
       
   211             $esRes = $this->queryES($filters, $offset, $limit);
       
   212             // WARNING: we count on the fact that php keep keys order
       
   213             $total = intval($esRes['total']);
       
   214             foreach($esRes['documents'] as $esDocId) {
       
   215                 $uri = config('corpusparole.corpus_doc_id_base_uri_prefix').$esDocId;
       
   216                 $resDocs[$uri] = null;
       
   217             }
       
   218             if(count($resDocs) > 0) {
       
   219                 $filterUris = "FILTER(?uri in (<".join(">, <" , array_keys($resDocs)).">)) ";
       
   220             } else {
       
   221                 return ['meta' => [ 'total'=> $total ], 'documents' => []];
       
   222             }
       
   223 
       
   224         }
       
   225 
       
   226 
       
   227         $sparqlQuery = sprintf(self::BASE_DOC_QUERY.$sortClauseStr.$limitsClausesStr, $filterUris);
       
   228 
       
   229         $docs = $this->sparqlClient->query($sparqlQuery);
       
   230 
   142         foreach($docs as $doc) {
   231         foreach($docs as $doc) {
   143             $graph = $this->getResGraph($doc);
   232             $graph = $this->getResGraph($doc);
   144             if(is_null($graph)) {
   233             if(is_null($graph)) {
       
   234                 Log::debug("NULL GRAPH - odd");
   145                 continue;
   235                 continue;
   146             }
   236             }
   147             $uri = $doc->uri->getUri();
   237             $uri = $doc->uri->getUri();
   148             $resDocs[$uri] = $graph;
   238             $resDocs[$uri] = $graph;
   149         }
   239         }
   150 
   240 
   151         if(count($resDocs) == 0) {
   241         if(count($resDocs) == 0) {
   152             return [];
   242             return ['meta' => [ 'total'=> $total ], 'documents' => []];
   153         }
   243         }
   154 
   244 
   155         $filterUris = "FILTER(?uri in (<".join(">, <" , array_keys($resDocs)).">)) ";
   245         if(empty($filterUris)) {
       
   246             $filterUris = "FILTER(?uri in (<".join(">, <" , array_keys($resDocs)).">)) ";
       
   247         }
   156 
   248 
   157         foreach(self::ADDITIONAL_DOC_QUERIES as $query) {
   249         foreach(self::ADDITIONAL_DOC_QUERIES as $query) {
   158             $docs = $this->sparqlClient->query(sprintf($query, $filterUris));
   250             $docs = $this->sparqlClient->query(sprintf($query, $filterUris));
   159             foreach($docs as $doc) {
   251             foreach($docs as $doc) {
   160                 $graph = $this->getResGraph($doc);
   252                 $graph = $this->getResGraph($doc);
   169                     $resDocs[$uri] = $graph;
   261                     $resDocs[$uri] = $graph;
   170                 }
   262                 }
   171             }
   263             }
   172         }
   264         }
   173 
   265 
   174         return array_map(function($g) { return new DocumentResult($g->getUri(), $g); }, array_values($resDocs));
   266         $documentsResults = array_map(function($g) { return new DocumentResult($g->getUri(), $g); }, array_values($resDocs));
       
   267 
       
   268         return ['meta' => [ 'total'=> $total ], 'documents' => $documentsResults];
   175     }
   269     }
   176 
   270 
   177     public function all() {
   271     public function all() {
   178         return $this->queryDocs();
   272         return $this->queryDocs(null, null, null, "_graph")['documents'];
   179     }
   273     }
   180 
   274 
   181     public function get($id, $short=false) {
   275     public function get($id, $short=false) {
   182 
   276 
   183         if(strpos($id, config('corpusparole.corpus_id_scheme')) === 0) {
   277         if(strpos($id, config('corpusparole.corpus_id_scheme')) === 0) {
   231             }
   325             }
   232             throw $e;
   326             throw $e;
   233         }
   327         }
   234     }
   328     }
   235 
   329 
   236     public function getCount() {
   330     public function getCount($filters=null) {
   237         $res = $this->sparqlClient->query("SELECT (COUNT (DISTINCT ?g) as ?count) WHERE { GRAPH ?g { ?s a <http://www.europeana.eu/schemas/edm/ProvidedCHO> } }");
   331         $count = 0;
   238         assert(!is_null($res) && $res->count()==1);
   332         if(empty($filters)) {
   239         return $res[0]->count->getValue();
   333             $res = $this->sparqlClient->query("SELECT (COUNT (DISTINCT ?g) as ?count) WHERE { GRAPH ?g { ?s a <http://www.europeana.eu/schemas/edm/ProvidedCHO> } }");
       
   334             assert(!is_null($res) && count($res)==1);
       
   335             $count = intval($res[0]->count->getValue());
       
   336         } else {
       
   337             $esRes = $this->queryES($filters, 0, 0);
       
   338             $count = intval($esRes['hits']['total']);
       
   339         }
       
   340 
       
   341         return $count;
       
   342 
   240     }
   343     }
   241 
   344 
   242     //SELECT ?g WHERE { GRAPH ?g { ?s ?p ?o } }
   345     //SELECT ?g WHERE { GRAPH ?g { ?s ?p ?o } }
   243 
   346 
   244     /**
   347     /**
   248      * @param  string  $pageName
   351      * @param  string  $pageName
   249      * @return \Illuminate\Contracts\Pagination\LengthAwarePaginator
   352      * @return \Illuminate\Contracts\Pagination\LengthAwarePaginator
   250      */
   353      */
   251     public function paginateAll($perPage = 15, $pageName = 'page', $page = null)
   354     public function paginateAll($perPage = 15, $pageName = 'page', $page = null)
   252     {
   355     {
       
   356         return $this->paginate(null, $perPage, $pageName, null);
       
   357     }
       
   358 
       
   359     /**
       
   360      * Paginate filtered document as a paginator.
       
   361      *
       
   362      * @param  array $filters
       
   363      * @param  int  $perPage
       
   364      * @param  string  $pageName
       
   365      * @return \Illuminate\Contracts\Pagination\LengthAwarePaginator
       
   366      */
       
   367     public function paginate($filters = null, $perPage = 15, $pageName = 'page', $page = null, $sort=null) {
       
   368 
   253         assert(is_numeric($perPage));
   369         assert(is_numeric($perPage));
   254 
   370 
   255         if(is_null($page)) {
   371         if(is_null($page)) {
   256             $page = Paginator::resolveCurrentPage($pageName);
   372             $page = Paginator::resolveCurrentPage($pageName);
   257         }
   373         }
   258 
   374 
   259         assert(is_null($page) || is_numeric($page));
   375         assert(is_null($page) || is_numeric($page));
   260 
   376 
   261         $total = $this->getCount();
       
   262 
       
   263         $offset = max(0,($page - 1) * $perPage);
   377         $offset = max(0,($page - 1) * $perPage);
   264 
   378 
   265         $results = $this->queryDocs($offset, $perPage);
   379         $results = $this->queryDocs($filters, $offset, $perPage, $sort);
   266 
   380 
   267         return new LengthAwarePaginator($results, $total, $perPage, $page, [
   381         return new LengthAwarePaginator($results['documents'], $results['meta']['total'], $perPage, $page, [
   268             'path' => Paginator::resolveCurrentPath(),
   382             'path' => Paginator::resolveCurrentPath(),
   269             'pageName' => $pageName,
   383             'pageName' => $pageName,
   270         ]);
   384         ]);
   271     }
   385 
       
   386     }
       
   387 
   272 
   388 
   273     /**
   389     /**
   274      * Resolve lexvo id for all documents in the list
   390      * Resolve lexvo id for all documents in the list
   275      * this allow to optimise the call of lexvo repository
   391      * this allow to optimise the call of lexvo repository
   276      * @param $docList Array: a list (Array) of document to resolve
   392      * @param $docList Array: a list (Array) of document to resolve
   278     public function resolveLexvo(Array $docList) {
   394     public function resolveLexvo(Array $docList) {
   279 
   395 
   280         $languageIds = [];
   396         $languageIds = [];
   281         #get the list pf language needing resolving
   397         #get the list pf language needing resolving
   282         foreach ($docList as $doc) {
   398         foreach ($docList as $doc) {
   283             if($doc->getLanguageValue() && is_null($doc->getLanguageResolved())) {
   399             if(!empty($doc->getLanguagesValue()) && is_null($doc->getLanguagesResolved())) {
   284                 $languageIds[$doc->getLanguageValue()] = true;
   400                 foreach($doc->getLanguagesValue() as $lang) {
       
   401                     $languageIds[$lang]=true;
       
   402                 }
   285             }
   403             }
   286         }
   404         }
   287 
   405 
   288         # call LexvoResolver
   406         # call LexvoResolver
   289         $langNames = $this->lexvoResolver->getNames(array_keys($languageIds));
   407         $langNames = $this->lexvoResolver->getNames(array_keys($languageIds));
   290 
   408 
   291         foreach ($docList as $doc) {
   409         foreach ($docList as $doc) {
   292             if($doc->getLanguageValue() && is_null($doc->getLanguageResolved())) {
   410             if(!empty($doc->getLanguagesValue()) && is_null($doc->getLanguagesResolved())) {
   293                 $doc->setLanguageResolved($langNames[$doc->getLanguageValue()]);
   411                 $langResolved = [];
       
   412                 foreach($doc->getLanguagesValue() as $lang) {
       
   413                     $langResolved[] = $langNames[$lang];
       
   414                 }
       
   415                 $doc->setLanguageResolved($langResolved);
   294             }
   416             }
   295         }
   417         }
   296 
   418 
   297         return $docList;
   419         return $docList;
   298     }
   420     }