server/src/app/Console/Commands/ImportCocoonRDF.php
changeset 19 eadaf0b8f02e
parent 18 f2a40bbc27f6
child 114 8af5ed0521a2
equal deleted inserted replaced
18:f2a40bbc27f6 19:eadaf0b8f02e
    12 
    12 
    13 class ImportCocoonRDF extends Command {
    13 class ImportCocoonRDF extends Command {
    14 
    14 
    15     const INSERT_TIMEOUT_RETRY = 5;
    15     const INSERT_TIMEOUT_RETRY = 5;
    16 
    16 
       
    17     const MAPPER_CLASS_MAP = [
       
    18         "http://purl.org/dc/dcmitype/Sound" => '\CorpusParole\Libraries\Mappers\CocoonSoundRdfMapper',
       
    19         "http://purl.org/dc/dcmitype/MovingImage" => '\CorpusParole\Libraries\Mappers\CocoonSoundRdfMapper',
       
    20         "http://purl.org/dc/dcmitype/Text" => '\CorpusParole\Libraries\Mappers\CocoonTextRdfMapper',
       
    21         "http://purl.org/dc/dcmitype/Collection" => '\CorpusParole\Libraries\Mappers\CocoonCollectionRdfMapper'
       
    22     ];
       
    23 
    17     /**
    24     /**
    18      * The console command description.
    25      * The console command description.
    19      *
    26      *
    20      * @var string
    27      * @var string
    21      */
    28      */
    24     /**
    31     /**
    25     * The name and signature of the console command.
    32     * The name and signature of the console command.
    26     *
    33     *
    27     * @var string
    34     * @var string
    28     */
    35     */
    29     protected $signature = 'corpus-parole:importRDF {--skip=0 : Number of record to skip}';
    36     protected $signature = 'corpus-parole:importRDF {--skip=0 : Number of record to skip} {--raw : Register raw}';
    30 
    37 
    31     /**
    38     /**
    32      * Create a new command instance.
    39      * Create a new command instance.
    33      */
    40      */
    34     public function __construct() {
    41     public function __construct() {
    35         parent::__construct();
    42         parent::__construct();
    36     }
    43     }
    37 
    44 
    38     /**
    45     /**
       
    46      * Get the list of dcmi types for the graph
       
    47      */
       
    48     private function getDocTypes($doc, $docUri) {
       
    49 
       
    50         $res = $doc->resource($docUri);
       
    51         $docTypes = [];
       
    52         //foreach ($res->all("http://purl.org/dc/elements/1.1/type") as $resType) {
       
    53         foreach ($res->all("dc11:type","resource") as $resType) {
       
    54             $type = $resType->getUri();
       
    55             if(0 === strpos($type, 'http://purl.org/dc/dcmitype/')) {
       
    56                 $docTypes[] = $type;
       
    57             }
       
    58         }
       
    59 
       
    60         // if the doc type list is empty, check that we have a collection
       
    61         if(empty($docTypes)) {
       
    62             if(!empty($doc->allOfType('edm:Collection'))) {
       
    63                 $docTypes[] = "http://purl.org/dc/dcmitype/Collection";
       
    64             }
       
    65         }
       
    66         return $docTypes;
       
    67     }
       
    68 
       
    69 
       
    70     /**
    39      * Execute the console command.
    71      * Execute the console command.
    40      *
    72      *
    41      * @return mixed
    73      * @return mixed
    42      */
    74      */
    43     public function fire() {
    75     public function fire() {
    44 
    76 
    45         libxml_use_internal_errors(true);
    77         libxml_use_internal_errors(true);
    46 
    78 
    47         $skip = (int)$this->option('skip');
    79         $skip = (int)$this->option('skip');
       
    80         $raw = $this->option('raw');
    48 
    81 
    49         $this->comment("Skipping $skip records");
    82         $this->comment("Skipping $skip records");
       
    83         $this->comment("Recording raw queries: $raw");
    50 
    84 
    51         $gs = new \EasyRdf\Sparql\Client(Config::get('corpusparole.sesame_query_url'), Config::get('corpusparole.sesame_update_url'));
    85         $gs = new \EasyRdf\Sparql\Client(Config::get('corpusparole.sesame_query_url'), Config::get('corpusparole.sesame_update_url'));
       
    86         $gs_raw = new \EasyRdf\Sparql\Client(Config::get('corpusparole.sesame_query_url_raw'), Config::get('corpusparole.sesame_update_url_raw'));
    52 
    87 
    53 
    88 
    54         $client = new Client(Config::get('corpusparole.cocoon_oaipmh_url'));
    89         $client = new Client(Config::get('corpusparole.cocoon_oaipmh_url'));
    55         $endpoint = new Endpoint($client);
    90         $endpoint = new Endpoint($client);
    56 
    91 
    59         //TODO : treat timeout exceptions
    94         //TODO : treat timeout exceptions
    60         $progressBar = $this->output->createProgressBar($recs->getTotalRecordsInCollection());
    95         $progressBar = $this->output->createProgressBar($recs->getTotalRecordsInCollection());
    61         $progressBar->setFormat(' %current%/%max% [%bar%] %percent:3s%% - %message%');
    96         $progressBar->setFormat(' %current%/%max% [%bar%] %percent:3s%% - %message%');
    62 
    97 
    63         $insertTimeouts = 0;
    98         $insertTimeouts = 0;
       
    99 
       
   100         $documentCounts = ['all' => 0, 'unknown' => 0];
    64 
   101 
    65         foreach ($recs as $item) {
   102         foreach ($recs as $item) {
    66 
   103 
    67             $identifier = (string) $item->xpath('/record/header/identifier')[0];
   104             $identifier = (string) $item->xpath('/record/header/identifier')[0];
    68             $docRdfUrl = Config::get('corpusparole.cocoon_rdf_base_uri').substr($identifier, strlen(Config::get('corpusparole.cocoon_doc_id_base')));
   105             $docRdfUrl = Config::get('corpusparole.cocoon_rdf_base_uri').substr($identifier, strlen(Config::get('corpusparole.cocoon_doc_id_base')));
    75             $progressBar->setMessage($message);
   112             $progressBar->setMessage($message);
    76             $progressBar->advance();
   113             $progressBar->advance();
    77 
   114 
    78             $docUri = config('corpusparole.cocoon_doc_id_base_uri').substr($identifier, strlen(Config::get('corpusparole.cocoon_doc_id_base')));
   115             $docUri = config('corpusparole.cocoon_doc_id_base_uri').substr($identifier, strlen(Config::get('corpusparole.cocoon_doc_id_base')));
    79 
   116 
    80             $resDocs = $gs->query("ASK WHERE { GRAPH <$docUri> { ?s ?p ?o }}");
   117             $docLoaded = false;
    81             if(!$resDocs->getBoolean()) {
   118             $loadRetry = 0;
    82                 $docLoaded = false;
   119             $doc = null;
    83                 $loadRetry = 0;
   120             while(!$docLoaded && $loadRetry < config('corpusparole.max_load_retry', 3)) {
    84                 while(!$docLoaded && $loadRetry < config('corpusparole.max_load_retry', 3)) {
   121                 $loadRetry++;
    85                     $loadRetry++;
       
    86                     try {
       
    87                         $doc = new \EasyRdf\Graph($docRdfUrl);
       
    88                         $doc->load();
       
    89                         $docLoaded = true;
       
    90                     }
       
    91                     //TODO: catch network exception - add error to database
       
    92                     catch(\Exception $e) {
       
    93                         $code = $e->getCode();
       
    94                         $message = $e->getMessage();
       
    95                         $this->debug("\nError processing $identifier. code : $code, message: $message");
       
    96                         Log::debug("Error processing $identifier. code : $code, message: $message");
       
    97                         if($code == 1 && stripos($message, 'timed out')>=0 ) {
       
    98                             $this->warning("\nTimeout error processing $identifier ($docRdfUrl) : $e, retrying");
       
    99                             Log::warning("Timeout error processing $identifier ($docRdfUrl) : $e, retrying");
       
   100                             continue;
       
   101                         }
       
   102                         else {
       
   103                             $this->error("\nError processing $identifier ($docRdfUrl) : $e");
       
   104                             Log::error("Error processing $identifier ($docRdfUrl) : $e");
       
   105                             break;
       
   106                         }
       
   107                         //$this->error(print_r($e->getTraceAsString(),true));
       
   108                     }
       
   109                 }
       
   110                 if(!$docLoaded) {
       
   111                     continue;
       
   112                 }
       
   113                 //TODO: treat errors
       
   114                 $subjects = $doc->resources();
       
   115                 $subject = reset($subjects)->getUri();
       
   116                 try {
   122                 try {
   117                     $gs->insert($doc, $subject);
   123                     $doc = new \EasyRdf\Graph($docRdfUrl);
       
   124                     $doc->load();
       
   125                     $docLoaded = true;
       
   126                 }
       
   127                 //TODO: catch network exception - add error to database
       
   128                 catch(\Exception $e) {
       
   129                     $code = $e->getCode();
       
   130                     $message = $e->getMessage();
       
   131                     $this->info("\nError processing $identifier. code : $code, message: $message");
       
   132                     Log::debug("Error processing $identifier. code : $code, message: $message");
       
   133                     if($code == 0 && stripos($message, 'timed out')>=0 ) {
       
   134                         $this->info("\nTimeout error processing $identifier ($docRdfUrl) : $e, retrying");
       
   135                         Log::warning("Timeout error processing $identifier ($docRdfUrl) : $e, retrying");
       
   136                         continue;
       
   137                     }
       
   138                     else {
       
   139                         $this->error("\nError processing $identifier ($docRdfUrl) : $e");
       
   140                         Log::error("Error processing $identifier ($docRdfUrl) : $e");
       
   141                         break;
       
   142                     }
       
   143                     //$this->error(print_r($e->getTraceAsString(),true));
       
   144                 }
       
   145             }
       
   146             if(!$docLoaded) {
       
   147                 continue;
       
   148             }
       
   149 
       
   150             //insert raw
       
   151             if($raw) {
       
   152                 $resDocsRaw = $gs_raw->query("ASK WHERE { GRAPH <$docUri> { ?s ?p ?o }}");
       
   153                 if($resDocsRaw->getBoolean()) {
       
   154                     $gs_raw->clear($docUri);
       
   155                 }
       
   156                 $gs_raw->insert($doc, $docUri);
       
   157             }
       
   158 
       
   159             //map doc
       
   160             $inputDocTypes = $this->getDocTypes($doc, $docUri);
       
   161 
       
   162             $docType = count($inputDocTypes)>0? $inputDocTypes[0]:null;
       
   163 
       
   164             if(is_null($docType) || !array_key_exists($docType,ImportCocoonRDF::MAPPER_CLASS_MAP)) {
       
   165                 $this->error("\nError processing $identifier ($docRdfUrl) : $docType unknown mapper");
       
   166                 Log::error("Error processing $identifier ($docRdfUrl) : $docType unknown mapper");
       
   167                 $documentCounts['unknown'] += 1;
       
   168                 continue;
       
   169             }
       
   170             $documentCounts['all'] += 1;
       
   171             $documentCounts[$docType] = isset($documentCounts[$docType])?$documentCounts[$docType]+1:1;
       
   172 
       
   173             $mapperClass = ImportCocoonRDF::MAPPER_CLASS_MAP[$docType];
       
   174             $mapper = new $mapperClass($doc, $docUri);
       
   175 
       
   176             $mapper->mapGraph();
       
   177             $mappedGraphes = $mapper->getOutputGraphes();
       
   178 
       
   179             foreach ($mapper->getOutputGraphes() as $mappedGraphKey => $mappedGraph) {
       
   180 
       
   181                 $mappedGraphUri = $mappedGraph->getUri();
       
   182                 try {
       
   183                     $resDocs = $gs->query("CONSTRUCT { ?s ?p ?o } WHERE { GRAPH <$mappedGraphUri> { ?s ?p ?o }}");
       
   184                 } catch (\Exception $e) {
       
   185                     $this->error("\nError on graph query $identifier ($mappedGraphUri) : $e \n" . $e->getBody() . "\n");
       
   186                     Log::error("\nError on graph query $identifier ($mappedGraphUri) : $e \n" . $e->getBody());
       
   187                     exit;
       
   188                 }
       
   189 
       
   190                 $mergedGraph = null;
       
   191                 $doDelete = true;
       
   192 
       
   193                 if($resDocs->isEmpty()) {
       
   194                     $mergedGraph = $mappedGraph;
       
   195                     $doDelete = false;
       
   196                 }
       
   197                 else {
       
   198                     $doDelete = true;
       
   199                     $mappedTypes = $this->getDocTypes($mappedGraph, $mappedGraphUri);
       
   200                     $presentTypes = $this->getDocTypes($resDocs, $mappedGraphUri);
       
   201 
       
   202                     if($docType == "http://purl.org/dc/dcmitype/Collection" || in_array("http://purl.org/dc/dcmitype/Collection", $mappedTypes)) {
       
   203                         $merger = new \CorpusParole\Libraries\Mergers\CocoonCollectionRdfMerger();
       
   204                         $baseGraph = $resDocs;
       
   205                         $sourceGraph = $mappedGraph;
       
   206                     }
       
   207                     elseif ($docType == "http://purl.org/dc/dcmitype/Text") {
       
   208                         $merger = new \CorpusParole\Libraries\Mergers\CocoonSoundRdfMerger();
       
   209                         $baseGraph = $resDocs;
       
   210                         $sourceGraph = $mappedGraph;
       
   211                     }
       
   212                     else {
       
   213                         $merger = new \CorpusParole\Libraries\Mergers\CocoonSoundRdfMerger();
       
   214                         $baseGraph = $mappedGraph;
       
   215                         $sourceGraph = $resDocs;
       
   216                     }
       
   217                     $mergedGraph = $merger->mergeGraph($baseGraph, $sourceGraph, $mappedGraphUri);
       
   218                     if(\EasyRdf\Isomorphic::isomorphic($resDocs, $mergedGraph)) {
       
   219                         //graph are isomorphic no need to go farther for this graph
       
   220                         Log::info("Graph are isomorphic for $mappedGraphUri (from $identifier : $docRdfUrl), skipping");
       
   221                         continue;
       
   222                     }
       
   223                 }
       
   224 
       
   225                 try {
       
   226                     if($doDelete) {
       
   227                         $gs->clear($mappedGraphUri);
       
   228                     }
       
   229                     $gs->insert($mergedGraph, $mappedGraphUri);
   118                 }
   230                 }
   119                 catch(\Exception $e) {
   231                 catch(\Exception $e) {
   120                     // just log not much we can do here...
   232                     // just log not much we can do here...
   121                     $this->error("\nError on insert $identifier ($docRdfUrl) : $e");
   233                     $this->error("\nError on insert $identifier ($docRdfUrl) : $e");
   122                     Log::error("Error on insert $identifier ($docRdfUrl) : $e");
   234                     Log::error("Error on insert $identifier ($docRdfUrl) : $e");
   132                 }
   244                 }
   133             }
   245             }
   134         }
   246         }
   135         $progressBar->setMessage("finished");
   247         $progressBar->setMessage("finished");
   136         $progressBar->finish();
   248         $progressBar->finish();
       
   249 
       
   250         $this->info("\nDocument count info: ");
       
   251         foreach ($documentCounts as $docType => $docCount) {
       
   252             $this->info("$docType => $docCount");
       
   253         }
   137     }
   254     }
   138 }
   255 }