server/src/app/Console/Commands/ImportCocoonRDF.php
changeset 4 f55970e41793
parent 3 2b3247d02769
child 18 f2a40bbc27f6
equal deleted inserted replaced
3:2b3247d02769 4:f55970e41793
     1 <?php
     1 <?php
     2 
     2 
     3 namespace CorpusParole\Console\Commands;
     3 namespace CorpusParole\Console\Commands;
     4 
     4 
     5 use Config;
     5 use Config;
       
     6 use Log;
     6 use Illuminate\Console\Command;
     7 use Illuminate\Console\Command;
     7 use Symfony\Component\Console\Input\InputOption;
     8 use Symfony\Component\Console\Input\InputOption;
     8 use Symfony\Component\Console\Input\InputArgument;
     9 use Symfony\Component\Console\Input\InputArgument;
     9 use Phpoaipmh\Client;
    10 use Phpoaipmh\Client;
    10 use Phpoaipmh\Endpoint;
    11 use Phpoaipmh\Endpoint;
    11 
    12 
    12 class ImportCocoonRDF extends Command
    13 class ImportCocoonRDF extends Command {
    13 {
       
    14     /**
       
    15      * The console command name.
       
    16      *
       
    17      * @var string
       
    18      */
       
    19     protected $name = 'corpus-parole:importRDF';
       
    20 
    14 
    21     /**
    15     /**
    22      * The console command description.
    16      * The console command description.
    23      *
    17      *
    24      * @var string
    18      * @var string
    25      */
    19      */
    26     protected $description = 'Command description.';
    20     protected $description = 'Import Rdf from Cocoon.';
       
    21 
       
    22     /**
       
    23     * The name and signature of the console command.
       
    24     *
       
    25     * @var string
       
    26     */
       
    27     protected $signature = 'corpus-parole:importRDF {--skip=0 : Number of record to skip}';
    27 
    28 
    28     /**
    29     /**
    29      * Create a new command instance.
    30      * Create a new command instance.
    30      */
    31      */
    31     public function __construct()
    32     public function __construct() {
    32     {
       
    33         parent::__construct();
    33         parent::__construct();
    34     }
    34     }
    35 
    35 
    36     /**
    36     /**
    37      * Execute the console command.
    37      * Execute the console command.
    38      *
    38      *
    39      * @return mixed
    39      * @return mixed
    40      */
    40      */
    41     public function fire()
    41     public function fire() {
    42     {
    42 
    43         echo("hello\n");
       
    44         libxml_use_internal_errors(true);
    43         libxml_use_internal_errors(true);
    45 
    44 
    46         //$gs = new \EasyRdf_GraphStore(Config::get('corpusparole.sesame_update_url'));
    45         $skip = (int)$this->option('skip');
    47         $gs = new \EasyRdf_Sparql_Client(Config::get('corpusparole.sesame_query_url'), Config::get('corpusparole.sesame_update_url'));
       
    48 
    46 
    49         //$doc = new \EasyRdf_Graph("http://cocoon.huma-num.fr/exist/crdo/rdf/crdo-ESLO1_ENTCONT_203");
    47         $this->comment("Skipping $skip records");
    50         //$doc->load();
    48 
       
    49         $gs = new \EasyRdf\Sparql\Client(Config::get('corpusparole.sesame_query_url'), Config::get('corpusparole.sesame_update_url'));
       
    50 
    51 
    51 
    52         $client = new Client(Config::get('corpusparole.cocoon_oaipmh_url'));
    52         $client = new Client(Config::get('corpusparole.cocoon_oaipmh_url'));
    53         //$client = new Client('http://memory.loc.gov/cgi-bin/oai2_0');
       
    54         $endpoint = new Endpoint($client);
    53         $endpoint = new Endpoint($client);
    55 
    54 
    56         $recs = $endpoint->listRecords('olac', null, null, 'LanguesDeFrance');
    55         $recs = $endpoint->listRecords('olac', null, null, 'LanguesDeFrance');
    57         //$recs = $endpoint->listRecords('oai_dc', null, null, 'mussm');
    56 
       
    57         //TODO : treat timeout exceptions
       
    58         $progressBar = $this->output->createProgressBar($recs->getTotalRecordsInCollection());
       
    59         $progressBar->setFormat(' %current%/%max% [%bar%] %percent:3s%% - %message%');
    58 
    60 
    59         foreach ($recs as $item) {
    61         foreach ($recs as $item) {
    60             if ($recs->getNumRequests() > 1) {
    62 
    61                 break;
       
    62             }
       
    63             $identifier = (string) $item->xpath('/record/header/identifier')[0];
    63             $identifier = (string) $item->xpath('/record/header/identifier')[0];
    64             $docRdfUrl = Config::get('corpusparole.cocoon_rdf_base_uri').substr($identifier, strlen(Config::get('corpusparole.cocoon_doc_id_base')));
    64             $docRdfUrl = Config::get('corpusparole.cocoon_rdf_base_uri').substr($identifier, strlen(Config::get('corpusparole.cocoon_doc_id_base')));
    65             print("Processing $identifier : $docRdfUrl\n");
    65             $message = "$identifier : $docRdfUrl";
    66             $doc = new \EasyRdf_Graph($docRdfUrl);
    66             if($recs->getNumRetrieved() <= $skip) {
    67             $doc->load();
    67                 $progressBar->setMessage("$message - Skipping");
    68             $subjects = $doc->resourcesMatching('foaf:primaryTopic');
    68                 $progressBar->advance();
    69             $subject = reset($subjects)->getUri();
    69                 continue;
    70             $gs->insert($doc, $subject);
    70             }
       
    71             $progressBar->setMessage($message);
       
    72             $progressBar->advance();
       
    73 
       
    74             $docUri = config('corpusparole.cocoon_doc_id_base_uri').substr($identifier, strlen(Config::get('corpusparole.cocoon_doc_id_base')));
       
    75             $resDocs = $gs->query("ASK WHERE { GRAPH <$docUri> { ?s ?p ?o }}");
       
    76             if(!$resDocs->getBoolean()) {
       
    77                 $docLoaded = false;
       
    78                 $loadRetry = 0;
       
    79                 while(!$docLoaded && $loadRetry < config('corpusparole.max_load_retry', 3)) {
       
    80                     $loadRetry++;
       
    81                     try {
       
    82                         $doc = new \EasyRdf\Graph($docRdfUrl);
       
    83                         $doc->load();
       
    84                         $docLoaded = true;
       
    85                     }
       
    86                     //TODO: catch network exception - add error to database
       
    87                     catch(\Exception $e) {
       
    88                         $code = $e->getCode();
       
    89                         $message = $e->getMessage();
       
    90                         $this->debug("\nError processing $identifier. code : $code, message: $message");
       
    91                         Log::debug("Error processing $identifier. code : $code, message: $message");
       
    92                         if($code == 1 && stripos($message, 'timed out')>=0 ) {
       
    93                             $this->warning("\nTimeout error processing $identifier ($docRdfUrl) : $e, retrying");
       
    94                             Log::warning("Timeout error processing $identifier ($docRdfUrl) : $e, retrying");
       
    95                             continue;
       
    96                         }
       
    97                         else {
       
    98                             $this->error("\nError processing $identifier ($docRdfUrl) : $e");
       
    99                             Log::error("Error processing $identifier ($docRdfUrl) : $e");
       
   100                             break;
       
   101                         }
       
   102                         //$this->error(print_r($e->getTraceAsString(),true));
       
   103                     }
       
   104                 }
       
   105                 if(!$docLoaded) {
       
   106                     continue;
       
   107                 }
       
   108                 //TODO: treat errors
       
   109                 $subjects = $doc->resources();
       
   110                 $subject = reset($subjects)->getUri();
       
   111                 //TODO: exceptions ? but if pb on insert probably we have to fail anyway
       
   112                 $gs->insert($doc, $subject);
       
   113             }
    71         }
   114         }
    72     }
   115         $progressBar->setMessage("finished");
    73 
   116         $progressBar->finish();
    74     /**
       
    75      * Get the console command arguments.
       
    76      *
       
    77      * @return array
       
    78      */
       
    79     protected function getArguments()
       
    80     {
       
    81         return [
       
    82             ['example', InputArgument::REQUIRED, 'An example argument.'],
       
    83         ];
       
    84     }
       
    85 
       
    86     /**
       
    87      * Get the console command options.
       
    88      *
       
    89      * @return array
       
    90      */
       
    91     protected function getOptions()
       
    92     {
       
    93         return [
       
    94             //['example', null, InputOption::VALUE_OPTIONAL, 'An example option.', null],
       
    95         ];
       
    96     }
   117     }
    97 }
   118 }