diff -r 2b3247d02769 -r f55970e41793 server/src/app/Console/Commands/ImportCocoonRDF.php --- a/server/src/app/Console/Commands/ImportCocoonRDF.php Wed Jun 24 01:36:46 2015 +0200 +++ b/server/src/app/Console/Commands/ImportCocoonRDF.php Mon Oct 05 17:02:10 2015 +0200 @@ -3,33 +3,33 @@ namespace CorpusParole\Console\Commands; use Config; +use Log; use Illuminate\Console\Command; use Symfony\Component\Console\Input\InputOption; use Symfony\Component\Console\Input\InputArgument; use Phpoaipmh\Client; use Phpoaipmh\Endpoint; -class ImportCocoonRDF extends Command -{ - /** - * The console command name. - * - * @var string - */ - protected $name = 'corpus-parole:importRDF'; +class ImportCocoonRDF extends Command { /** * The console command description. * * @var string */ - protected $description = 'Command description.'; + protected $description = 'Import Rdf from Cocoon.'; + + /** + * The name and signature of the console command. + * + * @var string + */ + protected $signature = 'corpus-parole:importRDF {--skip=0 : Number of record to skip}'; /** * Create a new command instance. */ - public function __construct() - { + public function __construct() { parent::__construct(); } @@ -38,60 +38,81 @@ * * @return mixed */ - public function fire() - { - echo("hello\n"); + public function fire() { + libxml_use_internal_errors(true); - //$gs = new \EasyRdf_GraphStore(Config::get('corpusparole.sesame_update_url')); - $gs = new \EasyRdf_Sparql_Client(Config::get('corpusparole.sesame_query_url'), Config::get('corpusparole.sesame_update_url')); + $skip = (int)$this->option('skip'); - //$doc = new \EasyRdf_Graph("http://cocoon.huma-num.fr/exist/crdo/rdf/crdo-ESLO1_ENTCONT_203"); - //$doc->load(); + $this->comment("Skipping $skip records"); + + $gs = new \EasyRdf\Sparql\Client(Config::get('corpusparole.sesame_query_url'), Config::get('corpusparole.sesame_update_url')); + $client = new Client(Config::get('corpusparole.cocoon_oaipmh_url')); - //$client = new Client('http://memory.loc.gov/cgi-bin/oai2_0'); $endpoint = new Endpoint($client); $recs = $endpoint->listRecords('olac', null, null, 'LanguesDeFrance'); - //$recs = $endpoint->listRecords('oai_dc', null, null, 'mussm'); + + //TODO : treat timeout exceptions + $progressBar = $this->output->createProgressBar($recs->getTotalRecordsInCollection()); + $progressBar->setFormat(' %current%/%max% [%bar%] %percent:3s%% - %message%'); foreach ($recs as $item) { - if ($recs->getNumRequests() > 1) { - break; - } + $identifier = (string) $item->xpath('/record/header/identifier')[0]; $docRdfUrl = Config::get('corpusparole.cocoon_rdf_base_uri').substr($identifier, strlen(Config::get('corpusparole.cocoon_doc_id_base'))); - print("Processing $identifier : $docRdfUrl\n"); - $doc = new \EasyRdf_Graph($docRdfUrl); - $doc->load(); - $subjects = $doc->resourcesMatching('foaf:primaryTopic'); - $subject = reset($subjects)->getUri(); - $gs->insert($doc, $subject); - } - } + $message = "$identifier : $docRdfUrl"; + if($recs->getNumRetrieved() <= $skip) { + $progressBar->setMessage("$message - Skipping"); + $progressBar->advance(); + continue; + } + $progressBar->setMessage($message); + $progressBar->advance(); - /** - * Get the console command arguments. - * - * @return array - */ - protected function getArguments() - { - return [ - ['example', InputArgument::REQUIRED, 'An example argument.'], - ]; - } - - /** - * Get the console command options. - * - * @return array - */ - protected function getOptions() - { - return [ - //['example', null, InputOption::VALUE_OPTIONAL, 'An example option.', null], - ]; + $docUri = config('corpusparole.cocoon_doc_id_base_uri').substr($identifier, strlen(Config::get('corpusparole.cocoon_doc_id_base'))); + $resDocs = $gs->query("ASK WHERE { GRAPH <$docUri> { ?s ?p ?o }}"); + if(!$resDocs->getBoolean()) { + $docLoaded = false; + $loadRetry = 0; + while(!$docLoaded && $loadRetry < config('corpusparole.max_load_retry', 3)) { + $loadRetry++; + try { + $doc = new \EasyRdf\Graph($docRdfUrl); + $doc->load(); + $docLoaded = true; + } + //TODO: catch network exception - add error to database + catch(\Exception $e) { + $code = $e->getCode(); + $message = $e->getMessage(); + $this->debug("\nError processing $identifier. code : $code, message: $message"); + Log::debug("Error processing $identifier. code : $code, message: $message"); + if($code == 1 && stripos($message, 'timed out')>=0 ) { + $this->warning("\nTimeout error processing $identifier ($docRdfUrl) : $e, retrying"); + Log::warning("Timeout error processing $identifier ($docRdfUrl) : $e, retrying"); + continue; + } + else { + $this->error("\nError processing $identifier ($docRdfUrl) : $e"); + Log::error("Error processing $identifier ($docRdfUrl) : $e"); + break; + } + //$this->error(print_r($e->getTraceAsString(),true)); + } + } + if(!$docLoaded) { + continue; + } + //TODO: treat errors + $subjects = $doc->resources(); + $subject = reset($subjects)->getUri(); + //TODO: exceptions ? but if pb on insert probably we have to fail anyway + $gs->insert($doc, $subject); + } + } + $progressBar->setMessage("finished"); + $progressBar->finish(); } }