<?php
namespace CorpusParole\Console\Commands;
use Config;
use Log;
use Illuminate\Console\Command;
use Symfony\Component\Console\Input\InputOption;
use Symfony\Component\Console\Input\InputArgument;
use Phpoaipmh\Client;
use Phpoaipmh\Endpoint;
class ImportCocoonRDF extends Command {
/**
* The console command description.
*
* @var string
*/
protected $description = 'Import Rdf from Cocoon.';
/**
* The name and signature of the console command.
*
* @var string
*/
protected $signature = 'corpus-parole:importRDF {--skip=0 : Number of record to skip}';
/**
* Create a new command instance.
*/
public function __construct() {
parent::__construct();
}
/**
* Execute the console command.
*
* @return mixed
*/
public function fire() {
libxml_use_internal_errors(true);
$skip = (int)$this->option('skip');
$this->comment("Skipping $skip records");
$gs = new \EasyRdf\Sparql\Client(Config::get('corpusparole.sesame_query_url'), Config::get('corpusparole.sesame_update_url'));
$client = new Client(Config::get('corpusparole.cocoon_oaipmh_url'));
$endpoint = new Endpoint($client);
$recs = $endpoint->listRecords('olac', null, null, 'LanguesDeFrance');
//TODO : treat timeout exceptions
$progressBar = $this->output->createProgressBar($recs->getTotalRecordsInCollection());
$progressBar->setFormat(' %current%/%max% [%bar%] %percent:3s%% - %message%');
foreach ($recs as $item) {
$identifier = (string) $item->xpath('/record/header/identifier')[0];
$docRdfUrl = Config::get('corpusparole.cocoon_rdf_base_uri').substr($identifier, strlen(Config::get('corpusparole.cocoon_doc_id_base')));
$message = "$identifier : $docRdfUrl";
if($recs->getNumRetrieved() <= $skip) {
$progressBar->setMessage("$message - Skipping");
$progressBar->advance();
continue;
}
$progressBar->setMessage($message);
$progressBar->advance();
$docUri = config('corpusparole.cocoon_doc_id_base_uri').substr($identifier, strlen(Config::get('corpusparole.cocoon_doc_id_base')));
$resDocs = $gs->query("ASK WHERE { GRAPH <$docUri> { ?s ?p ?o }}");
if(!$resDocs->getBoolean()) {
$docLoaded = false;
$loadRetry = 0;
while(!$docLoaded && $loadRetry < config('corpusparole.max_load_retry', 3)) {
$loadRetry++;
try {
$doc = new \EasyRdf\Graph($docRdfUrl);
$doc->load();
$docLoaded = true;
}
//TODO: catch network exception - add error to database
catch(\Exception $e) {
$code = $e->getCode();
$message = $e->getMessage();
$this->debug("\nError processing $identifier. code : $code, message: $message");
Log::debug("Error processing $identifier. code : $code, message: $message");
if($code == 1 && stripos($message, 'timed out')>=0 ) {
$this->warning("\nTimeout error processing $identifier ($docRdfUrl) : $e, retrying");
Log::warning("Timeout error processing $identifier ($docRdfUrl) : $e, retrying");
continue;
}
else {
$this->error("\nError processing $identifier ($docRdfUrl) : $e");
Log::error("Error processing $identifier ($docRdfUrl) : $e");
break;
}
//$this->error(print_r($e->getTraceAsString(),true));
}
}
if(!$docLoaded) {
continue;
}
//TODO: treat errors
$subjects = $doc->resources();
$subject = reset($subjects)->getUri();
//TODO: exceptions ? but if pb on insert probably we have to fail anyway
$gs->insert($doc, $subject);
}
}
$progressBar->setMessage("finished");
$progressBar->finish();
}
}