--- a/server/src/app/Console/Commands/ImportCocoonRDF.php Wed Jun 24 01:36:46 2015 +0200
+++ b/server/src/app/Console/Commands/ImportCocoonRDF.php Mon Oct 05 17:02:10 2015 +0200
@@ -3,33 +3,33 @@
namespace CorpusParole\Console\Commands;
use Config;
+use Log;
use Illuminate\Console\Command;
use Symfony\Component\Console\Input\InputOption;
use Symfony\Component\Console\Input\InputArgument;
use Phpoaipmh\Client;
use Phpoaipmh\Endpoint;
-class ImportCocoonRDF extends Command
-{
- /**
- * The console command name.
- *
- * @var string
- */
- protected $name = 'corpus-parole:importRDF';
+class ImportCocoonRDF extends Command {
/**
* The console command description.
*
* @var string
*/
- protected $description = 'Command description.';
+ protected $description = 'Import Rdf from Cocoon.';
+
+ /**
+ * The name and signature of the console command.
+ *
+ * @var string
+ */
+ protected $signature = 'corpus-parole:importRDF {--skip=0 : Number of record to skip}';
/**
* Create a new command instance.
*/
- public function __construct()
- {
+ public function __construct() {
parent::__construct();
}
@@ -38,60 +38,81 @@
*
* @return mixed
*/
- public function fire()
- {
- echo("hello\n");
+ public function fire() {
+
libxml_use_internal_errors(true);
- //$gs = new \EasyRdf_GraphStore(Config::get('corpusparole.sesame_update_url'));
- $gs = new \EasyRdf_Sparql_Client(Config::get('corpusparole.sesame_query_url'), Config::get('corpusparole.sesame_update_url'));
+ $skip = (int)$this->option('skip');
- //$doc = new \EasyRdf_Graph("http://cocoon.huma-num.fr/exist/crdo/rdf/crdo-ESLO1_ENTCONT_203");
- //$doc->load();
+ $this->comment("Skipping $skip records");
+
+ $gs = new \EasyRdf\Sparql\Client(Config::get('corpusparole.sesame_query_url'), Config::get('corpusparole.sesame_update_url'));
+
$client = new Client(Config::get('corpusparole.cocoon_oaipmh_url'));
- //$client = new Client('http://memory.loc.gov/cgi-bin/oai2_0');
$endpoint = new Endpoint($client);
$recs = $endpoint->listRecords('olac', null, null, 'LanguesDeFrance');
- //$recs = $endpoint->listRecords('oai_dc', null, null, 'mussm');
+
+ //TODO : treat timeout exceptions
+ $progressBar = $this->output->createProgressBar($recs->getTotalRecordsInCollection());
+ $progressBar->setFormat(' %current%/%max% [%bar%] %percent:3s%% - %message%');
foreach ($recs as $item) {
- if ($recs->getNumRequests() > 1) {
- break;
- }
+
$identifier = (string) $item->xpath('/record/header/identifier')[0];
$docRdfUrl = Config::get('corpusparole.cocoon_rdf_base_uri').substr($identifier, strlen(Config::get('corpusparole.cocoon_doc_id_base')));
- print("Processing $identifier : $docRdfUrl\n");
- $doc = new \EasyRdf_Graph($docRdfUrl);
- $doc->load();
- $subjects = $doc->resourcesMatching('foaf:primaryTopic');
- $subject = reset($subjects)->getUri();
- $gs->insert($doc, $subject);
- }
- }
+ $message = "$identifier : $docRdfUrl";
+ if($recs->getNumRetrieved() <= $skip) {
+ $progressBar->setMessage("$message - Skipping");
+ $progressBar->advance();
+ continue;
+ }
+ $progressBar->setMessage($message);
+ $progressBar->advance();
- /**
- * Get the console command arguments.
- *
- * @return array
- */
- protected function getArguments()
- {
- return [
- ['example', InputArgument::REQUIRED, 'An example argument.'],
- ];
- }
-
- /**
- * Get the console command options.
- *
- * @return array
- */
- protected function getOptions()
- {
- return [
- //['example', null, InputOption::VALUE_OPTIONAL, 'An example option.', null],
- ];
+ $docUri = config('corpusparole.cocoon_doc_id_base_uri').substr($identifier, strlen(Config::get('corpusparole.cocoon_doc_id_base')));
+ $resDocs = $gs->query("ASK WHERE { GRAPH <$docUri> { ?s ?p ?o }}");
+ if(!$resDocs->getBoolean()) {
+ $docLoaded = false;
+ $loadRetry = 0;
+ while(!$docLoaded && $loadRetry < config('corpusparole.max_load_retry', 3)) {
+ $loadRetry++;
+ try {
+ $doc = new \EasyRdf\Graph($docRdfUrl);
+ $doc->load();
+ $docLoaded = true;
+ }
+ //TODO: catch network exception - add error to database
+ catch(\Exception $e) {
+ $code = $e->getCode();
+ $message = $e->getMessage();
+ $this->debug("\nError processing $identifier. code : $code, message: $message");
+ Log::debug("Error processing $identifier. code : $code, message: $message");
+ if($code == 1 && stripos($message, 'timed out')>=0 ) {
+ $this->warning("\nTimeout error processing $identifier ($docRdfUrl) : $e, retrying");
+ Log::warning("Timeout error processing $identifier ($docRdfUrl) : $e, retrying");
+ continue;
+ }
+ else {
+ $this->error("\nError processing $identifier ($docRdfUrl) : $e");
+ Log::error("Error processing $identifier ($docRdfUrl) : $e");
+ break;
+ }
+ //$this->error(print_r($e->getTraceAsString(),true));
+ }
+ }
+ if(!$docLoaded) {
+ continue;
+ }
+ //TODO: treat errors
+ $subjects = $doc->resources();
+ $subject = reset($subjects)->getUri();
+ //TODO: exceptions ? but if pb on insert probably we have to fail anyway
+ $gs->insert($doc, $subject);
+ }
+ }
+ $progressBar->setMessage("finished");
+ $progressBar->finish();
}
}