server/src/app/Console/Commands/ImportCocoonRDF.php
changeset 4 f55970e41793
parent 3 2b3247d02769
child 18 f2a40bbc27f6
--- a/server/src/app/Console/Commands/ImportCocoonRDF.php	Wed Jun 24 01:36:46 2015 +0200
+++ b/server/src/app/Console/Commands/ImportCocoonRDF.php	Mon Oct 05 17:02:10 2015 +0200
@@ -3,33 +3,33 @@
 namespace CorpusParole\Console\Commands;
 
 use Config;
+use Log;
 use Illuminate\Console\Command;
 use Symfony\Component\Console\Input\InputOption;
 use Symfony\Component\Console\Input\InputArgument;
 use Phpoaipmh\Client;
 use Phpoaipmh\Endpoint;
 
-class ImportCocoonRDF extends Command
-{
-    /**
-     * The console command name.
-     *
-     * @var string
-     */
-    protected $name = 'corpus-parole:importRDF';
+class ImportCocoonRDF extends Command {
 
     /**
      * The console command description.
      *
      * @var string
      */
-    protected $description = 'Command description.';
+    protected $description = 'Import Rdf from Cocoon.';
+
+    /**
+    * The name and signature of the console command.
+    *
+    * @var string
+    */
+    protected $signature = 'corpus-parole:importRDF {--skip=0 : Number of record to skip}';
 
     /**
      * Create a new command instance.
      */
-    public function __construct()
-    {
+    public function __construct() {
         parent::__construct();
     }
 
@@ -38,60 +38,81 @@
      *
      * @return mixed
      */
-    public function fire()
-    {
-        echo("hello\n");
+    public function fire() {
+
         libxml_use_internal_errors(true);
 
-        //$gs = new \EasyRdf_GraphStore(Config::get('corpusparole.sesame_update_url'));
-        $gs = new \EasyRdf_Sparql_Client(Config::get('corpusparole.sesame_query_url'), Config::get('corpusparole.sesame_update_url'));
+        $skip = (int)$this->option('skip');
 
-        //$doc = new \EasyRdf_Graph("http://cocoon.huma-num.fr/exist/crdo/rdf/crdo-ESLO1_ENTCONT_203");
-        //$doc->load();
+        $this->comment("Skipping $skip records");
+
+        $gs = new \EasyRdf\Sparql\Client(Config::get('corpusparole.sesame_query_url'), Config::get('corpusparole.sesame_update_url'));
+
 
         $client = new Client(Config::get('corpusparole.cocoon_oaipmh_url'));
-        //$client = new Client('http://memory.loc.gov/cgi-bin/oai2_0');
         $endpoint = new Endpoint($client);
 
         $recs = $endpoint->listRecords('olac', null, null, 'LanguesDeFrance');
-        //$recs = $endpoint->listRecords('oai_dc', null, null, 'mussm');
+
+        //TODO : treat timeout exceptions
+        $progressBar = $this->output->createProgressBar($recs->getTotalRecordsInCollection());
+        $progressBar->setFormat(' %current%/%max% [%bar%] %percent:3s%% - %message%');
 
         foreach ($recs as $item) {
-            if ($recs->getNumRequests() > 1) {
-                break;
-            }
+
             $identifier = (string) $item->xpath('/record/header/identifier')[0];
             $docRdfUrl = Config::get('corpusparole.cocoon_rdf_base_uri').substr($identifier, strlen(Config::get('corpusparole.cocoon_doc_id_base')));
-            print("Processing $identifier : $docRdfUrl\n");
-            $doc = new \EasyRdf_Graph($docRdfUrl);
-            $doc->load();
-            $subjects = $doc->resourcesMatching('foaf:primaryTopic');
-            $subject = reset($subjects)->getUri();
-            $gs->insert($doc, $subject);
-        }
-    }
+            $message = "$identifier : $docRdfUrl";
+            if($recs->getNumRetrieved() <= $skip) {
+                $progressBar->setMessage("$message - Skipping");
+                $progressBar->advance();
+                continue;
+            }
+            $progressBar->setMessage($message);
+            $progressBar->advance();
 
-    /**
-     * Get the console command arguments.
-     *
-     * @return array
-     */
-    protected function getArguments()
-    {
-        return [
-            ['example', InputArgument::REQUIRED, 'An example argument.'],
-        ];
-    }
-
-    /**
-     * Get the console command options.
-     *
-     * @return array
-     */
-    protected function getOptions()
-    {
-        return [
-            //['example', null, InputOption::VALUE_OPTIONAL, 'An example option.', null],
-        ];
+            $docUri = config('corpusparole.cocoon_doc_id_base_uri').substr($identifier, strlen(Config::get('corpusparole.cocoon_doc_id_base')));
+            $resDocs = $gs->query("ASK WHERE { GRAPH <$docUri> { ?s ?p ?o }}");
+            if(!$resDocs->getBoolean()) {
+                $docLoaded = false;
+                $loadRetry = 0;
+                while(!$docLoaded && $loadRetry < config('corpusparole.max_load_retry', 3)) {
+                    $loadRetry++;
+                    try {
+                        $doc = new \EasyRdf\Graph($docRdfUrl);
+                        $doc->load();
+                        $docLoaded = true;
+                    }
+                    //TODO: catch network exception - add error to database
+                    catch(\Exception $e) {
+                        $code = $e->getCode();
+                        $message = $e->getMessage();
+                        $this->debug("\nError processing $identifier. code : $code, message: $message");
+                        Log::debug("Error processing $identifier. code : $code, message: $message");
+                        if($code == 1 && stripos($message, 'timed out')>=0 ) {
+                            $this->warning("\nTimeout error processing $identifier ($docRdfUrl) : $e, retrying");
+                            Log::warning("Timeout error processing $identifier ($docRdfUrl) : $e, retrying");
+                            continue;
+                        }
+                        else {
+                            $this->error("\nError processing $identifier ($docRdfUrl) : $e");
+                            Log::error("Error processing $identifier ($docRdfUrl) : $e");
+                            break;
+                        }
+                        //$this->error(print_r($e->getTraceAsString(),true));
+                    }
+                }
+                if(!$docLoaded) {
+                    continue;
+                }
+                //TODO: treat errors
+                $subjects = $doc->resources();
+                $subject = reset($subjects)->getUri();
+                //TODO: exceptions ? but if pb on insert probably we have to fail anyway
+                $gs->insert($doc, $subject);
+            }
+        }
+        $progressBar->setMessage("finished");
+        $progressBar->finish();
     }
 }