<?php
namespace CorpusParole\Console\Commands;
use Config;
use Log;
use Illuminate\Console\Command;
use Symfony\Component\Console\Input\InputOption;
use Symfony\Component\Console\Input\InputArgument;
use Phpoaipmh\Client;
use Phpoaipmh\Endpoint;
class ImportCocoonRDF extends Command {
const INSERT_TIMEOUT_RETRY = 5;
const MAPPER_CLASS_MAP = [
"http://purl.org/dc/dcmitype/Sound" => '\CorpusParole\Libraries\Mappers\CocoonSoundRdfMapper',
"http://purl.org/dc/dcmitype/MovingImage" => '\CorpusParole\Libraries\Mappers\CocoonSoundRdfMapper',
"http://purl.org/dc/dcmitype/Text" => '\CorpusParole\Libraries\Mappers\CocoonTextRdfMapper',
"http://purl.org/dc/dcmitype/Collection" => '\CorpusParole\Libraries\Mappers\CocoonCollectionRdfMapper'
];
/**
* The console command description.
*
* @var string
*/
protected $description = 'Import Rdf from Cocoon.';
/**
* The name and signature of the console command.
*
* @var string
*/
protected $signature = 'corpus-parole:importRDF {--skip=0 : Number of record to skip} {--raw : Register raw}';
/**
* Create a new command instance.
*/
public function __construct() {
parent::__construct();
}
/**
* Get the list of dcmi types for the graph
*/
private function getDocTypes($doc, $docUri) {
$res = $doc->resource($docUri);
$docTypes = [];
//foreach ($res->all("http://purl.org/dc/elements/1.1/type") as $resType) {
foreach ($res->all("dc11:type","resource") as $resType) {
$type = $resType->getUri();
if(0 === strpos($type, 'http://purl.org/dc/dcmitype/')) {
$docTypes[] = $type;
}
}
// if the doc type list is empty, check that we have a collection
if(empty($docTypes)) {
if(!empty($doc->allOfType('edm:Collection'))) {
$docTypes[] = "http://purl.org/dc/dcmitype/Collection";
}
}
return $docTypes;
}
/**
* Execute the console command.
*
* @return mixed
*/
public function fire() {
libxml_use_internal_errors(true);
$skip = (int)$this->option('skip');
$raw = $this->option('raw');
$this->comment("Skipping $skip records");
$this->comment("Recording raw queries: ".($raw?'TRUE':'FALSE'));
$gs = new \EasyRdf\Sparql\Client(Config::get('corpusparole.rdf4j_query_url'), Config::get('corpusparole.rdf4j_update_url'));
$gs_raw = new \EasyRdf\Sparql\Client(Config::get('corpusparole.rdf4j_query_url_raw'), Config::get('corpusparole.rdf4j_update_url_raw'));
$client = new Client(Config::get('corpusparole.cocoon_oaipmh_url'));
$endpoint = new Endpoint($client);
$recs = $endpoint->listRecords('olac', null, null, 'LanguesDeFrance');
//TODO : treat timeout exceptions
$progressBar = $this->output->createProgressBar($recs->getTotalRecordsInCollection());
$progressBar->setFormat(' %current%/%max% [%bar%] %percent:3s%% - %message%');
$insertTimeouts = 0;
$documentCounts = ['all' => 0, 'unknown' => 0, 'error' => 0];
foreach ($recs as $item) {
$item->registerXPathNamespace('oai', "http://www.openarchives.org/OAI/2.0/");
$identifier = (string) $item->xpath('oai:header/oai:identifier')[0];
$docRdfUrl = Config::get('corpusparole.cocoon_rdf_base_uri').substr($identifier, strlen(Config::get('corpusparole.cocoon_doc_id_base')));
$message = "$identifier : $docRdfUrl";
if($recs->getNumRetrieved() <= $skip) {
$progressBar->setMessage("$message - Skipping");
$progressBar->advance();
continue;
}
$progressBar->setMessage($message);
$progressBar->advance();
$docUri = config('corpusparole.cocoon_doc_id_base_uri').substr($identifier, strlen(Config::get('corpusparole.cocoon_doc_id_base')));
$docLoaded = false;
$loadRetry = 0;
$doc = null;
while(!$docLoaded && $loadRetry < config('corpusparole.max_load_retry', 3)) {
$loadRetry++;
try {
$doc = new \EasyRdf\Graph($docRdfUrl);
$doc->load();
$docLoaded = true;
}
//TODO: catch network exception - add error to database
catch(\Exception $e) {
$code = $e->getCode();
$message = $e->getMessage();
$this->info("\nError processing $identifier. code : $code, message: $message");
Log::debug("Error processing $identifier. code : $code, message: $message");
if($code == 400 || ($code == 0 && stripos($message, 'timed out')>=0) ) {
$this->info("\nTimeout error processing $identifier ($docRdfUrl) : $e, retrying");
Log::warning("Timeout error processing $identifier ($docRdfUrl) : $e, retrying");
continue;
}
else {
$this->error("\nError processing $identifier ($docRdfUrl) : $e");
Log::error("Error processing $identifier ($docRdfUrl) : $e");
break;
}
//$this->error(print_r($e->getTraceAsString(),true));
}
}
if(!$docLoaded) {
$documentCounts['error'] += 1;
continue;
}
//insert raw
if($raw) {
$resDocsRaw = $gs_raw->query("ASK WHERE { GRAPH <$docUri> { ?s ?p ?o }}");
if($resDocsRaw->getBoolean()) {
$gs_raw->clear($docUri);
}
$gs_raw->insert($doc, $docUri);
}
//map doc
$inputDocTypes = $this->getDocTypes($doc, $docUri);
$docType = count($inputDocTypes)>0? $inputDocTypes[0]:null;
if(is_null($docType) || !array_key_exists($docType,ImportCocoonRDF::MAPPER_CLASS_MAP)) {
$this->error("\nError processing $identifier ($docRdfUrl) : $docType unknown mapper");
Log::error("Error processing $identifier ($docRdfUrl) : $docType unknown mapper");
$documentCounts['unknown'] += 1;
continue;
}
$mapperClass = ImportCocoonRDF::MAPPER_CLASS_MAP[$docType];
$mapper = new $mapperClass($doc, $docUri);
try {
$mapper->mapGraph();
} catch (\Exception $e) {
Log::error("Error processing $identifier ($docRdfUrl) : error mapping graph : $e");
$documentCounts['error'] += 1;
}
$documentCounts['all'] += 1;
$documentCounts[$docType] = isset($documentCounts[$docType])?$documentCounts[$docType]+1:1;
$mappedGraphes = $mapper->getOutputGraphes();
foreach ($mapper->getOutputGraphes() as $mappedGraphKey => $mappedGraph) {
$mappedGraphUri = $mappedGraph->getUri();
try {
$resDocs = $gs->query("CONSTRUCT { ?s ?p ?o } WHERE { GRAPH <$mappedGraphUri> { ?s ?p ?o }}");
} catch (\Exception $e) {
$this->error("\nError on graph query $identifier ($mappedGraphUri) : $e \n" . $e->getBody() . "\n");
Log::error("\nError on graph query $identifier ($mappedGraphUri) : $e \n" . $e->getBody());
exit;
}
$mergedGraph = null;
$doDelete = true;
if($resDocs->isEmpty()) {
$mergedGraph = $mappedGraph;
$doDelete = false;
}
else {
$doDelete = true;
$mappedTypes = $this->getDocTypes($mappedGraph, $mappedGraphUri);
$presentTypes = $this->getDocTypes($resDocs, $mappedGraphUri);
if($docType == "http://purl.org/dc/dcmitype/Collection" || in_array("http://purl.org/dc/dcmitype/Collection", $mappedTypes)) {
$merger = new \CorpusParole\Libraries\Mergers\CocoonCollectionRdfMerger();
$baseGraph = $resDocs;
$sourceGraph = $mappedGraph;
}
elseif ($docType == "http://purl.org/dc/dcmitype/Text") {
$merger = new \CorpusParole\Libraries\Mergers\CocoonTextRdfMerger();
$baseGraph = $resDocs;
$sourceGraph = $mappedGraph;
}
else {
$merger = new \CorpusParole\Libraries\Mergers\CocoonSoundRdfMerger();
$baseGraph = $mappedGraph;
$sourceGraph = $resDocs;
}
$mergedGraph = $merger->mergeGraph($baseGraph, $sourceGraph, $mappedGraphUri);
if(\EasyRdf\Isomorphic::isomorphic($resDocs, $mergedGraph)) {
//graph are isomorphic no need to go farther for this graph
Log::info("Graph are isomorphic for $mappedGraphUri (from $identifier : $docRdfUrl), skipping");
continue;
}
}
try {
if($doDelete) {
$gs->clear($mappedGraphUri);
}
$gs->insert($mergedGraph, $mappedGraphUri);
}
catch(\Exception $e) {
// just log not much we can do here...
$this->error("\nError on insert $identifier ($docRdfUrl) : $e");
Log::error("Error on insert $identifier ($docRdfUrl) : $e");
$code = $e->getCode();
$message = $e->getMessage();
if($e instanceof EasyRdf\Exception && stripos($message, 'timed out')>=0 && $insertTimeout<= ImportCocoonRDF::INSERT_TIMEOUT_RETRY) {
$this->info("\nThis is a timeout, we continue.");
Log::info("This is a timeout, we continue.");
$insertTimeouts++;
continue;
}
throw $e;
}
}
}
$progressBar->setMessage("finished");
$progressBar->finish();
$this->info("\nDocument count info: ");
foreach ($documentCounts as $docType => $docCount) {
if($docType == 'error' && $docCount > 0) {
$this->error("$docType => $docCount");
} else {
$this->info("$docType => $docCount");
}
}
}
}