server/src/app/Console/Commands/ImportCocoonRDF.php
author ymh <ymh.work@gmail.com>
Thu, 10 Nov 2016 15:35:18 +0100
changeset 412 c88746153ee0
parent 410 240ca282331d
child 506 8a5bb4b48b85
permissions -rw-r--r--
correct import

<?php

namespace CorpusParole\Console\Commands;

use Config;
use Log;
use Illuminate\Console\Command;
use Symfony\Component\Console\Input\InputOption;
use Symfony\Component\Console\Input\InputArgument;
use Phpoaipmh\Client;
use Phpoaipmh\Endpoint;

class ImportCocoonRDF extends Command {

    const INSERT_TIMEOUT_RETRY = 5;

    const MAPPER_CLASS_MAP = [
        "http://purl.org/dc/dcmitype/Sound" => '\CorpusParole\Libraries\Mappers\CocoonSoundRdfMapper',
        "http://purl.org/dc/dcmitype/MovingImage" => '\CorpusParole\Libraries\Mappers\CocoonSoundRdfMapper',
        "http://purl.org/dc/dcmitype/Text" => '\CorpusParole\Libraries\Mappers\CocoonTextRdfMapper',
        "http://purl.org/dc/dcmitype/Collection" => '\CorpusParole\Libraries\Mappers\CocoonCollectionRdfMapper'
    ];

    /**
     * The console command description.
     *
     * @var string
     */
    protected $description = 'Import Rdf from Cocoon.';

    /**
    * The name and signature of the console command.
    *
    * @var string
    */
    protected $signature = 'corpus-parole:importRDF {--skip=0 : Number of record to skip} {--raw : Register raw}';

    /**
     * Create a new command instance.
     */
    public function __construct() {
        parent::__construct();
    }

    /**
     * Get the list of dcmi types for the graph
     */
    private function getDocTypes($doc, $docUri) {

        $res = $doc->resource($docUri);
        $docTypes = [];
        //foreach ($res->all("http://purl.org/dc/elements/1.1/type") as $resType) {
        foreach ($res->all("dc11:type","resource") as $resType) {
            $type = $resType->getUri();
            if(0 === strpos($type, 'http://purl.org/dc/dcmitype/')) {
                $docTypes[] = $type;
            }
        }

        // if the doc type list is empty, check that we have a collection
        if(empty($docTypes)) {
            if(!empty($doc->allOfType('edm:Collection'))) {
                $docTypes[] = "http://purl.org/dc/dcmitype/Collection";
            }
        }
        return $docTypes;
    }


    /**
     * Execute the console command.
     *
     * @return mixed
     */
    public function fire() {

        libxml_use_internal_errors(true);

        $skip = (int)$this->option('skip');
        $raw = $this->option('raw');

        $this->comment("Skipping $skip records");
        $this->comment("Recording raw queries: ".($raw?'TRUE':'FALSE'));

        $gs = new \EasyRdf\Sparql\Client(Config::get('corpusparole.rdf4j_query_url'), Config::get('corpusparole.rdf4j_update_url'));
        $gs_raw = new \EasyRdf\Sparql\Client(Config::get('corpusparole.rdf4j_query_url_raw'), Config::get('corpusparole.rdf4j_update_url_raw'));


        $client = new Client(Config::get('corpusparole.cocoon_oaipmh_url'));
        $endpoint = new Endpoint($client);

        $recs = $endpoint->listRecords('olac', null, null, 'LanguesDeFrance');

        //TODO : treat timeout exceptions
        $progressBar = $this->output->createProgressBar($recs->getTotalRecordsInCollection());
        $progressBar->setFormat(' %current%/%max% [%bar%] %percent:3s%% - %message%');

        $insertTimeouts = 0;

        $documentCounts = ['all' => 0, 'unknown' => 0, 'error' => 0];

        foreach ($recs as $item) {
            $item->registerXPathNamespace('oai', "http://www.openarchives.org/OAI/2.0/");
            $identifier = (string) $item->xpath('oai:header/oai:identifier')[0];

            $docRdfUrl = Config::get('corpusparole.cocoon_rdf_base_uri').substr($identifier, strlen(Config::get('corpusparole.cocoon_doc_id_base')));
            $message = "$identifier : $docRdfUrl";
            if($recs->getNumRetrieved() <= $skip) {
                $progressBar->setMessage("$message - Skipping");
                $progressBar->advance();
                continue;
            }
            $progressBar->setMessage($message);
            $progressBar->advance();

            $docUri = config('corpusparole.cocoon_doc_id_base_uri').substr($identifier, strlen(Config::get('corpusparole.cocoon_doc_id_base')));

            $docLoaded = false;
            $loadRetry = 0;
            $doc = null;
            while(!$docLoaded && $loadRetry < config('corpusparole.max_load_retry', 3)) {
                $loadRetry++;
                try {
                    $doc = new \EasyRdf\Graph($docRdfUrl);
                    $doc->load();
                    $docLoaded = true;
                }
                //TODO: catch network exception - add error to database
                catch(\Exception $e) {
                    $code = $e->getCode();
                    $message = $e->getMessage();
                    $this->info("\nError processing $identifier. code : $code, message: $message");
                    Log::debug("Error processing $identifier. code : $code, message: $message");
                    if($code == 400 || ($code == 0 && stripos($message, 'timed out')>=0) ) {
                        $this->info("\nTimeout error processing $identifier ($docRdfUrl) : $e, retrying");
                        Log::warning("Timeout error processing $identifier ($docRdfUrl) : $e, retrying");
                        continue;
                    }
                    else {
                        $this->error("\nError processing $identifier ($docRdfUrl) : $e");
                        Log::error("Error processing $identifier ($docRdfUrl) : $e");
                        break;
                    }
                    //$this->error(print_r($e->getTraceAsString(),true));
                }
            }
            if(!$docLoaded) {
                $documentCounts['error'] += 1;
                continue;
            }

            //insert raw
            if($raw) {
                $resDocsRaw = $gs_raw->query("ASK WHERE { GRAPH <$docUri> { ?s ?p ?o }}");
                if($resDocsRaw->getBoolean()) {
                    $gs_raw->clear($docUri);
                }
                $gs_raw->insert($doc, $docUri);
            }

            //map doc
            $inputDocTypes = $this->getDocTypes($doc, $docUri);

            $docType = count($inputDocTypes)>0? $inputDocTypes[0]:null;

            if(is_null($docType) || !array_key_exists($docType,ImportCocoonRDF::MAPPER_CLASS_MAP)) {
                $this->error("\nError processing $identifier ($docRdfUrl) : $docType unknown mapper");
                Log::error("Error processing $identifier ($docRdfUrl) : $docType unknown mapper");
                $documentCounts['unknown'] += 1;
                continue;
            }

            $mapperClass = ImportCocoonRDF::MAPPER_CLASS_MAP[$docType];
            $mapper = new $mapperClass($doc, $docUri);

            try {
                $mapper->mapGraph();
            } catch (\Exception $e) {
                Log::error("Error processing $identifier ($docRdfUrl) : error mapping graph : $e");
                $documentCounts['error'] += 1;
            }
            $documentCounts['all'] += 1;
            $documentCounts[$docType] = isset($documentCounts[$docType])?$documentCounts[$docType]+1:1;

            $mappedGraphes = $mapper->getOutputGraphes();

            foreach ($mapper->getOutputGraphes() as $mappedGraphKey => $mappedGraph) {

                $mappedGraphUri = $mappedGraph->getUri();
                try {
                    $resDocs = $gs->query("CONSTRUCT { ?s ?p ?o } WHERE { GRAPH <$mappedGraphUri> { ?s ?p ?o }}");
                } catch (\Exception $e) {
                    $this->error("\nError on graph query $identifier ($mappedGraphUri) : $e \n" . $e->getBody() . "\n");
                    Log::error("\nError on graph query $identifier ($mappedGraphUri) : $e \n" . $e->getBody());
                    exit;
                }

                $mergedGraph = null;
                $doDelete = true;

                if($resDocs->isEmpty()) {
                    $mergedGraph = $mappedGraph;
                    $doDelete = false;
                }
                else {
                    $doDelete = true;
                    $mappedTypes = $this->getDocTypes($mappedGraph, $mappedGraphUri);
                    $presentTypes = $this->getDocTypes($resDocs, $mappedGraphUri);

                    if($docType == "http://purl.org/dc/dcmitype/Collection" || in_array("http://purl.org/dc/dcmitype/Collection", $mappedTypes)) {
                        $merger = new \CorpusParole\Libraries\Mergers\CocoonCollectionRdfMerger();
                        $baseGraph = $resDocs;
                        $sourceGraph = $mappedGraph;
                    }
                    elseif ($docType == "http://purl.org/dc/dcmitype/Text") {
                        $merger = new \CorpusParole\Libraries\Mergers\CocoonTextRdfMerger();
                        $baseGraph = $resDocs;
                        $sourceGraph = $mappedGraph;
                    }
                    else {
                        $merger = new \CorpusParole\Libraries\Mergers\CocoonSoundRdfMerger();
                        $baseGraph = $mappedGraph;
                        $sourceGraph = $resDocs;
                    }
                    $mergedGraph = $merger->mergeGraph($baseGraph, $sourceGraph, $mappedGraphUri);
                    if(\EasyRdf\Isomorphic::isomorphic($resDocs, $mergedGraph)) {
                        //graph are isomorphic no need to go farther for this graph
                        Log::info("Graph are isomorphic for $mappedGraphUri (from $identifier : $docRdfUrl), skipping");
                        continue;
                    }
                }

                try {
                    if($doDelete) {
                        $gs->clear($mappedGraphUri);
                    }
                    $gs->insert($mergedGraph, $mappedGraphUri);
                }
                catch(\Exception $e) {
                    // just log not much we can do here...
                    $this->error("\nError on insert $identifier ($docRdfUrl) : $e");
                    Log::error("Error on insert $identifier ($docRdfUrl) : $e");
                    $code = $e->getCode();
                    $message = $e->getMessage();
                    if($e instanceof EasyRdf\Exception && stripos($message, 'timed out')>=0 && $insertTimeout<= ImportCocoonRDF::INSERT_TIMEOUT_RETRY) {
                        $this->info("\nThis is a timeout, we continue.");
                        Log::info("This is a timeout, we continue.");
                        $insertTimeouts++;
                        continue;
                    }
                    throw $e;
                }
            }
        }
        $progressBar->setMessage("finished");
        $progressBar->finish();

        $this->info("\nDocument count info: ");
        foreach ($documentCounts as $docType => $docCount) {
            if($docType == 'error' && $docCount > 0) {
                $this->error("$docType => $docCount");
            } else {
                $this->info("$docType => $docCount");
            }
        }
    }
}