24 /** |
31 /** |
25 * The name and signature of the console command. |
32 * The name and signature of the console command. |
26 * |
33 * |
27 * @var string |
34 * @var string |
28 */ |
35 */ |
29 protected $signature = 'corpus-parole:importRDF {--skip=0 : Number of record to skip}'; |
36 protected $signature = 'corpus-parole:importRDF {--skip=0 : Number of record to skip} {--raw : Register raw}'; |
30 |
37 |
31 /** |
38 /** |
32 * Create a new command instance. |
39 * Create a new command instance. |
33 */ |
40 */ |
34 public function __construct() { |
41 public function __construct() { |
35 parent::__construct(); |
42 parent::__construct(); |
36 } |
43 } |
37 |
44 |
38 /** |
45 /** |
|
46 * Get the list of dcmi types for the graph |
|
47 */ |
|
48 private function getDocTypes($doc, $docUri) { |
|
49 |
|
50 $res = $doc->resource($docUri); |
|
51 $docTypes = []; |
|
52 //foreach ($res->all("http://purl.org/dc/elements/1.1/type") as $resType) { |
|
53 foreach ($res->all("dc11:type","resource") as $resType) { |
|
54 $type = $resType->getUri(); |
|
55 if(0 === strpos($type, 'http://purl.org/dc/dcmitype/')) { |
|
56 $docTypes[] = $type; |
|
57 } |
|
58 } |
|
59 |
|
60 // if the doc type list is empty, check that we have a collection |
|
61 if(empty($docTypes)) { |
|
62 if(!empty($doc->allOfType('edm:Collection'))) { |
|
63 $docTypes[] = "http://purl.org/dc/dcmitype/Collection"; |
|
64 } |
|
65 } |
|
66 return $docTypes; |
|
67 } |
|
68 |
|
69 |
|
70 /** |
39 * Execute the console command. |
71 * Execute the console command. |
40 * |
72 * |
41 * @return mixed |
73 * @return mixed |
42 */ |
74 */ |
43 public function fire() { |
75 public function fire() { |
44 |
76 |
45 libxml_use_internal_errors(true); |
77 libxml_use_internal_errors(true); |
46 |
78 |
47 $skip = (int)$this->option('skip'); |
79 $skip = (int)$this->option('skip'); |
|
80 $raw = $this->option('raw'); |
48 |
81 |
49 $this->comment("Skipping $skip records"); |
82 $this->comment("Skipping $skip records"); |
|
83 $this->comment("Recording raw queries: $raw"); |
50 |
84 |
51 $gs = new \EasyRdf\Sparql\Client(Config::get('corpusparole.sesame_query_url'), Config::get('corpusparole.sesame_update_url')); |
85 $gs = new \EasyRdf\Sparql\Client(Config::get('corpusparole.sesame_query_url'), Config::get('corpusparole.sesame_update_url')); |
|
86 $gs_raw = new \EasyRdf\Sparql\Client(Config::get('corpusparole.sesame_query_url_raw'), Config::get('corpusparole.sesame_update_url_raw')); |
52 |
87 |
53 |
88 |
54 $client = new Client(Config::get('corpusparole.cocoon_oaipmh_url')); |
89 $client = new Client(Config::get('corpusparole.cocoon_oaipmh_url')); |
55 $endpoint = new Endpoint($client); |
90 $endpoint = new Endpoint($client); |
56 |
91 |
75 $progressBar->setMessage($message); |
112 $progressBar->setMessage($message); |
76 $progressBar->advance(); |
113 $progressBar->advance(); |
77 |
114 |
78 $docUri = config('corpusparole.cocoon_doc_id_base_uri').substr($identifier, strlen(Config::get('corpusparole.cocoon_doc_id_base'))); |
115 $docUri = config('corpusparole.cocoon_doc_id_base_uri').substr($identifier, strlen(Config::get('corpusparole.cocoon_doc_id_base'))); |
79 |
116 |
80 $resDocs = $gs->query("ASK WHERE { GRAPH <$docUri> { ?s ?p ?o }}"); |
117 $docLoaded = false; |
81 if(!$resDocs->getBoolean()) { |
118 $loadRetry = 0; |
82 $docLoaded = false; |
119 $doc = null; |
83 $loadRetry = 0; |
120 while(!$docLoaded && $loadRetry < config('corpusparole.max_load_retry', 3)) { |
84 while(!$docLoaded && $loadRetry < config('corpusparole.max_load_retry', 3)) { |
121 $loadRetry++; |
85 $loadRetry++; |
|
86 try { |
|
87 $doc = new \EasyRdf\Graph($docRdfUrl); |
|
88 $doc->load(); |
|
89 $docLoaded = true; |
|
90 } |
|
91 //TODO: catch network exception - add error to database |
|
92 catch(\Exception $e) { |
|
93 $code = $e->getCode(); |
|
94 $message = $e->getMessage(); |
|
95 $this->debug("\nError processing $identifier. code : $code, message: $message"); |
|
96 Log::debug("Error processing $identifier. code : $code, message: $message"); |
|
97 if($code == 1 && stripos($message, 'timed out')>=0 ) { |
|
98 $this->warning("\nTimeout error processing $identifier ($docRdfUrl) : $e, retrying"); |
|
99 Log::warning("Timeout error processing $identifier ($docRdfUrl) : $e, retrying"); |
|
100 continue; |
|
101 } |
|
102 else { |
|
103 $this->error("\nError processing $identifier ($docRdfUrl) : $e"); |
|
104 Log::error("Error processing $identifier ($docRdfUrl) : $e"); |
|
105 break; |
|
106 } |
|
107 //$this->error(print_r($e->getTraceAsString(),true)); |
|
108 } |
|
109 } |
|
110 if(!$docLoaded) { |
|
111 continue; |
|
112 } |
|
113 //TODO: treat errors |
|
114 $subjects = $doc->resources(); |
|
115 $subject = reset($subjects)->getUri(); |
|
116 try { |
122 try { |
117 $gs->insert($doc, $subject); |
123 $doc = new \EasyRdf\Graph($docRdfUrl); |
|
124 $doc->load(); |
|
125 $docLoaded = true; |
|
126 } |
|
127 //TODO: catch network exception - add error to database |
|
128 catch(\Exception $e) { |
|
129 $code = $e->getCode(); |
|
130 $message = $e->getMessage(); |
|
131 $this->info("\nError processing $identifier. code : $code, message: $message"); |
|
132 Log::debug("Error processing $identifier. code : $code, message: $message"); |
|
133 if($code == 0 && stripos($message, 'timed out')>=0 ) { |
|
134 $this->info("\nTimeout error processing $identifier ($docRdfUrl) : $e, retrying"); |
|
135 Log::warning("Timeout error processing $identifier ($docRdfUrl) : $e, retrying"); |
|
136 continue; |
|
137 } |
|
138 else { |
|
139 $this->error("\nError processing $identifier ($docRdfUrl) : $e"); |
|
140 Log::error("Error processing $identifier ($docRdfUrl) : $e"); |
|
141 break; |
|
142 } |
|
143 //$this->error(print_r($e->getTraceAsString(),true)); |
|
144 } |
|
145 } |
|
146 if(!$docLoaded) { |
|
147 continue; |
|
148 } |
|
149 |
|
150 //insert raw |
|
151 if($raw) { |
|
152 $resDocsRaw = $gs_raw->query("ASK WHERE { GRAPH <$docUri> { ?s ?p ?o }}"); |
|
153 if($resDocsRaw->getBoolean()) { |
|
154 $gs_raw->clear($docUri); |
|
155 } |
|
156 $gs_raw->insert($doc, $docUri); |
|
157 } |
|
158 |
|
159 //map doc |
|
160 $inputDocTypes = $this->getDocTypes($doc, $docUri); |
|
161 |
|
162 $docType = count($inputDocTypes)>0? $inputDocTypes[0]:null; |
|
163 |
|
164 if(is_null($docType) || !array_key_exists($docType,ImportCocoonRDF::MAPPER_CLASS_MAP)) { |
|
165 $this->error("\nError processing $identifier ($docRdfUrl) : $docType unknown mapper"); |
|
166 Log::error("Error processing $identifier ($docRdfUrl) : $docType unknown mapper"); |
|
167 $documentCounts['unknown'] += 1; |
|
168 continue; |
|
169 } |
|
170 $documentCounts['all'] += 1; |
|
171 $documentCounts[$docType] = isset($documentCounts[$docType])?$documentCounts[$docType]+1:1; |
|
172 |
|
173 $mapperClass = ImportCocoonRDF::MAPPER_CLASS_MAP[$docType]; |
|
174 $mapper = new $mapperClass($doc, $docUri); |
|
175 |
|
176 $mapper->mapGraph(); |
|
177 $mappedGraphes = $mapper->getOutputGraphes(); |
|
178 |
|
179 foreach ($mapper->getOutputGraphes() as $mappedGraphKey => $mappedGraph) { |
|
180 |
|
181 $mappedGraphUri = $mappedGraph->getUri(); |
|
182 try { |
|
183 $resDocs = $gs->query("CONSTRUCT { ?s ?p ?o } WHERE { GRAPH <$mappedGraphUri> { ?s ?p ?o }}"); |
|
184 } catch (\Exception $e) { |
|
185 $this->error("\nError on graph query $identifier ($mappedGraphUri) : $e \n" . $e->getBody() . "\n"); |
|
186 Log::error("\nError on graph query $identifier ($mappedGraphUri) : $e \n" . $e->getBody()); |
|
187 exit; |
|
188 } |
|
189 |
|
190 $mergedGraph = null; |
|
191 $doDelete = true; |
|
192 |
|
193 if($resDocs->isEmpty()) { |
|
194 $mergedGraph = $mappedGraph; |
|
195 $doDelete = false; |
|
196 } |
|
197 else { |
|
198 $doDelete = true; |
|
199 $mappedTypes = $this->getDocTypes($mappedGraph, $mappedGraphUri); |
|
200 $presentTypes = $this->getDocTypes($resDocs, $mappedGraphUri); |
|
201 |
|
202 if($docType == "http://purl.org/dc/dcmitype/Collection" || in_array("http://purl.org/dc/dcmitype/Collection", $mappedTypes)) { |
|
203 $merger = new \CorpusParole\Libraries\Mergers\CocoonCollectionRdfMerger(); |
|
204 $baseGraph = $resDocs; |
|
205 $sourceGraph = $mappedGraph; |
|
206 } |
|
207 elseif ($docType == "http://purl.org/dc/dcmitype/Text") { |
|
208 $merger = new \CorpusParole\Libraries\Mergers\CocoonSoundRdfMerger(); |
|
209 $baseGraph = $resDocs; |
|
210 $sourceGraph = $mappedGraph; |
|
211 } |
|
212 else { |
|
213 $merger = new \CorpusParole\Libraries\Mergers\CocoonSoundRdfMerger(); |
|
214 $baseGraph = $mappedGraph; |
|
215 $sourceGraph = $resDocs; |
|
216 } |
|
217 $mergedGraph = $merger->mergeGraph($baseGraph, $sourceGraph, $mappedGraphUri); |
|
218 if(\EasyRdf\Isomorphic::isomorphic($resDocs, $mergedGraph)) { |
|
219 //graph are isomorphic no need to go farther for this graph |
|
220 Log::info("Graph are isomorphic for $mappedGraphUri (from $identifier : $docRdfUrl), skipping"); |
|
221 continue; |
|
222 } |
|
223 } |
|
224 |
|
225 try { |
|
226 if($doDelete) { |
|
227 $gs->clear($mappedGraphUri); |
|
228 } |
|
229 $gs->insert($mergedGraph, $mappedGraphUri); |
118 } |
230 } |
119 catch(\Exception $e) { |
231 catch(\Exception $e) { |
120 // just log not much we can do here... |
232 // just log not much we can do here... |
121 $this->error("\nError on insert $identifier ($docRdfUrl) : $e"); |
233 $this->error("\nError on insert $identifier ($docRdfUrl) : $e"); |
122 Log::error("Error on insert $identifier ($docRdfUrl) : $e"); |
234 Log::error("Error on insert $identifier ($docRdfUrl) : $e"); |