--- a/server/src/app/Console/Commands/IndexDocuments.php Tue Oct 04 13:53:56 2016 +0200
+++ b/server/src/app/Console/Commands/IndexDocuments.php Wed Oct 05 02:31:25 2016 +0200
@@ -3,10 +3,14 @@
namespace CorpusParole\Console\Commands;
use Illuminate\Console\Command;
+use EasyRdf\Resource;
+
use GuzzleHttp\Client;
+use CorpusParole\Libraries\Utils;
use CorpusParole\Repositories\DocumentRepository;
use CorpusParole\Libraries\CocoonUtils;
use CorpusParole\Models\GeonamesHierarchy;
+use CorpusParole\Services\BnfResolverInterface;
use Es;
class IndexDocuments extends Command
@@ -34,9 +38,10 @@
*
* @return void
*/
- public function __construct(DocumentRepository $documentRepository, Client $httpClient)
+ public function __construct(DocumentRepository $documentRepository, Client $httpClient, BnfResolverInterface $bnfResolver)
{
$this->documentRepository = $documentRepository;
+ $this->bnfResolver = $bnfResolver;
$this->httpClient = $httpClient;
parent::__construct();
}
@@ -82,7 +87,15 @@
],
'date' => [ 'type' => 'date' ],
'geonames_hyerarchy' => [ 'type' => 'string' ],
- 'location' => [ 'type' => 'geo_point' ]
+ 'location' => [ 'type' => 'geo_point' ],
+ 'subject' => [
+ 'type' => 'nested',
+ 'properties' => [
+ 'label' => [ 'type' => 'string', 'index' => 'not_analyzed'],
+ 'code' => [ 'type' => 'string', 'index' => 'not_analyzed'],
+ 'label_code' => [ 'type' => 'string', 'index' => 'not_analyzed']
+ ]
+ ]
// TODO: add location information
]
]
@@ -154,6 +167,29 @@
}
/**
+ * get subjects as { 'label': label, 'code': code } objects
+ * Takes only into account the bnf subjects
+ */
+ private function getSubjects($doc) {
+
+ $sres = array_reduce($doc->getSubjects(), function($res, $s) {
+ $m = [];
+ if($s instanceof Resource && preg_match(config('corpusparole.bnf_ark_url_regexp'), $s->getUri(), $m) === 1) {
+ array_push($res, [
+ 'uri' => $m[0],
+ 'code' => $m[1]
+ ]);
+ }
+ return $res;
+ }, []);
+
+ $labels = $this->bnfResolver->getLabels(array_unique(array_map(function($so) { return $so['uri'];}, $sres)));
+
+ return array_map(function($so) use ($labels) { return [ 'label' => $labels[$so['uri']], 'code' => $so['code'], 'label_code' => $labels[$so['uri']]."|".$so['code'] ]; }, $sres);
+
+ }
+
+ /**
* Index one document into Elasticsearch
*
* @return int (1 if sucess, 0 if error)
@@ -168,7 +204,8 @@
'body' => [
'title' => (string)$doc->getTitle(),
'date' => (string)$doc->getModified(),
- 'geonames_hierarchy' => $this->getGeonamesHierarchy($doc)
+ 'geonames_hierarchy' => $this->getGeonamesHierarchy($doc),
+ 'subject' => $this->getSubjects($doc)
]
];
Es::index($query_data);
@@ -182,7 +219,8 @@
private function indexBulk($docs)
{
$query_data = ['body' => []];
- foreach($docs as $doc){
+ foreach($docs as $resultDoc){
+ $doc = $this->documentRepository->get($resultDoc->getId());
$query_data['body'][] = [
'index' => [
'_index' => config('elasticsearch.index'),
@@ -192,7 +230,9 @@
];
$query_data['body'][] = [
'title' => (string)$doc->getTitle(),
- 'date' => (string)$doc->getModified()
+ 'date' => (string)$doc->getModified(),
+ 'geonames_hierarchy' => $this->getGeonamesHierarchy($doc),
+ 'subject' => $this->getSubjects($doc)
];
}
Es::bulk($query_data);
@@ -263,18 +303,18 @@
break;
}
$this->indexOne($doc);
+ $progressBar->setMessage($doc->getId());
$progressBar->advance();
- $progressBar->setMessage($doc->getId());
}
}
else
{
$this->indexBulk($docs);
+ $progressBar->setMessage('Page '.$page);
$progressBar->advance();
- $progressBar->setMessage('Page '.$page);
}
}
$progressBar->finish();
- $this->info('Indexing completed');
+ $this->info("\nIndexing completed");
}
}