diff -r 0fce13da58af -r aefaad270b9b server/src/app/Console/Commands/IndexDocuments.php --- a/server/src/app/Console/Commands/IndexDocuments.php Tue Oct 04 13:53:56 2016 +0200 +++ b/server/src/app/Console/Commands/IndexDocuments.php Wed Oct 05 02:31:25 2016 +0200 @@ -3,10 +3,14 @@ namespace CorpusParole\Console\Commands; use Illuminate\Console\Command; +use EasyRdf\Resource; + use GuzzleHttp\Client; +use CorpusParole\Libraries\Utils; use CorpusParole\Repositories\DocumentRepository; use CorpusParole\Libraries\CocoonUtils; use CorpusParole\Models\GeonamesHierarchy; +use CorpusParole\Services\BnfResolverInterface; use Es; class IndexDocuments extends Command @@ -34,9 +38,10 @@ * * @return void */ - public function __construct(DocumentRepository $documentRepository, Client $httpClient) + public function __construct(DocumentRepository $documentRepository, Client $httpClient, BnfResolverInterface $bnfResolver) { $this->documentRepository = $documentRepository; + $this->bnfResolver = $bnfResolver; $this->httpClient = $httpClient; parent::__construct(); } @@ -82,7 +87,15 @@ ], 'date' => [ 'type' => 'date' ], 'geonames_hyerarchy' => [ 'type' => 'string' ], - 'location' => [ 'type' => 'geo_point' ] + 'location' => [ 'type' => 'geo_point' ], + 'subject' => [ + 'type' => 'nested', + 'properties' => [ + 'label' => [ 'type' => 'string', 'index' => 'not_analyzed'], + 'code' => [ 'type' => 'string', 'index' => 'not_analyzed'], + 'label_code' => [ 'type' => 'string', 'index' => 'not_analyzed'] + ] + ] // TODO: add location information ] ] @@ -154,6 +167,29 @@ } /** + * get subjects as { 'label': label, 'code': code } objects + * Takes only into account the bnf subjects + */ + private function getSubjects($doc) { + + $sres = array_reduce($doc->getSubjects(), function($res, $s) { + $m = []; + if($s instanceof Resource && preg_match(config('corpusparole.bnf_ark_url_regexp'), $s->getUri(), $m) === 1) { + array_push($res, [ + 'uri' => $m[0], + 'code' => $m[1] + ]); + } + return $res; + }, []); + + $labels = $this->bnfResolver->getLabels(array_unique(array_map(function($so) { return $so['uri'];}, $sres))); + + return array_map(function($so) use ($labels) { return [ 'label' => $labels[$so['uri']], 'code' => $so['code'], 'label_code' => $labels[$so['uri']]."|".$so['code'] ]; }, $sres); + + } + + /** * Index one document into Elasticsearch * * @return int (1 if sucess, 0 if error) @@ -168,7 +204,8 @@ 'body' => [ 'title' => (string)$doc->getTitle(), 'date' => (string)$doc->getModified(), - 'geonames_hierarchy' => $this->getGeonamesHierarchy($doc) + 'geonames_hierarchy' => $this->getGeonamesHierarchy($doc), + 'subject' => $this->getSubjects($doc) ] ]; Es::index($query_data); @@ -182,7 +219,8 @@ private function indexBulk($docs) { $query_data = ['body' => []]; - foreach($docs as $doc){ + foreach($docs as $resultDoc){ + $doc = $this->documentRepository->get($resultDoc->getId()); $query_data['body'][] = [ 'index' => [ '_index' => config('elasticsearch.index'), @@ -192,7 +230,9 @@ ]; $query_data['body'][] = [ 'title' => (string)$doc->getTitle(), - 'date' => (string)$doc->getModified() + 'date' => (string)$doc->getModified(), + 'geonames_hierarchy' => $this->getGeonamesHierarchy($doc), + 'subject' => $this->getSubjects($doc) ]; } Es::bulk($query_data); @@ -263,18 +303,18 @@ break; } $this->indexOne($doc); + $progressBar->setMessage($doc->getId()); $progressBar->advance(); - $progressBar->setMessage($doc->getId()); } } else { $this->indexBulk($docs); + $progressBar->setMessage('Page '.$page); $progressBar->advance(); - $progressBar->setMessage('Page '.$page); } } $progressBar->finish(); - $this->info('Indexing completed'); + $this->info("\nIndexing completed"); } }