server/src/app/Console/Commands/IndexDocuments.php
changeset 321 aefaad270b9b
parent 320 0fce13da58af
child 322 084aae09edf4
--- a/server/src/app/Console/Commands/IndexDocuments.php	Tue Oct 04 13:53:56 2016 +0200
+++ b/server/src/app/Console/Commands/IndexDocuments.php	Wed Oct 05 02:31:25 2016 +0200
@@ -3,10 +3,14 @@
 namespace CorpusParole\Console\Commands;
 
 use Illuminate\Console\Command;
+use EasyRdf\Resource;
+
 use GuzzleHttp\Client;
+use CorpusParole\Libraries\Utils;
 use CorpusParole\Repositories\DocumentRepository;
 use CorpusParole\Libraries\CocoonUtils;
 use CorpusParole\Models\GeonamesHierarchy;
+use CorpusParole\Services\BnfResolverInterface;
 use Es;
 
 class IndexDocuments extends Command
@@ -34,9 +38,10 @@
      *
      * @return void
      */
-    public function __construct(DocumentRepository $documentRepository, Client $httpClient)
+    public function __construct(DocumentRepository $documentRepository, Client $httpClient, BnfResolverInterface $bnfResolver)
     {
         $this->documentRepository = $documentRepository;
+        $this->bnfResolver = $bnfResolver;
         $this->httpClient = $httpClient;
         parent::__construct();
     }
@@ -82,7 +87,15 @@
                         ],
                         'date' => [ 'type' => 'date' ],
                         'geonames_hyerarchy' => [ 'type' => 'string' ],
-                        'location' => [ 'type' => 'geo_point' ]
+                        'location' => [ 'type' => 'geo_point' ],
+                        'subject' => [
+                            'type' => 'nested',
+                            'properties' => [
+                                'label' => [ 'type' => 'string', 'index' => 'not_analyzed'],
+                                'code' => [ 'type' => 'string', 'index' => 'not_analyzed'],
+                                'label_code' => [ 'type' => 'string', 'index' => 'not_analyzed']
+                            ]
+                        ]
                         // TODO: add location information
                     ]
                 ]
@@ -154,6 +167,29 @@
     }
 
     /**
+     * get subjects as { 'label': label, 'code': code } objects
+     * Takes only into account the bnf subjects
+     */
+    private function getSubjects($doc) {
+
+        $sres = array_reduce($doc->getSubjects(), function($res, $s) {
+            $m = [];
+            if($s instanceof Resource && preg_match(config('corpusparole.bnf_ark_url_regexp'), $s->getUri(), $m) === 1) {
+                array_push($res, [
+                    'uri' => $m[0],
+                    'code' => $m[1]
+                ]);
+            }
+            return $res;
+        }, []);
+
+        $labels = $this->bnfResolver->getLabels(array_unique(array_map(function($so) { return $so['uri'];}, $sres)));
+
+        return array_map(function($so) use ($labels) { return [ 'label' => $labels[$so['uri']], 'code' => $so['code'], 'label_code' =>  $labels[$so['uri']]."|".$so['code'] ]; }, $sres);
+
+    }
+
+    /**
      * Index one document into Elasticsearch
      *
      * @return int (1 if sucess, 0 if error)
@@ -168,7 +204,8 @@
             'body' => [
                 'title' => (string)$doc->getTitle(),
                 'date' => (string)$doc->getModified(),
-                'geonames_hierarchy' => $this->getGeonamesHierarchy($doc)
+                'geonames_hierarchy' => $this->getGeonamesHierarchy($doc),
+                'subject' => $this->getSubjects($doc)
             ]
         ];
         Es::index($query_data);
@@ -182,7 +219,8 @@
      private function indexBulk($docs)
      {
           $query_data = ['body' => []];
-          foreach($docs as $doc){
+          foreach($docs as $resultDoc){
+              $doc = $this->documentRepository->get($resultDoc->getId());
               $query_data['body'][] = [
                   'index' => [
                       '_index' => config('elasticsearch.index'),
@@ -192,7 +230,9 @@
               ];
               $query_data['body'][] = [
                   'title' => (string)$doc->getTitle(),
-                  'date' => (string)$doc->getModified()
+                  'date' => (string)$doc->getModified(),
+                  'geonames_hierarchy' => $this->getGeonamesHierarchy($doc),
+                  'subject' => $this->getSubjects($doc)
               ];
           }
           Es::bulk($query_data);
@@ -263,18 +303,18 @@
                         break;
                     }
                     $this->indexOne($doc);
+                    $progressBar->setMessage($doc->getId());
                     $progressBar->advance();
-                    $progressBar->setMessage($doc->getId());
                 }
             }
             else
             {
                 $this->indexBulk($docs);
+                $progressBar->setMessage('Page '.$page);
                 $progressBar->advance();
-                $progressBar->setMessage('Page '.$page);
             }
         }
         $progressBar->finish();
-        $this->info('Indexing completed');
+        $this->info("\nIndexing completed");
     }
 }