server/src/app/Console/Commands/IndexDocuments.php
changeset 326 226d5b17a119
parent 325 31a4987f6017
child 369 796725d33b67
--- a/server/src/app/Console/Commands/IndexDocuments.php	Sun Oct 09 11:44:18 2016 +0200
+++ b/server/src/app/Console/Commands/IndexDocuments.php	Tue Oct 11 02:49:59 2016 +0200
@@ -103,12 +103,12 @@
                                 ]
                             ]
                         ],
-                        'date' => [ 'type' => 'date' ],
-                        'geonames_hyerarchy' => [ 'type' => 'string' ],
-                        'location' => [ 'type' => 'geo_point' ],
-                        'creation_date' => ['type' => 'date'],
-                        'language' => ['type' => 'string'],
-                        'discourse_types' => ['type' => 'string'],
+                        'date' => [ 'type' => 'date', 'index' => 'not_analyzed'],
+                        'geonames_hyerarchy' => [ 'type' => 'string', 'index' => 'not_analyzed'],
+                        'location' => [ 'type' => 'geo_point'],
+                        'creation_date' => ['type' => 'date', 'index' => 'not_analyzed'],
+                        'language' => ['type' => 'string', 'index' => 'not_analyzed'],
+                        'discourse_types' => ['type' => 'string', 'index' => 'not_analyzed'],
                         'subject' => [
                             'type' => 'nested',
                             'properties' => [
@@ -117,7 +117,6 @@
                                 'label_code' => [ 'type' => 'string', 'index' => 'not_analyzed']
                             ]
                         ]
-                        // TODO: add location information
                     ]
                 ]
             ]
@@ -473,7 +472,7 @@
             'date' => (string)$doc->getModified(),
             'location' => $this->getLocation($doc),
             'creation_date' => $this->getCreationDate($doc),
-            'language' => $doc->getLanguageValue(),
+            'language' => $doc->getLanguagesValue(),
             'discourse_types' => $this->getDiscourseTypes($doc),
             'geonames_hierarchy' => $this->getGeonamesHierarchy($doc),
             'subject' => $this->getSubjects($doc),
@@ -485,14 +484,13 @@
      *
      * @return int (1 if sucess, 0 if error)
      */
-    private function indexOne($resultDoc)
+    private function indexOne($docId, $docBody)
     {
-        $doc = $this->documentRepository->get($resultDoc->getId());
         $query_data = [
             'index' => config('elasticsearch.index'),
             'type' => 'document',
-            'id' => (string)$doc->getId(),
-            'body' => $this->getDocBody($doc)
+            'id' => $docId,
+            'body' => $docBody
         ];
         Es::index($query_data);
     }
@@ -502,19 +500,18 @@
      *
      * @return int (1 if sucess, 0 if error)
      */
-     private function indexBulk($docs)
+     private function indexBulk($docBodies)
      {
           $query_data = ['body' => []];
-          foreach($docs as $resultDoc){
-              $doc = $this->documentRepository->get($resultDoc->getId());
+          foreach($docBodies as $docId => $docBody){
               $query_data['body'][] = [
                   'index' => [
                       '_index' => config('elasticsearch.index'),
                       '_type' => 'document',
-                      '_id' => (string)$doc->getId()
+                      '_id' => $docId
                   ]
               ];
-              $query_data['body'][] = $this->getDocBody($doc);
+              $query_data['body'][] = $docBody;
           }
           Es::bulk($query_data);
      }
@@ -559,49 +556,43 @@
 
         $this->info('Indexing documents...');
 
-        if ($limit<=0) {
-            $lastPage = $this->documentRepository->paginateAll($stepSize, 'page')->lastPage();
-            $total = $this->documentRepository->getCount();
-            $lastPageEntryCount = $stepSize+1;
-        }
-        else {
-            $lastPage = min((int)($limit/$stepSize)+1, $this->documentRepository->paginateAll($stepSize, 'page')->lastPage());
-            $total = $limit;
-            $lastPageEntryCount = $limit % $stepSize;
+        $limit = (int)$limit;
+        $total = $this->documentRepository->getCount();
+
+        if($limit>0) {
+            $total = min($limit, $total);
         }
 
-        if ($noBulk)
-        {
-            $progressBar = $this->output->createProgressBar($total);
-        }
-        else
-        {
-            $progressBar = $this->output->createProgressBar($lastPage);
-        }
+        $progressBar = $this->output->createProgressBar($total);
         $progressBar->setFormat(' %current%/%max% [%bar%] %percent:3s%% - %message%');
 
-        for ($page=1;$page<=$lastPage;$page++)
-        {
-            $docs = $this->documentRepository->paginateAll($stepSize, 'page', $page);
-            if ($noBulk)
-            {
-                foreach ($docs as $i=>$doc){
-                    if ($page==$lastPage && $i>=$lastPageEntryCount){
-                        break;
-                    }
-                    $progressBar->setMessage($doc->getId());
-                    $progressBar->advance();
-                    $this->indexOne($doc);
+        $page = 0;
+        $lastPage = PHP_INT_MAX;
+        $docIds = [];
+
+        while($page++<$lastPage) {
+            $docsPaginator = $this->documentRepository->paginate(null, $stepSize, config('corpusparole.pagination_page_param'), $page, "_graph");
+            $lastPage = $docsPaginator->lastPage();
+            $docsBodies = [];
+            foreach($docsPaginator as $docResult) {
+                $docId = (string)$docResult->getId();
+                $progressBar->setMessage($docId);
+                $progressBar->advance();
+                $doc = $this->documentRepository->get($docId);
+                $docBody = $this->getDocBody($doc);
+                if($noBulk) {
+                    $this->indexOne($docId, $docBody);
+                } else {
+                    $docsBodies[$docId] = $docBody;
                 }
+                $docIds[] = $docId;
             }
-            else
-            {
-                $progressBar->setMessage('Page '.$page);
-                $progressBar->advance();
-                $this->indexBulk($docs);
+            if(!$noBulk) {
+                $this->indexBulk($docsBodies);
             }
         }
         $progressBar->finish();
-        $this->info("\nIndexing completed");
+        $this->info("\nIndexing completed for " . count(array_unique($docIds))." documents (of ".count($docIds).").");
+
     }
 }