diff -r 31a4987f6017 -r 226d5b17a119 server/src/app/Console/Commands/IndexDocuments.php --- a/server/src/app/Console/Commands/IndexDocuments.php Sun Oct 09 11:44:18 2016 +0200 +++ b/server/src/app/Console/Commands/IndexDocuments.php Tue Oct 11 02:49:59 2016 +0200 @@ -103,12 +103,12 @@ ] ] ], - 'date' => [ 'type' => 'date' ], - 'geonames_hyerarchy' => [ 'type' => 'string' ], - 'location' => [ 'type' => 'geo_point' ], - 'creation_date' => ['type' => 'date'], - 'language' => ['type' => 'string'], - 'discourse_types' => ['type' => 'string'], + 'date' => [ 'type' => 'date', 'index' => 'not_analyzed'], + 'geonames_hyerarchy' => [ 'type' => 'string', 'index' => 'not_analyzed'], + 'location' => [ 'type' => 'geo_point'], + 'creation_date' => ['type' => 'date', 'index' => 'not_analyzed'], + 'language' => ['type' => 'string', 'index' => 'not_analyzed'], + 'discourse_types' => ['type' => 'string', 'index' => 'not_analyzed'], 'subject' => [ 'type' => 'nested', 'properties' => [ @@ -117,7 +117,6 @@ 'label_code' => [ 'type' => 'string', 'index' => 'not_analyzed'] ] ] - // TODO: add location information ] ] ] @@ -473,7 +472,7 @@ 'date' => (string)$doc->getModified(), 'location' => $this->getLocation($doc), 'creation_date' => $this->getCreationDate($doc), - 'language' => $doc->getLanguageValue(), + 'language' => $doc->getLanguagesValue(), 'discourse_types' => $this->getDiscourseTypes($doc), 'geonames_hierarchy' => $this->getGeonamesHierarchy($doc), 'subject' => $this->getSubjects($doc), @@ -485,14 +484,13 @@ * * @return int (1 if sucess, 0 if error) */ - private function indexOne($resultDoc) + private function indexOne($docId, $docBody) { - $doc = $this->documentRepository->get($resultDoc->getId()); $query_data = [ 'index' => config('elasticsearch.index'), 'type' => 'document', - 'id' => (string)$doc->getId(), - 'body' => $this->getDocBody($doc) + 'id' => $docId, + 'body' => $docBody ]; Es::index($query_data); } @@ -502,19 +500,18 @@ * * @return int (1 if sucess, 0 if error) */ - private function indexBulk($docs) + private function indexBulk($docBodies) { $query_data = ['body' => []]; - foreach($docs as $resultDoc){ - $doc = $this->documentRepository->get($resultDoc->getId()); + foreach($docBodies as $docId => $docBody){ $query_data['body'][] = [ 'index' => [ '_index' => config('elasticsearch.index'), '_type' => 'document', - '_id' => (string)$doc->getId() + '_id' => $docId ] ]; - $query_data['body'][] = $this->getDocBody($doc); + $query_data['body'][] = $docBody; } Es::bulk($query_data); } @@ -559,49 +556,43 @@ $this->info('Indexing documents...'); - if ($limit<=0) { - $lastPage = $this->documentRepository->paginateAll($stepSize, 'page')->lastPage(); - $total = $this->documentRepository->getCount(); - $lastPageEntryCount = $stepSize+1; - } - else { - $lastPage = min((int)($limit/$stepSize)+1, $this->documentRepository->paginateAll($stepSize, 'page')->lastPage()); - $total = $limit; - $lastPageEntryCount = $limit % $stepSize; + $limit = (int)$limit; + $total = $this->documentRepository->getCount(); + + if($limit>0) { + $total = min($limit, $total); } - if ($noBulk) - { - $progressBar = $this->output->createProgressBar($total); - } - else - { - $progressBar = $this->output->createProgressBar($lastPage); - } + $progressBar = $this->output->createProgressBar($total); $progressBar->setFormat(' %current%/%max% [%bar%] %percent:3s%% - %message%'); - for ($page=1;$page<=$lastPage;$page++) - { - $docs = $this->documentRepository->paginateAll($stepSize, 'page', $page); - if ($noBulk) - { - foreach ($docs as $i=>$doc){ - if ($page==$lastPage && $i>=$lastPageEntryCount){ - break; - } - $progressBar->setMessage($doc->getId()); - $progressBar->advance(); - $this->indexOne($doc); + $page = 0; + $lastPage = PHP_INT_MAX; + $docIds = []; + + while($page++<$lastPage) { + $docsPaginator = $this->documentRepository->paginate(null, $stepSize, config('corpusparole.pagination_page_param'), $page, "_graph"); + $lastPage = $docsPaginator->lastPage(); + $docsBodies = []; + foreach($docsPaginator as $docResult) { + $docId = (string)$docResult->getId(); + $progressBar->setMessage($docId); + $progressBar->advance(); + $doc = $this->documentRepository->get($docId); + $docBody = $this->getDocBody($doc); + if($noBulk) { + $this->indexOne($docId, $docBody); + } else { + $docsBodies[$docId] = $docBody; } + $docIds[] = $docId; } - else - { - $progressBar->setMessage('Page '.$page); - $progressBar->advance(); - $this->indexBulk($docs); + if(!$noBulk) { + $this->indexBulk($docsBodies); } } $progressBar->finish(); - $this->info("\nIndexing completed"); + $this->info("\nIndexing completed for " . count(array_unique($docIds))." documents (of ".count($docIds).")."); + } }