--- a/server/src/app/Console/Commands/IndexDocuments.php Sun Oct 09 11:44:18 2016 +0200
+++ b/server/src/app/Console/Commands/IndexDocuments.php Tue Oct 11 02:49:59 2016 +0200
@@ -103,12 +103,12 @@
]
]
],
- 'date' => [ 'type' => 'date' ],
- 'geonames_hyerarchy' => [ 'type' => 'string' ],
- 'location' => [ 'type' => 'geo_point' ],
- 'creation_date' => ['type' => 'date'],
- 'language' => ['type' => 'string'],
- 'discourse_types' => ['type' => 'string'],
+ 'date' => [ 'type' => 'date', 'index' => 'not_analyzed'],
+ 'geonames_hyerarchy' => [ 'type' => 'string', 'index' => 'not_analyzed'],
+ 'location' => [ 'type' => 'geo_point'],
+ 'creation_date' => ['type' => 'date', 'index' => 'not_analyzed'],
+ 'language' => ['type' => 'string', 'index' => 'not_analyzed'],
+ 'discourse_types' => ['type' => 'string', 'index' => 'not_analyzed'],
'subject' => [
'type' => 'nested',
'properties' => [
@@ -117,7 +117,6 @@
'label_code' => [ 'type' => 'string', 'index' => 'not_analyzed']
]
]
- // TODO: add location information
]
]
]
@@ -473,7 +472,7 @@
'date' => (string)$doc->getModified(),
'location' => $this->getLocation($doc),
'creation_date' => $this->getCreationDate($doc),
- 'language' => $doc->getLanguageValue(),
+ 'language' => $doc->getLanguagesValue(),
'discourse_types' => $this->getDiscourseTypes($doc),
'geonames_hierarchy' => $this->getGeonamesHierarchy($doc),
'subject' => $this->getSubjects($doc),
@@ -485,14 +484,13 @@
*
* @return int (1 if sucess, 0 if error)
*/
- private function indexOne($resultDoc)
+ private function indexOne($docId, $docBody)
{
- $doc = $this->documentRepository->get($resultDoc->getId());
$query_data = [
'index' => config('elasticsearch.index'),
'type' => 'document',
- 'id' => (string)$doc->getId(),
- 'body' => $this->getDocBody($doc)
+ 'id' => $docId,
+ 'body' => $docBody
];
Es::index($query_data);
}
@@ -502,19 +500,18 @@
*
* @return int (1 if sucess, 0 if error)
*/
- private function indexBulk($docs)
+ private function indexBulk($docBodies)
{
$query_data = ['body' => []];
- foreach($docs as $resultDoc){
- $doc = $this->documentRepository->get($resultDoc->getId());
+ foreach($docBodies as $docId => $docBody){
$query_data['body'][] = [
'index' => [
'_index' => config('elasticsearch.index'),
'_type' => 'document',
- '_id' => (string)$doc->getId()
+ '_id' => $docId
]
];
- $query_data['body'][] = $this->getDocBody($doc);
+ $query_data['body'][] = $docBody;
}
Es::bulk($query_data);
}
@@ -559,49 +556,43 @@
$this->info('Indexing documents...');
- if ($limit<=0) {
- $lastPage = $this->documentRepository->paginateAll($stepSize, 'page')->lastPage();
- $total = $this->documentRepository->getCount();
- $lastPageEntryCount = $stepSize+1;
- }
- else {
- $lastPage = min((int)($limit/$stepSize)+1, $this->documentRepository->paginateAll($stepSize, 'page')->lastPage());
- $total = $limit;
- $lastPageEntryCount = $limit % $stepSize;
+ $limit = (int)$limit;
+ $total = $this->documentRepository->getCount();
+
+ if($limit>0) {
+ $total = min($limit, $total);
}
- if ($noBulk)
- {
- $progressBar = $this->output->createProgressBar($total);
- }
- else
- {
- $progressBar = $this->output->createProgressBar($lastPage);
- }
+ $progressBar = $this->output->createProgressBar($total);
$progressBar->setFormat(' %current%/%max% [%bar%] %percent:3s%% - %message%');
- for ($page=1;$page<=$lastPage;$page++)
- {
- $docs = $this->documentRepository->paginateAll($stepSize, 'page', $page);
- if ($noBulk)
- {
- foreach ($docs as $i=>$doc){
- if ($page==$lastPage && $i>=$lastPageEntryCount){
- break;
- }
- $progressBar->setMessage($doc->getId());
- $progressBar->advance();
- $this->indexOne($doc);
+ $page = 0;
+ $lastPage = PHP_INT_MAX;
+ $docIds = [];
+
+ while($page++<$lastPage) {
+ $docsPaginator = $this->documentRepository->paginate(null, $stepSize, config('corpusparole.pagination_page_param'), $page, "_graph");
+ $lastPage = $docsPaginator->lastPage();
+ $docsBodies = [];
+ foreach($docsPaginator as $docResult) {
+ $docId = (string)$docResult->getId();
+ $progressBar->setMessage($docId);
+ $progressBar->advance();
+ $doc = $this->documentRepository->get($docId);
+ $docBody = $this->getDocBody($doc);
+ if($noBulk) {
+ $this->indexOne($docId, $docBody);
+ } else {
+ $docsBodies[$docId] = $docBody;
}
+ $docIds[] = $docId;
}
- else
- {
- $progressBar->setMessage('Page '.$page);
- $progressBar->advance();
- $this->indexBulk($docs);
+ if(!$noBulk) {
+ $this->indexBulk($docsBodies);
}
}
$progressBar->finish();
- $this->info("\nIndexing completed");
+ $this->info("\nIndexing completed for " . count(array_unique($docIds))." documents (of ".count($docIds).").");
+
}
}