101 'type' => 'string', |
101 'type' => 'string', |
102 'index' => 'not_analyzed' |
102 'index' => 'not_analyzed' |
103 ] |
103 ] |
104 ] |
104 ] |
105 ], |
105 ], |
106 'date' => [ 'type' => 'date' ], |
106 'date' => [ 'type' => 'date', 'index' => 'not_analyzed'], |
107 'geonames_hyerarchy' => [ 'type' => 'string' ], |
107 'geonames_hyerarchy' => [ 'type' => 'string', 'index' => 'not_analyzed'], |
108 'location' => [ 'type' => 'geo_point' ], |
108 'location' => [ 'type' => 'geo_point'], |
109 'creation_date' => ['type' => 'date'], |
109 'creation_date' => ['type' => 'date', 'index' => 'not_analyzed'], |
110 'language' => ['type' => 'string'], |
110 'language' => ['type' => 'string', 'index' => 'not_analyzed'], |
111 'discourse_types' => ['type' => 'string'], |
111 'discourse_types' => ['type' => 'string', 'index' => 'not_analyzed'], |
112 'subject' => [ |
112 'subject' => [ |
113 'type' => 'nested', |
113 'type' => 'nested', |
114 'properties' => [ |
114 'properties' => [ |
115 'label' => [ 'type' => 'string', 'index' => 'not_analyzed'], |
115 'label' => [ 'type' => 'string', 'index' => 'not_analyzed'], |
116 'code' => [ 'type' => 'string', 'index' => 'not_analyzed'], |
116 'code' => [ 'type' => 'string', 'index' => 'not_analyzed'], |
117 'label_code' => [ 'type' => 'string', 'index' => 'not_analyzed'] |
117 'label_code' => [ 'type' => 'string', 'index' => 'not_analyzed'] |
118 ] |
118 ] |
119 ] |
119 ] |
120 // TODO: add location information |
|
121 ] |
120 ] |
122 ] |
121 ] |
123 ] |
122 ] |
124 ]; |
123 ]; |
125 $response = Es::indices()->create($indexParams); |
124 $response = Es::indices()->create($indexParams); |
471 return [ |
470 return [ |
472 'title' => (string)$doc->getTitle(), |
471 'title' => (string)$doc->getTitle(), |
473 'date' => (string)$doc->getModified(), |
472 'date' => (string)$doc->getModified(), |
474 'location' => $this->getLocation($doc), |
473 'location' => $this->getLocation($doc), |
475 'creation_date' => $this->getCreationDate($doc), |
474 'creation_date' => $this->getCreationDate($doc), |
476 'language' => $doc->getLanguageValue(), |
475 'language' => $doc->getLanguagesValue(), |
477 'discourse_types' => $this->getDiscourseTypes($doc), |
476 'discourse_types' => $this->getDiscourseTypes($doc), |
478 'geonames_hierarchy' => $this->getGeonamesHierarchy($doc), |
477 'geonames_hierarchy' => $this->getGeonamesHierarchy($doc), |
479 'subject' => $this->getSubjects($doc), |
478 'subject' => $this->getSubjects($doc), |
480 ]; |
479 ]; |
481 } |
480 } |
483 /** |
482 /** |
484 * Index one document into Elasticsearch |
483 * Index one document into Elasticsearch |
485 * |
484 * |
486 * @return int (1 if sucess, 0 if error) |
485 * @return int (1 if sucess, 0 if error) |
487 */ |
486 */ |
488 private function indexOne($resultDoc) |
487 private function indexOne($docId, $docBody) |
489 { |
488 { |
490 $doc = $this->documentRepository->get($resultDoc->getId()); |
|
491 $query_data = [ |
489 $query_data = [ |
492 'index' => config('elasticsearch.index'), |
490 'index' => config('elasticsearch.index'), |
493 'type' => 'document', |
491 'type' => 'document', |
494 'id' => (string)$doc->getId(), |
492 'id' => $docId, |
495 'body' => $this->getDocBody($doc) |
493 'body' => $docBody |
496 ]; |
494 ]; |
497 Es::index($query_data); |
495 Es::index($query_data); |
498 } |
496 } |
499 |
497 |
500 /** |
498 /** |
501 * Index multiple document into Elasticsearch |
499 * Index multiple document into Elasticsearch |
502 * |
500 * |
503 * @return int (1 if sucess, 0 if error) |
501 * @return int (1 if sucess, 0 if error) |
504 */ |
502 */ |
505 private function indexBulk($docs) |
503 private function indexBulk($docBodies) |
506 { |
504 { |
507 $query_data = ['body' => []]; |
505 $query_data = ['body' => []]; |
508 foreach($docs as $resultDoc){ |
506 foreach($docBodies as $docId => $docBody){ |
509 $doc = $this->documentRepository->get($resultDoc->getId()); |
|
510 $query_data['body'][] = [ |
507 $query_data['body'][] = [ |
511 'index' => [ |
508 'index' => [ |
512 '_index' => config('elasticsearch.index'), |
509 '_index' => config('elasticsearch.index'), |
513 '_type' => 'document', |
510 '_type' => 'document', |
514 '_id' => (string)$doc->getId() |
511 '_id' => $docId |
515 ] |
512 ] |
516 ]; |
513 ]; |
517 $query_data['body'][] = $this->getDocBody($doc); |
514 $query_data['body'][] = $docBody; |
518 } |
515 } |
519 Es::bulk($query_data); |
516 Es::bulk($query_data); |
520 } |
517 } |
521 /** |
518 /** |
522 * Execute the console command. |
519 * Execute the console command. |
557 $this->error('Error resetting index ' . env('ELASTICSEARCH_INDEX')); |
554 $this->error('Error resetting index ' . env('ELASTICSEARCH_INDEX')); |
558 } |
555 } |
559 |
556 |
560 $this->info('Indexing documents...'); |
557 $this->info('Indexing documents...'); |
561 |
558 |
562 if ($limit<=0) { |
559 $limit = (int)$limit; |
563 $lastPage = $this->documentRepository->paginateAll($stepSize, 'page')->lastPage(); |
560 $total = $this->documentRepository->getCount(); |
564 $total = $this->documentRepository->getCount(); |
561 |
565 $lastPageEntryCount = $stepSize+1; |
562 if($limit>0) { |
566 } |
563 $total = min($limit, $total); |
567 else { |
564 } |
568 $lastPage = min((int)($limit/$stepSize)+1, $this->documentRepository->paginateAll($stepSize, 'page')->lastPage()); |
565 |
569 $total = $limit; |
566 $progressBar = $this->output->createProgressBar($total); |
570 $lastPageEntryCount = $limit % $stepSize; |
|
571 } |
|
572 |
|
573 if ($noBulk) |
|
574 { |
|
575 $progressBar = $this->output->createProgressBar($total); |
|
576 } |
|
577 else |
|
578 { |
|
579 $progressBar = $this->output->createProgressBar($lastPage); |
|
580 } |
|
581 $progressBar->setFormat(' %current%/%max% [%bar%] %percent:3s%% - %message%'); |
567 $progressBar->setFormat(' %current%/%max% [%bar%] %percent:3s%% - %message%'); |
582 |
568 |
583 for ($page=1;$page<=$lastPage;$page++) |
569 $page = 0; |
584 { |
570 $lastPage = PHP_INT_MAX; |
585 $docs = $this->documentRepository->paginateAll($stepSize, 'page', $page); |
571 $docIds = []; |
586 if ($noBulk) |
572 |
587 { |
573 while($page++<$lastPage) { |
588 foreach ($docs as $i=>$doc){ |
574 $docsPaginator = $this->documentRepository->paginate(null, $stepSize, config('corpusparole.pagination_page_param'), $page, "_graph"); |
589 if ($page==$lastPage && $i>=$lastPageEntryCount){ |
575 $lastPage = $docsPaginator->lastPage(); |
590 break; |
576 $docsBodies = []; |
591 } |
577 foreach($docsPaginator as $docResult) { |
592 $progressBar->setMessage($doc->getId()); |
578 $docId = (string)$docResult->getId(); |
593 $progressBar->advance(); |
579 $progressBar->setMessage($docId); |
594 $this->indexOne($doc); |
|
595 } |
|
596 } |
|
597 else |
|
598 { |
|
599 $progressBar->setMessage('Page '.$page); |
|
600 $progressBar->advance(); |
580 $progressBar->advance(); |
601 $this->indexBulk($docs); |
581 $doc = $this->documentRepository->get($docId); |
|
582 $docBody = $this->getDocBody($doc); |
|
583 if($noBulk) { |
|
584 $this->indexOne($docId, $docBody); |
|
585 } else { |
|
586 $docsBodies[$docId] = $docBody; |
|
587 } |
|
588 $docIds[] = $docId; |
|
589 } |
|
590 if(!$noBulk) { |
|
591 $this->indexBulk($docsBodies); |
602 } |
592 } |
603 } |
593 } |
604 $progressBar->finish(); |
594 $progressBar->finish(); |
605 $this->info("\nIndexing completed"); |
595 $this->info("\nIndexing completed for " . count(array_unique($docIds))." documents (of ".count($docIds).")."); |
|
596 |
606 } |
597 } |
607 } |
598 } |