server/src/app/Console/Commands/IndexDocuments.php
changeset 339 766af1228b05
parent 326 226d5b17a119
child 369 796725d33b67
equal deleted inserted replaced
338:4a3899b6a7ed 339:766af1228b05
     1 <?php
     1 <?php
     2 
     2 
     3 namespace CorpusParole\Console\Commands;
     3 namespace CorpusParole\Console\Commands;
     4 
     4 
       
     5 
       
     6 
     5 use Illuminate\Console\Command;
     7 use Illuminate\Console\Command;
       
     8 use EasyRdf\Resource;
       
     9 use EasyRdf\Literal;
       
    10 use EasyRdf\Graph;
       
    11 
       
    12 use Carbon\Carbon;
       
    13 
     6 use GuzzleHttp\Client;
    14 use GuzzleHttp\Client;
       
    15 use GuzzleHttp\Exception\TransferException;
       
    16 use GuzzleHttp\Psr7;
       
    17 
       
    18 use CorpusParole\Libraries\Utils;
     7 use CorpusParole\Repositories\DocumentRepository;
    19 use CorpusParole\Repositories\DocumentRepository;
     8 use CorpusParole\Libraries\CocoonUtils;
    20 use CorpusParole\Libraries\CocoonUtils;
     9 use CorpusParole\Models\GeonamesHierarchy;
    21 use CorpusParole\Models\GeonamesHierarchy;
       
    22 use CorpusParole\Services\BnfResolverInterface;
       
    23 use CorpusParole\Services\LexvoResolverInterface;
    10 use Es;
    24 use Es;
       
    25 use Log;
       
    26 use Cache;
    11 
    27 
    12 class IndexDocuments extends Command
    28 class IndexDocuments extends Command
    13 {
    29 {
    14 
    30 
    15     /**
    31     /**
    18      * @var string
    34      * @var string
    19      */
    35      */
    20     protected $signature = 'corpus-parole:indexDocuments
    36     protected $signature = 'corpus-parole:indexDocuments
    21                           {--limit=0 : index only the first n documents, 0 (default) means index everything }
    37                           {--limit=0 : index only the first n documents, 0 (default) means index everything }
    22                           {--no-bulk : index documents one by one instead of using ElasticSearch bulk indexing }
    38                           {--no-bulk : index documents one by one instead of using ElasticSearch bulk indexing }
    23                           {--step-size=100 : number of documents to retrieve from repository at a time before indexing}';
    39                           {--step-size=100 : number of documents to retrieve from repository at a time before indexing}
       
    40                           {--reset-geo-cache : reset geo cache befr indexing}';
    24 
    41 
    25     /**
    42     /**
    26      * The console command description.
    43      * The console command description.
    27      *
    44      *
    28      * @var string
    45      * @var string
    32     /**
    49     /**
    33      * Create a new command instance.
    50      * Create a new command instance.
    34      *
    51      *
    35      * @return void
    52      * @return void
    36      */
    53      */
    37     public function __construct(DocumentRepository $documentRepository, Client $httpClient)
    54     public function __construct(
       
    55         DocumentRepository $documentRepository,
       
    56         Client $httpClient,
       
    57         BnfResolverInterface $bnfResolver,
       
    58         LexvoResolverInterface $lexvoResolver)
    38     {
    59     {
    39         $this->documentRepository = $documentRepository;
    60         $this->documentRepository = $documentRepository;
       
    61         $this->bnfResolver = $bnfResolver;
       
    62         $this->lexvoResolver = $lexvoResolver;
    40         $this->httpClient = $httpClient;
    63         $this->httpClient = $httpClient;
    41         parent::__construct();
    64         parent::__construct();
    42     }
    65     }
    43 
    66 
    44 
    67 
    63 
    86 
    64         $indexParams['body'] = [
    87         $indexParams['body'] = [
    65             'settings' => [
    88             'settings' => [
    66                 'number_of_shards' => config('elasticsearch.shards'),
    89                 'number_of_shards' => config('elasticsearch.shards'),
    67                 'number_of_replicas' => config('elasticsearch.replicas'),
    90                 'number_of_replicas' => config('elasticsearch.replicas'),
    68                 'index.mapping.ignore_malformed' => True
    91                 'index.mapping.ignore_malformed' => True,
       
    92                 'index.requests.cache.enable' => True
    69             ],
    93             ],
    70             'mappings' => [
    94             'mappings' => [
    71                 'document' => [
    95                 'document' => [
    72                     'properties' => [
    96                     'properties' => [
    73                         'title' => [
    97                         'title' => [
    77                                     'type' => 'string',
   101                                     'type' => 'string',
    78                                     'index' => 'not_analyzed'
   102                                     'index' => 'not_analyzed'
    79                                 ]
   103                                 ]
    80                             ]
   104                             ]
    81                         ],
   105                         ],
    82                         'date' => [ 'type' => 'date' ],
   106                         'date' => [ 'type' => 'date', 'index' => 'not_analyzed'],
    83                         'geonames_hyerarchy' => [ 'type' => 'string' ],
   107                         'geonames_hyerarchy' => [ 'type' => 'string', 'index' => 'not_analyzed'],
    84                         'location' => [ 'type' => 'geo_point' ]
   108                         'location' => [ 'type' => 'geo_point'],
    85                         // TODO: add location information
   109                         'creation_date' => ['type' => 'date', 'index' => 'not_analyzed'],
       
   110                         'language' => ['type' => 'string', 'index' => 'not_analyzed'],
       
   111                         'discourse_types' => ['type' => 'string', 'index' => 'not_analyzed'],
       
   112                         'subject' => [
       
   113                             'type' => 'nested',
       
   114                             'properties' => [
       
   115                                 'label' => [ 'type' => 'string', 'index' => 'not_analyzed'],
       
   116                                 'code' => [ 'type' => 'string', 'index' => 'not_analyzed'],
       
   117                                 'label_code' => [ 'type' => 'string', 'index' => 'not_analyzed']
       
   118                             ]
       
   119                         ]
    86                     ]
   120                     ]
    87                 ]
   121                 ]
    88             ]
   122             ]
    89         ];
   123         ];
    90         $response = Es::indices()->create($indexParams);
   124         $response = Es::indices()->create($indexParams);
    94         return 1;
   128         return 1;
    95     }
   129     }
    96 
   130 
    97 
   131 
    98     private function getGeonamesHierarchyArray($geonamesid) {
   132     private function getGeonamesHierarchyArray($geonamesid) {
    99         // TODO: Manage this cache !!!
   133 
   100         $hcache = GeonamesHierarchy::where('geonamesid', $geonamesid)->first();
   134         $hcache = GeonamesHierarchy::where('geonamesid', $geonamesid)->first();
   101         if(is_null($hcache)) {
   135         if(is_null($hcache)) {
   102 
   136 
   103             // TODO: add delay to respect geonames 2k request/hour
   137             // TODO: add delay to respect geonames 2k request/hour
   104             // TODO: manage errors
   138             // TODO: manage errors
   110                       'username' => config('corpusparole.geonames_username') ],
   144                       'username' => config('corpusparole.geonames_username') ],
   111                   'accept' => 'application/json' // TODO: check this
   145                   'accept' => 'application/json' // TODO: check this
   112                 ]
   146                 ]
   113             )->getBody();
   147             )->getBody();
   114             $hjson = json_decode($apiBody);
   148             $hjson = json_decode($apiBody);
   115             $hcache = new GeonamesHierarchy;
   149             $hcache = new GeonamesHierarchy();
   116             $hcache->geonamesid = $geonamesid;
   150             $hcache->geonamesid = $geonamesid;
   117             $hcache->hierarchy = $hjson;
   151             $hcache->hierarchy = $hjson;
   118             $hcache->save();
   152             $hcache->save();
   119         }
   153         }
   120 
   154 
   122         foreach($hcache->hierarchy['geonames'] as $hierarchyElem) {
   156         foreach($hcache->hierarchy['geonames'] as $hierarchyElem) {
   123             if(in_array($hierarchyElem['fcode'], ['CONT','PCLI', 'PCL','PCLD', 'PCLF', 'PCLH', 'PCLIX', 'PCLIS', 'ADM1'])) {
   157             if(in_array($hierarchyElem['fcode'], ['CONT','PCLI', 'PCL','PCLD', 'PCLF', 'PCLH', 'PCLIX', 'PCLIS', 'ADM1'])) {
   124                 array_push($res, $hierarchyElem['geonameId']);
   158                 array_push($res, $hierarchyElem['geonameId']);
   125             }
   159             }
   126         }
   160         }
   127 
       
   128         return $res;
   161         return $res;
   129 
   162 
   130     }
   163     }
   131 
   164 
   132     /**
   165     /**
   151         return $res;
   184         return $res;
   152 
   185 
   153     }
   186     }
   154 
   187 
   155     /**
   188     /**
       
   189      * get subjects as { 'label': label, 'code': code } objects
       
   190      * Takes only into account the bnf subjects
       
   191      */
       
   192     private function getSubjects($doc) {
       
   193 
       
   194         $sres = array_reduce($doc->getSubjects(), function($res, $s) {
       
   195             $mBnf = [];
       
   196             $mLexvo = [];
       
   197 
       
   198             if($s instanceof Resource && preg_match(config('corpusparole.bnf_ark_url_regexp'), $s->getUri(), $mBnf) === 1) {
       
   199 
       
   200                 array_push($res, [
       
   201                     'uri' => $mBnf[0],
       
   202                     'code' => $mBnf[1],
       
   203                     'type' => 'bnf'
       
   204                 ]);
       
   205             } elseif($s instanceof Resource && preg_match(config('corpusparole.lexvo_url_regexp'), $s->getUri(), $mLexvo) === 1) {
       
   206                 array_push($res, [
       
   207                     'uri' => $mLexvo[0],
       
   208                     'code' => $mLexvo[1],
       
   209                     'type' => 'lxv'
       
   210                 ]);
       
   211             } elseif($s instanceof Literal && strpos($s->getDatatypeUri(), config('corpusparole.olac_base_url')) === 0 ) {
       
   212                 array_push($res, [
       
   213                     'uri' => $s->getValue(),
       
   214                     'code' => $s->getValue(),
       
   215                     'type' => 'olac'
       
   216                 ]);
       
   217             } elseif($s instanceof Literal) {
       
   218                 array_push($res, [
       
   219                     'uri' => $s->getValue(),
       
   220                     'code' => $s->getValue(),
       
   221                     'type' => 'txt'
       
   222                 ]);
       
   223             }
       
   224             return $res;
       
   225         }, []);
       
   226 
       
   227         $labelsBnf = $this->bnfResolver->getLabels(
       
   228             array_unique(array_reduce(
       
   229                 $sres,
       
   230                 function($r, $so) {
       
   231                     if($so['type'] === 'bnf') {
       
   232                         array_push($r, $so['uri']);
       
   233                     }
       
   234                     return $r;
       
   235                 },[]
       
   236             ))
       
   237         );
       
   238         $labelsLexvo = $this->lexvoResolver->getNames(
       
   239             array_unique(array_reduce(
       
   240                 $sres,
       
   241                 function($r, $so) {
       
   242                     if($so['type'] === 'lxv') {
       
   243                         array_push($r, $so['uri']);
       
   244                     }
       
   245                     return $r;
       
   246                 },[]
       
   247             ))
       
   248         );
       
   249 
       
   250         return array_map(function($so) use ($labelsBnf, $labelsLexvo) {
       
   251             $label = $so['uri'];
       
   252             if($so['type'] === 'bnf') {
       
   253                 $label = $labelsBnf[$label];
       
   254             } elseif ($so['type'] === 'lxv') {
       
   255                 $label = $labelsLexvo[$label];
       
   256             }
       
   257             return [ 'label' => $label, 'code' => $so['code'], 'label_code' =>  $label."|".$so['type']."|".$so['code'] ]; }, $sres
       
   258         );
       
   259     }
       
   260 
       
   261     private function graphResolvCoordinate($loc, $graph) {
       
   262         $latLit = $graph->getLiteral($loc, "<http://www.w3.org/2003/01/geo/wgs84_pos#lat>");
       
   263         if(is_null($latLit) || empty($latLit->getValue())) {
       
   264             return null;
       
   265         }
       
   266         $lat = $latLit->getValue();
       
   267 
       
   268         $longLit = $graph->getLiteral($loc, "<http://www.w3.org/2003/01/geo/wgs84_pos#long>");
       
   269         if(is_null($longLit) || empty($longLit->getValue())) {
       
   270             return null;
       
   271         }
       
   272         $long = $longLit->getValue();
       
   273 
       
   274         return [ $lat, $long ];
       
   275     }
       
   276 
       
   277     private function loadGraph($url, $type) {
       
   278         try {
       
   279             $r = $this->httpClient->get($url);
       
   280         } catch (TransferException $e) {
       
   281             $this->error("loadGraph : Error Loading $url");
       
   282             Log::error("loadGraph : Error Loading $url");
       
   283             Log::error("loadGraph : Error request " . Psr7\str($e->getRequest()));
       
   284             if ($e->hasResponse()) {
       
   285                 $this->error("loadGraph : Error response " . Psr7\str($e->getResponse()));
       
   286                 Log::error("loadGraph : Error response " . Psr7\str($e->getResponse()));
       
   287             }
       
   288             return null;
       
   289         }
       
   290         try {
       
   291             $message = (string)$r->getBody();
       
   292             $graph = new Graph($url, $message, $type);
       
   293             return $graph;
       
   294         } catch (EasyRdf\Exception $e) {
       
   295             $this->error("loadGraph : Error parsing $url");
       
   296             Log::error("loadGraph : Error parsing $url");
       
   297             if($e instanceof EasyRdf\Parser\Exception) {
       
   298                 Log::error("loadGraph : Error exception line ".$e->getLine().", column: ".$e->getColumn());
       
   299             }
       
   300             $this->error("loadGraph : Error exception message ".$e->getMessage());
       
   301             Log::error("loadGraph : Error exception message ".$e->getMessage());
       
   302             Log::error("loadGraph : Error content $message");
       
   303             return null;
       
   304         }
       
   305 
       
   306     }
       
   307 
       
   308     private function geonamesResolveCoordinates($loc) {
       
   309         $coords = cache("corpus.geonames.coord.$loc");
       
   310         if(is_null($coords)) {
       
   311             $graph = $this->loadGraph("{$loc}about.rdf", 'rdfxml');
       
   312             $coords = is_null($graph)?null:$this->graphResolvCoordinate($loc, $graph);
       
   313             cache(["corpus.geonames.coord.$loc" => is_null($coords)?false:$coords], Carbon::now()->addMinutes(20));
       
   314         }
       
   315         return ($coords===false)?null:$coords;
       
   316     }
       
   317 
       
   318     private function dbpediaResolveCoordinates($loc) {
       
   319         $coords = cache("corpus.dbpedia.coord.$loc");
       
   320         if(is_null($coords)) {
       
   321             $graph = $this->loadGraph("$loc.rdf", 'rdfxml');
       
   322             $coords = is_null($graph)?null:$this->graphResolvCoordinate($loc, $graph);
       
   323             cache(["corpus.dbpedia.coord.$loc"=> is_null($coords)?false:$coords], Carbon::now()->addMinutes(20));
       
   324         }
       
   325         return ($coords===false)?null:$coords;
       
   326     }
       
   327 
       
   328     private function getLocation($doc) {
       
   329 
       
   330         $geoRes = $doc->getGeoInfo();
       
   331 
       
   332         if(is_null($geoRes)) {
       
   333             return null;
       
   334         }
       
   335 
       
   336         $locUrls = [];
       
   337         foreach($geoRes->getRefLocs() as $loc) {
       
   338             if(preg_match(config('corpusparole.geonames_url_regexp'), $loc, $m) === 1) {
       
   339 
       
   340                 if(!array_key_exists('geonames', $locUrls)) {
       
   341                     $locUrls['geonames'] = [];
       
   342                 }
       
   343                 array_push($locUrls['geonames'], "http://sws.geonames.org/$m[1]/");
       
   344 
       
   345             } elseif(preg_match(config('corpusparole.dbpedia_url_regexp'), $loc, $md) === 1) {
       
   346                 if(!array_key_exists('dbpedia', $locUrls)) {
       
   347                     $locUrls['dbpedia'] = [];
       
   348                 }
       
   349                 //$this->line("DBPEDIA MATCH $loc ".print_r($md,true));
       
   350                 array_push($locUrls['dbpedia'], "http://$md[1]/data/$md[4]");
       
   351             }
       
   352         }
       
   353 
       
   354         $coordinates = null;
       
   355         foreach($locUrls as $locType => $locList) {
       
   356             foreach($locList as $locationUrl) {
       
   357                 $coordinates = call_user_func([$this, "${locType}ResolveCoordinates"], $locationUrl);
       
   358                 if(!is_null($coordinates)) {
       
   359                     break;
       
   360                 }
       
   361             }
       
   362         }
       
   363 
       
   364         if(is_null($coordinates)) {
       
   365             $coordinates = [$geoRes->getLatitudeValue(), $geoRes->getLongitudeValue()];
       
   366         }
       
   367 
       
   368         if(empty($coordinates[0]) || empty($coordinates[1])) {
       
   369             return null;
       
   370         } else {
       
   371             return [floatval($coordinates[0]), floatval($coordinates[1])];
       
   372         }
       
   373 
       
   374     }
       
   375 
       
   376     private function getCreationDate($doc) {
       
   377 
       
   378         $created = $doc->getCreated();
       
   379         if(is_null($created)) {
       
   380             return null;
       
   381         }
       
   382         $dateType = $created->getDatatypeUri();
       
   383         $res = null;
       
   384 
       
   385         if($dateType === "http://purl.org/dc/terms/Period") {
       
   386             $res = $this->processPeriod($created->getValue());
       
   387         }
       
   388         elseif($dateType === "http://purl.org/dc/terms/W3CDTF") {
       
   389             $res = $this->processDate($created->getValue());
       
   390         }
       
   391 
       
   392         return $res;
       
   393 
       
   394     }
       
   395 
       
   396     private function extractDate($dateStr) {
       
   397         if(preg_match("/^\\d{4}$/", $dateStr) === 1) {
       
   398             $dateStr = "$dateStr-1-1";
       
   399         }
       
   400         $date = date_create($dateStr);
       
   401         if($date === false ) {
       
   402             Log::warning("DateStatsController:extractYear bad format for date $dateStr");
       
   403             return null;
       
   404         }
       
   405         return $date;
       
   406     }
       
   407 
       
   408     private function processPeriod($periodStr) {
       
   409         $start = null;
       
   410         $end = null;
       
   411         foreach(explode(";", $periodStr) as $elem) {
       
   412             $elem = trim($elem);
       
   413             if(strpos($elem, 'start=') === 0) {
       
   414                 $startDate = $this->extractDate(trim(substr($elem, 6)));
       
   415                 if(is_null($startDate)) {
       
   416                     return null;
       
   417                 }
       
   418                 $start = intval($startDate->format("Y"));
       
   419                 if($start === false) {
       
   420                     return null;
       
   421                 }
       
   422             } elseif(strpos($elem, 'end=') === 0) {
       
   423                 $endDate = $this->extractDate(trim(substr($elem, 4)));
       
   424                 if(is_null($endDate)) {
       
   425                     return null;
       
   426                 }
       
   427                 $end = intval($endDate->format("Y"));
       
   428                 if($end === false) {
       
   429                     return null;
       
   430                 }
       
   431             }
       
   432         }
       
   433 
       
   434         if(is_null($start) || is_null($end) || $start>$end ) {
       
   435             Log::warning("Bad format for $periodStr");
       
   436             return null;
       
   437         }
       
   438 
       
   439         return array_map(function($y) {
       
   440             return \DateTime::createFromFormat("Y", "$y")->format(\DateTime::W3C);
       
   441         }, range($start, $end));
       
   442     }
       
   443 
       
   444     private function processDate($dateStr) {
       
   445         $date = $this->extractDate($dateStr);
       
   446         if(is_null($date))  {
       
   447             return null;
       
   448         } else {
       
   449             return $date->format(\DateTime::W3C);
       
   450         }
       
   451     }
       
   452 
       
   453     private function getDiscourseTypes($doc) {
       
   454         return array_reduce($doc->getDiscourseTypes(), function($res, $d) {
       
   455             $val = null;
       
   456             if($d instanceof Resource) {
       
   457                 $val = $d->getUri();
       
   458             } elseif($d instanceof Literal) {
       
   459                 $datatype = $d->getDatatypeURI();
       
   460                 $val = (!empty($datatype)?"$datatype#":"").$d->getValue();
       
   461             }
       
   462             if(!empty($val)) {
       
   463                 array_push($res,$val);
       
   464             }
       
   465             return $res;
       
   466         }, []);
       
   467     }
       
   468 
       
   469     private function getDocBody($doc) {
       
   470         return [
       
   471             'title' => (string)$doc->getTitle(),
       
   472             'date' => (string)$doc->getModified(),
       
   473             'location' => $this->getLocation($doc),
       
   474             'creation_date' => $this->getCreationDate($doc),
       
   475             'language' => $doc->getLanguagesValue(),
       
   476             'discourse_types' => $this->getDiscourseTypes($doc),
       
   477             'geonames_hierarchy' => $this->getGeonamesHierarchy($doc),
       
   478             'subject' => $this->getSubjects($doc),
       
   479         ];
       
   480     }
       
   481 
       
   482     /**
   156      * Index one document into Elasticsearch
   483      * Index one document into Elasticsearch
   157      *
   484      *
   158      * @return int (1 if sucess, 0 if error)
   485      * @return int (1 if sucess, 0 if error)
   159      */
   486      */
   160     private function indexOne($resultDoc)
   487     private function indexOne($docId, $docBody)
   161     {
   488     {
   162         $doc = $this->documentRepository->get($resultDoc->getId());
       
   163         $query_data = [
   489         $query_data = [
   164             'index' => config('elasticsearch.index'),
   490             'index' => config('elasticsearch.index'),
   165             'type' => 'document',
   491             'type' => 'document',
   166             'id' => (string)$doc->getId(),
   492             'id' => $docId,
   167             'body' => [
   493             'body' => $docBody
   168                 'title' => (string)$doc->getTitle(),
       
   169                 'date' => (string)$doc->getModified(),
       
   170                 'geonames_hierarchy' => $this->getGeonamesHierarchy($doc)
       
   171             ]
       
   172         ];
   494         ];
   173         Es::index($query_data);
   495         Es::index($query_data);
   174     }
   496     }
   175 
   497 
   176     /**
   498     /**
   177      * Index multiple document into Elasticsearch
   499      * Index multiple document into Elasticsearch
   178      *
   500      *
   179      * @return int (1 if sucess, 0 if error)
   501      * @return int (1 if sucess, 0 if error)
   180      */
   502      */
   181      private function indexBulk($docs)
   503      private function indexBulk($docBodies)
   182      {
   504      {
   183           $query_data = ['body' => []];
   505           $query_data = ['body' => []];
   184           foreach($docs as $doc){
   506           foreach($docBodies as $docId => $docBody){
   185               $query_data['body'][] = [
   507               $query_data['body'][] = [
   186                   'index' => [
   508                   'index' => [
   187                       '_index' => config('elasticsearch.index'),
   509                       '_index' => config('elasticsearch.index'),
   188                       '_type' => 'document',
   510                       '_type' => 'document',
   189                       '_id' => (string)$doc->getId()
   511                       '_id' => $docId
   190                   ]
   512                   ]
   191               ];
   513               ];
   192               $query_data['body'][] = [
   514               $query_data['body'][] = $docBody;
   193                   'title' => (string)$doc->getTitle(),
       
   194                   'date' => (string)$doc->getModified()
       
   195               ];
       
   196           }
   515           }
   197           Es::bulk($query_data);
   516           Es::bulk($query_data);
   198      }
   517      }
   199     /**
   518     /**
   200      * Execute the console command.
   519      * Execute the console command.
   218             $this->comment(' - Indexing only the first '.$limit.' documents');
   537             $this->comment(' - Indexing only the first '.$limit.' documents');
   219         }
   538         }
   220         $stepSize = $this->option('step-size');
   539         $stepSize = $this->option('step-size');
   221         $this->comment(' - Indexing with step size of '.$stepSize);
   540         $this->comment(' - Indexing with step size of '.$stepSize);
   222 
   541 
       
   542         if($this->option('reset-geo-cache', false)) {
       
   543             // delete all rows in GeonamesHierarchy
       
   544             GeonamesHierarchy::getQuery()->delete();
       
   545             $this->comment('Geonames cache reset!');
       
   546         }
       
   547 
   223         $this->info('Resetting index...');
   548         $this->info('Resetting index...');
   224         $success = $this->resetIndex();
   549         $success = $this->resetIndex();
   225         if($success==1){
   550         if($success==1){
   226             $this->comment('Index reset!');
   551             $this->comment('Index reset!');
   227         }
   552         }
   229             $this->error('Error resetting index ' . env('ELASTICSEARCH_INDEX'));
   554             $this->error('Error resetting index ' . env('ELASTICSEARCH_INDEX'));
   230         }
   555         }
   231 
   556 
   232         $this->info('Indexing documents...');
   557         $this->info('Indexing documents...');
   233 
   558 
   234         if ($limit<=0) {
   559         $limit = (int)$limit;
   235             $lastPage = $this->documentRepository->paginateAll($stepSize, 'page')->lastPage();
   560         $total = $this->documentRepository->getCount();
   236             $total = $this->documentRepository->getCount();
   561 
   237             $lastPageEntryCount = $stepSize+1;
   562         if($limit>0) {
   238         }
   563             $total = min($limit, $total);
   239         else {
   564         }
   240             $lastPage = min((int)($limit/$stepSize)+1, $this->documentRepository->paginateAll($stepSize, 'page')->lastPage());
   565 
   241             $total = $limit;
   566         $progressBar = $this->output->createProgressBar($total);
   242             $lastPageEntryCount = $limit % $stepSize;
       
   243         }
       
   244 
       
   245         if ($noBulk)
       
   246         {
       
   247             $progressBar = $this->output->createProgressBar($total);
       
   248         }
       
   249         else
       
   250         {
       
   251             $progressBar = $this->output->createProgressBar($lastPage);
       
   252         }
       
   253         $progressBar->setFormat(' %current%/%max% [%bar%] %percent:3s%% - %message%');
   567         $progressBar->setFormat(' %current%/%max% [%bar%] %percent:3s%% - %message%');
   254 
   568 
   255         for ($page=1;$page<=$lastPage;$page++)
   569         $page = 0;
   256         {
   570         $lastPage = PHP_INT_MAX;
   257             $docs = $this->documentRepository->paginateAll($stepSize, 'page', $page);
   571         $docIds = [];
   258             if ($noBulk)
   572 
   259             {
   573         while($page++<$lastPage) {
   260                 foreach ($docs as $i=>$doc){
   574             $docsPaginator = $this->documentRepository->paginate(null, $stepSize, config('corpusparole.pagination_page_param'), $page, "_graph");
   261                     if ($page==$lastPage && $i>=$lastPageEntryCount){
   575             $lastPage = $docsPaginator->lastPage();
   262                         break;
   576             $docsBodies = [];
   263                     }
   577             foreach($docsPaginator as $docResult) {
   264                     $this->indexOne($doc);
   578                 $docId = (string)$docResult->getId();
   265                     $progressBar->advance();
   579                 $progressBar->setMessage($docId);
   266                     $progressBar->setMessage($doc->getId());
       
   267                 }
       
   268             }
       
   269             else
       
   270             {
       
   271                 $this->indexBulk($docs);
       
   272                 $progressBar->advance();
   580                 $progressBar->advance();
   273                 $progressBar->setMessage('Page '.$page);
   581                 $doc = $this->documentRepository->get($docId);
       
   582                 $docBody = $this->getDocBody($doc);
       
   583                 if($noBulk) {
       
   584                     $this->indexOne($docId, $docBody);
       
   585                 } else {
       
   586                     $docsBodies[$docId] = $docBody;
       
   587                 }
       
   588                 $docIds[] = $docId;
       
   589             }
       
   590             if(!$noBulk) {
       
   591                 $this->indexBulk($docsBodies);
   274             }
   592             }
   275         }
   593         }
   276         $progressBar->finish();
   594         $progressBar->finish();
   277         $this->info('Indexing completed');
   595         $this->info("\nIndexing completed for " . count(array_unique($docIds))." documents (of ".count($docIds).").");
       
   596 
   278     }
   597     }
   279 }
   598 }