server/src/app/Console/Commands/IndexDocuments.php
changeset 314 f5690d918358
parent 308 e032d686d88e
child 320 0fce13da58af
equal deleted inserted replaced
313:706f10bcdc3c 314:f5690d918358
     1 <?php
     1 <?php
     2 
     2 
     3 namespace CorpusParole\Console\Commands;
     3 namespace CorpusParole\Console\Commands;
     4 
     4 
     5 use Illuminate\Console\Command;
     5 use Illuminate\Console\Command;
       
     6 use GuzzleHttp\Client;
     6 use CorpusParole\Repositories\DocumentRepository;
     7 use CorpusParole\Repositories\DocumentRepository;
       
     8 use CorpusParole\Libraries\CocoonUtils;
       
     9 use CorpusParole\Models\GeonamesHierarchy;
     7 use Es;
    10 use Es;
     8 
    11 
     9 class IndexDocuments extends Command
    12 class IndexDocuments extends Command
    10 {
    13 {
    11 
    14 
    29     /**
    32     /**
    30      * Create a new command instance.
    33      * Create a new command instance.
    31      *
    34      *
    32      * @return void
    35      * @return void
    33      */
    36      */
    34     public function __construct(DocumentRepository $documentRepository)
    37     public function __construct(DocumentRepository $documentRepository, Client $httpClient)
    35     {
    38     {
    36         $this->documentRepository = $documentRepository;
    39         $this->documentRepository = $documentRepository;
       
    40         $this->httpClient = $httpClient;
    37         parent::__construct();
    41         parent::__construct();
    38     }
    42     }
    39 
    43 
    40 
    44 
    41 
    45 
    53             $response = Es::indices()->delete($indexParams);
    57             $response = Es::indices()->delete($indexParams);
    54             if($response['acknowledged']!=1){
    58             if($response['acknowledged']!=1){
    55                 return 0;
    59                 return 0;
    56             }
    60             }
    57         }
    61         }
       
    62         // Note: removed the "'store' => True" parameters on fields and use _source on record instead
       
    63 
    58         $indexParams['body'] = [
    64         $indexParams['body'] = [
    59             'settings' => [
    65             'settings' => [
    60                 'number_of_shards' => conf('elasticsearch.shards'),
    66                 'number_of_shards' => config('elasticsearch.shards'),
    61                 'number_of_replicas' => conf('elasticsearch.replicas'),
    67                 'number_of_replicas' => config('elasticsearch.replicas'),
    62                 'index.mapping.ignore_malformed' => True
    68                 'index.mapping.ignore_malformed' => True
    63             ],
    69             ],
    64             'mappings' => [
    70             'mappings' => [
    65                 'document' => [
    71                 'document' => [
    66                     'properties' => [
    72                     'properties' => [
    67                         'title' => [
    73                         'title' => [
    68                             'type' => 'string',
    74                             'type' => 'string',
    69                             'store' => True,
       
    70                             'fields' => [
    75                             'fields' => [
    71                                 'raw' => [
    76                                 'raw' => [
    72                                     'type' => 'string',
    77                                     'type' => 'string',
    73                                     'index' => 'not_analyzed'
    78                                     'index' => 'not_analyzed'
    74                                 ]
    79                                 ]
    75                             ]
    80                             ]
    76                         ],
    81                         ],
    77                         'date' => [
    82                         'date' => [ 'type' => 'date' ],
    78                             'type' => 'date',
    83                         'geonames_hyerarchy' => [ 'type' => 'string' ],
    79                             'store' => True
    84                         'location' => [ 'type' => 'geo_point' ]
    80                         ]
    85                         // TODO: add location information
    81                     ]
    86                     ]
    82                 ]
    87                 ]
    83             ]
    88             ]
    84         ];
    89         ];
    85         $response = Es::indices()->create($indexParams);
    90         $response = Es::indices()->create($indexParams);
    87             return 0;
    92             return 0;
    88         }
    93         }
    89         return 1;
    94         return 1;
    90     }
    95     }
    91 
    96 
       
    97 
       
    98     private function getGeonamesHierarchyArray($geonamesid) {
       
    99         // TODO: Manage this cache !!!
       
   100         $hcache = GeonamesHierarchy::where('geonamesid', $geonamesid)->first();
       
   101         if(is_null($hcache)) {
       
   102 
       
   103             // TODO: add delay to respect geonames 2k request/hour
       
   104             // TODO: manage errors
       
   105 
       
   106             $apiBody = $this->httpClient->get(
       
   107                 config('corpusparole.geonames_hierarchy_webservice_url'),
       
   108                 [ 'query' =>
       
   109                     [ 'geonameId' => $geonamesid,
       
   110                       'username' => config('corpusparole.geonames_username') ],
       
   111                   'accept' => 'application/json' // TODO: check this
       
   112                 ]
       
   113             )->getBody();
       
   114             $hjson = json_decode($apiBody);
       
   115             $hcache = new GeonamesHierarchy;
       
   116             $hcache->geonamesid = $geonamesid;
       
   117             $hcache->hierarchy = $hjson;
       
   118             $hcache->save();
       
   119         }
       
   120 
       
   121         $res = [];
       
   122         foreach($hcache->hierarchy['geonames'] as $hierarchyElem) {
       
   123             if(in_array($hierarchyElem['fcode'], ['CONT','PCLI', 'PCL','PCLD', 'PCLF', 'PCLH', 'PCLIX', 'PCLIS', 'ADM1'])) {
       
   124                 array_push($res, $hierarchyElem['geonameId']);
       
   125             }
       
   126         }
       
   127 
       
   128         return $res;
       
   129 
       
   130     }
       
   131 
       
   132     /**
       
   133      * get geonames hierarchy data.
       
   134      * @return array list of geonames ids
       
   135      */
       
   136     private function getGeonamesHierarchy($doc) {
       
   137         $geoRes = $doc->getGeoInfo();
       
   138         if(is_null($geoRes)) {
       
   139             return [];
       
   140         }
       
   141         // aggregate hierachy list from geonames results
       
   142         $res = [];
       
   143         foreach($geoRes->getGeonamesLocs() as $gurl) {
       
   144             $geonamesId = CocoonUtils::getGeonamesidFromUrl($gurl);
       
   145             if(is_null($geonamesId)) {
       
   146                 continue;
       
   147             }
       
   148             $hierarchyIds = $this->getGeonamesHierarchyArray($geonamesId);
       
   149             $res = array_unique(array_merge($res, $hierarchyIds));
       
   150         }
       
   151         return $res;
       
   152 
       
   153     }
       
   154 
    92     /**
   155     /**
    93      * Index one document into Elasticsearch
   156      * Index one document into Elasticsearch
    94      *
   157      *
    95      * @return int (1 if sucess, 0 if error)
   158      * @return int (1 if sucess, 0 if error)
    96      */
   159      */
    97     private function indexOne($doc)
   160     private function indexOne($resultDoc)
    98     {
   161     {
       
   162         $doc = $this->documentRepository->get($resultDoc->getId());
    99         $query_data = [
   163         $query_data = [
   100             'index' => conf('elasticsearch.index'),
   164             'index' => config('elasticsearch.index'),
   101             'type' => 'document',
   165             'type' => 'document',
   102             'id' => (string)$doc->getId(),
   166             'id' => (string)$doc->getId(),
   103             'body' => [
   167             'body' => [
   104                 'title' => (string)$doc->getTitle(),
   168                 'title' => (string)$doc->getTitle(),
   105                 'date' => (string)$doc->getModified()
   169                 'date' => (string)$doc->getModified(),
       
   170                 'geonames_hierarchy' => $this->getGeonamesHierarchy($doc)
   106             ]
   171             ]
   107         ];
   172         ];
   108         Es::index($query_data);
   173         Es::index($query_data);
   109     }
   174     }
   110 
   175 
   117      {
   182      {
   118           $query_data = ['body' => []];
   183           $query_data = ['body' => []];
   119           foreach($docs as $doc){
   184           foreach($docs as $doc){
   120               $query_data['body'][] = [
   185               $query_data['body'][] = [
   121                   'index' => [
   186                   'index' => [
   122                       '_index' => conf('elasticsearch.index'),
   187                       '_index' => config('elasticsearch.index'),
   123                       '_type' => 'document',
   188                       '_type' => 'document',
   124                       '_id' => (string)$doc->getId()
   189                       '_id' => (string)$doc->getId()
   125                   ]
   190                   ]
   126               ];
   191               ];
   127               $query_data['body'][] = [
   192               $query_data['body'][] = [