Added command "corpus-parole:indexDocuments" to index documents into ElasticSearch
authordurandn
Tue, 15 Dec 2015 10:40:00 +0100
changeset 24 de47e8f66e8b
parent 23 037687868bc4
child 25 4ce76c9e7729
Added command "corpus-parole:indexDocuments" to index documents into ElasticSearch
server/src/.env.example
server/src/app/Console/Commands/IndexDocuments.php
server/src/app/Console/Kernel.php
server/src/composer.json
server/src/composer.lock
server/src/config/app.php
server/src/config/elasticsearch.php
--- a/server/src/.env.example	Thu Dec 10 16:05:53 2015 +0100
+++ b/server/src/.env.example	Tue Dec 15 10:40:00 2015 +0100
@@ -25,3 +25,9 @@
 CORPUSPAROLE_SESAME_UPDATE_URL=
 
 EASYRDF_HTTP_CLIENT_TIMEOUT=20
+
+ELASTICSEARCH_URL=
+ELASTICSEARCH_LOG_PATH='/logs/elasticsearch.log'
+ELASTICSEARCH_INDEX='corpus'
+ELASTICSEARCH_SHARDS=5
+ELASTICSEARCH_REPLICAS=1
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/server/src/app/Console/Commands/IndexDocuments.php	Tue Dec 15 10:40:00 2015 +0100
@@ -0,0 +1,214 @@
+<?php
+
+namespace CorpusParole\Console\Commands;
+
+use Illuminate\Console\Command;
+use CorpusParole\Repositories\DocumentRepository;
+use Es;
+
+class IndexDocuments extends Command
+{
+
+    /**
+     * The name and signature of the console command.
+     *
+     * @var string
+     */
+    protected $signature = 'corpus-parole:indexDocuments
+                          {--limit=0 : index only the first n documents, 0 (default) means index everything }
+                          {--no-bulk : index documents one by one instead of using ElasticSearch bulk indexing }
+                          {--step-size=100 : number of documents to retrieve from repository at a time before indexing}';
+
+    /**
+     * The console command description.
+     *
+     * @var string
+     */
+    protected $description = 'Index documents into ElasticSearch.';
+
+    /**
+     * Create a new command instance.
+     *
+     * @return void
+     */
+    public function __construct(DocumentRepository $documentRepository)
+    {
+        $this->documentRepository = $documentRepository;
+        parent::__construct();
+    }
+
+
+
+    /**
+     * Reset Elasticsearch index
+     *
+     * @return int (1 if sucess, 0 if error)
+     */
+    private function resetIndex()
+    {
+        $indexParams = [
+            'index' => env('ELASTICSEARCH_INDEX')
+        ];
+        if(Es::indices()->exists($indexParams)){
+            $response = Es::indices()->delete($indexParams);
+            if($response['acknowledged']!=1){
+                return 0;
+            }
+        }
+        $indexParams['body'] = [
+            'settings' => [
+                'number_of_shards' => env('ELASTICSEARCH_SHARDS'),
+                'number_of_replicas' => env('ELASTICSEARCH_REPLICAS'),
+                'index.mapping.ignore_malformed' => True
+            ],
+            'mappings' => [
+                'document' => [
+                    'properties' => [
+                        'title' => [
+                            'type' => 'string',
+                            'store' => True,
+                            'fields' => [
+                                'raw' => [
+                                    'type' => 'string',
+                                    'index' => 'not_analyzed'
+                                ]
+                            ]
+                        ],
+                        'date' => [
+                            'type' => 'date',
+                            'store' => True
+                        ]
+                    ]
+                ]
+            ]
+        ];
+        $response = Es::indices()->create($indexParams);
+        if($response['acknowledged']!=1){
+            return 0;
+        }
+        return 1;
+    }
+
+    /**
+     * Index one document into Elasticsearch
+     *
+     * @return int (1 if sucess, 0 if error)
+     */
+    private function indexOne($doc)
+    {
+        $query_data = [
+            'index' => env('ELASTICSEARCH_INDEX'),
+            'type' => 'document',
+            'id' => (string)$doc->getId(),
+            'body' => [
+                'title' => (string)$doc->getTitle(),
+                'date' => (string)$doc->getModified()
+            ]
+        ];
+        Es::index($query_data);
+    }
+
+    /**
+     * Index multiple document into Elasticsearch
+     *
+     * @return int (1 if sucess, 0 if error)
+     */
+     private function indexBulk($docs)
+     {
+          $query_data = ['body' => []];
+          foreach($docs as $doc){
+              $query_data['body'][] = [
+                  'index' => [
+                      '_index' => env('ELASTICSEARCH_INDEX'),
+                      '_type' => 'document',
+                      '_id' => (string)$doc->getId()
+                  ]
+              ];
+              $query_data['body'][] = [
+                  'title' => (string)$doc->getTitle(),
+                  'date' => (string)$doc->getModified()
+              ];
+          }
+          Es::bulk($query_data);
+     }
+    /**
+     * Execute the console command.
+     *
+     * @return mixed
+     */
+    public function handle()
+    {
+        $this->info('Options:');
+        $noBulk = $this->option('no-bulk');
+        if ($noBulk)
+        {
+            $this->comment(' - Indexing without bulk insert');
+        }
+        else
+        {
+            $this->comment(' - Indexing using bulk insert');
+        }
+        $limit = $this->option('limit');
+        if ($limit>0) {
+            $this->comment(' - Indexing only the first '.$limit.' documents');
+        }
+        $stepSize = $this->option('step-size');
+        $this->comment(' - Indexing with step size of '.$stepSize);
+
+        $this->info('Resetting index...');
+        $success = $this->resetIndex();
+        if($success==1){
+            $this->comment('Index reset!');
+        }
+        else{
+            $this->error('Error resetting index ' . env('ELASTICSEARCH_INDEX'));
+        }
+
+        $this->info('Indexing documents...');
+
+        if ($limit<=0) {
+            $lastPage = $this->documentRepository->paginateAll($stepSize, 'page')->lastPage();
+            $total = $this->documentRepository->getCount();
+            $lastPageEntryCount = $stepSize+1;
+        }
+        else {
+            $lastPage = min((int)($limit/$stepSize)+1, $this->documentRepository->paginateAll($stepSize, 'page')->lastPage());
+            $total = $limit;
+            $lastPageEntryCount = $limit % $stepSize;
+        }
+
+        if ($noBulk)
+        {
+            $progressBar = $this->output->createProgressBar($total);
+        }
+        else
+        {
+            $progressBar = $this->output->createProgressBar($lastPage);
+        }
+        $progressBar->setFormat(' %current%/%max% [%bar%] %percent:3s%% - %message%');
+
+        for ($page=1;$page<=$lastPage;$page++)
+        {
+            $docs = $this->documentRepository->paginateAll($stepSize, 'page', $page);
+            if ($noBulk)
+            {
+                foreach ($docs as $i=>$doc){
+                    if ($page==$lastPage && $i>=$lastPageEntryCount){
+                        break;
+                    }
+                    $this->indexOne($doc);
+                    $progressBar->advance();
+                    $progressBar->setMessage($doc->getId());
+                }
+            }
+            else
+            {
+                $this->indexBulk($docs);
+                $progressBar->advance();
+                $progressBar->setMessage('Page '.$page);
+            }
+        }
+        $progressBar->finish();
+        $this->info('Indexing completed');
+    }
+}
--- a/server/src/app/Console/Kernel.php	Thu Dec 10 16:05:53 2015 +0100
+++ b/server/src/app/Console/Kernel.php	Tue Dec 15 10:40:00 2015 +0100
@@ -15,6 +15,7 @@
     protected $commands = [
         'CorpusParole\Console\Commands\Inspire',
         'CorpusParole\Console\Commands\ImportCocoonRDF',
+        'CorpusParole\Console\Commands\IndexDocuments',
     ];
 
     /**
--- a/server/src/composer.json	Thu Dec 10 16:05:53 2015 +0100
+++ b/server/src/composer.json	Tue Dec 15 10:40:00 2015 +0100
@@ -10,7 +10,8 @@
 		"ml/json-ld": "*",
 		"caseyamcl/phpoaipmh": "~2.4",
 		"guzzlehttp/guzzle":   "~6.0",
-		"laravelcollective/html": "5.1.*"
+		"laravelcollective/html": "5.1.*",
+		"shift31/laravel-elasticsearch": "~1.0"
 	},
 	"require-dev": {
 		"phpunit/phpunit": "~5.0",
--- a/server/src/composer.lock	Thu Dec 10 16:05:53 2015 +0100
+++ b/server/src/composer.lock	Tue Dec 15 10:40:00 2015 +0100
@@ -4,8 +4,8 @@
         "Read more about it at https://getcomposer.org/doc/01-basic-usage.md#composer-lock-the-lock-file",
         "This file is @generated automatically"
     ],
-    "hash": "814fed9487bfb758c37b3f322650ade6",
-    "content-hash": "80322b0e38b8cc26e4ef4caa3a13894f",
+    "hash": "e7a4d2a203d50a598d3a7c5b4c3605b4",
+    "content-hash": "aa3d65578938af3883cada1736fa5d64",
     "packages": [
         {
             "name": "caseyamcl/phpoaipmh",
@@ -286,7 +286,7 @@
             },
             "dist": {
                 "type": "zip",
-                "url": "https://api.github.com/repos/njh/easyrdf/zipball/02864f8996311ff7c6619be70fc1ecf6f3165c48",
+                "url": "https://api.github.com/repos/njh/easyrdf/zipball/1371c8af2abd3a948c50962b2212fce1912000e7",
                 "reference": "02864f8996311ff7c6619be70fc1ecf6f3165c48",
                 "shasum": ""
             },
@@ -344,6 +344,156 @@
             "time": "2015-10-07 11:58:54"
         },
         {
+            "name": "elasticsearch/elasticsearch",
+            "version": "v1.4.1",
+            "source": {
+                "type": "git",
+                "url": "https://github.com/elastic/elasticsearch-php.git",
+                "reference": "3a5573cf3223d5646a76a5f8ce938048ca340680"
+            },
+            "dist": {
+                "type": "zip",
+                "url": "https://api.github.com/repos/elastic/elasticsearch-php/zipball/3a5573cf3223d5646a76a5f8ce938048ca340680",
+                "reference": "3a5573cf3223d5646a76a5f8ce938048ca340680",
+                "shasum": ""
+            },
+            "require": {
+                "ext-curl": "*",
+                "guzzle/guzzle": "~3.0",
+                "monolog/monolog": "~1.11",
+                "php": ">=5.3.9",
+                "pimple/pimple": "~3.0",
+                "psr/log": "~1.0"
+            },
+            "require-dev": {
+                "athletic/athletic": "~0.1",
+                "cpliakas/git-wrapper": "~1.0",
+                "mikey179/vfsstream": "~1.2",
+                "mockery/mockery": "0.9.4",
+                "phpunit/phpunit": "3.7.*",
+                "satooshi/php-coveralls": "dev-master",
+                "symfony/yaml": "2.4.3 as 2.4.2",
+                "twig/twig": "1.*"
+            },
+            "type": "library",
+            "autoload": {
+                "psr-4": {
+                    "Elasticsearch\\": "src/Elasticsearch/"
+                }
+            },
+            "notification-url": "https://packagist.org/downloads/",
+            "license": [
+                "Apache 2"
+            ],
+            "authors": [
+                {
+                    "name": "Zachary Tong"
+                }
+            ],
+            "description": "PHP Client for Elasticsearch",
+            "keywords": [
+                "client",
+                "elasticsearch",
+                "search"
+            ],
+            "time": "2015-09-17 22:01:44"
+        },
+        {
+            "name": "guzzle/guzzle",
+            "version": "v3.9.3",
+            "source": {
+                "type": "git",
+                "url": "https://github.com/guzzle/guzzle3.git",
+                "reference": "0645b70d953bc1c067bbc8d5bc53194706b628d9"
+            },
+            "dist": {
+                "type": "zip",
+                "url": "https://api.github.com/repos/guzzle/guzzle3/zipball/0645b70d953bc1c067bbc8d5bc53194706b628d9",
+                "reference": "0645b70d953bc1c067bbc8d5bc53194706b628d9",
+                "shasum": ""
+            },
+            "require": {
+                "ext-curl": "*",
+                "php": ">=5.3.3",
+                "symfony/event-dispatcher": "~2.1"
+            },
+            "replace": {
+                "guzzle/batch": "self.version",
+                "guzzle/cache": "self.version",
+                "guzzle/common": "self.version",
+                "guzzle/http": "self.version",
+                "guzzle/inflection": "self.version",
+                "guzzle/iterator": "self.version",
+                "guzzle/log": "self.version",
+                "guzzle/parser": "self.version",
+                "guzzle/plugin": "self.version",
+                "guzzle/plugin-async": "self.version",
+                "guzzle/plugin-backoff": "self.version",
+                "guzzle/plugin-cache": "self.version",
+                "guzzle/plugin-cookie": "self.version",
+                "guzzle/plugin-curlauth": "self.version",
+                "guzzle/plugin-error-response": "self.version",
+                "guzzle/plugin-history": "self.version",
+                "guzzle/plugin-log": "self.version",
+                "guzzle/plugin-md5": "self.version",
+                "guzzle/plugin-mock": "self.version",
+                "guzzle/plugin-oauth": "self.version",
+                "guzzle/service": "self.version",
+                "guzzle/stream": "self.version"
+            },
+            "require-dev": {
+                "doctrine/cache": "~1.3",
+                "monolog/monolog": "~1.0",
+                "phpunit/phpunit": "3.7.*",
+                "psr/log": "~1.0",
+                "symfony/class-loader": "~2.1",
+                "zendframework/zend-cache": "2.*,<2.3",
+                "zendframework/zend-log": "2.*,<2.3"
+            },
+            "suggest": {
+                "guzzlehttp/guzzle": "Guzzle 5 has moved to a new package name. The package you have installed, Guzzle 3, is deprecated."
+            },
+            "type": "library",
+            "extra": {
+                "branch-alias": {
+                    "dev-master": "3.9-dev"
+                }
+            },
+            "autoload": {
+                "psr-0": {
+                    "Guzzle": "src/",
+                    "Guzzle\\Tests": "tests/"
+                }
+            },
+            "notification-url": "https://packagist.org/downloads/",
+            "license": [
+                "MIT"
+            ],
+            "authors": [
+                {
+                    "name": "Michael Dowling",
+                    "email": "mtdowling@gmail.com",
+                    "homepage": "https://github.com/mtdowling"
+                },
+                {
+                    "name": "Guzzle Community",
+                    "homepage": "https://github.com/guzzle/guzzle/contributors"
+                }
+            ],
+            "description": "PHP HTTP client. This library is deprecated in favor of https://packagist.org/packages/guzzlehttp/guzzle",
+            "homepage": "http://guzzlephp.org/",
+            "keywords": [
+                "client",
+                "curl",
+                "framework",
+                "http",
+                "http client",
+                "rest",
+                "web service"
+            ],
+            "time": "2015-03-18 18:23:50"
+        },
+        {
             "name": "guzzlehttp/guzzle",
             "version": "6.1.0",
             "source": {
@@ -1230,6 +1380,52 @@
             "time": "2015-09-19 14:15:08"
         },
         {
+            "name": "pimple/pimple",
+            "version": "v3.0.2",
+            "source": {
+                "type": "git",
+                "url": "https://github.com/silexphp/Pimple.git",
+                "reference": "a30f7d6e57565a2e1a316e1baf2a483f788b258a"
+            },
+            "dist": {
+                "type": "zip",
+                "url": "https://api.github.com/repos/silexphp/Pimple/zipball/a30f7d6e57565a2e1a316e1baf2a483f788b258a",
+                "reference": "a30f7d6e57565a2e1a316e1baf2a483f788b258a",
+                "shasum": ""
+            },
+            "require": {
+                "php": ">=5.3.0"
+            },
+            "type": "library",
+            "extra": {
+                "branch-alias": {
+                    "dev-master": "3.0.x-dev"
+                }
+            },
+            "autoload": {
+                "psr-0": {
+                    "Pimple": "src/"
+                }
+            },
+            "notification-url": "https://packagist.org/downloads/",
+            "license": [
+                "MIT"
+            ],
+            "authors": [
+                {
+                    "name": "Fabien Potencier",
+                    "email": "fabien@symfony.com"
+                }
+            ],
+            "description": "Pimple, a simple Dependency Injection Container",
+            "homepage": "http://pimple.sensiolabs.org",
+            "keywords": [
+                "container",
+                "dependency injection"
+            ],
+            "time": "2015-09-11 15:10:35"
+        },
+        {
             "name": "psr/http-message",
             "version": "1.0",
             "source": {
@@ -1389,6 +1585,44 @@
             "time": "2015-07-16 15:26:57"
         },
         {
+            "name": "shift31/laravel-elasticsearch",
+            "version": "1.3.4",
+            "source": {
+                "type": "git",
+                "url": "https://github.com/shift31/laravel-elasticsearch.git",
+                "reference": "5341998fc1d87a21ee44e44482d07de297b1b515"
+            },
+            "dist": {
+                "type": "zip",
+                "url": "https://api.github.com/repos/shift31/laravel-elasticsearch/zipball/5341998fc1d87a21ee44e44482d07de297b1b515",
+                "reference": "5341998fc1d87a21ee44e44482d07de297b1b515",
+                "shasum": ""
+            },
+            "require": {
+                "elasticsearch/elasticsearch": "~1.3",
+                "illuminate/support": "~4|~5",
+                "php": ">=5.3.0"
+            },
+            "type": "library",
+            "autoload": {
+                "classmap": [
+                    "src/migrations"
+                ],
+                "psr-0": {
+                    "Shift31\\LaravelElasticsearch": "src/"
+                }
+            },
+            "notification-url": "https://packagist.org/downloads/",
+            "authors": [
+                {
+                    "name": "Shift 31 Consulting",
+                    "email": "code@shift31.com"
+                }
+            ],
+            "description": "A Laravel Service Provider for the Elasticsearch API client",
+            "time": "2015-08-10 02:30:03"
+        },
+        {
             "name": "swiftmailer/swiftmailer",
             "version": "v5.4.1",
             "source": {
@@ -2350,7 +2584,7 @@
             },
             "dist": {
                 "type": "zip",
-                "url": "https://api.github.com/repos/padraic/mockery/zipball/bb19cd92e91aee51c344d8bd1a453dc85de47c21",
+                "url": "https://api.github.com/repos/padraic/mockery/zipball/15f6bb72f66df6857b957bedd09dc4fd0ace3b03",
                 "reference": "bb19cd92e91aee51c344d8bd1a453dc85de47c21",
                 "shasum": ""
             },
--- a/server/src/config/app.php	Thu Dec 10 16:05:53 2015 +0100
+++ b/server/src/config/app.php	Tue Dec 15 10:40:00 2015 +0100
@@ -141,6 +141,7 @@
          * Additional service providers
          */
         'Collective\Html\HtmlServiceProvider',
+        'Shift31\LaravelElasticsearch\ElasticsearchServiceProvider',
 
         /*
          * Application Service Providers...
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/server/src/config/elasticsearch.php	Tue Dec 15 10:40:00 2015 +0100
@@ -0,0 +1,11 @@
+<?php
+
+use Monolog\Logger;
+
+return array(
+    'hosts' => array(
+                    env('ELASTICSEARCH_URL')
+                    ),
+    'logPath' => storage_path() . env('ELASTICSEARCH_LOG_PATH'),
+    'logLevel' => Logger::INFO
+);