web/lib/Zend/Search/Lucene/Index/SegmentMerger.php
changeset 64 162c1de6545a
parent 19 1c2f13fd785c
child 68 ecaf28ffe26e
equal deleted inserted replaced
63:5b37998e522e 64:162c1de6545a
       
     1 <?php
       
     2 /**
       
     3  * Zend Framework
       
     4  *
       
     5  * LICENSE
       
     6  *
       
     7  * This source file is subject to the new BSD license that is bundled
       
     8  * with this package in the file LICENSE.txt.
       
     9  * It is also available through the world-wide-web at this URL:
       
    10  * http://framework.zend.com/license/new-bsd
       
    11  * If you did not receive a copy of the license and are unable to
       
    12  * obtain it through the world-wide-web, please send an email
       
    13  * to license@zend.com so we can send you a copy immediately.
       
    14  *
       
    15  * @category   Zend
       
    16  * @package    Zend_Search_Lucene
       
    17  * @subpackage Index
       
    18  * @copyright  Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
       
    19  * @license    http://framework.zend.com/license/new-bsd     New BSD License
       
    20  * @version    $Id: SegmentMerger.php 20096 2010-01-06 02:05:09Z bkarwin $
       
    21  */
       
    22 
       
    23 /** Zend_Search_Lucene_Index_SegmentInfo */
       
    24 require_once 'Zend/Search/Lucene/Index/SegmentInfo.php';
       
    25 
       
    26 
       
    27 /**
       
    28  * @category   Zend
       
    29  * @package    Zend_Search_Lucene
       
    30  * @subpackage Index
       
    31  * @copyright  Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
       
    32  * @license    http://framework.zend.com/license/new-bsd     New BSD License
       
    33  */
       
    34 class Zend_Search_Lucene_Index_SegmentMerger
       
    35 {
       
    36     /**
       
    37      * Target segment writer
       
    38      *
       
    39      * @var Zend_Search_Lucene_Index_SegmentWriter_StreamWriter
       
    40      */
       
    41     private $_writer;
       
    42 
       
    43     /**
       
    44      * Number of docs in a new segment
       
    45      *
       
    46      * @var integer
       
    47      */
       
    48     private $_docCount;
       
    49 
       
    50     /**
       
    51      * A set of segments to be merged
       
    52      *
       
    53      * @var array Zend_Search_Lucene_Index_SegmentInfo
       
    54      */
       
    55     private $_segmentInfos = array();
       
    56 
       
    57     /**
       
    58      * Flag to signal, that merge is already done
       
    59      *
       
    60      * @var boolean
       
    61      */
       
    62     private $_mergeDone = false;
       
    63 
       
    64     /**
       
    65      * Field map
       
    66      * [<segment_name>][<field_number>] => <target_field_number>
       
    67      *
       
    68      * @var array
       
    69      */
       
    70     private $_fieldsMap = array();
       
    71 
       
    72 
       
    73 
       
    74     /**
       
    75      * Object constructor.
       
    76      *
       
    77      * Creates new segment merger with $directory as target to merge segments into
       
    78      * and $name as a name of new segment
       
    79      *
       
    80      * @param Zend_Search_Lucene_Storage_Directory $directory
       
    81      * @param string $name
       
    82      */
       
    83     public function __construct($directory, $name)
       
    84     {
       
    85         /** Zend_Search_Lucene_Index_SegmentWriter_StreamWriter */
       
    86         require_once 'Zend/Search/Lucene/Index/SegmentWriter/StreamWriter.php';
       
    87         $this->_writer = new Zend_Search_Lucene_Index_SegmentWriter_StreamWriter($directory, $name);
       
    88     }
       
    89 
       
    90 
       
    91     /**
       
    92      * Add segmnet to a collection of segments to be merged
       
    93      *
       
    94      * @param Zend_Search_Lucene_Index_SegmentInfo $segment
       
    95      */
       
    96     public function addSource(Zend_Search_Lucene_Index_SegmentInfo $segmentInfo)
       
    97     {
       
    98         $this->_segmentInfos[$segmentInfo->getName()] = $segmentInfo;
       
    99     }
       
   100 
       
   101 
       
   102     /**
       
   103      * Do merge.
       
   104      *
       
   105      * Returns number of documents in newly created segment
       
   106      *
       
   107      * @return Zend_Search_Lucene_Index_SegmentInfo
       
   108      * @throws Zend_Search_Lucene_Exception
       
   109      */
       
   110     public function merge()
       
   111     {
       
   112         if ($this->_mergeDone) {
       
   113             require_once 'Zend/Search/Lucene/Exception.php';
       
   114             throw new Zend_Search_Lucene_Exception('Merge is already done.');
       
   115         }
       
   116 
       
   117         if (count($this->_segmentInfos) < 1) {
       
   118             require_once 'Zend/Search/Lucene/Exception.php';
       
   119             throw new Zend_Search_Lucene_Exception('Wrong number of segments to be merged ('
       
   120                                                  . count($this->_segmentInfos)
       
   121                                                  . ').');
       
   122         }
       
   123 
       
   124         $this->_mergeFields();
       
   125         $this->_mergeNorms();
       
   126         $this->_mergeStoredFields();
       
   127         $this->_mergeTerms();
       
   128 
       
   129         $this->_mergeDone = true;
       
   130 
       
   131         return $this->_writer->close();
       
   132     }
       
   133 
       
   134 
       
   135     /**
       
   136      * Merge fields information
       
   137      */
       
   138     private function _mergeFields()
       
   139     {
       
   140         foreach ($this->_segmentInfos as $segName => $segmentInfo) {
       
   141             foreach ($segmentInfo->getFieldInfos() as $fieldInfo) {
       
   142                 $this->_fieldsMap[$segName][$fieldInfo->number] = $this->_writer->addFieldInfo($fieldInfo);
       
   143             }
       
   144         }
       
   145     }
       
   146 
       
   147     /**
       
   148      * Merge field's normalization factors
       
   149      */
       
   150     private function _mergeNorms()
       
   151     {
       
   152         foreach ($this->_writer->getFieldInfos() as $fieldInfo) {
       
   153             if ($fieldInfo->isIndexed) {
       
   154                 foreach ($this->_segmentInfos as $segName => $segmentInfo) {
       
   155                     if ($segmentInfo->hasDeletions()) {
       
   156                         $srcNorm = $segmentInfo->normVector($fieldInfo->name);
       
   157                         $norm    = '';
       
   158                         $docs    = $segmentInfo->count();
       
   159                         for ($count = 0; $count < $docs; $count++) {
       
   160                             if (!$segmentInfo->isDeleted($count)) {
       
   161                                 $norm .= $srcNorm[$count];
       
   162                             }
       
   163                         }
       
   164                         $this->_writer->addNorm($fieldInfo->name, $norm);
       
   165                     } else {
       
   166                         $this->_writer->addNorm($fieldInfo->name, $segmentInfo->normVector($fieldInfo->name));
       
   167                     }
       
   168                 }
       
   169             }
       
   170         }
       
   171     }
       
   172 
       
   173     /**
       
   174      * Merge fields information
       
   175      */
       
   176     private function _mergeStoredFields()
       
   177     {
       
   178         $this->_docCount = 0;
       
   179 
       
   180         foreach ($this->_segmentInfos as $segName => $segmentInfo) {
       
   181             $fdtFile = $segmentInfo->openCompoundFile('.fdt');
       
   182 
       
   183             for ($count = 0; $count < $segmentInfo->count(); $count++) {
       
   184                 $fieldCount = $fdtFile->readVInt();
       
   185                 $storedFields = array();
       
   186 
       
   187                 for ($count2 = 0; $count2 < $fieldCount; $count2++) {
       
   188                     $fieldNum = $fdtFile->readVInt();
       
   189                     $bits = $fdtFile->readByte();
       
   190                     $fieldInfo = $segmentInfo->getField($fieldNum);
       
   191 
       
   192                     if (!($bits & 2)) { // Text data
       
   193                         $storedFields[] =
       
   194                                  new Zend_Search_Lucene_Field($fieldInfo->name,
       
   195                                                               $fdtFile->readString(),
       
   196                                                               'UTF-8',
       
   197                                                               true,
       
   198                                                               $fieldInfo->isIndexed,
       
   199                                                               $bits & 1 );
       
   200                     } else {            // Binary data
       
   201                         $storedFields[] =
       
   202                                  new Zend_Search_Lucene_Field($fieldInfo->name,
       
   203                                                               $fdtFile->readBinary(),
       
   204                                                               '',
       
   205                                                               true,
       
   206                                                               $fieldInfo->isIndexed,
       
   207                                                               $bits & 1,
       
   208                                                               true);
       
   209                     }
       
   210                 }
       
   211 
       
   212                 if (!$segmentInfo->isDeleted($count)) {
       
   213                     $this->_docCount++;
       
   214                     $this->_writer->addStoredFields($storedFields);
       
   215                 }
       
   216             }
       
   217         }
       
   218     }
       
   219 
       
   220 
       
   221     /**
       
   222      * Merge fields information
       
   223      */
       
   224     private function _mergeTerms()
       
   225     {
       
   226         /** Zend_Search_Lucene_Index_TermsPriorityQueue */
       
   227         require_once 'Zend/Search/Lucene/Index/TermsPriorityQueue.php';
       
   228 
       
   229         $segmentInfoQueue = new Zend_Search_Lucene_Index_TermsPriorityQueue();
       
   230 
       
   231         $segmentStartId = 0;
       
   232         foreach ($this->_segmentInfos as $segName => $segmentInfo) {
       
   233             $segmentStartId = $segmentInfo->resetTermsStream($segmentStartId, Zend_Search_Lucene_Index_SegmentInfo::SM_MERGE_INFO);
       
   234 
       
   235             // Skip "empty" segments
       
   236             if ($segmentInfo->currentTerm() !== null) {
       
   237                 $segmentInfoQueue->put($segmentInfo);
       
   238             }
       
   239         }
       
   240 
       
   241         $this->_writer->initializeDictionaryFiles();
       
   242 
       
   243         $termDocs = array();
       
   244         while (($segmentInfo = $segmentInfoQueue->pop()) !== null) {
       
   245             // Merge positions array
       
   246             $termDocs += $segmentInfo->currentTermPositions();
       
   247 
       
   248             if ($segmentInfoQueue->top() === null ||
       
   249                 $segmentInfoQueue->top()->currentTerm()->key() !=
       
   250                             $segmentInfo->currentTerm()->key()) {
       
   251                 // We got new term
       
   252                 ksort($termDocs, SORT_NUMERIC);
       
   253 
       
   254                 // Add term if it's contained in any document
       
   255                 if (count($termDocs) > 0) {
       
   256                     $this->_writer->addTerm($segmentInfo->currentTerm(), $termDocs);
       
   257                 }
       
   258                 $termDocs = array();
       
   259             }
       
   260 
       
   261             $segmentInfo->nextTerm();
       
   262             // check, if segment dictionary is finished
       
   263             if ($segmentInfo->currentTerm() !== null) {
       
   264                 // Put segment back into the priority queue
       
   265                 $segmentInfoQueue->put($segmentInfo);
       
   266             }
       
   267         }
       
   268 
       
   269         $this->_writer->closeDictionaryFiles();
       
   270     }
       
   271 }