web/lib/Zend/Search/Lucene/Index/SegmentInfo.php
changeset 64 162c1de6545a
parent 19 1c2f13fd785c
child 68 ecaf28ffe26e
equal deleted inserted replaced
63:5b37998e522e 64:162c1de6545a
       
     1 <?php
       
     2 /**
       
     3  * Zend Framework
       
     4  *
       
     5  * LICENSE
       
     6  *
       
     7  * This source file is subject to the new BSD license that is bundled
       
     8  * with this package in the file LICENSE.txt.
       
     9  * It is also available through the world-wide-web at this URL:
       
    10  * http://framework.zend.com/license/new-bsd
       
    11  * If you did not receive a copy of the license and are unable to
       
    12  * obtain it through the world-wide-web, please send an email
       
    13  * to license@zend.com so we can send you a copy immediately.
       
    14  *
       
    15  * @category   Zend
       
    16  * @package    Zend_Search_Lucene
       
    17  * @subpackage Index
       
    18  * @copyright  Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
       
    19  * @license    http://framework.zend.com/license/new-bsd     New BSD License
       
    20  * @version    $Id: SegmentInfo.php 22987 2010-09-21 10:39:53Z alexander $
       
    21  */
       
    22 
       
    23 /** Zend_Search_Lucene_Index_TermsStream_Interface */
       
    24 require_once 'Zend/Search/Lucene/Index/TermsStream/Interface.php';
       
    25 
       
    26 
       
    27 /** Zend_Search_Lucene_Search_Similarity */
       
    28 require_once 'Zend/Search/Lucene/Search/Similarity.php';
       
    29 
       
    30 /** Zend_Search_Lucene_Index_FieldInfo */
       
    31 require_once 'Zend/Search/Lucene/Index/FieldInfo.php';
       
    32 
       
    33 /** Zend_Search_Lucene_Index_Term */
       
    34 require_once 'Zend/Search/Lucene/Index/Term.php';
       
    35 
       
    36 /** Zend_Search_Lucene_Index_TermInfo */
       
    37 require_once 'Zend/Search/Lucene/Index/TermInfo.php';
       
    38 
       
    39 /**
       
    40  * @category   Zend
       
    41  * @package    Zend_Search_Lucene
       
    42  * @subpackage Index
       
    43  * @copyright  Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
       
    44  * @license    http://framework.zend.com/license/new-bsd     New BSD License
       
    45  */
       
    46 class Zend_Search_Lucene_Index_SegmentInfo implements Zend_Search_Lucene_Index_TermsStream_Interface
       
    47 {
       
    48     /**
       
    49      * "Full scan vs fetch" boundary.
       
    50      *
       
    51      * If filter selectivity is less than this value, then full scan is performed
       
    52      * (since term entries fetching has some additional overhead).
       
    53      */
       
    54     const FULL_SCAN_VS_FETCH_BOUNDARY = 5;
       
    55 
       
    56     /**
       
    57      * Number of docs in a segment
       
    58      *
       
    59      * @var integer
       
    60      */
       
    61     private $_docCount;
       
    62 
       
    63     /**
       
    64      * Segment name
       
    65      *
       
    66      * @var string
       
    67      */
       
    68     private $_name;
       
    69 
       
    70     /**
       
    71      * Term Dictionary Index
       
    72      *
       
    73      * Array of arrays (Zend_Search_Lucene_Index_Term objects are represented as arrays because
       
    74      * of performance considerations)
       
    75      * [0] -> $termValue
       
    76      * [1] -> $termFieldNum
       
    77      *
       
    78      * Corresponding Zend_Search_Lucene_Index_TermInfo object stored in the $_termDictionaryInfos
       
    79      *
       
    80      * @var array
       
    81      */
       
    82     private $_termDictionary;
       
    83 
       
    84     /**
       
    85      * Term Dictionary Index TermInfos
       
    86      *
       
    87      * Array of arrays (Zend_Search_Lucene_Index_TermInfo objects are represented as arrays because
       
    88      * of performance considerations)
       
    89      * [0] -> $docFreq
       
    90      * [1] -> $freqPointer
       
    91      * [2] -> $proxPointer
       
    92      * [3] -> $skipOffset
       
    93      * [4] -> $indexPointer
       
    94      *
       
    95      * @var array
       
    96      */
       
    97     private $_termDictionaryInfos;
       
    98 
       
    99     /**
       
   100      * Segment fields. Array of Zend_Search_Lucene_Index_FieldInfo objects for this segment
       
   101      *
       
   102      * @var array
       
   103      */
       
   104     private $_fields;
       
   105 
       
   106     /**
       
   107      * Field positions in a dictionary.
       
   108      * (Term dictionary contains filelds ordered by names)
       
   109      *
       
   110      * @var array
       
   111      */
       
   112     private $_fieldsDicPositions;
       
   113 
       
   114 
       
   115     /**
       
   116      * Associative array where the key is the file name and the value is data offset
       
   117      * in a compound segment file (.csf).
       
   118      *
       
   119      * @var array
       
   120      */
       
   121     private $_segFiles;
       
   122 
       
   123     /**
       
   124      * Associative array where the key is the file name and the value is file size (.csf).
       
   125      *
       
   126      * @var array
       
   127      */
       
   128     private $_segFileSizes;
       
   129 
       
   130     /**
       
   131      * Delete file generation number
       
   132      *
       
   133      * -2 means autodetect latest delete generation
       
   134      * -1 means 'there is no delete file'
       
   135      *  0 means pre-2.1 format delete file
       
   136      *  X specifies used delete file
       
   137      *
       
   138      * @var integer
       
   139      */
       
   140     private $_delGen;
       
   141 
       
   142     /**
       
   143      * Segment has single norms file
       
   144      *
       
   145      * If true then one .nrm file is used for all fields
       
   146      * Otherwise .fN files are used
       
   147      *
       
   148      * @var boolean
       
   149      */
       
   150     private $_hasSingleNormFile;
       
   151 
       
   152     /**
       
   153      * Use compound segment file (*.cfs) to collect all other segment files
       
   154      * (excluding .del files)
       
   155      *
       
   156      * @var boolean
       
   157      */
       
   158     private $_isCompound;
       
   159 
       
   160 
       
   161     /**
       
   162      * File system adapter.
       
   163      *
       
   164      * @var Zend_Search_Lucene_Storage_Directory_Filesystem
       
   165      */
       
   166     private $_directory;
       
   167 
       
   168     /**
       
   169      * Normalization factors.
       
   170      * An array fieldName => normVector
       
   171      * normVector is a binary string.
       
   172      * Each byte corresponds to an indexed document in a segment and
       
   173      * encodes normalization factor (float value, encoded by
       
   174      * Zend_Search_Lucene_Search_Similarity::encodeNorm())
       
   175      *
       
   176      * @var array
       
   177      */
       
   178     private $_norms = array();
       
   179 
       
   180     /**
       
   181      * List of deleted documents.
       
   182      * bitset if bitset extension is loaded or array otherwise.
       
   183      *
       
   184      * @var mixed
       
   185      */
       
   186     private $_deleted = null;
       
   187 
       
   188     /**
       
   189      * $this->_deleted update flag
       
   190      *
       
   191      * @var boolean
       
   192      */
       
   193     private $_deletedDirty = false;
       
   194 
       
   195     /**
       
   196      * True if segment uses shared doc store
       
   197      *
       
   198      * @var boolean
       
   199      */
       
   200     private $_usesSharedDocStore;
       
   201 
       
   202     /*
       
   203      * Shared doc store options.
       
   204      * It's an assotiative array with the following items:
       
   205      * - 'offset'     => $docStoreOffset           The starting document in the shared doc store files where this segment's documents begin
       
   206      * - 'segment'    => $docStoreSegment          The name of the segment that has the shared doc store files.
       
   207      * - 'isCompound' => $docStoreIsCompoundFile   True, if compound file format is used for the shared doc store files (.cfx file).
       
   208      */
       
   209     private $_sharedDocStoreOptions;
       
   210 
       
   211 
       
   212     /**
       
   213      * Zend_Search_Lucene_Index_SegmentInfo constructor
       
   214      *
       
   215      * @param Zend_Search_Lucene_Storage_Directory $directory
       
   216      * @param string     $name
       
   217      * @param integer    $docCount
       
   218      * @param integer    $delGen
       
   219      * @param array|null $docStoreOptions
       
   220      * @param boolean    $hasSingleNormFile
       
   221      * @param boolean    $isCompound
       
   222      */
       
   223     public function __construct(Zend_Search_Lucene_Storage_Directory $directory, $name, $docCount, $delGen = 0, $docStoreOptions = null, $hasSingleNormFile = false, $isCompound = null)
       
   224     {
       
   225         $this->_directory = $directory;
       
   226         $this->_name      = $name;
       
   227         $this->_docCount  = $docCount;
       
   228 
       
   229         if ($docStoreOptions !== null) {
       
   230             $this->_usesSharedDocStore    = true;
       
   231             $this->_sharedDocStoreOptions = $docStoreOptions;
       
   232 
       
   233             if ($docStoreOptions['isCompound']) {
       
   234                 $cfxFile       = $this->_directory->getFileObject($docStoreOptions['segment'] . '.cfx');
       
   235                 $cfxFilesCount = $cfxFile->readVInt();
       
   236 
       
   237                 $cfxFiles     = array();
       
   238                 $cfxFileSizes = array();
       
   239 
       
   240                 for ($count = 0; $count < $cfxFilesCount; $count++) {
       
   241                     $dataOffset = $cfxFile->readLong();
       
   242                     if ($count != 0) {
       
   243                         $cfxFileSizes[$fileName] = $dataOffset - end($cfxFiles);
       
   244                     }
       
   245                     $fileName            = $cfxFile->readString();
       
   246                     $cfxFiles[$fileName] = $dataOffset;
       
   247                 }
       
   248                 if ($count != 0) {
       
   249                     $cfxFileSizes[$fileName] = $this->_directory->fileLength($docStoreOptions['segment'] . '.cfx') - $dataOffset;
       
   250                 }
       
   251 
       
   252                 $this->_sharedDocStoreOptions['files']     = $cfxFiles;
       
   253                 $this->_sharedDocStoreOptions['fileSizes'] = $cfxFileSizes;
       
   254             }
       
   255         }
       
   256 
       
   257         $this->_hasSingleNormFile = $hasSingleNormFile;
       
   258         $this->_delGen            = $delGen;
       
   259         $this->_termDictionary    = null;
       
   260 
       
   261 
       
   262         if ($isCompound !== null) {
       
   263             $this->_isCompound    = $isCompound;
       
   264         } else {
       
   265             // It's a pre-2.1 segment or isCompound is set to 'unknown'
       
   266             // Detect if segment uses compound file
       
   267             require_once 'Zend/Search/Lucene/Exception.php';
       
   268             try {
       
   269                 // Try to open compound file
       
   270                 $this->_directory->getFileObject($name . '.cfs');
       
   271 
       
   272                 // Compound file is found
       
   273                 $this->_isCompound = true;
       
   274             } catch (Zend_Search_Lucene_Exception $e) {
       
   275                 if (strpos($e->getMessage(), 'is not readable') !== false) {
       
   276                     // Compound file is not found or is not readable
       
   277                     $this->_isCompound = false;
       
   278                 } else {
       
   279                     throw new Zend_Search_Lucene_Exception($e->getMessage(), $e->getCode(), $e);
       
   280                 }
       
   281             }
       
   282         }
       
   283 
       
   284         $this->_segFiles = array();
       
   285         if ($this->_isCompound) {
       
   286             $cfsFile = $this->_directory->getFileObject($name . '.cfs');
       
   287             $segFilesCount = $cfsFile->readVInt();
       
   288 
       
   289             for ($count = 0; $count < $segFilesCount; $count++) {
       
   290                 $dataOffset = $cfsFile->readLong();
       
   291                 if ($count != 0) {
       
   292                     $this->_segFileSizes[$fileName] = $dataOffset - end($this->_segFiles);
       
   293                 }
       
   294                 $fileName = $cfsFile->readString();
       
   295                 $this->_segFiles[$fileName] = $dataOffset;
       
   296             }
       
   297             if ($count != 0) {
       
   298                 $this->_segFileSizes[$fileName] = $this->_directory->fileLength($name . '.cfs') - $dataOffset;
       
   299             }
       
   300         }
       
   301 
       
   302         $fnmFile = $this->openCompoundFile('.fnm');
       
   303         $fieldsCount = $fnmFile->readVInt();
       
   304         $fieldNames = array();
       
   305         $fieldNums  = array();
       
   306         $this->_fields = array();
       
   307 
       
   308         for ($count=0; $count < $fieldsCount; $count++) {
       
   309             $fieldName = $fnmFile->readString();
       
   310             $fieldBits = $fnmFile->readByte();
       
   311             $this->_fields[$count] = new Zend_Search_Lucene_Index_FieldInfo($fieldName,
       
   312                                                                             $fieldBits & 0x01 /* field is indexed */,
       
   313                                                                             $count,
       
   314                                                                             $fieldBits & 0x02 /* termvectors are stored */,
       
   315                                                                             $fieldBits & 0x10 /* norms are omitted */,
       
   316                                                                             $fieldBits & 0x20 /* payloads are stored */);
       
   317             if ($fieldBits & 0x10) {
       
   318                 // norms are omitted for the indexed field
       
   319                 $this->_norms[$count] = str_repeat(chr(Zend_Search_Lucene_Search_Similarity::encodeNorm(1.0)), $docCount);
       
   320             }
       
   321 
       
   322             $fieldNums[$count]  = $count;
       
   323             $fieldNames[$count] = $fieldName;
       
   324         }
       
   325         array_multisort($fieldNames, SORT_ASC, SORT_REGULAR, $fieldNums);
       
   326         $this->_fieldsDicPositions = array_flip($fieldNums);
       
   327 
       
   328         if ($this->_delGen == -2) {
       
   329             // SegmentInfo constructor is invoked from index writer
       
   330             // Autodetect current delete file generation number
       
   331             $this->_delGen = $this->_detectLatestDelGen();
       
   332         }
       
   333 
       
   334         // Load deletions
       
   335         $this->_deleted = $this->_loadDelFile();
       
   336     }
       
   337 
       
   338     /**
       
   339      * Load detetions file
       
   340      *
       
   341      * Returns bitset or an array depending on bitset extension availability
       
   342      *
       
   343      * @return mixed
       
   344      * @throws Zend_Search_Lucene_Exception
       
   345      */
       
   346     private function _loadDelFile()
       
   347     {
       
   348         if ($this->_delGen == -1) {
       
   349             // There is no delete file for this segment
       
   350             return null;
       
   351         } else if ($this->_delGen == 0) {
       
   352             // It's a segment with pre-2.1 format delete file
       
   353             // Try to load deletions file
       
   354             return $this->_loadPre21DelFile();
       
   355         } else {
       
   356             // It's 2.1+ format deleteions file
       
   357             return $this->_load21DelFile();
       
   358         }
       
   359     }
       
   360 
       
   361     /**
       
   362      * Load pre-2.1 detetions file
       
   363      *
       
   364      * Returns bitset or an array depending on bitset extension availability
       
   365      *
       
   366      * @return mixed
       
   367      * @throws Zend_Search_Lucene_Exception
       
   368      */
       
   369     private function _loadPre21DelFile()
       
   370     {
       
   371         require_once 'Zend/Search/Lucene/Exception.php';
       
   372         try {
       
   373             // '.del' files always stored in a separate file
       
   374             // Segment compound is not used
       
   375             $delFile = $this->_directory->getFileObject($this->_name . '.del');
       
   376 
       
   377             $byteCount = $delFile->readInt();
       
   378             $byteCount = ceil($byteCount/8);
       
   379             $bitCount  = $delFile->readInt();
       
   380 
       
   381             if ($bitCount == 0) {
       
   382                 $delBytes = '';
       
   383             } else {
       
   384                 $delBytes = $delFile->readBytes($byteCount);
       
   385             }
       
   386 
       
   387             if (extension_loaded('bitset')) {
       
   388                 return $delBytes;
       
   389             } else {
       
   390                 $deletions = array();
       
   391                 for ($count = 0; $count < $byteCount; $count++) {
       
   392                     $byte = ord($delBytes[$count]);
       
   393                     for ($bit = 0; $bit < 8; $bit++) {
       
   394                         if ($byte & (1<<$bit)) {
       
   395                             $deletions[$count*8 + $bit] = 1;
       
   396                         }
       
   397                     }
       
   398                 }
       
   399 
       
   400                 return $deletions;
       
   401             }
       
   402         } catch(Zend_Search_Lucene_Exception $e) {
       
   403             if (strpos($e->getMessage(), 'is not readable') === false) {
       
   404                 throw new Zend_Search_Lucene_Exception($e->getMessage(), $e->getCode(), $e);
       
   405             }
       
   406             // There is no deletion file
       
   407             $this->_delGen = -1;
       
   408 
       
   409             return null;
       
   410         }
       
   411     }
       
   412 
       
   413     /**
       
   414      * Load 2.1+ format detetions file
       
   415      *
       
   416      * Returns bitset or an array depending on bitset extension availability
       
   417      *
       
   418      * @return mixed
       
   419      */
       
   420     private function _load21DelFile()
       
   421     {
       
   422         $delFile = $this->_directory->getFileObject($this->_name . '_' . base_convert($this->_delGen, 10, 36) . '.del');
       
   423 
       
   424         $format = $delFile->readInt();
       
   425 
       
   426         if ($format == (int)0xFFFFFFFF) {
       
   427             if (extension_loaded('bitset')) {
       
   428                 $deletions = bitset_empty();
       
   429             } else {
       
   430                 $deletions = array();
       
   431             }
       
   432 
       
   433             $byteCount = $delFile->readInt();
       
   434             $bitCount  = $delFile->readInt();
       
   435 
       
   436             $delFileSize = $this->_directory->fileLength($this->_name . '_' . base_convert($this->_delGen, 10, 36) . '.del');
       
   437             $byteNum = 0;
       
   438 
       
   439             do {
       
   440                 $dgap = $delFile->readVInt();
       
   441                 $nonZeroByte = $delFile->readByte();
       
   442 
       
   443                 $byteNum += $dgap;
       
   444 
       
   445 
       
   446                 if (extension_loaded('bitset')) {
       
   447                     for ($bit = 0; $bit < 8; $bit++) {
       
   448                         if ($nonZeroByte & (1<<$bit)) {
       
   449                             bitset_incl($deletions, $byteNum*8 + $bit);
       
   450                         }
       
   451                     }
       
   452                     return $deletions;
       
   453                 } else {
       
   454                     for ($bit = 0; $bit < 8; $bit++) {
       
   455                         if ($nonZeroByte & (1<<$bit)) {
       
   456                             $deletions[$byteNum*8 + $bit] = 1;
       
   457                         }
       
   458                     }
       
   459                     return (count($deletions) > 0) ? $deletions : null;
       
   460                 }
       
   461 
       
   462             } while ($delFile->tell() < $delFileSize);
       
   463         } else {
       
   464             // $format is actually byte count
       
   465             $byteCount = ceil($format/8);
       
   466             $bitCount  = $delFile->readInt();
       
   467 
       
   468             if ($bitCount == 0) {
       
   469                 $delBytes = '';
       
   470             } else {
       
   471                 $delBytes = $delFile->readBytes($byteCount);
       
   472             }
       
   473 
       
   474             if (extension_loaded('bitset')) {
       
   475                 return $delBytes;
       
   476             } else {
       
   477                 $deletions = array();
       
   478                 for ($count = 0; $count < $byteCount; $count++) {
       
   479                     $byte = ord($delBytes[$count]);
       
   480                     for ($bit = 0; $bit < 8; $bit++) {
       
   481                         if ($byte & (1<<$bit)) {
       
   482                             $deletions[$count*8 + $bit] = 1;
       
   483                         }
       
   484                     }
       
   485                 }
       
   486 
       
   487                 return (count($deletions) > 0) ? $deletions : null;
       
   488             }
       
   489         }
       
   490     }
       
   491 
       
   492     /**
       
   493      * Opens index file stoted within compound index file
       
   494      *
       
   495      * @param string $extension
       
   496      * @param boolean $shareHandler
       
   497      * @throws Zend_Search_Lucene_Exception
       
   498      * @return Zend_Search_Lucene_Storage_File
       
   499      */
       
   500     public function openCompoundFile($extension, $shareHandler = true)
       
   501     {
       
   502         if (($extension == '.fdx'  || $extension == '.fdt')  &&  $this->_usesSharedDocStore) {
       
   503             $fdxFName = $this->_sharedDocStoreOptions['segment'] . '.fdx';
       
   504             $fdtFName = $this->_sharedDocStoreOptions['segment'] . '.fdt';
       
   505 
       
   506             if (!$this->_sharedDocStoreOptions['isCompound']) {
       
   507                 $fdxFile = $this->_directory->getFileObject($fdxFName, $shareHandler);
       
   508                 $fdxFile->seek($this->_sharedDocStoreOptions['offset']*8, SEEK_CUR);
       
   509 
       
   510                 if ($extension == '.fdx') {
       
   511                     // '.fdx' file is requested
       
   512                     return $fdxFile;
       
   513                 } else {
       
   514                     // '.fdt' file is requested
       
   515                     $fdtStartOffset = $fdxFile->readLong();
       
   516 
       
   517                     $fdtFile = $this->_directory->getFileObject($fdtFName, $shareHandler);
       
   518                     $fdtFile->seek($fdtStartOffset, SEEK_CUR);
       
   519 
       
   520                     return $fdtFile;
       
   521                 }
       
   522             }
       
   523 
       
   524             if( !isset($this->_sharedDocStoreOptions['files'][$fdxFName]) ) {
       
   525                 require_once 'Zend/Search/Lucene/Exception.php';
       
   526                 throw new Zend_Search_Lucene_Exception('Shared doc storage segment compound file doesn\'t contain '
       
   527                                        . $fdxFName . ' file.' );
       
   528             }
       
   529             if( !isset($this->_sharedDocStoreOptions['files'][$fdtFName]) ) {
       
   530                 require_once 'Zend/Search/Lucene/Exception.php';
       
   531                 throw new Zend_Search_Lucene_Exception('Shared doc storage segment compound file doesn\'t contain '
       
   532                                        . $fdtFName . ' file.' );
       
   533             }
       
   534 
       
   535             // Open shared docstore segment file
       
   536             $cfxFile = $this->_directory->getFileObject($this->_sharedDocStoreOptions['segment'] . '.cfx', $shareHandler);
       
   537             // Seek to the start of '.fdx' file within compound file
       
   538             $cfxFile->seek($this->_sharedDocStoreOptions['files'][$fdxFName]);
       
   539             // Seek to the start of current segment documents section
       
   540             $cfxFile->seek($this->_sharedDocStoreOptions['offset']*8, SEEK_CUR);
       
   541 
       
   542             if ($extension == '.fdx') {
       
   543                 // '.fdx' file is requested
       
   544                 return $cfxFile;
       
   545             } else {
       
   546                 // '.fdt' file is requested
       
   547                 $fdtStartOffset = $cfxFile->readLong();
       
   548 
       
   549                 // Seek to the start of '.fdt' file within compound file
       
   550                 $cfxFile->seek($this->_sharedDocStoreOptions['files'][$fdtFName]);
       
   551                 // Seek to the start of current segment documents section
       
   552                 $cfxFile->seek($fdtStartOffset, SEEK_CUR);
       
   553 
       
   554                 return $fdtFile;
       
   555             }
       
   556         }
       
   557 
       
   558         $filename = $this->_name . $extension;
       
   559 
       
   560         if (!$this->_isCompound) {
       
   561             return $this->_directory->getFileObject($filename, $shareHandler);
       
   562         }
       
   563 
       
   564         if( !isset($this->_segFiles[$filename]) ) {
       
   565             require_once 'Zend/Search/Lucene/Exception.php';
       
   566             throw new Zend_Search_Lucene_Exception('Segment compound file doesn\'t contain '
       
   567                                        . $filename . ' file.' );
       
   568         }
       
   569 
       
   570         $file = $this->_directory->getFileObject($this->_name . '.cfs', $shareHandler);
       
   571         $file->seek($this->_segFiles[$filename]);
       
   572         return $file;
       
   573     }
       
   574 
       
   575     /**
       
   576      * Get compound file length
       
   577      *
       
   578      * @param string $extension
       
   579      * @return integer
       
   580      */
       
   581     public function compoundFileLength($extension)
       
   582     {
       
   583         if (($extension == '.fdx'  || $extension == '.fdt')  &&  $this->_usesSharedDocStore) {
       
   584             $filename = $this->_sharedDocStoreOptions['segment'] . $extension;
       
   585 
       
   586             if (!$this->_sharedDocStoreOptions['isCompound']) {
       
   587                 return $this->_directory->fileLength($filename);
       
   588             }
       
   589 
       
   590             if( !isset($this->_sharedDocStoreOptions['fileSizes'][$filename]) ) {
       
   591                 require_once 'Zend/Search/Lucene/Exception.php';
       
   592                 throw new Zend_Search_Lucene_Exception('Shared doc store compound file doesn\'t contain '
       
   593                                            . $filename . ' file.' );
       
   594             }
       
   595 
       
   596             return $this->_sharedDocStoreOptions['fileSizes'][$filename];
       
   597         }
       
   598 
       
   599 
       
   600         $filename = $this->_name . $extension;
       
   601 
       
   602         // Try to get common file first
       
   603         if ($this->_directory->fileExists($filename)) {
       
   604             return $this->_directory->fileLength($filename);
       
   605         }
       
   606 
       
   607         if( !isset($this->_segFileSizes[$filename]) ) {
       
   608             require_once 'Zend/Search/Lucene/Exception.php';
       
   609             throw new Zend_Search_Lucene_Exception('Index compound file doesn\'t contain '
       
   610                                        . $filename . ' file.' );
       
   611         }
       
   612 
       
   613         return $this->_segFileSizes[$filename];
       
   614     }
       
   615 
       
   616     /**
       
   617      * Returns field index or -1 if field is not found
       
   618      *
       
   619      * @param string $fieldName
       
   620      * @return integer
       
   621      */
       
   622     public function getFieldNum($fieldName)
       
   623     {
       
   624         foreach( $this->_fields as $field ) {
       
   625             if( $field->name == $fieldName ) {
       
   626                 return $field->number;
       
   627             }
       
   628         }
       
   629 
       
   630         return -1;
       
   631     }
       
   632 
       
   633     /**
       
   634      * Returns field info for specified field
       
   635      *
       
   636      * @param integer $fieldNum
       
   637      * @return Zend_Search_Lucene_Index_FieldInfo
       
   638      */
       
   639     public function getField($fieldNum)
       
   640     {
       
   641         return $this->_fields[$fieldNum];
       
   642     }
       
   643 
       
   644     /**
       
   645      * Returns array of fields.
       
   646      * if $indexed parameter is true, then returns only indexed fields.
       
   647      *
       
   648      * @param boolean $indexed
       
   649      * @return array
       
   650      */
       
   651     public function getFields($indexed = false)
       
   652     {
       
   653         $result = array();
       
   654         foreach( $this->_fields as $field ) {
       
   655             if( (!$indexed) || $field->isIndexed ) {
       
   656                 $result[ $field->name ] = $field->name;
       
   657             }
       
   658         }
       
   659         return $result;
       
   660     }
       
   661 
       
   662     /**
       
   663      * Returns array of FieldInfo objects.
       
   664      *
       
   665      * @return array
       
   666      */
       
   667     public function getFieldInfos()
       
   668     {
       
   669         return $this->_fields;
       
   670     }
       
   671 
       
   672     /**
       
   673      * Returns actual deletions file generation number.
       
   674      *
       
   675      * @return integer
       
   676      */
       
   677     public function getDelGen()
       
   678     {
       
   679         return $this->_delGen;
       
   680     }
       
   681 
       
   682     /**
       
   683      * Returns the total number of documents in this segment (including deleted documents).
       
   684      *
       
   685      * @return integer
       
   686      */
       
   687     public function count()
       
   688     {
       
   689         return $this->_docCount;
       
   690     }
       
   691 
       
   692     /**
       
   693      * Returns number of deleted documents.
       
   694      *
       
   695      * @return integer
       
   696      */
       
   697     private function _deletedCount()
       
   698     {
       
   699         if ($this->_deleted === null) {
       
   700             return 0;
       
   701         }
       
   702 
       
   703         if (extension_loaded('bitset')) {
       
   704             return count(bitset_to_array($this->_deleted));
       
   705         } else {
       
   706             return count($this->_deleted);
       
   707         }
       
   708     }
       
   709 
       
   710     /**
       
   711      * Returns the total number of non-deleted documents in this segment.
       
   712      *
       
   713      * @return integer
       
   714      */
       
   715     public function numDocs()
       
   716     {
       
   717         if ($this->hasDeletions()) {
       
   718             return $this->_docCount - $this->_deletedCount();
       
   719         } else {
       
   720             return $this->_docCount;
       
   721         }
       
   722     }
       
   723 
       
   724     /**
       
   725      * Get field position in a fields dictionary
       
   726      *
       
   727      * @param integer $fieldNum
       
   728      * @return integer
       
   729      */
       
   730     private function _getFieldPosition($fieldNum) {
       
   731         // Treat values which are not in a translation table as a 'direct value'
       
   732         return isset($this->_fieldsDicPositions[$fieldNum]) ?
       
   733                            $this->_fieldsDicPositions[$fieldNum] : $fieldNum;
       
   734     }
       
   735 
       
   736     /**
       
   737      * Return segment name
       
   738      *
       
   739      * @return string
       
   740      */
       
   741     public function getName()
       
   742     {
       
   743         return $this->_name;
       
   744     }
       
   745 
       
   746 
       
   747     /**
       
   748      * TermInfo cache
       
   749      *
       
   750      * Size is 1024.
       
   751      * Numbers are used instead of class constants because of performance considerations
       
   752      *
       
   753      * @var array
       
   754      */
       
   755     private $_termInfoCache = array();
       
   756 
       
   757     private function _cleanUpTermInfoCache()
       
   758     {
       
   759         // Clean 256 term infos
       
   760         foreach ($this->_termInfoCache as $key => $termInfo) {
       
   761             unset($this->_termInfoCache[$key]);
       
   762 
       
   763             // leave 768 last used term infos
       
   764             if (count($this->_termInfoCache) == 768) {
       
   765                 break;
       
   766             }
       
   767         }
       
   768     }
       
   769 
       
   770     /**
       
   771      * Load terms dictionary index
       
   772      *
       
   773      * @throws Zend_Search_Lucene_Exception
       
   774      */
       
   775     private function _loadDictionaryIndex()
       
   776     {
       
   777         // Check, if index is already serialized
       
   778         if ($this->_directory->fileExists($this->_name . '.sti')) {
       
   779             // Load serialized dictionary index data
       
   780             $stiFile = $this->_directory->getFileObject($this->_name . '.sti');
       
   781             $stiFileData = $stiFile->readBytes($this->_directory->fileLength($this->_name . '.sti'));
       
   782 
       
   783             // Load dictionary index data
       
   784             if (($unserializedData = @unserialize($stiFileData)) !== false) {
       
   785                 list($this->_termDictionary, $this->_termDictionaryInfos) = $unserializedData;
       
   786                 return;
       
   787             }
       
   788         }
       
   789 
       
   790         // Load data from .tii file and generate .sti file
       
   791 
       
   792         // Prefetch dictionary index data
       
   793         $tiiFile = $this->openCompoundFile('.tii');
       
   794         $tiiFileData = $tiiFile->readBytes($this->compoundFileLength('.tii'));
       
   795 
       
   796         /** Zend_Search_Lucene_Index_DictionaryLoader */
       
   797         require_once 'Zend/Search/Lucene/Index/DictionaryLoader.php';
       
   798 
       
   799         // Load dictionary index data
       
   800         list($this->_termDictionary, $this->_termDictionaryInfos) =
       
   801                     Zend_Search_Lucene_Index_DictionaryLoader::load($tiiFileData);
       
   802 
       
   803         $stiFileData = serialize(array($this->_termDictionary, $this->_termDictionaryInfos));
       
   804         $stiFile = $this->_directory->createFile($this->_name . '.sti');
       
   805         $stiFile->writeBytes($stiFileData);
       
   806     }
       
   807 
       
   808     /**
       
   809      * Scans terms dictionary and returns term info
       
   810      *
       
   811      * @param Zend_Search_Lucene_Index_Term $term
       
   812      * @return Zend_Search_Lucene_Index_TermInfo
       
   813      */
       
   814     public function getTermInfo(Zend_Search_Lucene_Index_Term $term)
       
   815     {
       
   816         $termKey = $term->key();
       
   817         if (isset($this->_termInfoCache[$termKey])) {
       
   818             $termInfo = $this->_termInfoCache[$termKey];
       
   819 
       
   820             // Move termInfo to the end of cache
       
   821             unset($this->_termInfoCache[$termKey]);
       
   822             $this->_termInfoCache[$termKey] = $termInfo;
       
   823 
       
   824             return $termInfo;
       
   825         }
       
   826 
       
   827 
       
   828         if ($this->_termDictionary === null) {
       
   829             $this->_loadDictionaryIndex();
       
   830         }
       
   831 
       
   832         $searchField = $this->getFieldNum($term->field);
       
   833 
       
   834         if ($searchField == -1) {
       
   835             return null;
       
   836         }
       
   837         $searchDicField = $this->_getFieldPosition($searchField);
       
   838 
       
   839         // search for appropriate value in dictionary
       
   840         $lowIndex = 0;
       
   841         $highIndex = count($this->_termDictionary)-1;
       
   842         while ($highIndex >= $lowIndex) {
       
   843             // $mid = ($highIndex - $lowIndex)/2;
       
   844             $mid = ($highIndex + $lowIndex) >> 1;
       
   845             $midTerm = $this->_termDictionary[$mid];
       
   846 
       
   847             $fieldNum = $this->_getFieldPosition($midTerm[0] /* field */);
       
   848             $delta = $searchDicField - $fieldNum;
       
   849             if ($delta == 0) {
       
   850                 $delta = strcmp($term->text, $midTerm[1] /* text */);
       
   851             }
       
   852 
       
   853             if ($delta < 0) {
       
   854                 $highIndex = $mid-1;
       
   855             } elseif ($delta > 0) {
       
   856                 $lowIndex  = $mid+1;
       
   857             } else {
       
   858                 // return $this->_termDictionaryInfos[$mid]; // We got it!
       
   859                 $a = $this->_termDictionaryInfos[$mid];
       
   860                 $termInfo = new Zend_Search_Lucene_Index_TermInfo($a[0], $a[1], $a[2], $a[3], $a[4]);
       
   861 
       
   862                 // Put loaded termInfo into cache
       
   863                 $this->_termInfoCache[$termKey] = $termInfo;
       
   864 
       
   865                 return $termInfo;
       
   866             }
       
   867         }
       
   868 
       
   869         if ($highIndex == -1) {
       
   870             // Term is out of the dictionary range
       
   871             return null;
       
   872         }
       
   873 
       
   874         $prevPosition = $highIndex;
       
   875         $prevTerm = $this->_termDictionary[$prevPosition];
       
   876         $prevTermInfo = $this->_termDictionaryInfos[$prevPosition];
       
   877 
       
   878         $tisFile = $this->openCompoundFile('.tis');
       
   879         $tiVersion = $tisFile->readInt();
       
   880         if ($tiVersion != (int)0xFFFFFFFE /* pre-2.1 format */  &&
       
   881             $tiVersion != (int)0xFFFFFFFD /* 2.1+ format    */) {
       
   882             require_once 'Zend/Search/Lucene/Exception.php';
       
   883             throw new Zend_Search_Lucene_Exception('Wrong TermInfoFile file format');
       
   884         }
       
   885 
       
   886         $termCount     = $tisFile->readLong();
       
   887         $indexInterval = $tisFile->readInt();
       
   888         $skipInterval  = $tisFile->readInt();
       
   889         if ($tiVersion == (int)0xFFFFFFFD /* 2.1+ format */) {
       
   890             $maxSkipLevels = $tisFile->readInt();
       
   891         }
       
   892 
       
   893         $tisFile->seek($prevTermInfo[4] /* indexPointer */ - (($tiVersion == (int)0xFFFFFFFD)? 24 : 20) /* header size*/, SEEK_CUR);
       
   894 
       
   895         $termValue    = $prevTerm[1] /* text */;
       
   896         $termFieldNum = $prevTerm[0] /* field */;
       
   897         $freqPointer = $prevTermInfo[1] /* freqPointer */;
       
   898         $proxPointer = $prevTermInfo[2] /* proxPointer */;
       
   899         for ($count = $prevPosition*$indexInterval + 1;
       
   900              $count <= $termCount &&
       
   901              ( $this->_getFieldPosition($termFieldNum) < $searchDicField ||
       
   902               ($this->_getFieldPosition($termFieldNum) == $searchDicField &&
       
   903                strcmp($termValue, $term->text) < 0) );
       
   904              $count++) {
       
   905             $termPrefixLength = $tisFile->readVInt();
       
   906             $termSuffix       = $tisFile->readString();
       
   907             $termFieldNum     = $tisFile->readVInt();
       
   908             $termValue        = Zend_Search_Lucene_Index_Term::getPrefix($termValue, $termPrefixLength) . $termSuffix;
       
   909 
       
   910             $docFreq      = $tisFile->readVInt();
       
   911             $freqPointer += $tisFile->readVInt();
       
   912             $proxPointer += $tisFile->readVInt();
       
   913             if( $docFreq >= $skipInterval ) {
       
   914                 $skipOffset = $tisFile->readVInt();
       
   915             } else {
       
   916                 $skipOffset = 0;
       
   917             }
       
   918         }
       
   919 
       
   920         if ($termFieldNum == $searchField && $termValue == $term->text) {
       
   921             $termInfo = new Zend_Search_Lucene_Index_TermInfo($docFreq, $freqPointer, $proxPointer, $skipOffset);
       
   922         } else {
       
   923             $termInfo = null;
       
   924         }
       
   925 
       
   926         // Put loaded termInfo into cache
       
   927         $this->_termInfoCache[$termKey] = $termInfo;
       
   928 
       
   929         if (count($this->_termInfoCache) == 1024) {
       
   930             $this->_cleanUpTermInfoCache();
       
   931         }
       
   932 
       
   933         return $termInfo;
       
   934     }
       
   935 
       
   936     /**
       
   937      * Returns IDs of all the documents containing term.
       
   938      *
       
   939      * @param Zend_Search_Lucene_Index_Term $term
       
   940      * @param integer $shift
       
   941      * @param Zend_Search_Lucene_Index_DocsFilter|null $docsFilter
       
   942      * @return array
       
   943      */
       
   944     public function termDocs(Zend_Search_Lucene_Index_Term $term, $shift = 0, $docsFilter = null)
       
   945     {
       
   946         $termInfo = $this->getTermInfo($term);
       
   947 
       
   948         if (!$termInfo instanceof Zend_Search_Lucene_Index_TermInfo) {
       
   949             if ($docsFilter !== null  &&  $docsFilter instanceof Zend_Search_Lucene_Index_DocsFilter) {
       
   950                 $docsFilter->segmentFilters[$this->_name] = array();
       
   951             }
       
   952             return array();
       
   953         }
       
   954 
       
   955         $frqFile = $this->openCompoundFile('.frq');
       
   956         $frqFile->seek($termInfo->freqPointer,SEEK_CUR);
       
   957         $docId  = 0;
       
   958         $result = array();
       
   959 
       
   960         if ($docsFilter !== null) {
       
   961             if (!$docsFilter instanceof Zend_Search_Lucene_Index_DocsFilter) {
       
   962                 require_once 'Zend/Search/Lucene/Exception.php';
       
   963                 throw new Zend_Search_Lucene_Exception('Documents filter must be an instance of Zend_Search_Lucene_Index_DocsFilter or null.');
       
   964             }
       
   965 
       
   966             if (isset($docsFilter->segmentFilters[$this->_name])) {
       
   967                 // Filter already has some data for the current segment
       
   968 
       
   969                 // Make short name for the filter (which doesn't need additional dereferencing)
       
   970                 $filter = &$docsFilter->segmentFilters[$this->_name];
       
   971 
       
   972                 // Check if filter is not empty
       
   973                 if (count($filter) == 0) {
       
   974                     return array();
       
   975                 }
       
   976 
       
   977                 if ($this->_docCount/count($filter) < self::FULL_SCAN_VS_FETCH_BOUNDARY) {
       
   978                     // Perform fetching
       
   979 // ---------------------------------------------------------------
       
   980                     $updatedFilterData = array();
       
   981 
       
   982                     for( $count=0; $count < $termInfo->docFreq; $count++ ) {
       
   983                         $docDelta = $frqFile->readVInt();
       
   984                         if( $docDelta % 2 == 1 ) {
       
   985                             $docId += ($docDelta-1)/2;
       
   986                         } else {
       
   987                             $docId += $docDelta/2;
       
   988                             // read freq
       
   989                             $frqFile->readVInt();
       
   990                         }
       
   991 
       
   992                         if (isset($filter[$docId])) {
       
   993                            $result[] = $shift + $docId;
       
   994                            $updatedFilterData[$docId] = 1; // 1 is just a some constant value, so we don't need additional var dereference here
       
   995                         }
       
   996                     }
       
   997                     $docsFilter->segmentFilters[$this->_name] = $updatedFilterData;
       
   998 // ---------------------------------------------------------------
       
   999                 } else {
       
  1000                     // Perform full scan
       
  1001                     $updatedFilterData = array();
       
  1002 
       
  1003                     for( $count=0; $count < $termInfo->docFreq; $count++ ) {
       
  1004                         $docDelta = $frqFile->readVInt();
       
  1005                         if( $docDelta % 2 == 1 ) {
       
  1006                             $docId += ($docDelta-1)/2;
       
  1007                         } else {
       
  1008                             $docId += $docDelta/2;
       
  1009                             // read freq
       
  1010                             $frqFile->readVInt();
       
  1011                         }
       
  1012 
       
  1013                         if (isset($filter[$docId])) {
       
  1014                            $result[] = $shift + $docId;
       
  1015                            $updatedFilterData[$docId] = 1; // 1 is just a some constant value, so we don't need additional var dereference here
       
  1016                         }
       
  1017                     }
       
  1018                     $docsFilter->segmentFilters[$this->_name] = $updatedFilterData;
       
  1019                 }
       
  1020             } else {
       
  1021                 // Filter is present, but doesn't has data for the current segment yet
       
  1022                 $filterData = array();
       
  1023                 for( $count=0; $count < $termInfo->docFreq; $count++ ) {
       
  1024                     $docDelta = $frqFile->readVInt();
       
  1025                     if( $docDelta % 2 == 1 ) {
       
  1026                         $docId += ($docDelta-1)/2;
       
  1027                     } else {
       
  1028                         $docId += $docDelta/2;
       
  1029                         // read freq
       
  1030                         $frqFile->readVInt();
       
  1031                     }
       
  1032 
       
  1033                     $result[] = $shift + $docId;
       
  1034                     $filterData[$docId] = 1; // 1 is just a some constant value, so we don't need additional var dereference here
       
  1035                 }
       
  1036                 $docsFilter->segmentFilters[$this->_name] = $filterData;
       
  1037             }
       
  1038         } else {
       
  1039             for( $count=0; $count < $termInfo->docFreq; $count++ ) {
       
  1040                 $docDelta = $frqFile->readVInt();
       
  1041                 if( $docDelta % 2 == 1 ) {
       
  1042                     $docId += ($docDelta-1)/2;
       
  1043                 } else {
       
  1044                     $docId += $docDelta/2;
       
  1045                     // read freq
       
  1046                     $frqFile->readVInt();
       
  1047                 }
       
  1048 
       
  1049                 $result[] = $shift + $docId;
       
  1050             }
       
  1051         }
       
  1052 
       
  1053         return $result;
       
  1054     }
       
  1055 
       
  1056     /**
       
  1057      * Returns term freqs array.
       
  1058      * Result array structure: array(docId => freq, ...)
       
  1059      *
       
  1060      * @param Zend_Search_Lucene_Index_Term $term
       
  1061      * @param integer $shift
       
  1062      * @param Zend_Search_Lucene_Index_DocsFilter|null $docsFilter
       
  1063      * @return Zend_Search_Lucene_Index_TermInfo
       
  1064      */
       
  1065     public function termFreqs(Zend_Search_Lucene_Index_Term $term, $shift = 0, $docsFilter = null)
       
  1066     {
       
  1067         $termInfo = $this->getTermInfo($term);
       
  1068 
       
  1069         if (!$termInfo instanceof Zend_Search_Lucene_Index_TermInfo) {
       
  1070             if ($docsFilter !== null  &&  $docsFilter instanceof Zend_Search_Lucene_Index_DocsFilter) {
       
  1071                 $docsFilter->segmentFilters[$this->_name] = array();
       
  1072             }
       
  1073             return array();
       
  1074         }
       
  1075 
       
  1076         $frqFile = $this->openCompoundFile('.frq');
       
  1077         $frqFile->seek($termInfo->freqPointer,SEEK_CUR);
       
  1078         $result = array();
       
  1079         $docId = 0;
       
  1080 
       
  1081         $result = array();
       
  1082 
       
  1083         if ($docsFilter !== null) {
       
  1084             if (!$docsFilter instanceof Zend_Search_Lucene_Index_DocsFilter) {
       
  1085                 require_once 'Zend/Search/Lucene/Exception.php';
       
  1086                 throw new Zend_Search_Lucene_Exception('Documents filter must be an instance of Zend_Search_Lucene_Index_DocsFilter or null.');
       
  1087             }
       
  1088 
       
  1089             if (isset($docsFilter->segmentFilters[$this->_name])) {
       
  1090                 // Filter already has some data for the current segment
       
  1091 
       
  1092                 // Make short name for the filter (which doesn't need additional dereferencing)
       
  1093                 $filter = &$docsFilter->segmentFilters[$this->_name];
       
  1094 
       
  1095                 // Check if filter is not empty
       
  1096                 if (count($filter) == 0) {
       
  1097                     return array();
       
  1098                 }
       
  1099 
       
  1100 
       
  1101                 if ($this->_docCount/count($filter) < self::FULL_SCAN_VS_FETCH_BOUNDARY) {
       
  1102                     // Perform fetching
       
  1103 // ---------------------------------------------------------------
       
  1104                     $updatedFilterData = array();
       
  1105 
       
  1106                     for ($count = 0; $count < $termInfo->docFreq; $count++) {
       
  1107                         $docDelta = $frqFile->readVInt();
       
  1108                         if ($docDelta % 2 == 1) {
       
  1109                             $docId += ($docDelta-1)/2;
       
  1110                             if (isset($filter[$docId])) {
       
  1111                                 $result[$shift + $docId] = 1;
       
  1112                                 $updatedFilterData[$docId] = 1; // 1 is just a some constant value, so we don't need additional var dereference here
       
  1113                             }
       
  1114                         } else {
       
  1115                             $docId += $docDelta/2;
       
  1116                             $freq = $frqFile->readVInt();
       
  1117                             if (isset($filter[$docId])) {
       
  1118                                 $result[$shift + $docId] = $freq;
       
  1119                                 $updatedFilterData[$docId] = 1; // 1 is just a some constant value, so we don't need additional var dereference here
       
  1120                             }
       
  1121                         }
       
  1122                     }
       
  1123                     $docsFilter->segmentFilters[$this->_name] = $updatedFilterData;
       
  1124 // ---------------------------------------------------------------
       
  1125                 } else {
       
  1126                     // Perform full scan
       
  1127                     $updatedFilterData = array();
       
  1128 
       
  1129                     for ($count = 0; $count < $termInfo->docFreq; $count++) {
       
  1130                         $docDelta = $frqFile->readVInt();
       
  1131                         if ($docDelta % 2 == 1) {
       
  1132                             $docId += ($docDelta-1)/2;
       
  1133                             if (isset($filter[$docId])) {
       
  1134                                 $result[$shift + $docId] = 1;
       
  1135                                 $updatedFilterData[$docId] = 1; // 1 is just some constant value, so we don't need additional var dereference here
       
  1136                             }
       
  1137                         } else {
       
  1138                             $docId += $docDelta/2;
       
  1139                             $freq = $frqFile->readVInt();
       
  1140                             if (isset($filter[$docId])) {
       
  1141                                 $result[$shift + $docId] = $freq;
       
  1142                                 $updatedFilterData[$docId] = 1; // 1 is just some constant value, so we don't need additional var dereference here
       
  1143                             }
       
  1144                         }
       
  1145                     }
       
  1146                     $docsFilter->segmentFilters[$this->_name] = $updatedFilterData;
       
  1147                 }
       
  1148             } else {
       
  1149                 // Filter doesn't has data for current segment
       
  1150                 $filterData = array();
       
  1151 
       
  1152                 for ($count = 0; $count < $termInfo->docFreq; $count++) {
       
  1153                     $docDelta = $frqFile->readVInt();
       
  1154                     if ($docDelta % 2 == 1) {
       
  1155                         $docId += ($docDelta-1)/2;
       
  1156                         $result[$shift + $docId] = 1;
       
  1157                         $filterData[$docId] = 1; // 1 is just a some constant value, so we don't need additional var dereference here
       
  1158                     } else {
       
  1159                         $docId += $docDelta/2;
       
  1160                         $result[$shift + $docId] = $frqFile->readVInt();
       
  1161                         $filterData[$docId] = 1; // 1 is just a some constant value, so we don't need additional var dereference here
       
  1162                     }
       
  1163                 }
       
  1164 
       
  1165                 $docsFilter->segmentFilters[$this->_name] = $filterData;
       
  1166             }
       
  1167         } else {
       
  1168             for ($count = 0; $count < $termInfo->docFreq; $count++) {
       
  1169                 $docDelta = $frqFile->readVInt();
       
  1170                 if ($docDelta % 2 == 1) {
       
  1171                     $docId += ($docDelta-1)/2;
       
  1172                     $result[$shift + $docId] = 1;
       
  1173                 } else {
       
  1174                     $docId += $docDelta/2;
       
  1175                     $result[$shift + $docId] = $frqFile->readVInt();
       
  1176                 }
       
  1177             }
       
  1178         }
       
  1179 
       
  1180         return $result;
       
  1181     }
       
  1182 
       
  1183     /**
       
  1184      * Returns term positions array.
       
  1185      * Result array structure: array(docId => array(pos1, pos2, ...), ...)
       
  1186      *
       
  1187      * @param Zend_Search_Lucene_Index_Term $term
       
  1188      * @param integer $shift
       
  1189      * @param Zend_Search_Lucene_Index_DocsFilter|null $docsFilter
       
  1190      * @return Zend_Search_Lucene_Index_TermInfo
       
  1191      */
       
  1192     public function termPositions(Zend_Search_Lucene_Index_Term $term, $shift = 0, $docsFilter = null)
       
  1193     {
       
  1194         $termInfo = $this->getTermInfo($term);
       
  1195 
       
  1196         if (!$termInfo instanceof Zend_Search_Lucene_Index_TermInfo) {
       
  1197             if ($docsFilter !== null  &&  $docsFilter instanceof Zend_Search_Lucene_Index_DocsFilter) {
       
  1198                 $docsFilter->segmentFilters[$this->_name] = array();
       
  1199             }
       
  1200             return array();
       
  1201         }
       
  1202 
       
  1203         $frqFile = $this->openCompoundFile('.frq');
       
  1204         $frqFile->seek($termInfo->freqPointer,SEEK_CUR);
       
  1205 
       
  1206         $docId = 0;
       
  1207         $freqs = array();
       
  1208 
       
  1209 
       
  1210         if ($docsFilter !== null) {
       
  1211             if (!$docsFilter instanceof Zend_Search_Lucene_Index_DocsFilter) {
       
  1212                 require_once 'Zend/Search/Lucene/Exception.php';
       
  1213                 throw new Zend_Search_Lucene_Exception('Documents filter must be an instance of Zend_Search_Lucene_Index_DocsFilter or null.');
       
  1214             }
       
  1215 
       
  1216             if (isset($docsFilter->segmentFilters[$this->_name])) {
       
  1217                 // Filter already has some data for the current segment
       
  1218 
       
  1219                 // Make short name for the filter (which doesn't need additional dereferencing)
       
  1220                 $filter = &$docsFilter->segmentFilters[$this->_name];
       
  1221 
       
  1222                 // Check if filter is not empty
       
  1223                 if (count($filter) == 0) {
       
  1224                     return array();
       
  1225                 }
       
  1226 
       
  1227                 if ($this->_docCount/count($filter) < self::FULL_SCAN_VS_FETCH_BOUNDARY) {
       
  1228                     // Perform fetching
       
  1229 // ---------------------------------------------------------------
       
  1230                     for ($count = 0; $count < $termInfo->docFreq; $count++) {
       
  1231                         $docDelta = $frqFile->readVInt();
       
  1232                         if ($docDelta % 2 == 1) {
       
  1233                             $docId += ($docDelta-1)/2;
       
  1234                             $freqs[$docId] = 1;
       
  1235                         } else {
       
  1236                             $docId += $docDelta/2;
       
  1237                             $freqs[$docId] = $frqFile->readVInt();
       
  1238                         }
       
  1239                     }
       
  1240 
       
  1241                     $updatedFilterData = array();
       
  1242                     $result = array();
       
  1243                     $prxFile = $this->openCompoundFile('.prx');
       
  1244                     $prxFile->seek($termInfo->proxPointer, SEEK_CUR);
       
  1245                     foreach ($freqs as $docId => $freq) {
       
  1246                         $termPosition = 0;
       
  1247                         $positions = array();
       
  1248 
       
  1249                         // we have to read .prx file to get right position for next doc
       
  1250                         // even filter doesn't match current document
       
  1251                         for ($count = 0; $count < $freq; $count++ ) {
       
  1252                             $termPosition += $prxFile->readVInt();
       
  1253                             $positions[] = $termPosition;
       
  1254                         }
       
  1255 
       
  1256                         // Include into updated filter and into result only if doc is matched by filter
       
  1257                         if (isset($filter[$docId])) {
       
  1258                             $updatedFilterData[$docId] = 1; // 1 is just a some constant value, so we don't need additional var dereference here
       
  1259                             $result[$shift + $docId] = $positions;
       
  1260                         }
       
  1261                     }
       
  1262 
       
  1263                     $docsFilter->segmentFilters[$this->_name] = $updatedFilterData;
       
  1264 // ---------------------------------------------------------------
       
  1265                 } else {
       
  1266                     // Perform full scan
       
  1267                     for ($count = 0; $count < $termInfo->docFreq; $count++) {
       
  1268                         $docDelta = $frqFile->readVInt();
       
  1269                         if ($docDelta % 2 == 1) {
       
  1270                             $docId += ($docDelta-1)/2;
       
  1271                             $freqs[$docId] = 1;
       
  1272                         } else {
       
  1273                             $docId += $docDelta/2;
       
  1274                             $freqs[$docId] = $frqFile->readVInt();
       
  1275                         }
       
  1276                     }
       
  1277 
       
  1278                     $updatedFilterData = array();
       
  1279                     $result = array();
       
  1280                     $prxFile = $this->openCompoundFile('.prx');
       
  1281                     $prxFile->seek($termInfo->proxPointer, SEEK_CUR);
       
  1282                     foreach ($freqs as $docId => $freq) {
       
  1283                         $termPosition = 0;
       
  1284                         $positions = array();
       
  1285 
       
  1286                         // we have to read .prx file to get right position for next doc
       
  1287                         // even filter doesn't match current document
       
  1288                         for ($count = 0; $count < $freq; $count++ ) {
       
  1289                             $termPosition += $prxFile->readVInt();
       
  1290                             $positions[] = $termPosition;
       
  1291                         }
       
  1292 
       
  1293                         // Include into updated filter and into result only if doc is matched by filter
       
  1294                         if (isset($filter[$docId])) {
       
  1295                             $updatedFilterData[$docId] = 1; // 1 is just a some constant value, so we don't need additional var dereference here
       
  1296                             $result[$shift + $docId] = $positions;
       
  1297                         }
       
  1298                     }
       
  1299 
       
  1300                     $docsFilter->segmentFilters[$this->_name] = $updatedFilterData;
       
  1301                 }
       
  1302             } else {
       
  1303                 // Filter doesn't has data for current segment
       
  1304                 for ($count = 0; $count < $termInfo->docFreq; $count++) {
       
  1305                     $docDelta = $frqFile->readVInt();
       
  1306                     if ($docDelta % 2 == 1) {
       
  1307                         $docId += ($docDelta-1)/2;
       
  1308                         $freqs[$docId] = 1;
       
  1309                     } else {
       
  1310                         $docId += $docDelta/2;
       
  1311                         $freqs[$docId] = $frqFile->readVInt();
       
  1312                     }
       
  1313                 }
       
  1314 
       
  1315                 $filterData = array();
       
  1316                 $result = array();
       
  1317                 $prxFile = $this->openCompoundFile('.prx');
       
  1318                 $prxFile->seek($termInfo->proxPointer, SEEK_CUR);
       
  1319                 foreach ($freqs as $docId => $freq) {
       
  1320                     $filterData[$docId] = 1; // 1 is just a some constant value, so we don't need additional var dereference here
       
  1321 
       
  1322                     $termPosition = 0;
       
  1323                     $positions = array();
       
  1324 
       
  1325                     for ($count = 0; $count < $freq; $count++ ) {
       
  1326                         $termPosition += $prxFile->readVInt();
       
  1327                         $positions[] = $termPosition;
       
  1328                     }
       
  1329 
       
  1330                     $result[$shift + $docId] = $positions;
       
  1331                 }
       
  1332 
       
  1333                 $docsFilter->segmentFilters[$this->_name] = $filterData;
       
  1334             }
       
  1335         } else {
       
  1336             for ($count = 0; $count < $termInfo->docFreq; $count++) {
       
  1337                 $docDelta = $frqFile->readVInt();
       
  1338                 if ($docDelta % 2 == 1) {
       
  1339                     $docId += ($docDelta-1)/2;
       
  1340                     $freqs[$docId] = 1;
       
  1341                 } else {
       
  1342                     $docId += $docDelta/2;
       
  1343                     $freqs[$docId] = $frqFile->readVInt();
       
  1344                 }
       
  1345             }
       
  1346 
       
  1347             $result = array();
       
  1348             $prxFile = $this->openCompoundFile('.prx');
       
  1349             $prxFile->seek($termInfo->proxPointer, SEEK_CUR);
       
  1350             foreach ($freqs as $docId => $freq) {
       
  1351                 $termPosition = 0;
       
  1352                 $positions = array();
       
  1353 
       
  1354                 for ($count = 0; $count < $freq; $count++ ) {
       
  1355                     $termPosition += $prxFile->readVInt();
       
  1356                     $positions[] = $termPosition;
       
  1357                 }
       
  1358 
       
  1359                 $result[$shift + $docId] = $positions;
       
  1360             }
       
  1361         }
       
  1362 
       
  1363         return $result;
       
  1364     }
       
  1365 
       
  1366     /**
       
  1367      * Load normalizatin factors from an index file
       
  1368      *
       
  1369      * @param integer $fieldNum
       
  1370      * @throws Zend_Search_Lucene_Exception
       
  1371      */
       
  1372     private function _loadNorm($fieldNum)
       
  1373     {
       
  1374         if ($this->_hasSingleNormFile) {
       
  1375             $normfFile = $this->openCompoundFile('.nrm');
       
  1376 
       
  1377             $header              = $normfFile->readBytes(3);
       
  1378             $headerFormatVersion = $normfFile->readByte();
       
  1379 
       
  1380             if ($header != 'NRM'  ||  $headerFormatVersion != (int)0xFF) {
       
  1381                 require_once 'Zend/Search/Lucene/Exception.php';
       
  1382                 throw new  Zend_Search_Lucene_Exception('Wrong norms file format.');
       
  1383             }
       
  1384 
       
  1385             foreach ($this->_fields as $fNum => $fieldInfo) {
       
  1386                 if ($fieldInfo->isIndexed) {
       
  1387                     $this->_norms[$fNum] = $normfFile->readBytes($this->_docCount);
       
  1388                 }
       
  1389             }
       
  1390         } else {
       
  1391             $fFile = $this->openCompoundFile('.f' . $fieldNum);
       
  1392             $this->_norms[$fieldNum] = $fFile->readBytes($this->_docCount);
       
  1393         }
       
  1394     }
       
  1395 
       
  1396     /**
       
  1397      * Returns normalization factor for specified documents
       
  1398      *
       
  1399      * @param integer $id
       
  1400      * @param string $fieldName
       
  1401      * @return float
       
  1402      */
       
  1403     public function norm($id, $fieldName)
       
  1404     {
       
  1405         $fieldNum = $this->getFieldNum($fieldName);
       
  1406 
       
  1407         if ( !($this->_fields[$fieldNum]->isIndexed) ) {
       
  1408             return null;
       
  1409         }
       
  1410 
       
  1411         if (!isset($this->_norms[$fieldNum])) {
       
  1412             $this->_loadNorm($fieldNum);
       
  1413         }
       
  1414 
       
  1415         return Zend_Search_Lucene_Search_Similarity::decodeNorm( ord($this->_norms[$fieldNum][$id]) );
       
  1416     }
       
  1417 
       
  1418     /**
       
  1419      * Returns norm vector, encoded in a byte string
       
  1420      *
       
  1421      * @param string $fieldName
       
  1422      * @return string
       
  1423      */
       
  1424     public function normVector($fieldName)
       
  1425     {
       
  1426         $fieldNum = $this->getFieldNum($fieldName);
       
  1427 
       
  1428         if ($fieldNum == -1  ||  !($this->_fields[$fieldNum]->isIndexed)) {
       
  1429             $similarity = Zend_Search_Lucene_Search_Similarity::getDefault();
       
  1430 
       
  1431             return str_repeat(chr($similarity->encodeNorm( $similarity->lengthNorm($fieldName, 0) )),
       
  1432                               $this->_docCount);
       
  1433         }
       
  1434 
       
  1435         if (!isset($this->_norms[$fieldNum])) {
       
  1436             $this->_loadNorm($fieldNum);
       
  1437         }
       
  1438 
       
  1439         return $this->_norms[$fieldNum];
       
  1440     }
       
  1441 
       
  1442 
       
  1443     /**
       
  1444      * Returns true if any documents have been deleted from this index segment.
       
  1445      *
       
  1446      * @return boolean
       
  1447      */
       
  1448     public function hasDeletions()
       
  1449     {
       
  1450         return $this->_deleted !== null;
       
  1451     }
       
  1452 
       
  1453 
       
  1454     /**
       
  1455      * Returns true if segment has single norms file.
       
  1456      *
       
  1457      * @return boolean
       
  1458      */
       
  1459     public function hasSingleNormFile()
       
  1460     {
       
  1461         return $this->_hasSingleNormFile ? true : false;
       
  1462     }
       
  1463 
       
  1464     /**
       
  1465      * Returns true if segment is stored using compound segment file.
       
  1466      *
       
  1467      * @return boolean
       
  1468      */
       
  1469     public function isCompound()
       
  1470     {
       
  1471         return $this->_isCompound;
       
  1472     }
       
  1473 
       
  1474     /**
       
  1475      * Deletes a document from the index segment.
       
  1476      * $id is an internal document id
       
  1477      *
       
  1478      * @param integer
       
  1479      */
       
  1480     public function delete($id)
       
  1481     {
       
  1482         $this->_deletedDirty = true;
       
  1483 
       
  1484         if (extension_loaded('bitset')) {
       
  1485             if ($this->_deleted === null) {
       
  1486                 $this->_deleted = bitset_empty($id);
       
  1487             }
       
  1488             bitset_incl($this->_deleted, $id);
       
  1489         } else {
       
  1490             if ($this->_deleted === null) {
       
  1491                 $this->_deleted = array();
       
  1492             }
       
  1493 
       
  1494             $this->_deleted[$id] = 1;
       
  1495         }
       
  1496     }
       
  1497 
       
  1498     /**
       
  1499      * Checks, that document is deleted
       
  1500      *
       
  1501      * @param integer
       
  1502      * @return boolean
       
  1503      */
       
  1504     public function isDeleted($id)
       
  1505     {
       
  1506         if ($this->_deleted === null) {
       
  1507             return false;
       
  1508         }
       
  1509 
       
  1510         if (extension_loaded('bitset')) {
       
  1511             return bitset_in($this->_deleted, $id);
       
  1512         } else {
       
  1513             return isset($this->_deleted[$id]);
       
  1514         }
       
  1515     }
       
  1516 
       
  1517     /**
       
  1518      * Detect latest delete generation
       
  1519      *
       
  1520      * Is actualy used from writeChanges() method or from the constructor if it's invoked from
       
  1521      * Index writer. In both cases index write lock is already obtained, so we shouldn't care
       
  1522      * about it
       
  1523      *
       
  1524      * @return integer
       
  1525      */
       
  1526     private function _detectLatestDelGen()
       
  1527     {
       
  1528         $delFileList = array();
       
  1529         foreach ($this->_directory->fileList() as $file) {
       
  1530             if ($file == $this->_name . '.del') {
       
  1531                 // Matches <segment_name>.del file name
       
  1532                 $delFileList[] = 0;
       
  1533             } else if (preg_match('/^' . $this->_name . '_([a-zA-Z0-9]+)\.del$/i', $file, $matches)) {
       
  1534                 // Matches <segment_name>_NNN.del file names
       
  1535                 $delFileList[] = (int)base_convert($matches[1], 36, 10);
       
  1536             }
       
  1537         }
       
  1538 
       
  1539         if (count($delFileList) == 0) {
       
  1540             // There is no deletions file for current segment in the directory
       
  1541             // Set deletions file generation number to 1
       
  1542             return -1;
       
  1543         } else {
       
  1544             // There are some deletions files for current segment in the directory
       
  1545             // Set deletions file generation number to the highest nuber
       
  1546             return max($delFileList);
       
  1547         }
       
  1548     }
       
  1549 
       
  1550     /**
       
  1551      * Write changes if it's necessary.
       
  1552      *
       
  1553      * This method must be invoked only from the Writer _updateSegments() method,
       
  1554      * so index Write lock has to be already obtained.
       
  1555      *
       
  1556      * @internal
       
  1557      * @throws Zend_Search_Lucene_Exceptions
       
  1558      */
       
  1559     public function writeChanges()
       
  1560     {
       
  1561         // Get new generation number
       
  1562         $latestDelGen = $this->_detectLatestDelGen();
       
  1563 
       
  1564         if (!$this->_deletedDirty) {
       
  1565             // There was no deletions by current process
       
  1566 
       
  1567             if ($latestDelGen == $this->_delGen) {
       
  1568                 // Delete file hasn't been updated by any concurrent process
       
  1569                 return;
       
  1570             } else if ($latestDelGen > $this->_delGen) {
       
  1571                 // Delete file has been updated by some concurrent process
       
  1572                 // Reload deletions file
       
  1573                 $this->_delGen  = $latestDelGen;
       
  1574                 $this->_deleted = $this->_loadDelFile();
       
  1575 
       
  1576                 return;
       
  1577             } else {
       
  1578                 require_once 'Zend/Search/Lucene/Exception.php';
       
  1579                 throw new Zend_Search_Lucene_Exception('Delete file processing workflow is corrupted for the segment \'' . $this->_name . '\'.');
       
  1580             }
       
  1581         }
       
  1582 
       
  1583         if ($latestDelGen > $this->_delGen) {
       
  1584             // Merge current deletions with latest deletions file
       
  1585             $this->_delGen = $latestDelGen;
       
  1586 
       
  1587             $latestDelete = $this->_loadDelFile();
       
  1588 
       
  1589             if (extension_loaded('bitset')) {
       
  1590                 $this->_deleted = bitset_union($this->_deleted, $latestDelete);
       
  1591             } else {
       
  1592                 $this->_deleted += $latestDelete;
       
  1593             }
       
  1594         }
       
  1595 
       
  1596         if (extension_loaded('bitset')) {
       
  1597             $delBytes = $this->_deleted;
       
  1598             $bitCount = count(bitset_to_array($delBytes));
       
  1599         } else {
       
  1600             $byteCount = floor($this->_docCount/8)+1;
       
  1601             $delBytes = str_repeat(chr(0), $byteCount);
       
  1602             for ($count = 0; $count < $byteCount; $count++) {
       
  1603                 $byte = 0;
       
  1604                 for ($bit = 0; $bit < 8; $bit++) {
       
  1605                     if (isset($this->_deleted[$count*8 + $bit])) {
       
  1606                         $byte |= (1<<$bit);
       
  1607                     }
       
  1608                 }
       
  1609                 $delBytes[$count] = chr($byte);
       
  1610             }
       
  1611             $bitCount = count($this->_deleted);
       
  1612         }
       
  1613 
       
  1614         if ($this->_delGen == -1) {
       
  1615             // Set delete file generation number to 1
       
  1616             $this->_delGen = 1;
       
  1617         } else {
       
  1618             // Increase delete file generation number by 1
       
  1619             $this->_delGen++;
       
  1620         }
       
  1621 
       
  1622         $delFile = $this->_directory->createFile($this->_name . '_' . base_convert($this->_delGen, 10, 36) . '.del');
       
  1623         $delFile->writeInt($this->_docCount);
       
  1624         $delFile->writeInt($bitCount);
       
  1625         $delFile->writeBytes($delBytes);
       
  1626 
       
  1627         $this->_deletedDirty = false;
       
  1628     }
       
  1629 
       
  1630 
       
  1631     /**
       
  1632      * Term Dictionary File object for stream like terms reading
       
  1633      *
       
  1634      * @var Zend_Search_Lucene_Storage_File
       
  1635      */
       
  1636     private $_tisFile = null;
       
  1637 
       
  1638     /**
       
  1639      * Actual offset of the .tis file data
       
  1640      *
       
  1641      * @var integer
       
  1642      */
       
  1643     private $_tisFileOffset;
       
  1644 
       
  1645     /**
       
  1646      * Frequencies File object for stream like terms reading
       
  1647      *
       
  1648      * @var Zend_Search_Lucene_Storage_File
       
  1649      */
       
  1650     private $_frqFile = null;
       
  1651 
       
  1652     /**
       
  1653      * Actual offset of the .frq file data
       
  1654      *
       
  1655      * @var integer
       
  1656      */
       
  1657     private $_frqFileOffset;
       
  1658 
       
  1659     /**
       
  1660      * Positions File object for stream like terms reading
       
  1661      *
       
  1662      * @var Zend_Search_Lucene_Storage_File
       
  1663      */
       
  1664     private $_prxFile = null;
       
  1665 
       
  1666     /**
       
  1667      * Actual offset of the .prx file in the compound file
       
  1668      *
       
  1669      * @var integer
       
  1670      */
       
  1671     private $_prxFileOffset;
       
  1672 
       
  1673 
       
  1674     /**
       
  1675      * Actual number of terms in term stream
       
  1676      *
       
  1677      * @var integer
       
  1678      */
       
  1679     private $_termCount = 0;
       
  1680 
       
  1681     /**
       
  1682      * Overall number of terms in term stream
       
  1683      *
       
  1684      * @var integer
       
  1685      */
       
  1686     private $_termNum = 0;
       
  1687 
       
  1688     /**
       
  1689      * Segment index interval
       
  1690      *
       
  1691      * @var integer
       
  1692      */
       
  1693     private $_indexInterval;
       
  1694 
       
  1695     /**
       
  1696      * Segment skip interval
       
  1697      *
       
  1698      * @var integer
       
  1699      */
       
  1700     private $_skipInterval;
       
  1701 
       
  1702     /**
       
  1703      * Last TermInfo in a terms stream
       
  1704      *
       
  1705      * @var Zend_Search_Lucene_Index_TermInfo
       
  1706      */
       
  1707     private $_lastTermInfo = null;
       
  1708 
       
  1709     /**
       
  1710      * Last Term in a terms stream
       
  1711      *
       
  1712      * @var Zend_Search_Lucene_Index_Term
       
  1713      */
       
  1714     private $_lastTerm = null;
       
  1715 
       
  1716     /**
       
  1717      * Map of the document IDs
       
  1718      * Used to get new docID after removing deleted documents.
       
  1719      * It's not very effective from memory usage point of view,
       
  1720      * but much more faster, then other methods
       
  1721      *
       
  1722      * @var array|null
       
  1723      */
       
  1724     private $_docMap = null;
       
  1725 
       
  1726     /**
       
  1727      * An array of all term positions in the documents.
       
  1728      * Array structure: array( docId => array( pos1, pos2, ...), ...)
       
  1729      *
       
  1730      * Is set to null if term positions loading has to be skipped
       
  1731      *
       
  1732      * @var array|null
       
  1733      */
       
  1734     private $_lastTermPositions;
       
  1735 
       
  1736 
       
  1737     /**
       
  1738      * Terms scan mode
       
  1739      *
       
  1740      * Values:
       
  1741      *
       
  1742      * self::SM_TERMS_ONLY - terms are scanned, no additional info is retrieved
       
  1743      * self::SM_FULL_INFO  - terms are scanned, frequency and position info is retrieved
       
  1744      * self::SM_MERGE_INFO - terms are scanned, frequency and position info is retrieved
       
  1745      *                       document numbers are compacted (shifted if segment has deleted documents)
       
  1746      *
       
  1747      * @var integer
       
  1748      */
       
  1749     private $_termsScanMode;
       
  1750 
       
  1751     /** Scan modes */
       
  1752     const SM_TERMS_ONLY = 0;    // terms are scanned, no additional info is retrieved
       
  1753     const SM_FULL_INFO  = 1;    // terms are scanned, frequency and position info is retrieved
       
  1754     const SM_MERGE_INFO = 2;    // terms are scanned, frequency and position info is retrieved
       
  1755                                 // document numbers are compacted (shifted if segment contains deleted documents)
       
  1756 
       
  1757     /**
       
  1758      * Reset terms stream
       
  1759      *
       
  1760      * $startId - id for the fist document
       
  1761      * $compact - remove deleted documents
       
  1762      *
       
  1763      * Returns start document id for the next segment
       
  1764      *
       
  1765      * @param integer $startId
       
  1766      * @param integer $mode
       
  1767      * @throws Zend_Search_Lucene_Exception
       
  1768      * @return integer
       
  1769      */
       
  1770     public function resetTermsStream(/** $startId = 0, $mode = self::SM_TERMS_ONLY */)
       
  1771     {
       
  1772         /**
       
  1773          * SegmentInfo->resetTermsStream() method actually takes two optional parameters:
       
  1774          *   $startId (default value is 0)
       
  1775          *   $mode (default value is self::SM_TERMS_ONLY)
       
  1776          */
       
  1777         $argList = func_get_args();
       
  1778         if (count($argList) > 2) {
       
  1779             require_once 'Zend/Search/Lucene/Exception.php';
       
  1780             throw new Zend_Search_Lucene_Exception('Wrong number of arguments');
       
  1781         } else if (count($argList) == 2) {
       
  1782             $startId = $argList[0];
       
  1783             $mode    = $argList[1];
       
  1784         } else if (count($argList) == 1) {
       
  1785             $startId = $argList[0];
       
  1786             $mode    = self::SM_TERMS_ONLY;
       
  1787         } else {
       
  1788             $startId = 0;
       
  1789             $mode    = self::SM_TERMS_ONLY;
       
  1790         }
       
  1791 
       
  1792         if ($this->_tisFile !== null) {
       
  1793             $this->_tisFile = null;
       
  1794         }
       
  1795 
       
  1796         $this->_tisFile = $this->openCompoundFile('.tis', false);
       
  1797         $this->_tisFileOffset = $this->_tisFile->tell();
       
  1798 
       
  1799         $tiVersion = $this->_tisFile->readInt();
       
  1800         if ($tiVersion != (int)0xFFFFFFFE /* pre-2.1 format */  &&
       
  1801             $tiVersion != (int)0xFFFFFFFD /* 2.1+ format    */) {
       
  1802             require_once 'Zend/Search/Lucene/Exception.php';
       
  1803             throw new Zend_Search_Lucene_Exception('Wrong TermInfoFile file format');
       
  1804         }
       
  1805 
       
  1806         $this->_termCount     =
       
  1807               $this->_termNum = $this->_tisFile->readLong(); // Read terms count
       
  1808         $this->_indexInterval = $this->_tisFile->readInt();  // Read Index interval
       
  1809         $this->_skipInterval  = $this->_tisFile->readInt();  // Read skip interval
       
  1810         if ($tiVersion == (int)0xFFFFFFFD /* 2.1+ format */) {
       
  1811             $maxSkipLevels = $this->_tisFile->readInt();
       
  1812         }
       
  1813 
       
  1814         if ($this->_frqFile !== null) {
       
  1815             $this->_frqFile = null;
       
  1816         }
       
  1817         if ($this->_prxFile !== null) {
       
  1818             $this->_prxFile = null;
       
  1819         }
       
  1820         $this->_docMap = array();
       
  1821 
       
  1822         $this->_lastTerm          = new Zend_Search_Lucene_Index_Term('', -1);
       
  1823         $this->_lastTermInfo      = new Zend_Search_Lucene_Index_TermInfo(0, 0, 0, 0);
       
  1824         $this->_lastTermPositions = null;
       
  1825 
       
  1826         $this->_termsScanMode = $mode;
       
  1827 
       
  1828         switch ($mode) {
       
  1829             case self::SM_TERMS_ONLY:
       
  1830                 // Do nothing
       
  1831                 break;
       
  1832 
       
  1833             case self::SM_FULL_INFO:
       
  1834                 // break intentionally omitted
       
  1835             case self::SM_MERGE_INFO:
       
  1836                 $this->_frqFile = $this->openCompoundFile('.frq', false);
       
  1837                 $this->_frqFileOffset = $this->_frqFile->tell();
       
  1838 
       
  1839                 $this->_prxFile = $this->openCompoundFile('.prx', false);
       
  1840                 $this->_prxFileOffset = $this->_prxFile->tell();
       
  1841 
       
  1842                 for ($count = 0; $count < $this->_docCount; $count++) {
       
  1843                     if (!$this->isDeleted($count)) {
       
  1844                         $this->_docMap[$count] = $startId + (($mode == self::SM_MERGE_INFO) ? count($this->_docMap) : $count);
       
  1845                     }
       
  1846                 }
       
  1847                 break;
       
  1848 
       
  1849             default:
       
  1850                 require_once 'Zend/Search/Lucene/Exception.php';
       
  1851                 throw new Zend_Search_Lucene_Exception('Wrong terms scaning mode specified.');
       
  1852                 break;
       
  1853         }
       
  1854 
       
  1855         // Calculate next segment start id (since $this->_docMap structure may be cleaned by $this->nextTerm() call)
       
  1856         $nextSegmentStartId = $startId + (($mode == self::SM_MERGE_INFO) ? count($this->_docMap) : $this->_docCount);
       
  1857         $this->nextTerm();
       
  1858 
       
  1859         return $nextSegmentStartId;
       
  1860     }
       
  1861 
       
  1862 
       
  1863     /**
       
  1864      * Skip terms stream up to the specified term preffix.
       
  1865      *
       
  1866      * Prefix contains fully specified field info and portion of searched term
       
  1867      *
       
  1868      * @param Zend_Search_Lucene_Index_Term $prefix
       
  1869      * @throws Zend_Search_Lucene_Exception
       
  1870      */
       
  1871     public function skipTo(Zend_Search_Lucene_Index_Term $prefix)
       
  1872     {
       
  1873         if ($this->_termDictionary === null) {
       
  1874             $this->_loadDictionaryIndex();
       
  1875         }
       
  1876 
       
  1877         $searchField = $this->getFieldNum($prefix->field);
       
  1878 
       
  1879         if ($searchField == -1) {
       
  1880             /**
       
  1881              * Field is not presented in this segment
       
  1882              * Go to the end of dictionary
       
  1883              */
       
  1884             $this->_tisFile = null;
       
  1885             $this->_frqFile = null;
       
  1886             $this->_prxFile = null;
       
  1887 
       
  1888             $this->_lastTerm          = null;
       
  1889             $this->_lastTermInfo      = null;
       
  1890             $this->_lastTermPositions = null;
       
  1891 
       
  1892             return;
       
  1893         }
       
  1894         $searchDicField = $this->_getFieldPosition($searchField);
       
  1895 
       
  1896         // search for appropriate value in dictionary
       
  1897         $lowIndex = 0;
       
  1898         $highIndex = count($this->_termDictionary)-1;
       
  1899         while ($highIndex >= $lowIndex) {
       
  1900             // $mid = ($highIndex - $lowIndex)/2;
       
  1901             $mid = ($highIndex + $lowIndex) >> 1;
       
  1902             $midTerm = $this->_termDictionary[$mid];
       
  1903 
       
  1904             $fieldNum = $this->_getFieldPosition($midTerm[0] /* field */);
       
  1905             $delta = $searchDicField - $fieldNum;
       
  1906             if ($delta == 0) {
       
  1907                 $delta = strcmp($prefix->text, $midTerm[1] /* text */);
       
  1908             }
       
  1909 
       
  1910             if ($delta < 0) {
       
  1911                 $highIndex = $mid-1;
       
  1912             } elseif ($delta > 0) {
       
  1913                 $lowIndex  = $mid+1;
       
  1914             } else {
       
  1915                 // We have reached term we are looking for
       
  1916                 break;
       
  1917             }
       
  1918         }
       
  1919 
       
  1920         if ($highIndex == -1) {
       
  1921             // Term is out of the dictionary range
       
  1922             $this->_tisFile = null;
       
  1923             $this->_frqFile = null;
       
  1924             $this->_prxFile = null;
       
  1925 
       
  1926             $this->_lastTerm          = null;
       
  1927             $this->_lastTermInfo      = null;
       
  1928             $this->_lastTermPositions = null;
       
  1929 
       
  1930             return;
       
  1931         }
       
  1932 
       
  1933         $prevPosition = $highIndex;
       
  1934         $prevTerm = $this->_termDictionary[$prevPosition];
       
  1935         $prevTermInfo = $this->_termDictionaryInfos[$prevPosition];
       
  1936 
       
  1937         if ($this->_tisFile === null) {
       
  1938             // The end of terms stream is reached and terms dictionary file is closed
       
  1939             // Perform mini-reset operation
       
  1940             $this->_tisFile = $this->openCompoundFile('.tis', false);
       
  1941 
       
  1942             if ($this->_termsScanMode == self::SM_FULL_INFO  ||  $this->_termsScanMode == self::SM_MERGE_INFO) {
       
  1943                 $this->_frqFile = $this->openCompoundFile('.frq', false);
       
  1944                 $this->_prxFile = $this->openCompoundFile('.prx', false);
       
  1945             }
       
  1946         }
       
  1947         $this->_tisFile->seek($this->_tisFileOffset + $prevTermInfo[4], SEEK_SET);
       
  1948 
       
  1949         $this->_lastTerm     = new Zend_Search_Lucene_Index_Term($prevTerm[1] /* text */,
       
  1950                                                                  ($prevTerm[0] == -1) ? '' : $this->_fields[$prevTerm[0] /* field */]->name);
       
  1951         $this->_lastTermInfo = new Zend_Search_Lucene_Index_TermInfo($prevTermInfo[0] /* docFreq */,
       
  1952                                                                      $prevTermInfo[1] /* freqPointer */,
       
  1953                                                                      $prevTermInfo[2] /* proxPointer */,
       
  1954                                                                      $prevTermInfo[3] /* skipOffset */);
       
  1955         $this->_termCount  =  $this->_termNum - $prevPosition*$this->_indexInterval;
       
  1956 
       
  1957         if ($highIndex == 0) {
       
  1958             // skip start entry
       
  1959             $this->nextTerm();
       
  1960         } else if ($prefix->field == $this->_lastTerm->field  &&  $prefix->text  == $this->_lastTerm->text) {
       
  1961             // We got exact match in the dictionary index
       
  1962 
       
  1963             if ($this->_termsScanMode == self::SM_FULL_INFO  ||  $this->_termsScanMode == self::SM_MERGE_INFO) {
       
  1964                 $this->_lastTermPositions = array();
       
  1965 
       
  1966                 $this->_frqFile->seek($this->_lastTermInfo->freqPointer + $this->_frqFileOffset, SEEK_SET);
       
  1967                 $freqs = array();   $docId = 0;
       
  1968                 for( $count = 0; $count < $this->_lastTermInfo->docFreq; $count++ ) {
       
  1969                     $docDelta = $this->_frqFile->readVInt();
       
  1970                     if( $docDelta % 2 == 1 ) {
       
  1971                         $docId += ($docDelta-1)/2;
       
  1972                         $freqs[ $docId ] = 1;
       
  1973                     } else {
       
  1974                         $docId += $docDelta/2;
       
  1975                         $freqs[ $docId ] = $this->_frqFile->readVInt();
       
  1976                     }
       
  1977                 }
       
  1978 
       
  1979                 $this->_prxFile->seek($this->_lastTermInfo->proxPointer + $this->_prxFileOffset, SEEK_SET);
       
  1980                 foreach ($freqs as $docId => $freq) {
       
  1981                     $termPosition = 0;  $positions = array();
       
  1982 
       
  1983                     for ($count = 0; $count < $freq; $count++ ) {
       
  1984                         $termPosition += $this->_prxFile->readVInt();
       
  1985                         $positions[] = $termPosition;
       
  1986                     }
       
  1987 
       
  1988                     if (isset($this->_docMap[$docId])) {
       
  1989                         $this->_lastTermPositions[$this->_docMap[$docId]] = $positions;
       
  1990                     }
       
  1991                 }
       
  1992             }
       
  1993 
       
  1994             return;
       
  1995         }
       
  1996 
       
  1997         // Search term matching specified prefix
       
  1998         while ($this->_lastTerm !== null) {
       
  1999             if ( strcmp($this->_lastTerm->field, $prefix->field) > 0  ||
       
  2000                  ($prefix->field == $this->_lastTerm->field  &&  strcmp($this->_lastTerm->text, $prefix->text) >= 0) ) {
       
  2001                     // Current term matches or greate than the pattern
       
  2002                     return;
       
  2003             }
       
  2004 
       
  2005             $this->nextTerm();
       
  2006         }
       
  2007     }
       
  2008 
       
  2009 
       
  2010     /**
       
  2011      * Scans terms dictionary and returns next term
       
  2012      *
       
  2013      * @return Zend_Search_Lucene_Index_Term|null
       
  2014      */
       
  2015     public function nextTerm()
       
  2016     {
       
  2017         if ($this->_tisFile === null  ||  $this->_termCount == 0) {
       
  2018             $this->_lastTerm          = null;
       
  2019             $this->_lastTermInfo      = null;
       
  2020             $this->_lastTermPositions = null;
       
  2021             $this->_docMap            = null;
       
  2022 
       
  2023             // may be necessary for "empty" segment
       
  2024             $this->_tisFile = null;
       
  2025             $this->_frqFile = null;
       
  2026             $this->_prxFile = null;
       
  2027 
       
  2028             return null;
       
  2029         }
       
  2030 
       
  2031         $termPrefixLength = $this->_tisFile->readVInt();
       
  2032         $termSuffix       = $this->_tisFile->readString();
       
  2033         $termFieldNum     = $this->_tisFile->readVInt();
       
  2034         $termValue        = Zend_Search_Lucene_Index_Term::getPrefix($this->_lastTerm->text, $termPrefixLength) . $termSuffix;
       
  2035 
       
  2036         $this->_lastTerm = new Zend_Search_Lucene_Index_Term($termValue, $this->_fields[$termFieldNum]->name);
       
  2037 
       
  2038         $docFreq     = $this->_tisFile->readVInt();
       
  2039         $freqPointer = $this->_lastTermInfo->freqPointer + $this->_tisFile->readVInt();
       
  2040         $proxPointer = $this->_lastTermInfo->proxPointer + $this->_tisFile->readVInt();
       
  2041         if ($docFreq >= $this->_skipInterval) {
       
  2042             $skipOffset = $this->_tisFile->readVInt();
       
  2043         } else {
       
  2044             $skipOffset = 0;
       
  2045         }
       
  2046 
       
  2047         $this->_lastTermInfo = new Zend_Search_Lucene_Index_TermInfo($docFreq, $freqPointer, $proxPointer, $skipOffset);
       
  2048 
       
  2049 
       
  2050         if ($this->_termsScanMode == self::SM_FULL_INFO  ||  $this->_termsScanMode == self::SM_MERGE_INFO) {
       
  2051             $this->_lastTermPositions = array();
       
  2052 
       
  2053             $this->_frqFile->seek($this->_lastTermInfo->freqPointer + $this->_frqFileOffset, SEEK_SET);
       
  2054             $freqs = array();   $docId = 0;
       
  2055             for( $count = 0; $count < $this->_lastTermInfo->docFreq; $count++ ) {
       
  2056                 $docDelta = $this->_frqFile->readVInt();
       
  2057                 if( $docDelta % 2 == 1 ) {
       
  2058                     $docId += ($docDelta-1)/2;
       
  2059                     $freqs[ $docId ] = 1;
       
  2060                 } else {
       
  2061                     $docId += $docDelta/2;
       
  2062                     $freqs[ $docId ] = $this->_frqFile->readVInt();
       
  2063                 }
       
  2064             }
       
  2065 
       
  2066             $this->_prxFile->seek($this->_lastTermInfo->proxPointer + $this->_prxFileOffset, SEEK_SET);
       
  2067             foreach ($freqs as $docId => $freq) {
       
  2068                 $termPosition = 0;  $positions = array();
       
  2069 
       
  2070                 for ($count = 0; $count < $freq; $count++ ) {
       
  2071                     $termPosition += $this->_prxFile->readVInt();
       
  2072                     $positions[] = $termPosition;
       
  2073                 }
       
  2074 
       
  2075                 if (isset($this->_docMap[$docId])) {
       
  2076                     $this->_lastTermPositions[$this->_docMap[$docId]] = $positions;
       
  2077                 }
       
  2078             }
       
  2079         }
       
  2080 
       
  2081         $this->_termCount--;
       
  2082         if ($this->_termCount == 0) {
       
  2083             $this->_tisFile = null;
       
  2084             $this->_frqFile = null;
       
  2085             $this->_prxFile = null;
       
  2086         }
       
  2087 
       
  2088         return $this->_lastTerm;
       
  2089     }
       
  2090 
       
  2091     /**
       
  2092      * Close terms stream
       
  2093      *
       
  2094      * Should be used for resources clean up if stream is not read up to the end
       
  2095      */
       
  2096     public function closeTermsStream()
       
  2097     {
       
  2098         $this->_tisFile = null;
       
  2099         $this->_frqFile = null;
       
  2100         $this->_prxFile = null;
       
  2101 
       
  2102         $this->_lastTerm          = null;
       
  2103         $this->_lastTermInfo      = null;
       
  2104         $this->_lastTermPositions = null;
       
  2105 
       
  2106         $this->_docMap            = null;
       
  2107     }
       
  2108 
       
  2109 
       
  2110     /**
       
  2111      * Returns term in current position
       
  2112      *
       
  2113      * @return Zend_Search_Lucene_Index_Term|null
       
  2114      */
       
  2115     public function currentTerm()
       
  2116     {
       
  2117         return $this->_lastTerm;
       
  2118     }
       
  2119 
       
  2120 
       
  2121     /**
       
  2122      * Returns an array of all term positions in the documents.
       
  2123      * Return array structure: array( docId => array( pos1, pos2, ...), ...)
       
  2124      *
       
  2125      * @return array
       
  2126      */
       
  2127     public function currentTermPositions()
       
  2128     {
       
  2129         return $this->_lastTermPositions;
       
  2130     }
       
  2131 }
       
  2132