web/lib/Zend/Search/Lucene/Index/DictionaryLoader.php
changeset 64 162c1de6545a
parent 19 1c2f13fd785c
child 68 ecaf28ffe26e
equal deleted inserted replaced
63:5b37998e522e 64:162c1de6545a
       
     1 <?php
       
     2 /**
       
     3  * Zend Framework
       
     4  *
       
     5  * LICENSE
       
     6  *
       
     7  * This source file is subject to the new BSD license that is bundled
       
     8  * with this package in the file LICENSE.txt.
       
     9  * It is also available through the world-wide-web at this URL:
       
    10  * http://framework.zend.com/license/new-bsd
       
    11  * If you did not receive a copy of the license and are unable to
       
    12  * obtain it through the world-wide-web, please send an email
       
    13  * to license@zend.com so we can send you a copy immediately.
       
    14  *
       
    15  * @category   Zend
       
    16  * @package    Zend_Search_Lucene
       
    17  * @subpackage Index
       
    18  * @copyright  Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
       
    19  * @license    http://framework.zend.com/license/new-bsd     New BSD License
       
    20  * @version    $Id: DictionaryLoader.php 20096 2010-01-06 02:05:09Z bkarwin $
       
    21  */
       
    22 
       
    23 /**
       
    24  * Dictionary loader
       
    25  *
       
    26  * It's a dummy class which is created to encapsulate non-good structured code.
       
    27  * Manual "method inlining" is performed to increase dictionary index loading operation
       
    28  * which is major bottelneck for search performance.
       
    29  *
       
    30  *
       
    31  * @category   Zend
       
    32  * @package    Zend_Search_Lucene
       
    33  * @subpackage Index
       
    34  * @copyright  Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
       
    35  * @license    http://framework.zend.com/license/new-bsd     New BSD License
       
    36  */
       
    37 class Zend_Search_Lucene_Index_DictionaryLoader
       
    38 {
       
    39     /**
       
    40      * Dictionary index loader.
       
    41      *
       
    42      * It takes a string which is actually <segment_name>.tii index file data and
       
    43      * returns two arrays - term and tremInfo lists.
       
    44      *
       
    45      * See Zend_Search_Lucene_Index_SegmintInfo class for details
       
    46      *
       
    47      * @param string $data
       
    48      * @return array
       
    49      * @throws Zend_Search_Lucene_Exception
       
    50      */
       
    51     public static function load($data)
       
    52     {
       
    53         $termDictionary = array();
       
    54         $termInfos      = array();
       
    55         $pos = 0;
       
    56 
       
    57         // $tiVersion = $tiiFile->readInt();
       
    58         $tiVersion = ord($data[0]) << 24 | ord($data[1]) << 16 | ord($data[2]) << 8  | ord($data[3]);
       
    59         $pos += 4;
       
    60         if ($tiVersion != (int)0xFFFFFFFE /* pre-2.1 format */ &&
       
    61             $tiVersion != (int)0xFFFFFFFD /* 2.1+ format    */) {
       
    62                 require_once 'Zend/Search/Lucene/Exception.php';
       
    63                 throw new Zend_Search_Lucene_Exception('Wrong TermInfoIndexFile file format');
       
    64         }
       
    65 
       
    66         // $indexTermCount = $tiiFile->readLong();
       
    67         if (PHP_INT_SIZE > 4) {
       
    68             $indexTermCount = ord($data[$pos]) << 56  |
       
    69                               ord($data[$pos+1]) << 48  |
       
    70                               ord($data[$pos+2]) << 40  |
       
    71                               ord($data[$pos+3]) << 32  |
       
    72                               ord($data[$pos+4]) << 24  |
       
    73                               ord($data[$pos+5]) << 16  |
       
    74                               ord($data[$pos+6]) << 8   |
       
    75                               ord($data[$pos+7]);
       
    76         } else {
       
    77             if ((ord($data[$pos])            != 0) ||
       
    78                 (ord($data[$pos+1])          != 0) ||
       
    79                 (ord($data[$pos+2])          != 0) ||
       
    80                 (ord($data[$pos+3])          != 0) ||
       
    81                 ((ord($data[$pos+4]) & 0x80) != 0)) {
       
    82                     require_once 'Zend/Search/Lucene/Exception.php';
       
    83                     throw new Zend_Search_Lucene_Exception('Largest supported segment size (for 32-bit mode) is 2Gb');
       
    84                  }
       
    85 
       
    86             $indexTermCount = ord($data[$pos+4]) << 24  |
       
    87                               ord($data[$pos+5]) << 16  |
       
    88                               ord($data[$pos+6]) << 8   |
       
    89                               ord($data[$pos+7]);
       
    90         }
       
    91         $pos += 8;
       
    92 
       
    93         //                  $tiiFile->readInt();  // IndexInterval
       
    94         $pos += 4;
       
    95 
       
    96         // $skipInterval   = $tiiFile->readInt();
       
    97         $skipInterval = ord($data[$pos]) << 24 | ord($data[$pos+1]) << 16 | ord($data[$pos+2]) << 8  | ord($data[$pos+3]);
       
    98         $pos += 4;
       
    99         if ($indexTermCount < 1) {
       
   100             require_once 'Zend/Search/Lucene/Exception.php';
       
   101             throw new Zend_Search_Lucene_Exception('Wrong number of terms in a term dictionary index');
       
   102         }
       
   103 
       
   104         if ($tiVersion == (int)0xFFFFFFFD /* 2.1+ format */) {
       
   105             /* Skip MaxSkipLevels value */
       
   106             $pos += 4;
       
   107         }
       
   108 
       
   109         $prevTerm     = '';
       
   110         $freqPointer  =  0;
       
   111         $proxPointer  =  0;
       
   112         $indexPointer =  0;
       
   113         for ($count = 0; $count < $indexTermCount; $count++) {
       
   114             //$termPrefixLength = $tiiFile->readVInt();
       
   115             $nbyte = ord($data[$pos++]);
       
   116             $termPrefixLength = $nbyte & 0x7F;
       
   117             for ($shift=7; ($nbyte & 0x80) != 0; $shift += 7) {
       
   118                 $nbyte = ord($data[$pos++]);
       
   119                 $termPrefixLength |= ($nbyte & 0x7F) << $shift;
       
   120             }
       
   121 
       
   122             // $termSuffix       = $tiiFile->readString();
       
   123             $nbyte = ord($data[$pos++]);
       
   124             $len = $nbyte & 0x7F;
       
   125             for ($shift=7; ($nbyte & 0x80) != 0; $shift += 7) {
       
   126                 $nbyte = ord($data[$pos++]);
       
   127                 $len |= ($nbyte & 0x7F) << $shift;
       
   128             }
       
   129             if ($len == 0) {
       
   130                 $termSuffix = '';
       
   131             } else {
       
   132                 $termSuffix = substr($data, $pos, $len);
       
   133                 $pos += $len;
       
   134                 for ($count1 = 0; $count1 < $len; $count1++ ) {
       
   135                     if (( ord($termSuffix[$count1]) & 0xC0 ) == 0xC0) {
       
   136                         $addBytes = 1;
       
   137                         if (ord($termSuffix[$count1]) & 0x20 ) {
       
   138                             $addBytes++;
       
   139 
       
   140                             // Never used for Java Lucene created index.
       
   141                             // Java2 doesn't encode strings in four bytes
       
   142                             if (ord($termSuffix[$count1]) & 0x10 ) {
       
   143                                 $addBytes++;
       
   144                             }
       
   145                         }
       
   146                         $termSuffix .= substr($data, $pos, $addBytes);
       
   147                         $pos += $addBytes;
       
   148                         $len += $addBytes;
       
   149 
       
   150                         // Check for null character. Java2 encodes null character
       
   151                         // in two bytes.
       
   152                         if (ord($termSuffix[$count1]) == 0xC0 &&
       
   153                             ord($termSuffix[$count1+1]) == 0x80   ) {
       
   154                             $termSuffix[$count1] = 0;
       
   155                             $termSuffix = substr($termSuffix,0,$count1+1)
       
   156                                         . substr($termSuffix,$count1+2);
       
   157                         }
       
   158                         $count1 += $addBytes;
       
   159                     }
       
   160                 }
       
   161             }
       
   162 
       
   163             // $termValue        = Zend_Search_Lucene_Index_Term::getPrefix($prevTerm, $termPrefixLength) . $termSuffix;
       
   164             $pb = 0; $pc = 0;
       
   165             while ($pb < strlen($prevTerm)  &&  $pc < $termPrefixLength) {
       
   166                 $charBytes = 1;
       
   167                 if ((ord($prevTerm[$pb]) & 0xC0) == 0xC0) {
       
   168                     $charBytes++;
       
   169                     if (ord($prevTerm[$pb]) & 0x20 ) {
       
   170                         $charBytes++;
       
   171                         if (ord($prevTerm[$pb]) & 0x10 ) {
       
   172                             $charBytes++;
       
   173                         }
       
   174                     }
       
   175                 }
       
   176 
       
   177                 if ($pb + $charBytes > strlen($data)) {
       
   178                     // wrong character
       
   179                     break;
       
   180                 }
       
   181 
       
   182                 $pc++;
       
   183                 $pb += $charBytes;
       
   184             }
       
   185             $termValue = substr($prevTerm, 0, $pb) . $termSuffix;
       
   186 
       
   187             // $termFieldNum     = $tiiFile->readVInt();
       
   188             $nbyte = ord($data[$pos++]);
       
   189             $termFieldNum = $nbyte & 0x7F;
       
   190             for ($shift=7; ($nbyte & 0x80) != 0; $shift += 7) {
       
   191                 $nbyte = ord($data[$pos++]);
       
   192                 $termFieldNum |= ($nbyte & 0x7F) << $shift;
       
   193             }
       
   194 
       
   195             // $docFreq          = $tiiFile->readVInt();
       
   196             $nbyte = ord($data[$pos++]);
       
   197             $docFreq = $nbyte & 0x7F;
       
   198             for ($shift=7; ($nbyte & 0x80) != 0; $shift += 7) {
       
   199                 $nbyte = ord($data[$pos++]);
       
   200                 $docFreq |= ($nbyte & 0x7F) << $shift;
       
   201             }
       
   202 
       
   203             // $freqPointer     += $tiiFile->readVInt();
       
   204             $nbyte = ord($data[$pos++]);
       
   205             $vint = $nbyte & 0x7F;
       
   206             for ($shift=7; ($nbyte & 0x80) != 0; $shift += 7) {
       
   207                 $nbyte = ord($data[$pos++]);
       
   208                 $vint |= ($nbyte & 0x7F) << $shift;
       
   209             }
       
   210             $freqPointer += $vint;
       
   211 
       
   212             // $proxPointer     += $tiiFile->readVInt();
       
   213             $nbyte = ord($data[$pos++]);
       
   214             $vint = $nbyte & 0x7F;
       
   215             for ($shift=7; ($nbyte & 0x80) != 0; $shift += 7) {
       
   216                 $nbyte = ord($data[$pos++]);
       
   217                 $vint |= ($nbyte & 0x7F) << $shift;
       
   218             }
       
   219             $proxPointer += $vint;
       
   220 
       
   221             if( $docFreq >= $skipInterval ) {
       
   222                 // $skipDelta = $tiiFile->readVInt();
       
   223                 $nbyte = ord($data[$pos++]);
       
   224                 $vint = $nbyte & 0x7F;
       
   225                 for ($shift=7; ($nbyte & 0x80) != 0; $shift += 7) {
       
   226                     $nbyte = ord($data[$pos++]);
       
   227                     $vint |= ($nbyte & 0x7F) << $shift;
       
   228                 }
       
   229                 $skipDelta = $vint;
       
   230             } else {
       
   231                 $skipDelta = 0;
       
   232             }
       
   233 
       
   234             // $indexPointer += $tiiFile->readVInt();
       
   235             $nbyte = ord($data[$pos++]);
       
   236             $vint = $nbyte & 0x7F;
       
   237             for ($shift=7; ($nbyte & 0x80) != 0; $shift += 7) {
       
   238                 $nbyte = ord($data[$pos++]);
       
   239                 $vint |= ($nbyte & 0x7F) << $shift;
       
   240             }
       
   241             $indexPointer += $vint;
       
   242 
       
   243 
       
   244             // $this->_termDictionary[] =  new Zend_Search_Lucene_Index_Term($termValue, $termFieldNum);
       
   245             $termDictionary[] = array($termFieldNum, $termValue);
       
   246 
       
   247             $termInfos[] =
       
   248                  // new Zend_Search_Lucene_Index_TermInfo($docFreq, $freqPointer, $proxPointer, $skipDelta, $indexPointer);
       
   249                  array($docFreq, $freqPointer, $proxPointer, $skipDelta, $indexPointer);
       
   250 
       
   251             $prevTerm = $termValue;
       
   252         }
       
   253 
       
   254         // Check special index entry mark
       
   255         if ($termDictionary[0][0] != (int)0xFFFFFFFF) {
       
   256             require_once 'Zend/Search/Lucene/Exception.php';
       
   257             throw new Zend_Search_Lucene_Exception('Wrong TermInfoIndexFile file format');
       
   258         }
       
   259 
       
   260         if (PHP_INT_SIZE > 4) {
       
   261             // Treat 64-bit 0xFFFFFFFF as -1
       
   262             $termDictionary[0][0] = -1;
       
   263         }
       
   264 
       
   265         return array($termDictionary, $termInfos);
       
   266     }
       
   267 }
       
   268