|
1 <?php |
|
2 /** |
|
3 * Zend Framework |
|
4 * |
|
5 * LICENSE |
|
6 * |
|
7 * This source file is subject to the new BSD license that is bundled |
|
8 * with this package in the file LICENSE.txt. |
|
9 * It is also available through the world-wide-web at this URL: |
|
10 * http://framework.zend.com/license/new-bsd |
|
11 * If you did not receive a copy of the license and are unable to |
|
12 * obtain it through the world-wide-web, please send an email |
|
13 * to license@zend.com so we can send you a copy immediately. |
|
14 * |
|
15 * @category Zend |
|
16 * @package Zend_Search_Lucene |
|
17 * @subpackage Index |
|
18 * @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com) |
|
19 * @license http://framework.zend.com/license/new-bsd New BSD License |
|
20 * @version $Id: DictionaryLoader.php 20096 2010-01-06 02:05:09Z bkarwin $ |
|
21 */ |
|
22 |
|
23 /** |
|
24 * Dictionary loader |
|
25 * |
|
26 * It's a dummy class which is created to encapsulate non-good structured code. |
|
27 * Manual "method inlining" is performed to increase dictionary index loading operation |
|
28 * which is major bottelneck for search performance. |
|
29 * |
|
30 * |
|
31 * @category Zend |
|
32 * @package Zend_Search_Lucene |
|
33 * @subpackage Index |
|
34 * @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com) |
|
35 * @license http://framework.zend.com/license/new-bsd New BSD License |
|
36 */ |
|
37 class Zend_Search_Lucene_Index_DictionaryLoader |
|
38 { |
|
39 /** |
|
40 * Dictionary index loader. |
|
41 * |
|
42 * It takes a string which is actually <segment_name>.tii index file data and |
|
43 * returns two arrays - term and tremInfo lists. |
|
44 * |
|
45 * See Zend_Search_Lucene_Index_SegmintInfo class for details |
|
46 * |
|
47 * @param string $data |
|
48 * @return array |
|
49 * @throws Zend_Search_Lucene_Exception |
|
50 */ |
|
51 public static function load($data) |
|
52 { |
|
53 $termDictionary = array(); |
|
54 $termInfos = array(); |
|
55 $pos = 0; |
|
56 |
|
57 // $tiVersion = $tiiFile->readInt(); |
|
58 $tiVersion = ord($data[0]) << 24 | ord($data[1]) << 16 | ord($data[2]) << 8 | ord($data[3]); |
|
59 $pos += 4; |
|
60 if ($tiVersion != (int)0xFFFFFFFE /* pre-2.1 format */ && |
|
61 $tiVersion != (int)0xFFFFFFFD /* 2.1+ format */) { |
|
62 require_once 'Zend/Search/Lucene/Exception.php'; |
|
63 throw new Zend_Search_Lucene_Exception('Wrong TermInfoIndexFile file format'); |
|
64 } |
|
65 |
|
66 // $indexTermCount = $tiiFile->readLong(); |
|
67 if (PHP_INT_SIZE > 4) { |
|
68 $indexTermCount = ord($data[$pos]) << 56 | |
|
69 ord($data[$pos+1]) << 48 | |
|
70 ord($data[$pos+2]) << 40 | |
|
71 ord($data[$pos+3]) << 32 | |
|
72 ord($data[$pos+4]) << 24 | |
|
73 ord($data[$pos+5]) << 16 | |
|
74 ord($data[$pos+6]) << 8 | |
|
75 ord($data[$pos+7]); |
|
76 } else { |
|
77 if ((ord($data[$pos]) != 0) || |
|
78 (ord($data[$pos+1]) != 0) || |
|
79 (ord($data[$pos+2]) != 0) || |
|
80 (ord($data[$pos+3]) != 0) || |
|
81 ((ord($data[$pos+4]) & 0x80) != 0)) { |
|
82 require_once 'Zend/Search/Lucene/Exception.php'; |
|
83 throw new Zend_Search_Lucene_Exception('Largest supported segment size (for 32-bit mode) is 2Gb'); |
|
84 } |
|
85 |
|
86 $indexTermCount = ord($data[$pos+4]) << 24 | |
|
87 ord($data[$pos+5]) << 16 | |
|
88 ord($data[$pos+6]) << 8 | |
|
89 ord($data[$pos+7]); |
|
90 } |
|
91 $pos += 8; |
|
92 |
|
93 // $tiiFile->readInt(); // IndexInterval |
|
94 $pos += 4; |
|
95 |
|
96 // $skipInterval = $tiiFile->readInt(); |
|
97 $skipInterval = ord($data[$pos]) << 24 | ord($data[$pos+1]) << 16 | ord($data[$pos+2]) << 8 | ord($data[$pos+3]); |
|
98 $pos += 4; |
|
99 if ($indexTermCount < 1) { |
|
100 require_once 'Zend/Search/Lucene/Exception.php'; |
|
101 throw new Zend_Search_Lucene_Exception('Wrong number of terms in a term dictionary index'); |
|
102 } |
|
103 |
|
104 if ($tiVersion == (int)0xFFFFFFFD /* 2.1+ format */) { |
|
105 /* Skip MaxSkipLevels value */ |
|
106 $pos += 4; |
|
107 } |
|
108 |
|
109 $prevTerm = ''; |
|
110 $freqPointer = 0; |
|
111 $proxPointer = 0; |
|
112 $indexPointer = 0; |
|
113 for ($count = 0; $count < $indexTermCount; $count++) { |
|
114 //$termPrefixLength = $tiiFile->readVInt(); |
|
115 $nbyte = ord($data[$pos++]); |
|
116 $termPrefixLength = $nbyte & 0x7F; |
|
117 for ($shift=7; ($nbyte & 0x80) != 0; $shift += 7) { |
|
118 $nbyte = ord($data[$pos++]); |
|
119 $termPrefixLength |= ($nbyte & 0x7F) << $shift; |
|
120 } |
|
121 |
|
122 // $termSuffix = $tiiFile->readString(); |
|
123 $nbyte = ord($data[$pos++]); |
|
124 $len = $nbyte & 0x7F; |
|
125 for ($shift=7; ($nbyte & 0x80) != 0; $shift += 7) { |
|
126 $nbyte = ord($data[$pos++]); |
|
127 $len |= ($nbyte & 0x7F) << $shift; |
|
128 } |
|
129 if ($len == 0) { |
|
130 $termSuffix = ''; |
|
131 } else { |
|
132 $termSuffix = substr($data, $pos, $len); |
|
133 $pos += $len; |
|
134 for ($count1 = 0; $count1 < $len; $count1++ ) { |
|
135 if (( ord($termSuffix[$count1]) & 0xC0 ) == 0xC0) { |
|
136 $addBytes = 1; |
|
137 if (ord($termSuffix[$count1]) & 0x20 ) { |
|
138 $addBytes++; |
|
139 |
|
140 // Never used for Java Lucene created index. |
|
141 // Java2 doesn't encode strings in four bytes |
|
142 if (ord($termSuffix[$count1]) & 0x10 ) { |
|
143 $addBytes++; |
|
144 } |
|
145 } |
|
146 $termSuffix .= substr($data, $pos, $addBytes); |
|
147 $pos += $addBytes; |
|
148 $len += $addBytes; |
|
149 |
|
150 // Check for null character. Java2 encodes null character |
|
151 // in two bytes. |
|
152 if (ord($termSuffix[$count1]) == 0xC0 && |
|
153 ord($termSuffix[$count1+1]) == 0x80 ) { |
|
154 $termSuffix[$count1] = 0; |
|
155 $termSuffix = substr($termSuffix,0,$count1+1) |
|
156 . substr($termSuffix,$count1+2); |
|
157 } |
|
158 $count1 += $addBytes; |
|
159 } |
|
160 } |
|
161 } |
|
162 |
|
163 // $termValue = Zend_Search_Lucene_Index_Term::getPrefix($prevTerm, $termPrefixLength) . $termSuffix; |
|
164 $pb = 0; $pc = 0; |
|
165 while ($pb < strlen($prevTerm) && $pc < $termPrefixLength) { |
|
166 $charBytes = 1; |
|
167 if ((ord($prevTerm[$pb]) & 0xC0) == 0xC0) { |
|
168 $charBytes++; |
|
169 if (ord($prevTerm[$pb]) & 0x20 ) { |
|
170 $charBytes++; |
|
171 if (ord($prevTerm[$pb]) & 0x10 ) { |
|
172 $charBytes++; |
|
173 } |
|
174 } |
|
175 } |
|
176 |
|
177 if ($pb + $charBytes > strlen($data)) { |
|
178 // wrong character |
|
179 break; |
|
180 } |
|
181 |
|
182 $pc++; |
|
183 $pb += $charBytes; |
|
184 } |
|
185 $termValue = substr($prevTerm, 0, $pb) . $termSuffix; |
|
186 |
|
187 // $termFieldNum = $tiiFile->readVInt(); |
|
188 $nbyte = ord($data[$pos++]); |
|
189 $termFieldNum = $nbyte & 0x7F; |
|
190 for ($shift=7; ($nbyte & 0x80) != 0; $shift += 7) { |
|
191 $nbyte = ord($data[$pos++]); |
|
192 $termFieldNum |= ($nbyte & 0x7F) << $shift; |
|
193 } |
|
194 |
|
195 // $docFreq = $tiiFile->readVInt(); |
|
196 $nbyte = ord($data[$pos++]); |
|
197 $docFreq = $nbyte & 0x7F; |
|
198 for ($shift=7; ($nbyte & 0x80) != 0; $shift += 7) { |
|
199 $nbyte = ord($data[$pos++]); |
|
200 $docFreq |= ($nbyte & 0x7F) << $shift; |
|
201 } |
|
202 |
|
203 // $freqPointer += $tiiFile->readVInt(); |
|
204 $nbyte = ord($data[$pos++]); |
|
205 $vint = $nbyte & 0x7F; |
|
206 for ($shift=7; ($nbyte & 0x80) != 0; $shift += 7) { |
|
207 $nbyte = ord($data[$pos++]); |
|
208 $vint |= ($nbyte & 0x7F) << $shift; |
|
209 } |
|
210 $freqPointer += $vint; |
|
211 |
|
212 // $proxPointer += $tiiFile->readVInt(); |
|
213 $nbyte = ord($data[$pos++]); |
|
214 $vint = $nbyte & 0x7F; |
|
215 for ($shift=7; ($nbyte & 0x80) != 0; $shift += 7) { |
|
216 $nbyte = ord($data[$pos++]); |
|
217 $vint |= ($nbyte & 0x7F) << $shift; |
|
218 } |
|
219 $proxPointer += $vint; |
|
220 |
|
221 if( $docFreq >= $skipInterval ) { |
|
222 // $skipDelta = $tiiFile->readVInt(); |
|
223 $nbyte = ord($data[$pos++]); |
|
224 $vint = $nbyte & 0x7F; |
|
225 for ($shift=7; ($nbyte & 0x80) != 0; $shift += 7) { |
|
226 $nbyte = ord($data[$pos++]); |
|
227 $vint |= ($nbyte & 0x7F) << $shift; |
|
228 } |
|
229 $skipDelta = $vint; |
|
230 } else { |
|
231 $skipDelta = 0; |
|
232 } |
|
233 |
|
234 // $indexPointer += $tiiFile->readVInt(); |
|
235 $nbyte = ord($data[$pos++]); |
|
236 $vint = $nbyte & 0x7F; |
|
237 for ($shift=7; ($nbyte & 0x80) != 0; $shift += 7) { |
|
238 $nbyte = ord($data[$pos++]); |
|
239 $vint |= ($nbyte & 0x7F) << $shift; |
|
240 } |
|
241 $indexPointer += $vint; |
|
242 |
|
243 |
|
244 // $this->_termDictionary[] = new Zend_Search_Lucene_Index_Term($termValue, $termFieldNum); |
|
245 $termDictionary[] = array($termFieldNum, $termValue); |
|
246 |
|
247 $termInfos[] = |
|
248 // new Zend_Search_Lucene_Index_TermInfo($docFreq, $freqPointer, $proxPointer, $skipDelta, $indexPointer); |
|
249 array($docFreq, $freqPointer, $proxPointer, $skipDelta, $indexPointer); |
|
250 |
|
251 $prevTerm = $termValue; |
|
252 } |
|
253 |
|
254 // Check special index entry mark |
|
255 if ($termDictionary[0][0] != (int)0xFFFFFFFF) { |
|
256 require_once 'Zend/Search/Lucene/Exception.php'; |
|
257 throw new Zend_Search_Lucene_Exception('Wrong TermInfoIndexFile file format'); |
|
258 } |
|
259 |
|
260 if (PHP_INT_SIZE > 4) { |
|
261 // Treat 64-bit 0xFFFFFFFF as -1 |
|
262 $termDictionary[0][0] = -1; |
|
263 } |
|
264 |
|
265 return array($termDictionary, $termInfos); |
|
266 } |
|
267 } |
|
268 |