|
1 <?php |
|
2 /** |
|
3 * Zend Framework |
|
4 * |
|
5 * LICENSE |
|
6 * |
|
7 * This source file is subject to the new BSD license that is bundled |
|
8 * with this package in the file LICENSE.txt. |
|
9 * It is also available through the world-wide-web at this URL: |
|
10 * http://framework.zend.com/license/new-bsd |
|
11 * If you did not receive a copy of the license and are unable to |
|
12 * obtain it through the world-wide-web, please send an email |
|
13 * to license@zend.com so we can send you a copy immediately. |
|
14 * |
|
15 * @category Zend |
|
16 * @package Zend_Search_Lucene |
|
17 * @subpackage Index |
|
18 * @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com) |
|
19 * @license http://framework.zend.com/license/new-bsd New BSD License |
|
20 * @version $Id: SegmentInfo.php 22987 2010-09-21 10:39:53Z alexander $ |
|
21 */ |
|
22 |
|
23 /** Zend_Search_Lucene_Index_TermsStream_Interface */ |
|
24 require_once 'Zend/Search/Lucene/Index/TermsStream/Interface.php'; |
|
25 |
|
26 |
|
27 /** Zend_Search_Lucene_Search_Similarity */ |
|
28 require_once 'Zend/Search/Lucene/Search/Similarity.php'; |
|
29 |
|
30 /** Zend_Search_Lucene_Index_FieldInfo */ |
|
31 require_once 'Zend/Search/Lucene/Index/FieldInfo.php'; |
|
32 |
|
33 /** Zend_Search_Lucene_Index_Term */ |
|
34 require_once 'Zend/Search/Lucene/Index/Term.php'; |
|
35 |
|
36 /** Zend_Search_Lucene_Index_TermInfo */ |
|
37 require_once 'Zend/Search/Lucene/Index/TermInfo.php'; |
|
38 |
|
39 /** |
|
40 * @category Zend |
|
41 * @package Zend_Search_Lucene |
|
42 * @subpackage Index |
|
43 * @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com) |
|
44 * @license http://framework.zend.com/license/new-bsd New BSD License |
|
45 */ |
|
46 class Zend_Search_Lucene_Index_SegmentInfo implements Zend_Search_Lucene_Index_TermsStream_Interface |
|
47 { |
|
48 /** |
|
49 * "Full scan vs fetch" boundary. |
|
50 * |
|
51 * If filter selectivity is less than this value, then full scan is performed |
|
52 * (since term entries fetching has some additional overhead). |
|
53 */ |
|
54 const FULL_SCAN_VS_FETCH_BOUNDARY = 5; |
|
55 |
|
56 /** |
|
57 * Number of docs in a segment |
|
58 * |
|
59 * @var integer |
|
60 */ |
|
61 private $_docCount; |
|
62 |
|
63 /** |
|
64 * Segment name |
|
65 * |
|
66 * @var string |
|
67 */ |
|
68 private $_name; |
|
69 |
|
70 /** |
|
71 * Term Dictionary Index |
|
72 * |
|
73 * Array of arrays (Zend_Search_Lucene_Index_Term objects are represented as arrays because |
|
74 * of performance considerations) |
|
75 * [0] -> $termValue |
|
76 * [1] -> $termFieldNum |
|
77 * |
|
78 * Corresponding Zend_Search_Lucene_Index_TermInfo object stored in the $_termDictionaryInfos |
|
79 * |
|
80 * @var array |
|
81 */ |
|
82 private $_termDictionary; |
|
83 |
|
84 /** |
|
85 * Term Dictionary Index TermInfos |
|
86 * |
|
87 * Array of arrays (Zend_Search_Lucene_Index_TermInfo objects are represented as arrays because |
|
88 * of performance considerations) |
|
89 * [0] -> $docFreq |
|
90 * [1] -> $freqPointer |
|
91 * [2] -> $proxPointer |
|
92 * [3] -> $skipOffset |
|
93 * [4] -> $indexPointer |
|
94 * |
|
95 * @var array |
|
96 */ |
|
97 private $_termDictionaryInfos; |
|
98 |
|
99 /** |
|
100 * Segment fields. Array of Zend_Search_Lucene_Index_FieldInfo objects for this segment |
|
101 * |
|
102 * @var array |
|
103 */ |
|
104 private $_fields; |
|
105 |
|
106 /** |
|
107 * Field positions in a dictionary. |
|
108 * (Term dictionary contains filelds ordered by names) |
|
109 * |
|
110 * @var array |
|
111 */ |
|
112 private $_fieldsDicPositions; |
|
113 |
|
114 |
|
115 /** |
|
116 * Associative array where the key is the file name and the value is data offset |
|
117 * in a compound segment file (.csf). |
|
118 * |
|
119 * @var array |
|
120 */ |
|
121 private $_segFiles; |
|
122 |
|
123 /** |
|
124 * Associative array where the key is the file name and the value is file size (.csf). |
|
125 * |
|
126 * @var array |
|
127 */ |
|
128 private $_segFileSizes; |
|
129 |
|
130 /** |
|
131 * Delete file generation number |
|
132 * |
|
133 * -2 means autodetect latest delete generation |
|
134 * -1 means 'there is no delete file' |
|
135 * 0 means pre-2.1 format delete file |
|
136 * X specifies used delete file |
|
137 * |
|
138 * @var integer |
|
139 */ |
|
140 private $_delGen; |
|
141 |
|
142 /** |
|
143 * Segment has single norms file |
|
144 * |
|
145 * If true then one .nrm file is used for all fields |
|
146 * Otherwise .fN files are used |
|
147 * |
|
148 * @var boolean |
|
149 */ |
|
150 private $_hasSingleNormFile; |
|
151 |
|
152 /** |
|
153 * Use compound segment file (*.cfs) to collect all other segment files |
|
154 * (excluding .del files) |
|
155 * |
|
156 * @var boolean |
|
157 */ |
|
158 private $_isCompound; |
|
159 |
|
160 |
|
161 /** |
|
162 * File system adapter. |
|
163 * |
|
164 * @var Zend_Search_Lucene_Storage_Directory_Filesystem |
|
165 */ |
|
166 private $_directory; |
|
167 |
|
168 /** |
|
169 * Normalization factors. |
|
170 * An array fieldName => normVector |
|
171 * normVector is a binary string. |
|
172 * Each byte corresponds to an indexed document in a segment and |
|
173 * encodes normalization factor (float value, encoded by |
|
174 * Zend_Search_Lucene_Search_Similarity::encodeNorm()) |
|
175 * |
|
176 * @var array |
|
177 */ |
|
178 private $_norms = array(); |
|
179 |
|
180 /** |
|
181 * List of deleted documents. |
|
182 * bitset if bitset extension is loaded or array otherwise. |
|
183 * |
|
184 * @var mixed |
|
185 */ |
|
186 private $_deleted = null; |
|
187 |
|
188 /** |
|
189 * $this->_deleted update flag |
|
190 * |
|
191 * @var boolean |
|
192 */ |
|
193 private $_deletedDirty = false; |
|
194 |
|
195 /** |
|
196 * True if segment uses shared doc store |
|
197 * |
|
198 * @var boolean |
|
199 */ |
|
200 private $_usesSharedDocStore; |
|
201 |
|
202 /* |
|
203 * Shared doc store options. |
|
204 * It's an assotiative array with the following items: |
|
205 * - 'offset' => $docStoreOffset The starting document in the shared doc store files where this segment's documents begin |
|
206 * - 'segment' => $docStoreSegment The name of the segment that has the shared doc store files. |
|
207 * - 'isCompound' => $docStoreIsCompoundFile True, if compound file format is used for the shared doc store files (.cfx file). |
|
208 */ |
|
209 private $_sharedDocStoreOptions; |
|
210 |
|
211 |
|
212 /** |
|
213 * Zend_Search_Lucene_Index_SegmentInfo constructor |
|
214 * |
|
215 * @param Zend_Search_Lucene_Storage_Directory $directory |
|
216 * @param string $name |
|
217 * @param integer $docCount |
|
218 * @param integer $delGen |
|
219 * @param array|null $docStoreOptions |
|
220 * @param boolean $hasSingleNormFile |
|
221 * @param boolean $isCompound |
|
222 */ |
|
223 public function __construct(Zend_Search_Lucene_Storage_Directory $directory, $name, $docCount, $delGen = 0, $docStoreOptions = null, $hasSingleNormFile = false, $isCompound = null) |
|
224 { |
|
225 $this->_directory = $directory; |
|
226 $this->_name = $name; |
|
227 $this->_docCount = $docCount; |
|
228 |
|
229 if ($docStoreOptions !== null) { |
|
230 $this->_usesSharedDocStore = true; |
|
231 $this->_sharedDocStoreOptions = $docStoreOptions; |
|
232 |
|
233 if ($docStoreOptions['isCompound']) { |
|
234 $cfxFile = $this->_directory->getFileObject($docStoreOptions['segment'] . '.cfx'); |
|
235 $cfxFilesCount = $cfxFile->readVInt(); |
|
236 |
|
237 $cfxFiles = array(); |
|
238 $cfxFileSizes = array(); |
|
239 |
|
240 for ($count = 0; $count < $cfxFilesCount; $count++) { |
|
241 $dataOffset = $cfxFile->readLong(); |
|
242 if ($count != 0) { |
|
243 $cfxFileSizes[$fileName] = $dataOffset - end($cfxFiles); |
|
244 } |
|
245 $fileName = $cfxFile->readString(); |
|
246 $cfxFiles[$fileName] = $dataOffset; |
|
247 } |
|
248 if ($count != 0) { |
|
249 $cfxFileSizes[$fileName] = $this->_directory->fileLength($docStoreOptions['segment'] . '.cfx') - $dataOffset; |
|
250 } |
|
251 |
|
252 $this->_sharedDocStoreOptions['files'] = $cfxFiles; |
|
253 $this->_sharedDocStoreOptions['fileSizes'] = $cfxFileSizes; |
|
254 } |
|
255 } |
|
256 |
|
257 $this->_hasSingleNormFile = $hasSingleNormFile; |
|
258 $this->_delGen = $delGen; |
|
259 $this->_termDictionary = null; |
|
260 |
|
261 |
|
262 if ($isCompound !== null) { |
|
263 $this->_isCompound = $isCompound; |
|
264 } else { |
|
265 // It's a pre-2.1 segment or isCompound is set to 'unknown' |
|
266 // Detect if segment uses compound file |
|
267 require_once 'Zend/Search/Lucene/Exception.php'; |
|
268 try { |
|
269 // Try to open compound file |
|
270 $this->_directory->getFileObject($name . '.cfs'); |
|
271 |
|
272 // Compound file is found |
|
273 $this->_isCompound = true; |
|
274 } catch (Zend_Search_Lucene_Exception $e) { |
|
275 if (strpos($e->getMessage(), 'is not readable') !== false) { |
|
276 // Compound file is not found or is not readable |
|
277 $this->_isCompound = false; |
|
278 } else { |
|
279 throw new Zend_Search_Lucene_Exception($e->getMessage(), $e->getCode(), $e); |
|
280 } |
|
281 } |
|
282 } |
|
283 |
|
284 $this->_segFiles = array(); |
|
285 if ($this->_isCompound) { |
|
286 $cfsFile = $this->_directory->getFileObject($name . '.cfs'); |
|
287 $segFilesCount = $cfsFile->readVInt(); |
|
288 |
|
289 for ($count = 0; $count < $segFilesCount; $count++) { |
|
290 $dataOffset = $cfsFile->readLong(); |
|
291 if ($count != 0) { |
|
292 $this->_segFileSizes[$fileName] = $dataOffset - end($this->_segFiles); |
|
293 } |
|
294 $fileName = $cfsFile->readString(); |
|
295 $this->_segFiles[$fileName] = $dataOffset; |
|
296 } |
|
297 if ($count != 0) { |
|
298 $this->_segFileSizes[$fileName] = $this->_directory->fileLength($name . '.cfs') - $dataOffset; |
|
299 } |
|
300 } |
|
301 |
|
302 $fnmFile = $this->openCompoundFile('.fnm'); |
|
303 $fieldsCount = $fnmFile->readVInt(); |
|
304 $fieldNames = array(); |
|
305 $fieldNums = array(); |
|
306 $this->_fields = array(); |
|
307 |
|
308 for ($count=0; $count < $fieldsCount; $count++) { |
|
309 $fieldName = $fnmFile->readString(); |
|
310 $fieldBits = $fnmFile->readByte(); |
|
311 $this->_fields[$count] = new Zend_Search_Lucene_Index_FieldInfo($fieldName, |
|
312 $fieldBits & 0x01 /* field is indexed */, |
|
313 $count, |
|
314 $fieldBits & 0x02 /* termvectors are stored */, |
|
315 $fieldBits & 0x10 /* norms are omitted */, |
|
316 $fieldBits & 0x20 /* payloads are stored */); |
|
317 if ($fieldBits & 0x10) { |
|
318 // norms are omitted for the indexed field |
|
319 $this->_norms[$count] = str_repeat(chr(Zend_Search_Lucene_Search_Similarity::encodeNorm(1.0)), $docCount); |
|
320 } |
|
321 |
|
322 $fieldNums[$count] = $count; |
|
323 $fieldNames[$count] = $fieldName; |
|
324 } |
|
325 array_multisort($fieldNames, SORT_ASC, SORT_REGULAR, $fieldNums); |
|
326 $this->_fieldsDicPositions = array_flip($fieldNums); |
|
327 |
|
328 if ($this->_delGen == -2) { |
|
329 // SegmentInfo constructor is invoked from index writer |
|
330 // Autodetect current delete file generation number |
|
331 $this->_delGen = $this->_detectLatestDelGen(); |
|
332 } |
|
333 |
|
334 // Load deletions |
|
335 $this->_deleted = $this->_loadDelFile(); |
|
336 } |
|
337 |
|
338 /** |
|
339 * Load detetions file |
|
340 * |
|
341 * Returns bitset or an array depending on bitset extension availability |
|
342 * |
|
343 * @return mixed |
|
344 * @throws Zend_Search_Lucene_Exception |
|
345 */ |
|
346 private function _loadDelFile() |
|
347 { |
|
348 if ($this->_delGen == -1) { |
|
349 // There is no delete file for this segment |
|
350 return null; |
|
351 } else if ($this->_delGen == 0) { |
|
352 // It's a segment with pre-2.1 format delete file |
|
353 // Try to load deletions file |
|
354 return $this->_loadPre21DelFile(); |
|
355 } else { |
|
356 // It's 2.1+ format deleteions file |
|
357 return $this->_load21DelFile(); |
|
358 } |
|
359 } |
|
360 |
|
361 /** |
|
362 * Load pre-2.1 detetions file |
|
363 * |
|
364 * Returns bitset or an array depending on bitset extension availability |
|
365 * |
|
366 * @return mixed |
|
367 * @throws Zend_Search_Lucene_Exception |
|
368 */ |
|
369 private function _loadPre21DelFile() |
|
370 { |
|
371 require_once 'Zend/Search/Lucene/Exception.php'; |
|
372 try { |
|
373 // '.del' files always stored in a separate file |
|
374 // Segment compound is not used |
|
375 $delFile = $this->_directory->getFileObject($this->_name . '.del'); |
|
376 |
|
377 $byteCount = $delFile->readInt(); |
|
378 $byteCount = ceil($byteCount/8); |
|
379 $bitCount = $delFile->readInt(); |
|
380 |
|
381 if ($bitCount == 0) { |
|
382 $delBytes = ''; |
|
383 } else { |
|
384 $delBytes = $delFile->readBytes($byteCount); |
|
385 } |
|
386 |
|
387 if (extension_loaded('bitset')) { |
|
388 return $delBytes; |
|
389 } else { |
|
390 $deletions = array(); |
|
391 for ($count = 0; $count < $byteCount; $count++) { |
|
392 $byte = ord($delBytes[$count]); |
|
393 for ($bit = 0; $bit < 8; $bit++) { |
|
394 if ($byte & (1<<$bit)) { |
|
395 $deletions[$count*8 + $bit] = 1; |
|
396 } |
|
397 } |
|
398 } |
|
399 |
|
400 return $deletions; |
|
401 } |
|
402 } catch(Zend_Search_Lucene_Exception $e) { |
|
403 if (strpos($e->getMessage(), 'is not readable') === false) { |
|
404 throw new Zend_Search_Lucene_Exception($e->getMessage(), $e->getCode(), $e); |
|
405 } |
|
406 // There is no deletion file |
|
407 $this->_delGen = -1; |
|
408 |
|
409 return null; |
|
410 } |
|
411 } |
|
412 |
|
413 /** |
|
414 * Load 2.1+ format detetions file |
|
415 * |
|
416 * Returns bitset or an array depending on bitset extension availability |
|
417 * |
|
418 * @return mixed |
|
419 */ |
|
420 private function _load21DelFile() |
|
421 { |
|
422 $delFile = $this->_directory->getFileObject($this->_name . '_' . base_convert($this->_delGen, 10, 36) . '.del'); |
|
423 |
|
424 $format = $delFile->readInt(); |
|
425 |
|
426 if ($format == (int)0xFFFFFFFF) { |
|
427 if (extension_loaded('bitset')) { |
|
428 $deletions = bitset_empty(); |
|
429 } else { |
|
430 $deletions = array(); |
|
431 } |
|
432 |
|
433 $byteCount = $delFile->readInt(); |
|
434 $bitCount = $delFile->readInt(); |
|
435 |
|
436 $delFileSize = $this->_directory->fileLength($this->_name . '_' . base_convert($this->_delGen, 10, 36) . '.del'); |
|
437 $byteNum = 0; |
|
438 |
|
439 do { |
|
440 $dgap = $delFile->readVInt(); |
|
441 $nonZeroByte = $delFile->readByte(); |
|
442 |
|
443 $byteNum += $dgap; |
|
444 |
|
445 |
|
446 if (extension_loaded('bitset')) { |
|
447 for ($bit = 0; $bit < 8; $bit++) { |
|
448 if ($nonZeroByte & (1<<$bit)) { |
|
449 bitset_incl($deletions, $byteNum*8 + $bit); |
|
450 } |
|
451 } |
|
452 return $deletions; |
|
453 } else { |
|
454 for ($bit = 0; $bit < 8; $bit++) { |
|
455 if ($nonZeroByte & (1<<$bit)) { |
|
456 $deletions[$byteNum*8 + $bit] = 1; |
|
457 } |
|
458 } |
|
459 return (count($deletions) > 0) ? $deletions : null; |
|
460 } |
|
461 |
|
462 } while ($delFile->tell() < $delFileSize); |
|
463 } else { |
|
464 // $format is actually byte count |
|
465 $byteCount = ceil($format/8); |
|
466 $bitCount = $delFile->readInt(); |
|
467 |
|
468 if ($bitCount == 0) { |
|
469 $delBytes = ''; |
|
470 } else { |
|
471 $delBytes = $delFile->readBytes($byteCount); |
|
472 } |
|
473 |
|
474 if (extension_loaded('bitset')) { |
|
475 return $delBytes; |
|
476 } else { |
|
477 $deletions = array(); |
|
478 for ($count = 0; $count < $byteCount; $count++) { |
|
479 $byte = ord($delBytes[$count]); |
|
480 for ($bit = 0; $bit < 8; $bit++) { |
|
481 if ($byte & (1<<$bit)) { |
|
482 $deletions[$count*8 + $bit] = 1; |
|
483 } |
|
484 } |
|
485 } |
|
486 |
|
487 return (count($deletions) > 0) ? $deletions : null; |
|
488 } |
|
489 } |
|
490 } |
|
491 |
|
492 /** |
|
493 * Opens index file stoted within compound index file |
|
494 * |
|
495 * @param string $extension |
|
496 * @param boolean $shareHandler |
|
497 * @throws Zend_Search_Lucene_Exception |
|
498 * @return Zend_Search_Lucene_Storage_File |
|
499 */ |
|
500 public function openCompoundFile($extension, $shareHandler = true) |
|
501 { |
|
502 if (($extension == '.fdx' || $extension == '.fdt') && $this->_usesSharedDocStore) { |
|
503 $fdxFName = $this->_sharedDocStoreOptions['segment'] . '.fdx'; |
|
504 $fdtFName = $this->_sharedDocStoreOptions['segment'] . '.fdt'; |
|
505 |
|
506 if (!$this->_sharedDocStoreOptions['isCompound']) { |
|
507 $fdxFile = $this->_directory->getFileObject($fdxFName, $shareHandler); |
|
508 $fdxFile->seek($this->_sharedDocStoreOptions['offset']*8, SEEK_CUR); |
|
509 |
|
510 if ($extension == '.fdx') { |
|
511 // '.fdx' file is requested |
|
512 return $fdxFile; |
|
513 } else { |
|
514 // '.fdt' file is requested |
|
515 $fdtStartOffset = $fdxFile->readLong(); |
|
516 |
|
517 $fdtFile = $this->_directory->getFileObject($fdtFName, $shareHandler); |
|
518 $fdtFile->seek($fdtStartOffset, SEEK_CUR); |
|
519 |
|
520 return $fdtFile; |
|
521 } |
|
522 } |
|
523 |
|
524 if( !isset($this->_sharedDocStoreOptions['files'][$fdxFName]) ) { |
|
525 require_once 'Zend/Search/Lucene/Exception.php'; |
|
526 throw new Zend_Search_Lucene_Exception('Shared doc storage segment compound file doesn\'t contain ' |
|
527 . $fdxFName . ' file.' ); |
|
528 } |
|
529 if( !isset($this->_sharedDocStoreOptions['files'][$fdtFName]) ) { |
|
530 require_once 'Zend/Search/Lucene/Exception.php'; |
|
531 throw new Zend_Search_Lucene_Exception('Shared doc storage segment compound file doesn\'t contain ' |
|
532 . $fdtFName . ' file.' ); |
|
533 } |
|
534 |
|
535 // Open shared docstore segment file |
|
536 $cfxFile = $this->_directory->getFileObject($this->_sharedDocStoreOptions['segment'] . '.cfx', $shareHandler); |
|
537 // Seek to the start of '.fdx' file within compound file |
|
538 $cfxFile->seek($this->_sharedDocStoreOptions['files'][$fdxFName]); |
|
539 // Seek to the start of current segment documents section |
|
540 $cfxFile->seek($this->_sharedDocStoreOptions['offset']*8, SEEK_CUR); |
|
541 |
|
542 if ($extension == '.fdx') { |
|
543 // '.fdx' file is requested |
|
544 return $cfxFile; |
|
545 } else { |
|
546 // '.fdt' file is requested |
|
547 $fdtStartOffset = $cfxFile->readLong(); |
|
548 |
|
549 // Seek to the start of '.fdt' file within compound file |
|
550 $cfxFile->seek($this->_sharedDocStoreOptions['files'][$fdtFName]); |
|
551 // Seek to the start of current segment documents section |
|
552 $cfxFile->seek($fdtStartOffset, SEEK_CUR); |
|
553 |
|
554 return $fdtFile; |
|
555 } |
|
556 } |
|
557 |
|
558 $filename = $this->_name . $extension; |
|
559 |
|
560 if (!$this->_isCompound) { |
|
561 return $this->_directory->getFileObject($filename, $shareHandler); |
|
562 } |
|
563 |
|
564 if( !isset($this->_segFiles[$filename]) ) { |
|
565 require_once 'Zend/Search/Lucene/Exception.php'; |
|
566 throw new Zend_Search_Lucene_Exception('Segment compound file doesn\'t contain ' |
|
567 . $filename . ' file.' ); |
|
568 } |
|
569 |
|
570 $file = $this->_directory->getFileObject($this->_name . '.cfs', $shareHandler); |
|
571 $file->seek($this->_segFiles[$filename]); |
|
572 return $file; |
|
573 } |
|
574 |
|
575 /** |
|
576 * Get compound file length |
|
577 * |
|
578 * @param string $extension |
|
579 * @return integer |
|
580 */ |
|
581 public function compoundFileLength($extension) |
|
582 { |
|
583 if (($extension == '.fdx' || $extension == '.fdt') && $this->_usesSharedDocStore) { |
|
584 $filename = $this->_sharedDocStoreOptions['segment'] . $extension; |
|
585 |
|
586 if (!$this->_sharedDocStoreOptions['isCompound']) { |
|
587 return $this->_directory->fileLength($filename); |
|
588 } |
|
589 |
|
590 if( !isset($this->_sharedDocStoreOptions['fileSizes'][$filename]) ) { |
|
591 require_once 'Zend/Search/Lucene/Exception.php'; |
|
592 throw new Zend_Search_Lucene_Exception('Shared doc store compound file doesn\'t contain ' |
|
593 . $filename . ' file.' ); |
|
594 } |
|
595 |
|
596 return $this->_sharedDocStoreOptions['fileSizes'][$filename]; |
|
597 } |
|
598 |
|
599 |
|
600 $filename = $this->_name . $extension; |
|
601 |
|
602 // Try to get common file first |
|
603 if ($this->_directory->fileExists($filename)) { |
|
604 return $this->_directory->fileLength($filename); |
|
605 } |
|
606 |
|
607 if( !isset($this->_segFileSizes[$filename]) ) { |
|
608 require_once 'Zend/Search/Lucene/Exception.php'; |
|
609 throw new Zend_Search_Lucene_Exception('Index compound file doesn\'t contain ' |
|
610 . $filename . ' file.' ); |
|
611 } |
|
612 |
|
613 return $this->_segFileSizes[$filename]; |
|
614 } |
|
615 |
|
616 /** |
|
617 * Returns field index or -1 if field is not found |
|
618 * |
|
619 * @param string $fieldName |
|
620 * @return integer |
|
621 */ |
|
622 public function getFieldNum($fieldName) |
|
623 { |
|
624 foreach( $this->_fields as $field ) { |
|
625 if( $field->name == $fieldName ) { |
|
626 return $field->number; |
|
627 } |
|
628 } |
|
629 |
|
630 return -1; |
|
631 } |
|
632 |
|
633 /** |
|
634 * Returns field info for specified field |
|
635 * |
|
636 * @param integer $fieldNum |
|
637 * @return Zend_Search_Lucene_Index_FieldInfo |
|
638 */ |
|
639 public function getField($fieldNum) |
|
640 { |
|
641 return $this->_fields[$fieldNum]; |
|
642 } |
|
643 |
|
644 /** |
|
645 * Returns array of fields. |
|
646 * if $indexed parameter is true, then returns only indexed fields. |
|
647 * |
|
648 * @param boolean $indexed |
|
649 * @return array |
|
650 */ |
|
651 public function getFields($indexed = false) |
|
652 { |
|
653 $result = array(); |
|
654 foreach( $this->_fields as $field ) { |
|
655 if( (!$indexed) || $field->isIndexed ) { |
|
656 $result[ $field->name ] = $field->name; |
|
657 } |
|
658 } |
|
659 return $result; |
|
660 } |
|
661 |
|
662 /** |
|
663 * Returns array of FieldInfo objects. |
|
664 * |
|
665 * @return array |
|
666 */ |
|
667 public function getFieldInfos() |
|
668 { |
|
669 return $this->_fields; |
|
670 } |
|
671 |
|
672 /** |
|
673 * Returns actual deletions file generation number. |
|
674 * |
|
675 * @return integer |
|
676 */ |
|
677 public function getDelGen() |
|
678 { |
|
679 return $this->_delGen; |
|
680 } |
|
681 |
|
682 /** |
|
683 * Returns the total number of documents in this segment (including deleted documents). |
|
684 * |
|
685 * @return integer |
|
686 */ |
|
687 public function count() |
|
688 { |
|
689 return $this->_docCount; |
|
690 } |
|
691 |
|
692 /** |
|
693 * Returns number of deleted documents. |
|
694 * |
|
695 * @return integer |
|
696 */ |
|
697 private function _deletedCount() |
|
698 { |
|
699 if ($this->_deleted === null) { |
|
700 return 0; |
|
701 } |
|
702 |
|
703 if (extension_loaded('bitset')) { |
|
704 return count(bitset_to_array($this->_deleted)); |
|
705 } else { |
|
706 return count($this->_deleted); |
|
707 } |
|
708 } |
|
709 |
|
710 /** |
|
711 * Returns the total number of non-deleted documents in this segment. |
|
712 * |
|
713 * @return integer |
|
714 */ |
|
715 public function numDocs() |
|
716 { |
|
717 if ($this->hasDeletions()) { |
|
718 return $this->_docCount - $this->_deletedCount(); |
|
719 } else { |
|
720 return $this->_docCount; |
|
721 } |
|
722 } |
|
723 |
|
724 /** |
|
725 * Get field position in a fields dictionary |
|
726 * |
|
727 * @param integer $fieldNum |
|
728 * @return integer |
|
729 */ |
|
730 private function _getFieldPosition($fieldNum) { |
|
731 // Treat values which are not in a translation table as a 'direct value' |
|
732 return isset($this->_fieldsDicPositions[$fieldNum]) ? |
|
733 $this->_fieldsDicPositions[$fieldNum] : $fieldNum; |
|
734 } |
|
735 |
|
736 /** |
|
737 * Return segment name |
|
738 * |
|
739 * @return string |
|
740 */ |
|
741 public function getName() |
|
742 { |
|
743 return $this->_name; |
|
744 } |
|
745 |
|
746 |
|
747 /** |
|
748 * TermInfo cache |
|
749 * |
|
750 * Size is 1024. |
|
751 * Numbers are used instead of class constants because of performance considerations |
|
752 * |
|
753 * @var array |
|
754 */ |
|
755 private $_termInfoCache = array(); |
|
756 |
|
757 private function _cleanUpTermInfoCache() |
|
758 { |
|
759 // Clean 256 term infos |
|
760 foreach ($this->_termInfoCache as $key => $termInfo) { |
|
761 unset($this->_termInfoCache[$key]); |
|
762 |
|
763 // leave 768 last used term infos |
|
764 if (count($this->_termInfoCache) == 768) { |
|
765 break; |
|
766 } |
|
767 } |
|
768 } |
|
769 |
|
770 /** |
|
771 * Load terms dictionary index |
|
772 * |
|
773 * @throws Zend_Search_Lucene_Exception |
|
774 */ |
|
775 private function _loadDictionaryIndex() |
|
776 { |
|
777 // Check, if index is already serialized |
|
778 if ($this->_directory->fileExists($this->_name . '.sti')) { |
|
779 // Load serialized dictionary index data |
|
780 $stiFile = $this->_directory->getFileObject($this->_name . '.sti'); |
|
781 $stiFileData = $stiFile->readBytes($this->_directory->fileLength($this->_name . '.sti')); |
|
782 |
|
783 // Load dictionary index data |
|
784 if (($unserializedData = @unserialize($stiFileData)) !== false) { |
|
785 list($this->_termDictionary, $this->_termDictionaryInfos) = $unserializedData; |
|
786 return; |
|
787 } |
|
788 } |
|
789 |
|
790 // Load data from .tii file and generate .sti file |
|
791 |
|
792 // Prefetch dictionary index data |
|
793 $tiiFile = $this->openCompoundFile('.tii'); |
|
794 $tiiFileData = $tiiFile->readBytes($this->compoundFileLength('.tii')); |
|
795 |
|
796 /** Zend_Search_Lucene_Index_DictionaryLoader */ |
|
797 require_once 'Zend/Search/Lucene/Index/DictionaryLoader.php'; |
|
798 |
|
799 // Load dictionary index data |
|
800 list($this->_termDictionary, $this->_termDictionaryInfos) = |
|
801 Zend_Search_Lucene_Index_DictionaryLoader::load($tiiFileData); |
|
802 |
|
803 $stiFileData = serialize(array($this->_termDictionary, $this->_termDictionaryInfos)); |
|
804 $stiFile = $this->_directory->createFile($this->_name . '.sti'); |
|
805 $stiFile->writeBytes($stiFileData); |
|
806 } |
|
807 |
|
808 /** |
|
809 * Scans terms dictionary and returns term info |
|
810 * |
|
811 * @param Zend_Search_Lucene_Index_Term $term |
|
812 * @return Zend_Search_Lucene_Index_TermInfo |
|
813 */ |
|
814 public function getTermInfo(Zend_Search_Lucene_Index_Term $term) |
|
815 { |
|
816 $termKey = $term->key(); |
|
817 if (isset($this->_termInfoCache[$termKey])) { |
|
818 $termInfo = $this->_termInfoCache[$termKey]; |
|
819 |
|
820 // Move termInfo to the end of cache |
|
821 unset($this->_termInfoCache[$termKey]); |
|
822 $this->_termInfoCache[$termKey] = $termInfo; |
|
823 |
|
824 return $termInfo; |
|
825 } |
|
826 |
|
827 |
|
828 if ($this->_termDictionary === null) { |
|
829 $this->_loadDictionaryIndex(); |
|
830 } |
|
831 |
|
832 $searchField = $this->getFieldNum($term->field); |
|
833 |
|
834 if ($searchField == -1) { |
|
835 return null; |
|
836 } |
|
837 $searchDicField = $this->_getFieldPosition($searchField); |
|
838 |
|
839 // search for appropriate value in dictionary |
|
840 $lowIndex = 0; |
|
841 $highIndex = count($this->_termDictionary)-1; |
|
842 while ($highIndex >= $lowIndex) { |
|
843 // $mid = ($highIndex - $lowIndex)/2; |
|
844 $mid = ($highIndex + $lowIndex) >> 1; |
|
845 $midTerm = $this->_termDictionary[$mid]; |
|
846 |
|
847 $fieldNum = $this->_getFieldPosition($midTerm[0] /* field */); |
|
848 $delta = $searchDicField - $fieldNum; |
|
849 if ($delta == 0) { |
|
850 $delta = strcmp($term->text, $midTerm[1] /* text */); |
|
851 } |
|
852 |
|
853 if ($delta < 0) { |
|
854 $highIndex = $mid-1; |
|
855 } elseif ($delta > 0) { |
|
856 $lowIndex = $mid+1; |
|
857 } else { |
|
858 // return $this->_termDictionaryInfos[$mid]; // We got it! |
|
859 $a = $this->_termDictionaryInfos[$mid]; |
|
860 $termInfo = new Zend_Search_Lucene_Index_TermInfo($a[0], $a[1], $a[2], $a[3], $a[4]); |
|
861 |
|
862 // Put loaded termInfo into cache |
|
863 $this->_termInfoCache[$termKey] = $termInfo; |
|
864 |
|
865 return $termInfo; |
|
866 } |
|
867 } |
|
868 |
|
869 if ($highIndex == -1) { |
|
870 // Term is out of the dictionary range |
|
871 return null; |
|
872 } |
|
873 |
|
874 $prevPosition = $highIndex; |
|
875 $prevTerm = $this->_termDictionary[$prevPosition]; |
|
876 $prevTermInfo = $this->_termDictionaryInfos[$prevPosition]; |
|
877 |
|
878 $tisFile = $this->openCompoundFile('.tis'); |
|
879 $tiVersion = $tisFile->readInt(); |
|
880 if ($tiVersion != (int)0xFFFFFFFE /* pre-2.1 format */ && |
|
881 $tiVersion != (int)0xFFFFFFFD /* 2.1+ format */) { |
|
882 require_once 'Zend/Search/Lucene/Exception.php'; |
|
883 throw new Zend_Search_Lucene_Exception('Wrong TermInfoFile file format'); |
|
884 } |
|
885 |
|
886 $termCount = $tisFile->readLong(); |
|
887 $indexInterval = $tisFile->readInt(); |
|
888 $skipInterval = $tisFile->readInt(); |
|
889 if ($tiVersion == (int)0xFFFFFFFD /* 2.1+ format */) { |
|
890 $maxSkipLevels = $tisFile->readInt(); |
|
891 } |
|
892 |
|
893 $tisFile->seek($prevTermInfo[4] /* indexPointer */ - (($tiVersion == (int)0xFFFFFFFD)? 24 : 20) /* header size*/, SEEK_CUR); |
|
894 |
|
895 $termValue = $prevTerm[1] /* text */; |
|
896 $termFieldNum = $prevTerm[0] /* field */; |
|
897 $freqPointer = $prevTermInfo[1] /* freqPointer */; |
|
898 $proxPointer = $prevTermInfo[2] /* proxPointer */; |
|
899 for ($count = $prevPosition*$indexInterval + 1; |
|
900 $count <= $termCount && |
|
901 ( $this->_getFieldPosition($termFieldNum) < $searchDicField || |
|
902 ($this->_getFieldPosition($termFieldNum) == $searchDicField && |
|
903 strcmp($termValue, $term->text) < 0) ); |
|
904 $count++) { |
|
905 $termPrefixLength = $tisFile->readVInt(); |
|
906 $termSuffix = $tisFile->readString(); |
|
907 $termFieldNum = $tisFile->readVInt(); |
|
908 $termValue = Zend_Search_Lucene_Index_Term::getPrefix($termValue, $termPrefixLength) . $termSuffix; |
|
909 |
|
910 $docFreq = $tisFile->readVInt(); |
|
911 $freqPointer += $tisFile->readVInt(); |
|
912 $proxPointer += $tisFile->readVInt(); |
|
913 if( $docFreq >= $skipInterval ) { |
|
914 $skipOffset = $tisFile->readVInt(); |
|
915 } else { |
|
916 $skipOffset = 0; |
|
917 } |
|
918 } |
|
919 |
|
920 if ($termFieldNum == $searchField && $termValue == $term->text) { |
|
921 $termInfo = new Zend_Search_Lucene_Index_TermInfo($docFreq, $freqPointer, $proxPointer, $skipOffset); |
|
922 } else { |
|
923 $termInfo = null; |
|
924 } |
|
925 |
|
926 // Put loaded termInfo into cache |
|
927 $this->_termInfoCache[$termKey] = $termInfo; |
|
928 |
|
929 if (count($this->_termInfoCache) == 1024) { |
|
930 $this->_cleanUpTermInfoCache(); |
|
931 } |
|
932 |
|
933 return $termInfo; |
|
934 } |
|
935 |
|
936 /** |
|
937 * Returns IDs of all the documents containing term. |
|
938 * |
|
939 * @param Zend_Search_Lucene_Index_Term $term |
|
940 * @param integer $shift |
|
941 * @param Zend_Search_Lucene_Index_DocsFilter|null $docsFilter |
|
942 * @return array |
|
943 */ |
|
944 public function termDocs(Zend_Search_Lucene_Index_Term $term, $shift = 0, $docsFilter = null) |
|
945 { |
|
946 $termInfo = $this->getTermInfo($term); |
|
947 |
|
948 if (!$termInfo instanceof Zend_Search_Lucene_Index_TermInfo) { |
|
949 if ($docsFilter !== null && $docsFilter instanceof Zend_Search_Lucene_Index_DocsFilter) { |
|
950 $docsFilter->segmentFilters[$this->_name] = array(); |
|
951 } |
|
952 return array(); |
|
953 } |
|
954 |
|
955 $frqFile = $this->openCompoundFile('.frq'); |
|
956 $frqFile->seek($termInfo->freqPointer,SEEK_CUR); |
|
957 $docId = 0; |
|
958 $result = array(); |
|
959 |
|
960 if ($docsFilter !== null) { |
|
961 if (!$docsFilter instanceof Zend_Search_Lucene_Index_DocsFilter) { |
|
962 require_once 'Zend/Search/Lucene/Exception.php'; |
|
963 throw new Zend_Search_Lucene_Exception('Documents filter must be an instance of Zend_Search_Lucene_Index_DocsFilter or null.'); |
|
964 } |
|
965 |
|
966 if (isset($docsFilter->segmentFilters[$this->_name])) { |
|
967 // Filter already has some data for the current segment |
|
968 |
|
969 // Make short name for the filter (which doesn't need additional dereferencing) |
|
970 $filter = &$docsFilter->segmentFilters[$this->_name]; |
|
971 |
|
972 // Check if filter is not empty |
|
973 if (count($filter) == 0) { |
|
974 return array(); |
|
975 } |
|
976 |
|
977 if ($this->_docCount/count($filter) < self::FULL_SCAN_VS_FETCH_BOUNDARY) { |
|
978 // Perform fetching |
|
979 // --------------------------------------------------------------- |
|
980 $updatedFilterData = array(); |
|
981 |
|
982 for( $count=0; $count < $termInfo->docFreq; $count++ ) { |
|
983 $docDelta = $frqFile->readVInt(); |
|
984 if( $docDelta % 2 == 1 ) { |
|
985 $docId += ($docDelta-1)/2; |
|
986 } else { |
|
987 $docId += $docDelta/2; |
|
988 // read freq |
|
989 $frqFile->readVInt(); |
|
990 } |
|
991 |
|
992 if (isset($filter[$docId])) { |
|
993 $result[] = $shift + $docId; |
|
994 $updatedFilterData[$docId] = 1; // 1 is just a some constant value, so we don't need additional var dereference here |
|
995 } |
|
996 } |
|
997 $docsFilter->segmentFilters[$this->_name] = $updatedFilterData; |
|
998 // --------------------------------------------------------------- |
|
999 } else { |
|
1000 // Perform full scan |
|
1001 $updatedFilterData = array(); |
|
1002 |
|
1003 for( $count=0; $count < $termInfo->docFreq; $count++ ) { |
|
1004 $docDelta = $frqFile->readVInt(); |
|
1005 if( $docDelta % 2 == 1 ) { |
|
1006 $docId += ($docDelta-1)/2; |
|
1007 } else { |
|
1008 $docId += $docDelta/2; |
|
1009 // read freq |
|
1010 $frqFile->readVInt(); |
|
1011 } |
|
1012 |
|
1013 if (isset($filter[$docId])) { |
|
1014 $result[] = $shift + $docId; |
|
1015 $updatedFilterData[$docId] = 1; // 1 is just a some constant value, so we don't need additional var dereference here |
|
1016 } |
|
1017 } |
|
1018 $docsFilter->segmentFilters[$this->_name] = $updatedFilterData; |
|
1019 } |
|
1020 } else { |
|
1021 // Filter is present, but doesn't has data for the current segment yet |
|
1022 $filterData = array(); |
|
1023 for( $count=0; $count < $termInfo->docFreq; $count++ ) { |
|
1024 $docDelta = $frqFile->readVInt(); |
|
1025 if( $docDelta % 2 == 1 ) { |
|
1026 $docId += ($docDelta-1)/2; |
|
1027 } else { |
|
1028 $docId += $docDelta/2; |
|
1029 // read freq |
|
1030 $frqFile->readVInt(); |
|
1031 } |
|
1032 |
|
1033 $result[] = $shift + $docId; |
|
1034 $filterData[$docId] = 1; // 1 is just a some constant value, so we don't need additional var dereference here |
|
1035 } |
|
1036 $docsFilter->segmentFilters[$this->_name] = $filterData; |
|
1037 } |
|
1038 } else { |
|
1039 for( $count=0; $count < $termInfo->docFreq; $count++ ) { |
|
1040 $docDelta = $frqFile->readVInt(); |
|
1041 if( $docDelta % 2 == 1 ) { |
|
1042 $docId += ($docDelta-1)/2; |
|
1043 } else { |
|
1044 $docId += $docDelta/2; |
|
1045 // read freq |
|
1046 $frqFile->readVInt(); |
|
1047 } |
|
1048 |
|
1049 $result[] = $shift + $docId; |
|
1050 } |
|
1051 } |
|
1052 |
|
1053 return $result; |
|
1054 } |
|
1055 |
|
1056 /** |
|
1057 * Returns term freqs array. |
|
1058 * Result array structure: array(docId => freq, ...) |
|
1059 * |
|
1060 * @param Zend_Search_Lucene_Index_Term $term |
|
1061 * @param integer $shift |
|
1062 * @param Zend_Search_Lucene_Index_DocsFilter|null $docsFilter |
|
1063 * @return Zend_Search_Lucene_Index_TermInfo |
|
1064 */ |
|
1065 public function termFreqs(Zend_Search_Lucene_Index_Term $term, $shift = 0, $docsFilter = null) |
|
1066 { |
|
1067 $termInfo = $this->getTermInfo($term); |
|
1068 |
|
1069 if (!$termInfo instanceof Zend_Search_Lucene_Index_TermInfo) { |
|
1070 if ($docsFilter !== null && $docsFilter instanceof Zend_Search_Lucene_Index_DocsFilter) { |
|
1071 $docsFilter->segmentFilters[$this->_name] = array(); |
|
1072 } |
|
1073 return array(); |
|
1074 } |
|
1075 |
|
1076 $frqFile = $this->openCompoundFile('.frq'); |
|
1077 $frqFile->seek($termInfo->freqPointer,SEEK_CUR); |
|
1078 $result = array(); |
|
1079 $docId = 0; |
|
1080 |
|
1081 $result = array(); |
|
1082 |
|
1083 if ($docsFilter !== null) { |
|
1084 if (!$docsFilter instanceof Zend_Search_Lucene_Index_DocsFilter) { |
|
1085 require_once 'Zend/Search/Lucene/Exception.php'; |
|
1086 throw new Zend_Search_Lucene_Exception('Documents filter must be an instance of Zend_Search_Lucene_Index_DocsFilter or null.'); |
|
1087 } |
|
1088 |
|
1089 if (isset($docsFilter->segmentFilters[$this->_name])) { |
|
1090 // Filter already has some data for the current segment |
|
1091 |
|
1092 // Make short name for the filter (which doesn't need additional dereferencing) |
|
1093 $filter = &$docsFilter->segmentFilters[$this->_name]; |
|
1094 |
|
1095 // Check if filter is not empty |
|
1096 if (count($filter) == 0) { |
|
1097 return array(); |
|
1098 } |
|
1099 |
|
1100 |
|
1101 if ($this->_docCount/count($filter) < self::FULL_SCAN_VS_FETCH_BOUNDARY) { |
|
1102 // Perform fetching |
|
1103 // --------------------------------------------------------------- |
|
1104 $updatedFilterData = array(); |
|
1105 |
|
1106 for ($count = 0; $count < $termInfo->docFreq; $count++) { |
|
1107 $docDelta = $frqFile->readVInt(); |
|
1108 if ($docDelta % 2 == 1) { |
|
1109 $docId += ($docDelta-1)/2; |
|
1110 if (isset($filter[$docId])) { |
|
1111 $result[$shift + $docId] = 1; |
|
1112 $updatedFilterData[$docId] = 1; // 1 is just a some constant value, so we don't need additional var dereference here |
|
1113 } |
|
1114 } else { |
|
1115 $docId += $docDelta/2; |
|
1116 $freq = $frqFile->readVInt(); |
|
1117 if (isset($filter[$docId])) { |
|
1118 $result[$shift + $docId] = $freq; |
|
1119 $updatedFilterData[$docId] = 1; // 1 is just a some constant value, so we don't need additional var dereference here |
|
1120 } |
|
1121 } |
|
1122 } |
|
1123 $docsFilter->segmentFilters[$this->_name] = $updatedFilterData; |
|
1124 // --------------------------------------------------------------- |
|
1125 } else { |
|
1126 // Perform full scan |
|
1127 $updatedFilterData = array(); |
|
1128 |
|
1129 for ($count = 0; $count < $termInfo->docFreq; $count++) { |
|
1130 $docDelta = $frqFile->readVInt(); |
|
1131 if ($docDelta % 2 == 1) { |
|
1132 $docId += ($docDelta-1)/2; |
|
1133 if (isset($filter[$docId])) { |
|
1134 $result[$shift + $docId] = 1; |
|
1135 $updatedFilterData[$docId] = 1; // 1 is just some constant value, so we don't need additional var dereference here |
|
1136 } |
|
1137 } else { |
|
1138 $docId += $docDelta/2; |
|
1139 $freq = $frqFile->readVInt(); |
|
1140 if (isset($filter[$docId])) { |
|
1141 $result[$shift + $docId] = $freq; |
|
1142 $updatedFilterData[$docId] = 1; // 1 is just some constant value, so we don't need additional var dereference here |
|
1143 } |
|
1144 } |
|
1145 } |
|
1146 $docsFilter->segmentFilters[$this->_name] = $updatedFilterData; |
|
1147 } |
|
1148 } else { |
|
1149 // Filter doesn't has data for current segment |
|
1150 $filterData = array(); |
|
1151 |
|
1152 for ($count = 0; $count < $termInfo->docFreq; $count++) { |
|
1153 $docDelta = $frqFile->readVInt(); |
|
1154 if ($docDelta % 2 == 1) { |
|
1155 $docId += ($docDelta-1)/2; |
|
1156 $result[$shift + $docId] = 1; |
|
1157 $filterData[$docId] = 1; // 1 is just a some constant value, so we don't need additional var dereference here |
|
1158 } else { |
|
1159 $docId += $docDelta/2; |
|
1160 $result[$shift + $docId] = $frqFile->readVInt(); |
|
1161 $filterData[$docId] = 1; // 1 is just a some constant value, so we don't need additional var dereference here |
|
1162 } |
|
1163 } |
|
1164 |
|
1165 $docsFilter->segmentFilters[$this->_name] = $filterData; |
|
1166 } |
|
1167 } else { |
|
1168 for ($count = 0; $count < $termInfo->docFreq; $count++) { |
|
1169 $docDelta = $frqFile->readVInt(); |
|
1170 if ($docDelta % 2 == 1) { |
|
1171 $docId += ($docDelta-1)/2; |
|
1172 $result[$shift + $docId] = 1; |
|
1173 } else { |
|
1174 $docId += $docDelta/2; |
|
1175 $result[$shift + $docId] = $frqFile->readVInt(); |
|
1176 } |
|
1177 } |
|
1178 } |
|
1179 |
|
1180 return $result; |
|
1181 } |
|
1182 |
|
1183 /** |
|
1184 * Returns term positions array. |
|
1185 * Result array structure: array(docId => array(pos1, pos2, ...), ...) |
|
1186 * |
|
1187 * @param Zend_Search_Lucene_Index_Term $term |
|
1188 * @param integer $shift |
|
1189 * @param Zend_Search_Lucene_Index_DocsFilter|null $docsFilter |
|
1190 * @return Zend_Search_Lucene_Index_TermInfo |
|
1191 */ |
|
1192 public function termPositions(Zend_Search_Lucene_Index_Term $term, $shift = 0, $docsFilter = null) |
|
1193 { |
|
1194 $termInfo = $this->getTermInfo($term); |
|
1195 |
|
1196 if (!$termInfo instanceof Zend_Search_Lucene_Index_TermInfo) { |
|
1197 if ($docsFilter !== null && $docsFilter instanceof Zend_Search_Lucene_Index_DocsFilter) { |
|
1198 $docsFilter->segmentFilters[$this->_name] = array(); |
|
1199 } |
|
1200 return array(); |
|
1201 } |
|
1202 |
|
1203 $frqFile = $this->openCompoundFile('.frq'); |
|
1204 $frqFile->seek($termInfo->freqPointer,SEEK_CUR); |
|
1205 |
|
1206 $docId = 0; |
|
1207 $freqs = array(); |
|
1208 |
|
1209 |
|
1210 if ($docsFilter !== null) { |
|
1211 if (!$docsFilter instanceof Zend_Search_Lucene_Index_DocsFilter) { |
|
1212 require_once 'Zend/Search/Lucene/Exception.php'; |
|
1213 throw new Zend_Search_Lucene_Exception('Documents filter must be an instance of Zend_Search_Lucene_Index_DocsFilter or null.'); |
|
1214 } |
|
1215 |
|
1216 if (isset($docsFilter->segmentFilters[$this->_name])) { |
|
1217 // Filter already has some data for the current segment |
|
1218 |
|
1219 // Make short name for the filter (which doesn't need additional dereferencing) |
|
1220 $filter = &$docsFilter->segmentFilters[$this->_name]; |
|
1221 |
|
1222 // Check if filter is not empty |
|
1223 if (count($filter) == 0) { |
|
1224 return array(); |
|
1225 } |
|
1226 |
|
1227 if ($this->_docCount/count($filter) < self::FULL_SCAN_VS_FETCH_BOUNDARY) { |
|
1228 // Perform fetching |
|
1229 // --------------------------------------------------------------- |
|
1230 for ($count = 0; $count < $termInfo->docFreq; $count++) { |
|
1231 $docDelta = $frqFile->readVInt(); |
|
1232 if ($docDelta % 2 == 1) { |
|
1233 $docId += ($docDelta-1)/2; |
|
1234 $freqs[$docId] = 1; |
|
1235 } else { |
|
1236 $docId += $docDelta/2; |
|
1237 $freqs[$docId] = $frqFile->readVInt(); |
|
1238 } |
|
1239 } |
|
1240 |
|
1241 $updatedFilterData = array(); |
|
1242 $result = array(); |
|
1243 $prxFile = $this->openCompoundFile('.prx'); |
|
1244 $prxFile->seek($termInfo->proxPointer, SEEK_CUR); |
|
1245 foreach ($freqs as $docId => $freq) { |
|
1246 $termPosition = 0; |
|
1247 $positions = array(); |
|
1248 |
|
1249 // we have to read .prx file to get right position for next doc |
|
1250 // even filter doesn't match current document |
|
1251 for ($count = 0; $count < $freq; $count++ ) { |
|
1252 $termPosition += $prxFile->readVInt(); |
|
1253 $positions[] = $termPosition; |
|
1254 } |
|
1255 |
|
1256 // Include into updated filter and into result only if doc is matched by filter |
|
1257 if (isset($filter[$docId])) { |
|
1258 $updatedFilterData[$docId] = 1; // 1 is just a some constant value, so we don't need additional var dereference here |
|
1259 $result[$shift + $docId] = $positions; |
|
1260 } |
|
1261 } |
|
1262 |
|
1263 $docsFilter->segmentFilters[$this->_name] = $updatedFilterData; |
|
1264 // --------------------------------------------------------------- |
|
1265 } else { |
|
1266 // Perform full scan |
|
1267 for ($count = 0; $count < $termInfo->docFreq; $count++) { |
|
1268 $docDelta = $frqFile->readVInt(); |
|
1269 if ($docDelta % 2 == 1) { |
|
1270 $docId += ($docDelta-1)/2; |
|
1271 $freqs[$docId] = 1; |
|
1272 } else { |
|
1273 $docId += $docDelta/2; |
|
1274 $freqs[$docId] = $frqFile->readVInt(); |
|
1275 } |
|
1276 } |
|
1277 |
|
1278 $updatedFilterData = array(); |
|
1279 $result = array(); |
|
1280 $prxFile = $this->openCompoundFile('.prx'); |
|
1281 $prxFile->seek($termInfo->proxPointer, SEEK_CUR); |
|
1282 foreach ($freqs as $docId => $freq) { |
|
1283 $termPosition = 0; |
|
1284 $positions = array(); |
|
1285 |
|
1286 // we have to read .prx file to get right position for next doc |
|
1287 // even filter doesn't match current document |
|
1288 for ($count = 0; $count < $freq; $count++ ) { |
|
1289 $termPosition += $prxFile->readVInt(); |
|
1290 $positions[] = $termPosition; |
|
1291 } |
|
1292 |
|
1293 // Include into updated filter and into result only if doc is matched by filter |
|
1294 if (isset($filter[$docId])) { |
|
1295 $updatedFilterData[$docId] = 1; // 1 is just a some constant value, so we don't need additional var dereference here |
|
1296 $result[$shift + $docId] = $positions; |
|
1297 } |
|
1298 } |
|
1299 |
|
1300 $docsFilter->segmentFilters[$this->_name] = $updatedFilterData; |
|
1301 } |
|
1302 } else { |
|
1303 // Filter doesn't has data for current segment |
|
1304 for ($count = 0; $count < $termInfo->docFreq; $count++) { |
|
1305 $docDelta = $frqFile->readVInt(); |
|
1306 if ($docDelta % 2 == 1) { |
|
1307 $docId += ($docDelta-1)/2; |
|
1308 $freqs[$docId] = 1; |
|
1309 } else { |
|
1310 $docId += $docDelta/2; |
|
1311 $freqs[$docId] = $frqFile->readVInt(); |
|
1312 } |
|
1313 } |
|
1314 |
|
1315 $filterData = array(); |
|
1316 $result = array(); |
|
1317 $prxFile = $this->openCompoundFile('.prx'); |
|
1318 $prxFile->seek($termInfo->proxPointer, SEEK_CUR); |
|
1319 foreach ($freqs as $docId => $freq) { |
|
1320 $filterData[$docId] = 1; // 1 is just a some constant value, so we don't need additional var dereference here |
|
1321 |
|
1322 $termPosition = 0; |
|
1323 $positions = array(); |
|
1324 |
|
1325 for ($count = 0; $count < $freq; $count++ ) { |
|
1326 $termPosition += $prxFile->readVInt(); |
|
1327 $positions[] = $termPosition; |
|
1328 } |
|
1329 |
|
1330 $result[$shift + $docId] = $positions; |
|
1331 } |
|
1332 |
|
1333 $docsFilter->segmentFilters[$this->_name] = $filterData; |
|
1334 } |
|
1335 } else { |
|
1336 for ($count = 0; $count < $termInfo->docFreq; $count++) { |
|
1337 $docDelta = $frqFile->readVInt(); |
|
1338 if ($docDelta % 2 == 1) { |
|
1339 $docId += ($docDelta-1)/2; |
|
1340 $freqs[$docId] = 1; |
|
1341 } else { |
|
1342 $docId += $docDelta/2; |
|
1343 $freqs[$docId] = $frqFile->readVInt(); |
|
1344 } |
|
1345 } |
|
1346 |
|
1347 $result = array(); |
|
1348 $prxFile = $this->openCompoundFile('.prx'); |
|
1349 $prxFile->seek($termInfo->proxPointer, SEEK_CUR); |
|
1350 foreach ($freqs as $docId => $freq) { |
|
1351 $termPosition = 0; |
|
1352 $positions = array(); |
|
1353 |
|
1354 for ($count = 0; $count < $freq; $count++ ) { |
|
1355 $termPosition += $prxFile->readVInt(); |
|
1356 $positions[] = $termPosition; |
|
1357 } |
|
1358 |
|
1359 $result[$shift + $docId] = $positions; |
|
1360 } |
|
1361 } |
|
1362 |
|
1363 return $result; |
|
1364 } |
|
1365 |
|
1366 /** |
|
1367 * Load normalizatin factors from an index file |
|
1368 * |
|
1369 * @param integer $fieldNum |
|
1370 * @throws Zend_Search_Lucene_Exception |
|
1371 */ |
|
1372 private function _loadNorm($fieldNum) |
|
1373 { |
|
1374 if ($this->_hasSingleNormFile) { |
|
1375 $normfFile = $this->openCompoundFile('.nrm'); |
|
1376 |
|
1377 $header = $normfFile->readBytes(3); |
|
1378 $headerFormatVersion = $normfFile->readByte(); |
|
1379 |
|
1380 if ($header != 'NRM' || $headerFormatVersion != (int)0xFF) { |
|
1381 require_once 'Zend/Search/Lucene/Exception.php'; |
|
1382 throw new Zend_Search_Lucene_Exception('Wrong norms file format.'); |
|
1383 } |
|
1384 |
|
1385 foreach ($this->_fields as $fNum => $fieldInfo) { |
|
1386 if ($fieldInfo->isIndexed) { |
|
1387 $this->_norms[$fNum] = $normfFile->readBytes($this->_docCount); |
|
1388 } |
|
1389 } |
|
1390 } else { |
|
1391 $fFile = $this->openCompoundFile('.f' . $fieldNum); |
|
1392 $this->_norms[$fieldNum] = $fFile->readBytes($this->_docCount); |
|
1393 } |
|
1394 } |
|
1395 |
|
1396 /** |
|
1397 * Returns normalization factor for specified documents |
|
1398 * |
|
1399 * @param integer $id |
|
1400 * @param string $fieldName |
|
1401 * @return float |
|
1402 */ |
|
1403 public function norm($id, $fieldName) |
|
1404 { |
|
1405 $fieldNum = $this->getFieldNum($fieldName); |
|
1406 |
|
1407 if ( !($this->_fields[$fieldNum]->isIndexed) ) { |
|
1408 return null; |
|
1409 } |
|
1410 |
|
1411 if (!isset($this->_norms[$fieldNum])) { |
|
1412 $this->_loadNorm($fieldNum); |
|
1413 } |
|
1414 |
|
1415 return Zend_Search_Lucene_Search_Similarity::decodeNorm( ord($this->_norms[$fieldNum][$id]) ); |
|
1416 } |
|
1417 |
|
1418 /** |
|
1419 * Returns norm vector, encoded in a byte string |
|
1420 * |
|
1421 * @param string $fieldName |
|
1422 * @return string |
|
1423 */ |
|
1424 public function normVector($fieldName) |
|
1425 { |
|
1426 $fieldNum = $this->getFieldNum($fieldName); |
|
1427 |
|
1428 if ($fieldNum == -1 || !($this->_fields[$fieldNum]->isIndexed)) { |
|
1429 $similarity = Zend_Search_Lucene_Search_Similarity::getDefault(); |
|
1430 |
|
1431 return str_repeat(chr($similarity->encodeNorm( $similarity->lengthNorm($fieldName, 0) )), |
|
1432 $this->_docCount); |
|
1433 } |
|
1434 |
|
1435 if (!isset($this->_norms[$fieldNum])) { |
|
1436 $this->_loadNorm($fieldNum); |
|
1437 } |
|
1438 |
|
1439 return $this->_norms[$fieldNum]; |
|
1440 } |
|
1441 |
|
1442 |
|
1443 /** |
|
1444 * Returns true if any documents have been deleted from this index segment. |
|
1445 * |
|
1446 * @return boolean |
|
1447 */ |
|
1448 public function hasDeletions() |
|
1449 { |
|
1450 return $this->_deleted !== null; |
|
1451 } |
|
1452 |
|
1453 |
|
1454 /** |
|
1455 * Returns true if segment has single norms file. |
|
1456 * |
|
1457 * @return boolean |
|
1458 */ |
|
1459 public function hasSingleNormFile() |
|
1460 { |
|
1461 return $this->_hasSingleNormFile ? true : false; |
|
1462 } |
|
1463 |
|
1464 /** |
|
1465 * Returns true if segment is stored using compound segment file. |
|
1466 * |
|
1467 * @return boolean |
|
1468 */ |
|
1469 public function isCompound() |
|
1470 { |
|
1471 return $this->_isCompound; |
|
1472 } |
|
1473 |
|
1474 /** |
|
1475 * Deletes a document from the index segment. |
|
1476 * $id is an internal document id |
|
1477 * |
|
1478 * @param integer |
|
1479 */ |
|
1480 public function delete($id) |
|
1481 { |
|
1482 $this->_deletedDirty = true; |
|
1483 |
|
1484 if (extension_loaded('bitset')) { |
|
1485 if ($this->_deleted === null) { |
|
1486 $this->_deleted = bitset_empty($id); |
|
1487 } |
|
1488 bitset_incl($this->_deleted, $id); |
|
1489 } else { |
|
1490 if ($this->_deleted === null) { |
|
1491 $this->_deleted = array(); |
|
1492 } |
|
1493 |
|
1494 $this->_deleted[$id] = 1; |
|
1495 } |
|
1496 } |
|
1497 |
|
1498 /** |
|
1499 * Checks, that document is deleted |
|
1500 * |
|
1501 * @param integer |
|
1502 * @return boolean |
|
1503 */ |
|
1504 public function isDeleted($id) |
|
1505 { |
|
1506 if ($this->_deleted === null) { |
|
1507 return false; |
|
1508 } |
|
1509 |
|
1510 if (extension_loaded('bitset')) { |
|
1511 return bitset_in($this->_deleted, $id); |
|
1512 } else { |
|
1513 return isset($this->_deleted[$id]); |
|
1514 } |
|
1515 } |
|
1516 |
|
1517 /** |
|
1518 * Detect latest delete generation |
|
1519 * |
|
1520 * Is actualy used from writeChanges() method or from the constructor if it's invoked from |
|
1521 * Index writer. In both cases index write lock is already obtained, so we shouldn't care |
|
1522 * about it |
|
1523 * |
|
1524 * @return integer |
|
1525 */ |
|
1526 private function _detectLatestDelGen() |
|
1527 { |
|
1528 $delFileList = array(); |
|
1529 foreach ($this->_directory->fileList() as $file) { |
|
1530 if ($file == $this->_name . '.del') { |
|
1531 // Matches <segment_name>.del file name |
|
1532 $delFileList[] = 0; |
|
1533 } else if (preg_match('/^' . $this->_name . '_([a-zA-Z0-9]+)\.del$/i', $file, $matches)) { |
|
1534 // Matches <segment_name>_NNN.del file names |
|
1535 $delFileList[] = (int)base_convert($matches[1], 36, 10); |
|
1536 } |
|
1537 } |
|
1538 |
|
1539 if (count($delFileList) == 0) { |
|
1540 // There is no deletions file for current segment in the directory |
|
1541 // Set deletions file generation number to 1 |
|
1542 return -1; |
|
1543 } else { |
|
1544 // There are some deletions files for current segment in the directory |
|
1545 // Set deletions file generation number to the highest nuber |
|
1546 return max($delFileList); |
|
1547 } |
|
1548 } |
|
1549 |
|
1550 /** |
|
1551 * Write changes if it's necessary. |
|
1552 * |
|
1553 * This method must be invoked only from the Writer _updateSegments() method, |
|
1554 * so index Write lock has to be already obtained. |
|
1555 * |
|
1556 * @internal |
|
1557 * @throws Zend_Search_Lucene_Exceptions |
|
1558 */ |
|
1559 public function writeChanges() |
|
1560 { |
|
1561 // Get new generation number |
|
1562 $latestDelGen = $this->_detectLatestDelGen(); |
|
1563 |
|
1564 if (!$this->_deletedDirty) { |
|
1565 // There was no deletions by current process |
|
1566 |
|
1567 if ($latestDelGen == $this->_delGen) { |
|
1568 // Delete file hasn't been updated by any concurrent process |
|
1569 return; |
|
1570 } else if ($latestDelGen > $this->_delGen) { |
|
1571 // Delete file has been updated by some concurrent process |
|
1572 // Reload deletions file |
|
1573 $this->_delGen = $latestDelGen; |
|
1574 $this->_deleted = $this->_loadDelFile(); |
|
1575 |
|
1576 return; |
|
1577 } else { |
|
1578 require_once 'Zend/Search/Lucene/Exception.php'; |
|
1579 throw new Zend_Search_Lucene_Exception('Delete file processing workflow is corrupted for the segment \'' . $this->_name . '\'.'); |
|
1580 } |
|
1581 } |
|
1582 |
|
1583 if ($latestDelGen > $this->_delGen) { |
|
1584 // Merge current deletions with latest deletions file |
|
1585 $this->_delGen = $latestDelGen; |
|
1586 |
|
1587 $latestDelete = $this->_loadDelFile(); |
|
1588 |
|
1589 if (extension_loaded('bitset')) { |
|
1590 $this->_deleted = bitset_union($this->_deleted, $latestDelete); |
|
1591 } else { |
|
1592 $this->_deleted += $latestDelete; |
|
1593 } |
|
1594 } |
|
1595 |
|
1596 if (extension_loaded('bitset')) { |
|
1597 $delBytes = $this->_deleted; |
|
1598 $bitCount = count(bitset_to_array($delBytes)); |
|
1599 } else { |
|
1600 $byteCount = floor($this->_docCount/8)+1; |
|
1601 $delBytes = str_repeat(chr(0), $byteCount); |
|
1602 for ($count = 0; $count < $byteCount; $count++) { |
|
1603 $byte = 0; |
|
1604 for ($bit = 0; $bit < 8; $bit++) { |
|
1605 if (isset($this->_deleted[$count*8 + $bit])) { |
|
1606 $byte |= (1<<$bit); |
|
1607 } |
|
1608 } |
|
1609 $delBytes[$count] = chr($byte); |
|
1610 } |
|
1611 $bitCount = count($this->_deleted); |
|
1612 } |
|
1613 |
|
1614 if ($this->_delGen == -1) { |
|
1615 // Set delete file generation number to 1 |
|
1616 $this->_delGen = 1; |
|
1617 } else { |
|
1618 // Increase delete file generation number by 1 |
|
1619 $this->_delGen++; |
|
1620 } |
|
1621 |
|
1622 $delFile = $this->_directory->createFile($this->_name . '_' . base_convert($this->_delGen, 10, 36) . '.del'); |
|
1623 $delFile->writeInt($this->_docCount); |
|
1624 $delFile->writeInt($bitCount); |
|
1625 $delFile->writeBytes($delBytes); |
|
1626 |
|
1627 $this->_deletedDirty = false; |
|
1628 } |
|
1629 |
|
1630 |
|
1631 /** |
|
1632 * Term Dictionary File object for stream like terms reading |
|
1633 * |
|
1634 * @var Zend_Search_Lucene_Storage_File |
|
1635 */ |
|
1636 private $_tisFile = null; |
|
1637 |
|
1638 /** |
|
1639 * Actual offset of the .tis file data |
|
1640 * |
|
1641 * @var integer |
|
1642 */ |
|
1643 private $_tisFileOffset; |
|
1644 |
|
1645 /** |
|
1646 * Frequencies File object for stream like terms reading |
|
1647 * |
|
1648 * @var Zend_Search_Lucene_Storage_File |
|
1649 */ |
|
1650 private $_frqFile = null; |
|
1651 |
|
1652 /** |
|
1653 * Actual offset of the .frq file data |
|
1654 * |
|
1655 * @var integer |
|
1656 */ |
|
1657 private $_frqFileOffset; |
|
1658 |
|
1659 /** |
|
1660 * Positions File object for stream like terms reading |
|
1661 * |
|
1662 * @var Zend_Search_Lucene_Storage_File |
|
1663 */ |
|
1664 private $_prxFile = null; |
|
1665 |
|
1666 /** |
|
1667 * Actual offset of the .prx file in the compound file |
|
1668 * |
|
1669 * @var integer |
|
1670 */ |
|
1671 private $_prxFileOffset; |
|
1672 |
|
1673 |
|
1674 /** |
|
1675 * Actual number of terms in term stream |
|
1676 * |
|
1677 * @var integer |
|
1678 */ |
|
1679 private $_termCount = 0; |
|
1680 |
|
1681 /** |
|
1682 * Overall number of terms in term stream |
|
1683 * |
|
1684 * @var integer |
|
1685 */ |
|
1686 private $_termNum = 0; |
|
1687 |
|
1688 /** |
|
1689 * Segment index interval |
|
1690 * |
|
1691 * @var integer |
|
1692 */ |
|
1693 private $_indexInterval; |
|
1694 |
|
1695 /** |
|
1696 * Segment skip interval |
|
1697 * |
|
1698 * @var integer |
|
1699 */ |
|
1700 private $_skipInterval; |
|
1701 |
|
1702 /** |
|
1703 * Last TermInfo in a terms stream |
|
1704 * |
|
1705 * @var Zend_Search_Lucene_Index_TermInfo |
|
1706 */ |
|
1707 private $_lastTermInfo = null; |
|
1708 |
|
1709 /** |
|
1710 * Last Term in a terms stream |
|
1711 * |
|
1712 * @var Zend_Search_Lucene_Index_Term |
|
1713 */ |
|
1714 private $_lastTerm = null; |
|
1715 |
|
1716 /** |
|
1717 * Map of the document IDs |
|
1718 * Used to get new docID after removing deleted documents. |
|
1719 * It's not very effective from memory usage point of view, |
|
1720 * but much more faster, then other methods |
|
1721 * |
|
1722 * @var array|null |
|
1723 */ |
|
1724 private $_docMap = null; |
|
1725 |
|
1726 /** |
|
1727 * An array of all term positions in the documents. |
|
1728 * Array structure: array( docId => array( pos1, pos2, ...), ...) |
|
1729 * |
|
1730 * Is set to null if term positions loading has to be skipped |
|
1731 * |
|
1732 * @var array|null |
|
1733 */ |
|
1734 private $_lastTermPositions; |
|
1735 |
|
1736 |
|
1737 /** |
|
1738 * Terms scan mode |
|
1739 * |
|
1740 * Values: |
|
1741 * |
|
1742 * self::SM_TERMS_ONLY - terms are scanned, no additional info is retrieved |
|
1743 * self::SM_FULL_INFO - terms are scanned, frequency and position info is retrieved |
|
1744 * self::SM_MERGE_INFO - terms are scanned, frequency and position info is retrieved |
|
1745 * document numbers are compacted (shifted if segment has deleted documents) |
|
1746 * |
|
1747 * @var integer |
|
1748 */ |
|
1749 private $_termsScanMode; |
|
1750 |
|
1751 /** Scan modes */ |
|
1752 const SM_TERMS_ONLY = 0; // terms are scanned, no additional info is retrieved |
|
1753 const SM_FULL_INFO = 1; // terms are scanned, frequency and position info is retrieved |
|
1754 const SM_MERGE_INFO = 2; // terms are scanned, frequency and position info is retrieved |
|
1755 // document numbers are compacted (shifted if segment contains deleted documents) |
|
1756 |
|
1757 /** |
|
1758 * Reset terms stream |
|
1759 * |
|
1760 * $startId - id for the fist document |
|
1761 * $compact - remove deleted documents |
|
1762 * |
|
1763 * Returns start document id for the next segment |
|
1764 * |
|
1765 * @param integer $startId |
|
1766 * @param integer $mode |
|
1767 * @throws Zend_Search_Lucene_Exception |
|
1768 * @return integer |
|
1769 */ |
|
1770 public function resetTermsStream(/** $startId = 0, $mode = self::SM_TERMS_ONLY */) |
|
1771 { |
|
1772 /** |
|
1773 * SegmentInfo->resetTermsStream() method actually takes two optional parameters: |
|
1774 * $startId (default value is 0) |
|
1775 * $mode (default value is self::SM_TERMS_ONLY) |
|
1776 */ |
|
1777 $argList = func_get_args(); |
|
1778 if (count($argList) > 2) { |
|
1779 require_once 'Zend/Search/Lucene/Exception.php'; |
|
1780 throw new Zend_Search_Lucene_Exception('Wrong number of arguments'); |
|
1781 } else if (count($argList) == 2) { |
|
1782 $startId = $argList[0]; |
|
1783 $mode = $argList[1]; |
|
1784 } else if (count($argList) == 1) { |
|
1785 $startId = $argList[0]; |
|
1786 $mode = self::SM_TERMS_ONLY; |
|
1787 } else { |
|
1788 $startId = 0; |
|
1789 $mode = self::SM_TERMS_ONLY; |
|
1790 } |
|
1791 |
|
1792 if ($this->_tisFile !== null) { |
|
1793 $this->_tisFile = null; |
|
1794 } |
|
1795 |
|
1796 $this->_tisFile = $this->openCompoundFile('.tis', false); |
|
1797 $this->_tisFileOffset = $this->_tisFile->tell(); |
|
1798 |
|
1799 $tiVersion = $this->_tisFile->readInt(); |
|
1800 if ($tiVersion != (int)0xFFFFFFFE /* pre-2.1 format */ && |
|
1801 $tiVersion != (int)0xFFFFFFFD /* 2.1+ format */) { |
|
1802 require_once 'Zend/Search/Lucene/Exception.php'; |
|
1803 throw new Zend_Search_Lucene_Exception('Wrong TermInfoFile file format'); |
|
1804 } |
|
1805 |
|
1806 $this->_termCount = |
|
1807 $this->_termNum = $this->_tisFile->readLong(); // Read terms count |
|
1808 $this->_indexInterval = $this->_tisFile->readInt(); // Read Index interval |
|
1809 $this->_skipInterval = $this->_tisFile->readInt(); // Read skip interval |
|
1810 if ($tiVersion == (int)0xFFFFFFFD /* 2.1+ format */) { |
|
1811 $maxSkipLevels = $this->_tisFile->readInt(); |
|
1812 } |
|
1813 |
|
1814 if ($this->_frqFile !== null) { |
|
1815 $this->_frqFile = null; |
|
1816 } |
|
1817 if ($this->_prxFile !== null) { |
|
1818 $this->_prxFile = null; |
|
1819 } |
|
1820 $this->_docMap = array(); |
|
1821 |
|
1822 $this->_lastTerm = new Zend_Search_Lucene_Index_Term('', -1); |
|
1823 $this->_lastTermInfo = new Zend_Search_Lucene_Index_TermInfo(0, 0, 0, 0); |
|
1824 $this->_lastTermPositions = null; |
|
1825 |
|
1826 $this->_termsScanMode = $mode; |
|
1827 |
|
1828 switch ($mode) { |
|
1829 case self::SM_TERMS_ONLY: |
|
1830 // Do nothing |
|
1831 break; |
|
1832 |
|
1833 case self::SM_FULL_INFO: |
|
1834 // break intentionally omitted |
|
1835 case self::SM_MERGE_INFO: |
|
1836 $this->_frqFile = $this->openCompoundFile('.frq', false); |
|
1837 $this->_frqFileOffset = $this->_frqFile->tell(); |
|
1838 |
|
1839 $this->_prxFile = $this->openCompoundFile('.prx', false); |
|
1840 $this->_prxFileOffset = $this->_prxFile->tell(); |
|
1841 |
|
1842 for ($count = 0; $count < $this->_docCount; $count++) { |
|
1843 if (!$this->isDeleted($count)) { |
|
1844 $this->_docMap[$count] = $startId + (($mode == self::SM_MERGE_INFO) ? count($this->_docMap) : $count); |
|
1845 } |
|
1846 } |
|
1847 break; |
|
1848 |
|
1849 default: |
|
1850 require_once 'Zend/Search/Lucene/Exception.php'; |
|
1851 throw new Zend_Search_Lucene_Exception('Wrong terms scaning mode specified.'); |
|
1852 break; |
|
1853 } |
|
1854 |
|
1855 // Calculate next segment start id (since $this->_docMap structure may be cleaned by $this->nextTerm() call) |
|
1856 $nextSegmentStartId = $startId + (($mode == self::SM_MERGE_INFO) ? count($this->_docMap) : $this->_docCount); |
|
1857 $this->nextTerm(); |
|
1858 |
|
1859 return $nextSegmentStartId; |
|
1860 } |
|
1861 |
|
1862 |
|
1863 /** |
|
1864 * Skip terms stream up to the specified term preffix. |
|
1865 * |
|
1866 * Prefix contains fully specified field info and portion of searched term |
|
1867 * |
|
1868 * @param Zend_Search_Lucene_Index_Term $prefix |
|
1869 * @throws Zend_Search_Lucene_Exception |
|
1870 */ |
|
1871 public function skipTo(Zend_Search_Lucene_Index_Term $prefix) |
|
1872 { |
|
1873 if ($this->_termDictionary === null) { |
|
1874 $this->_loadDictionaryIndex(); |
|
1875 } |
|
1876 |
|
1877 $searchField = $this->getFieldNum($prefix->field); |
|
1878 |
|
1879 if ($searchField == -1) { |
|
1880 /** |
|
1881 * Field is not presented in this segment |
|
1882 * Go to the end of dictionary |
|
1883 */ |
|
1884 $this->_tisFile = null; |
|
1885 $this->_frqFile = null; |
|
1886 $this->_prxFile = null; |
|
1887 |
|
1888 $this->_lastTerm = null; |
|
1889 $this->_lastTermInfo = null; |
|
1890 $this->_lastTermPositions = null; |
|
1891 |
|
1892 return; |
|
1893 } |
|
1894 $searchDicField = $this->_getFieldPosition($searchField); |
|
1895 |
|
1896 // search for appropriate value in dictionary |
|
1897 $lowIndex = 0; |
|
1898 $highIndex = count($this->_termDictionary)-1; |
|
1899 while ($highIndex >= $lowIndex) { |
|
1900 // $mid = ($highIndex - $lowIndex)/2; |
|
1901 $mid = ($highIndex + $lowIndex) >> 1; |
|
1902 $midTerm = $this->_termDictionary[$mid]; |
|
1903 |
|
1904 $fieldNum = $this->_getFieldPosition($midTerm[0] /* field */); |
|
1905 $delta = $searchDicField - $fieldNum; |
|
1906 if ($delta == 0) { |
|
1907 $delta = strcmp($prefix->text, $midTerm[1] /* text */); |
|
1908 } |
|
1909 |
|
1910 if ($delta < 0) { |
|
1911 $highIndex = $mid-1; |
|
1912 } elseif ($delta > 0) { |
|
1913 $lowIndex = $mid+1; |
|
1914 } else { |
|
1915 // We have reached term we are looking for |
|
1916 break; |
|
1917 } |
|
1918 } |
|
1919 |
|
1920 if ($highIndex == -1) { |
|
1921 // Term is out of the dictionary range |
|
1922 $this->_tisFile = null; |
|
1923 $this->_frqFile = null; |
|
1924 $this->_prxFile = null; |
|
1925 |
|
1926 $this->_lastTerm = null; |
|
1927 $this->_lastTermInfo = null; |
|
1928 $this->_lastTermPositions = null; |
|
1929 |
|
1930 return; |
|
1931 } |
|
1932 |
|
1933 $prevPosition = $highIndex; |
|
1934 $prevTerm = $this->_termDictionary[$prevPosition]; |
|
1935 $prevTermInfo = $this->_termDictionaryInfos[$prevPosition]; |
|
1936 |
|
1937 if ($this->_tisFile === null) { |
|
1938 // The end of terms stream is reached and terms dictionary file is closed |
|
1939 // Perform mini-reset operation |
|
1940 $this->_tisFile = $this->openCompoundFile('.tis', false); |
|
1941 |
|
1942 if ($this->_termsScanMode == self::SM_FULL_INFO || $this->_termsScanMode == self::SM_MERGE_INFO) { |
|
1943 $this->_frqFile = $this->openCompoundFile('.frq', false); |
|
1944 $this->_prxFile = $this->openCompoundFile('.prx', false); |
|
1945 } |
|
1946 } |
|
1947 $this->_tisFile->seek($this->_tisFileOffset + $prevTermInfo[4], SEEK_SET); |
|
1948 |
|
1949 $this->_lastTerm = new Zend_Search_Lucene_Index_Term($prevTerm[1] /* text */, |
|
1950 ($prevTerm[0] == -1) ? '' : $this->_fields[$prevTerm[0] /* field */]->name); |
|
1951 $this->_lastTermInfo = new Zend_Search_Lucene_Index_TermInfo($prevTermInfo[0] /* docFreq */, |
|
1952 $prevTermInfo[1] /* freqPointer */, |
|
1953 $prevTermInfo[2] /* proxPointer */, |
|
1954 $prevTermInfo[3] /* skipOffset */); |
|
1955 $this->_termCount = $this->_termNum - $prevPosition*$this->_indexInterval; |
|
1956 |
|
1957 if ($highIndex == 0) { |
|
1958 // skip start entry |
|
1959 $this->nextTerm(); |
|
1960 } else if ($prefix->field == $this->_lastTerm->field && $prefix->text == $this->_lastTerm->text) { |
|
1961 // We got exact match in the dictionary index |
|
1962 |
|
1963 if ($this->_termsScanMode == self::SM_FULL_INFO || $this->_termsScanMode == self::SM_MERGE_INFO) { |
|
1964 $this->_lastTermPositions = array(); |
|
1965 |
|
1966 $this->_frqFile->seek($this->_lastTermInfo->freqPointer + $this->_frqFileOffset, SEEK_SET); |
|
1967 $freqs = array(); $docId = 0; |
|
1968 for( $count = 0; $count < $this->_lastTermInfo->docFreq; $count++ ) { |
|
1969 $docDelta = $this->_frqFile->readVInt(); |
|
1970 if( $docDelta % 2 == 1 ) { |
|
1971 $docId += ($docDelta-1)/2; |
|
1972 $freqs[ $docId ] = 1; |
|
1973 } else { |
|
1974 $docId += $docDelta/2; |
|
1975 $freqs[ $docId ] = $this->_frqFile->readVInt(); |
|
1976 } |
|
1977 } |
|
1978 |
|
1979 $this->_prxFile->seek($this->_lastTermInfo->proxPointer + $this->_prxFileOffset, SEEK_SET); |
|
1980 foreach ($freqs as $docId => $freq) { |
|
1981 $termPosition = 0; $positions = array(); |
|
1982 |
|
1983 for ($count = 0; $count < $freq; $count++ ) { |
|
1984 $termPosition += $this->_prxFile->readVInt(); |
|
1985 $positions[] = $termPosition; |
|
1986 } |
|
1987 |
|
1988 if (isset($this->_docMap[$docId])) { |
|
1989 $this->_lastTermPositions[$this->_docMap[$docId]] = $positions; |
|
1990 } |
|
1991 } |
|
1992 } |
|
1993 |
|
1994 return; |
|
1995 } |
|
1996 |
|
1997 // Search term matching specified prefix |
|
1998 while ($this->_lastTerm !== null) { |
|
1999 if ( strcmp($this->_lastTerm->field, $prefix->field) > 0 || |
|
2000 ($prefix->field == $this->_lastTerm->field && strcmp($this->_lastTerm->text, $prefix->text) >= 0) ) { |
|
2001 // Current term matches or greate than the pattern |
|
2002 return; |
|
2003 } |
|
2004 |
|
2005 $this->nextTerm(); |
|
2006 } |
|
2007 } |
|
2008 |
|
2009 |
|
2010 /** |
|
2011 * Scans terms dictionary and returns next term |
|
2012 * |
|
2013 * @return Zend_Search_Lucene_Index_Term|null |
|
2014 */ |
|
2015 public function nextTerm() |
|
2016 { |
|
2017 if ($this->_tisFile === null || $this->_termCount == 0) { |
|
2018 $this->_lastTerm = null; |
|
2019 $this->_lastTermInfo = null; |
|
2020 $this->_lastTermPositions = null; |
|
2021 $this->_docMap = null; |
|
2022 |
|
2023 // may be necessary for "empty" segment |
|
2024 $this->_tisFile = null; |
|
2025 $this->_frqFile = null; |
|
2026 $this->_prxFile = null; |
|
2027 |
|
2028 return null; |
|
2029 } |
|
2030 |
|
2031 $termPrefixLength = $this->_tisFile->readVInt(); |
|
2032 $termSuffix = $this->_tisFile->readString(); |
|
2033 $termFieldNum = $this->_tisFile->readVInt(); |
|
2034 $termValue = Zend_Search_Lucene_Index_Term::getPrefix($this->_lastTerm->text, $termPrefixLength) . $termSuffix; |
|
2035 |
|
2036 $this->_lastTerm = new Zend_Search_Lucene_Index_Term($termValue, $this->_fields[$termFieldNum]->name); |
|
2037 |
|
2038 $docFreq = $this->_tisFile->readVInt(); |
|
2039 $freqPointer = $this->_lastTermInfo->freqPointer + $this->_tisFile->readVInt(); |
|
2040 $proxPointer = $this->_lastTermInfo->proxPointer + $this->_tisFile->readVInt(); |
|
2041 if ($docFreq >= $this->_skipInterval) { |
|
2042 $skipOffset = $this->_tisFile->readVInt(); |
|
2043 } else { |
|
2044 $skipOffset = 0; |
|
2045 } |
|
2046 |
|
2047 $this->_lastTermInfo = new Zend_Search_Lucene_Index_TermInfo($docFreq, $freqPointer, $proxPointer, $skipOffset); |
|
2048 |
|
2049 |
|
2050 if ($this->_termsScanMode == self::SM_FULL_INFO || $this->_termsScanMode == self::SM_MERGE_INFO) { |
|
2051 $this->_lastTermPositions = array(); |
|
2052 |
|
2053 $this->_frqFile->seek($this->_lastTermInfo->freqPointer + $this->_frqFileOffset, SEEK_SET); |
|
2054 $freqs = array(); $docId = 0; |
|
2055 for( $count = 0; $count < $this->_lastTermInfo->docFreq; $count++ ) { |
|
2056 $docDelta = $this->_frqFile->readVInt(); |
|
2057 if( $docDelta % 2 == 1 ) { |
|
2058 $docId += ($docDelta-1)/2; |
|
2059 $freqs[ $docId ] = 1; |
|
2060 } else { |
|
2061 $docId += $docDelta/2; |
|
2062 $freqs[ $docId ] = $this->_frqFile->readVInt(); |
|
2063 } |
|
2064 } |
|
2065 |
|
2066 $this->_prxFile->seek($this->_lastTermInfo->proxPointer + $this->_prxFileOffset, SEEK_SET); |
|
2067 foreach ($freqs as $docId => $freq) { |
|
2068 $termPosition = 0; $positions = array(); |
|
2069 |
|
2070 for ($count = 0; $count < $freq; $count++ ) { |
|
2071 $termPosition += $this->_prxFile->readVInt(); |
|
2072 $positions[] = $termPosition; |
|
2073 } |
|
2074 |
|
2075 if (isset($this->_docMap[$docId])) { |
|
2076 $this->_lastTermPositions[$this->_docMap[$docId]] = $positions; |
|
2077 } |
|
2078 } |
|
2079 } |
|
2080 |
|
2081 $this->_termCount--; |
|
2082 if ($this->_termCount == 0) { |
|
2083 $this->_tisFile = null; |
|
2084 $this->_frqFile = null; |
|
2085 $this->_prxFile = null; |
|
2086 } |
|
2087 |
|
2088 return $this->_lastTerm; |
|
2089 } |
|
2090 |
|
2091 /** |
|
2092 * Close terms stream |
|
2093 * |
|
2094 * Should be used for resources clean up if stream is not read up to the end |
|
2095 */ |
|
2096 public function closeTermsStream() |
|
2097 { |
|
2098 $this->_tisFile = null; |
|
2099 $this->_frqFile = null; |
|
2100 $this->_prxFile = null; |
|
2101 |
|
2102 $this->_lastTerm = null; |
|
2103 $this->_lastTermInfo = null; |
|
2104 $this->_lastTermPositions = null; |
|
2105 |
|
2106 $this->_docMap = null; |
|
2107 } |
|
2108 |
|
2109 |
|
2110 /** |
|
2111 * Returns term in current position |
|
2112 * |
|
2113 * @return Zend_Search_Lucene_Index_Term|null |
|
2114 */ |
|
2115 public function currentTerm() |
|
2116 { |
|
2117 return $this->_lastTerm; |
|
2118 } |
|
2119 |
|
2120 |
|
2121 /** |
|
2122 * Returns an array of all term positions in the documents. |
|
2123 * Return array structure: array( docId => array( pos1, pos2, ...), ...) |
|
2124 * |
|
2125 * @return array |
|
2126 */ |
|
2127 public function currentTermPositions() |
|
2128 { |
|
2129 return $this->_lastTermPositions; |
|
2130 } |
|
2131 } |
|
2132 |