|
1 <?php |
|
2 /** |
|
3 * Zend Framework |
|
4 * |
|
5 * LICENSE |
|
6 * |
|
7 * This source file is subject to the new BSD license that is bundled |
|
8 * with this package in the file LICENSE.txt. |
|
9 * It is also available through the world-wide-web at this URL: |
|
10 * http://framework.zend.com/license/new-bsd |
|
11 * If you did not receive a copy of the license and are unable to |
|
12 * obtain it through the world-wide-web, please send an email |
|
13 * to license@zend.com so we can send you a copy immediately. |
|
14 * |
|
15 * @category Zend |
|
16 * @package Zend_Search_Lucene |
|
17 * @subpackage Index |
|
18 * @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com) |
|
19 * @license http://framework.zend.com/license/new-bsd New BSD License |
|
20 * @version $Id: SegmentMerger.php 20096 2010-01-06 02:05:09Z bkarwin $ |
|
21 */ |
|
22 |
|
23 /** Zend_Search_Lucene_Index_SegmentInfo */ |
|
24 require_once 'Zend/Search/Lucene/Index/SegmentInfo.php'; |
|
25 |
|
26 |
|
27 /** |
|
28 * @category Zend |
|
29 * @package Zend_Search_Lucene |
|
30 * @subpackage Index |
|
31 * @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com) |
|
32 * @license http://framework.zend.com/license/new-bsd New BSD License |
|
33 */ |
|
34 class Zend_Search_Lucene_Index_SegmentMerger |
|
35 { |
|
36 /** |
|
37 * Target segment writer |
|
38 * |
|
39 * @var Zend_Search_Lucene_Index_SegmentWriter_StreamWriter |
|
40 */ |
|
41 private $_writer; |
|
42 |
|
43 /** |
|
44 * Number of docs in a new segment |
|
45 * |
|
46 * @var integer |
|
47 */ |
|
48 private $_docCount; |
|
49 |
|
50 /** |
|
51 * A set of segments to be merged |
|
52 * |
|
53 * @var array Zend_Search_Lucene_Index_SegmentInfo |
|
54 */ |
|
55 private $_segmentInfos = array(); |
|
56 |
|
57 /** |
|
58 * Flag to signal, that merge is already done |
|
59 * |
|
60 * @var boolean |
|
61 */ |
|
62 private $_mergeDone = false; |
|
63 |
|
64 /** |
|
65 * Field map |
|
66 * [<segment_name>][<field_number>] => <target_field_number> |
|
67 * |
|
68 * @var array |
|
69 */ |
|
70 private $_fieldsMap = array(); |
|
71 |
|
72 |
|
73 |
|
74 /** |
|
75 * Object constructor. |
|
76 * |
|
77 * Creates new segment merger with $directory as target to merge segments into |
|
78 * and $name as a name of new segment |
|
79 * |
|
80 * @param Zend_Search_Lucene_Storage_Directory $directory |
|
81 * @param string $name |
|
82 */ |
|
83 public function __construct($directory, $name) |
|
84 { |
|
85 /** Zend_Search_Lucene_Index_SegmentWriter_StreamWriter */ |
|
86 require_once 'Zend/Search/Lucene/Index/SegmentWriter/StreamWriter.php'; |
|
87 $this->_writer = new Zend_Search_Lucene_Index_SegmentWriter_StreamWriter($directory, $name); |
|
88 } |
|
89 |
|
90 |
|
91 /** |
|
92 * Add segmnet to a collection of segments to be merged |
|
93 * |
|
94 * @param Zend_Search_Lucene_Index_SegmentInfo $segment |
|
95 */ |
|
96 public function addSource(Zend_Search_Lucene_Index_SegmentInfo $segmentInfo) |
|
97 { |
|
98 $this->_segmentInfos[$segmentInfo->getName()] = $segmentInfo; |
|
99 } |
|
100 |
|
101 |
|
102 /** |
|
103 * Do merge. |
|
104 * |
|
105 * Returns number of documents in newly created segment |
|
106 * |
|
107 * @return Zend_Search_Lucene_Index_SegmentInfo |
|
108 * @throws Zend_Search_Lucene_Exception |
|
109 */ |
|
110 public function merge() |
|
111 { |
|
112 if ($this->_mergeDone) { |
|
113 require_once 'Zend/Search/Lucene/Exception.php'; |
|
114 throw new Zend_Search_Lucene_Exception('Merge is already done.'); |
|
115 } |
|
116 |
|
117 if (count($this->_segmentInfos) < 1) { |
|
118 require_once 'Zend/Search/Lucene/Exception.php'; |
|
119 throw new Zend_Search_Lucene_Exception('Wrong number of segments to be merged (' |
|
120 . count($this->_segmentInfos) |
|
121 . ').'); |
|
122 } |
|
123 |
|
124 $this->_mergeFields(); |
|
125 $this->_mergeNorms(); |
|
126 $this->_mergeStoredFields(); |
|
127 $this->_mergeTerms(); |
|
128 |
|
129 $this->_mergeDone = true; |
|
130 |
|
131 return $this->_writer->close(); |
|
132 } |
|
133 |
|
134 |
|
135 /** |
|
136 * Merge fields information |
|
137 */ |
|
138 private function _mergeFields() |
|
139 { |
|
140 foreach ($this->_segmentInfos as $segName => $segmentInfo) { |
|
141 foreach ($segmentInfo->getFieldInfos() as $fieldInfo) { |
|
142 $this->_fieldsMap[$segName][$fieldInfo->number] = $this->_writer->addFieldInfo($fieldInfo); |
|
143 } |
|
144 } |
|
145 } |
|
146 |
|
147 /** |
|
148 * Merge field's normalization factors |
|
149 */ |
|
150 private function _mergeNorms() |
|
151 { |
|
152 foreach ($this->_writer->getFieldInfos() as $fieldInfo) { |
|
153 if ($fieldInfo->isIndexed) { |
|
154 foreach ($this->_segmentInfos as $segName => $segmentInfo) { |
|
155 if ($segmentInfo->hasDeletions()) { |
|
156 $srcNorm = $segmentInfo->normVector($fieldInfo->name); |
|
157 $norm = ''; |
|
158 $docs = $segmentInfo->count(); |
|
159 for ($count = 0; $count < $docs; $count++) { |
|
160 if (!$segmentInfo->isDeleted($count)) { |
|
161 $norm .= $srcNorm[$count]; |
|
162 } |
|
163 } |
|
164 $this->_writer->addNorm($fieldInfo->name, $norm); |
|
165 } else { |
|
166 $this->_writer->addNorm($fieldInfo->name, $segmentInfo->normVector($fieldInfo->name)); |
|
167 } |
|
168 } |
|
169 } |
|
170 } |
|
171 } |
|
172 |
|
173 /** |
|
174 * Merge fields information |
|
175 */ |
|
176 private function _mergeStoredFields() |
|
177 { |
|
178 $this->_docCount = 0; |
|
179 |
|
180 foreach ($this->_segmentInfos as $segName => $segmentInfo) { |
|
181 $fdtFile = $segmentInfo->openCompoundFile('.fdt'); |
|
182 |
|
183 for ($count = 0; $count < $segmentInfo->count(); $count++) { |
|
184 $fieldCount = $fdtFile->readVInt(); |
|
185 $storedFields = array(); |
|
186 |
|
187 for ($count2 = 0; $count2 < $fieldCount; $count2++) { |
|
188 $fieldNum = $fdtFile->readVInt(); |
|
189 $bits = $fdtFile->readByte(); |
|
190 $fieldInfo = $segmentInfo->getField($fieldNum); |
|
191 |
|
192 if (!($bits & 2)) { // Text data |
|
193 $storedFields[] = |
|
194 new Zend_Search_Lucene_Field($fieldInfo->name, |
|
195 $fdtFile->readString(), |
|
196 'UTF-8', |
|
197 true, |
|
198 $fieldInfo->isIndexed, |
|
199 $bits & 1 ); |
|
200 } else { // Binary data |
|
201 $storedFields[] = |
|
202 new Zend_Search_Lucene_Field($fieldInfo->name, |
|
203 $fdtFile->readBinary(), |
|
204 '', |
|
205 true, |
|
206 $fieldInfo->isIndexed, |
|
207 $bits & 1, |
|
208 true); |
|
209 } |
|
210 } |
|
211 |
|
212 if (!$segmentInfo->isDeleted($count)) { |
|
213 $this->_docCount++; |
|
214 $this->_writer->addStoredFields($storedFields); |
|
215 } |
|
216 } |
|
217 } |
|
218 } |
|
219 |
|
220 |
|
221 /** |
|
222 * Merge fields information |
|
223 */ |
|
224 private function _mergeTerms() |
|
225 { |
|
226 /** Zend_Search_Lucene_Index_TermsPriorityQueue */ |
|
227 require_once 'Zend/Search/Lucene/Index/TermsPriorityQueue.php'; |
|
228 |
|
229 $segmentInfoQueue = new Zend_Search_Lucene_Index_TermsPriorityQueue(); |
|
230 |
|
231 $segmentStartId = 0; |
|
232 foreach ($this->_segmentInfos as $segName => $segmentInfo) { |
|
233 $segmentStartId = $segmentInfo->resetTermsStream($segmentStartId, Zend_Search_Lucene_Index_SegmentInfo::SM_MERGE_INFO); |
|
234 |
|
235 // Skip "empty" segments |
|
236 if ($segmentInfo->currentTerm() !== null) { |
|
237 $segmentInfoQueue->put($segmentInfo); |
|
238 } |
|
239 } |
|
240 |
|
241 $this->_writer->initializeDictionaryFiles(); |
|
242 |
|
243 $termDocs = array(); |
|
244 while (($segmentInfo = $segmentInfoQueue->pop()) !== null) { |
|
245 // Merge positions array |
|
246 $termDocs += $segmentInfo->currentTermPositions(); |
|
247 |
|
248 if ($segmentInfoQueue->top() === null || |
|
249 $segmentInfoQueue->top()->currentTerm()->key() != |
|
250 $segmentInfo->currentTerm()->key()) { |
|
251 // We got new term |
|
252 ksort($termDocs, SORT_NUMERIC); |
|
253 |
|
254 // Add term if it's contained in any document |
|
255 if (count($termDocs) > 0) { |
|
256 $this->_writer->addTerm($segmentInfo->currentTerm(), $termDocs); |
|
257 } |
|
258 $termDocs = array(); |
|
259 } |
|
260 |
|
261 $segmentInfo->nextTerm(); |
|
262 // check, if segment dictionary is finished |
|
263 if ($segmentInfo->currentTerm() !== null) { |
|
264 // Put segment back into the priority queue |
|
265 $segmentInfoQueue->put($segmentInfo); |
|
266 } |
|
267 } |
|
268 |
|
269 $this->_writer->closeDictionaryFiles(); |
|
270 } |
|
271 } |