|
1 <?php |
|
2 /** |
|
3 * Zend Framework |
|
4 * |
|
5 * LICENSE |
|
6 * |
|
7 * This source file is subject to the new BSD license that is bundled |
|
8 * with this package in the file LICENSE.txt. |
|
9 * It is also available through the world-wide-web at this URL: |
|
10 * http://framework.zend.com/license/new-bsd |
|
11 * If you did not receive a copy of the license and are unable to |
|
12 * obtain it through the world-wide-web, please send an email |
|
13 * to license@zend.com so we can send you a copy immediately. |
|
14 * |
|
15 * @category Zend |
|
16 * @package Zend_Search_Lucene |
|
17 * @subpackage Index |
|
18 * @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com) |
|
19 * @license http://framework.zend.com/license/new-bsd New BSD License |
|
20 * @version $Id: DocumentWriter.php 20096 2010-01-06 02:05:09Z bkarwin $ |
|
21 */ |
|
22 |
|
23 /** Zend_Search_Lucene_Index_SegmentWriter */ |
|
24 require_once 'Zend/Search/Lucene/Index/SegmentWriter.php'; |
|
25 |
|
26 /** |
|
27 * @category Zend |
|
28 * @package Zend_Search_Lucene |
|
29 * @subpackage Index |
|
30 * @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com) |
|
31 * @license http://framework.zend.com/license/new-bsd New BSD License |
|
32 */ |
|
33 class Zend_Search_Lucene_Index_SegmentWriter_DocumentWriter extends Zend_Search_Lucene_Index_SegmentWriter |
|
34 { |
|
35 /** |
|
36 * Term Dictionary |
|
37 * Array of the Zend_Search_Lucene_Index_Term objects |
|
38 * Corresponding Zend_Search_Lucene_Index_TermInfo object stored in the $_termDictionaryInfos |
|
39 * |
|
40 * @var array |
|
41 */ |
|
42 protected $_termDictionary; |
|
43 |
|
44 /** |
|
45 * Documents, which contain the term |
|
46 * |
|
47 * @var array |
|
48 */ |
|
49 protected $_termDocs; |
|
50 |
|
51 /** |
|
52 * Object constructor. |
|
53 * |
|
54 * @param Zend_Search_Lucene_Storage_Directory $directory |
|
55 * @param string $name |
|
56 */ |
|
57 public function __construct(Zend_Search_Lucene_Storage_Directory $directory, $name) |
|
58 { |
|
59 parent::__construct($directory, $name); |
|
60 |
|
61 $this->_termDocs = array(); |
|
62 $this->_termDictionary = array(); |
|
63 } |
|
64 |
|
65 |
|
66 /** |
|
67 * Adds a document to this segment. |
|
68 * |
|
69 * @param Zend_Search_Lucene_Document $document |
|
70 * @throws Zend_Search_Lucene_Exception |
|
71 */ |
|
72 public function addDocument(Zend_Search_Lucene_Document $document) |
|
73 { |
|
74 /** Zend_Search_Lucene_Search_Similarity */ |
|
75 require_once 'Zend/Search/Lucene/Search/Similarity.php'; |
|
76 |
|
77 $storedFields = array(); |
|
78 $docNorms = array(); |
|
79 $similarity = Zend_Search_Lucene_Search_Similarity::getDefault(); |
|
80 |
|
81 foreach ($document->getFieldNames() as $fieldName) { |
|
82 $field = $document->getField($fieldName); |
|
83 |
|
84 if ($field->storeTermVector) { |
|
85 /** |
|
86 * @todo term vector storing support |
|
87 */ |
|
88 require_once 'Zend/Search/Lucene/Exception.php'; |
|
89 throw new Zend_Search_Lucene_Exception('Store term vector functionality is not supported yet.'); |
|
90 } |
|
91 |
|
92 if ($field->isIndexed) { |
|
93 if ($field->isTokenized) { |
|
94 /** Zend_Search_Lucene_Analysis_Analyzer */ |
|
95 require_once 'Zend/Search/Lucene/Analysis/Analyzer.php'; |
|
96 |
|
97 $analyzer = Zend_Search_Lucene_Analysis_Analyzer::getDefault(); |
|
98 $analyzer->setInput($field->value, $field->encoding); |
|
99 |
|
100 $position = 0; |
|
101 $tokenCounter = 0; |
|
102 while (($token = $analyzer->nextToken()) !== null) { |
|
103 $tokenCounter++; |
|
104 |
|
105 $term = new Zend_Search_Lucene_Index_Term($token->getTermText(), $field->name); |
|
106 $termKey = $term->key(); |
|
107 |
|
108 if (!isset($this->_termDictionary[$termKey])) { |
|
109 // New term |
|
110 $this->_termDictionary[$termKey] = $term; |
|
111 $this->_termDocs[$termKey] = array(); |
|
112 $this->_termDocs[$termKey][$this->_docCount] = array(); |
|
113 } else if (!isset($this->_termDocs[$termKey][$this->_docCount])) { |
|
114 // Existing term, but new term entry |
|
115 $this->_termDocs[$termKey][$this->_docCount] = array(); |
|
116 } |
|
117 $position += $token->getPositionIncrement(); |
|
118 $this->_termDocs[$termKey][$this->_docCount][] = $position; |
|
119 } |
|
120 |
|
121 if ($tokenCounter == 0) { |
|
122 // Field contains empty value. Treat it as non-indexed and non-tokenized |
|
123 $field = clone($field); |
|
124 $field->isIndexed = $field->isTokenized = false; |
|
125 } else { |
|
126 $docNorms[$field->name] = chr($similarity->encodeNorm( $similarity->lengthNorm($field->name, |
|
127 $tokenCounter)* |
|
128 $document->boost* |
|
129 $field->boost )); |
|
130 } |
|
131 } else if (($fieldUtf8Value = $field->getUtf8Value()) == '') { |
|
132 // Field contains empty value. Treat it as non-indexed and non-tokenized |
|
133 $field = clone($field); |
|
134 $field->isIndexed = $field->isTokenized = false; |
|
135 } else { |
|
136 $term = new Zend_Search_Lucene_Index_Term($fieldUtf8Value, $field->name); |
|
137 $termKey = $term->key(); |
|
138 |
|
139 if (!isset($this->_termDictionary[$termKey])) { |
|
140 // New term |
|
141 $this->_termDictionary[$termKey] = $term; |
|
142 $this->_termDocs[$termKey] = array(); |
|
143 $this->_termDocs[$termKey][$this->_docCount] = array(); |
|
144 } else if (!isset($this->_termDocs[$termKey][$this->_docCount])) { |
|
145 // Existing term, but new term entry |
|
146 $this->_termDocs[$termKey][$this->_docCount] = array(); |
|
147 } |
|
148 $this->_termDocs[$termKey][$this->_docCount][] = 0; // position |
|
149 |
|
150 $docNorms[$field->name] = chr($similarity->encodeNorm( $similarity->lengthNorm($field->name, 1)* |
|
151 $document->boost* |
|
152 $field->boost )); |
|
153 } |
|
154 } |
|
155 |
|
156 if ($field->isStored) { |
|
157 $storedFields[] = $field; |
|
158 } |
|
159 |
|
160 $this->addField($field); |
|
161 } |
|
162 |
|
163 foreach ($this->_fields as $fieldName => $field) { |
|
164 if (!$field->isIndexed) { |
|
165 continue; |
|
166 } |
|
167 |
|
168 if (!isset($this->_norms[$fieldName])) { |
|
169 $this->_norms[$fieldName] = str_repeat(chr($similarity->encodeNorm( $similarity->lengthNorm($fieldName, 0) )), |
|
170 $this->_docCount); |
|
171 } |
|
172 |
|
173 if (isset($docNorms[$fieldName])){ |
|
174 $this->_norms[$fieldName] .= $docNorms[$fieldName]; |
|
175 } else { |
|
176 $this->_norms[$fieldName] .= chr($similarity->encodeNorm( $similarity->lengthNorm($fieldName, 0) )); |
|
177 } |
|
178 } |
|
179 |
|
180 $this->addStoredFields($storedFields); |
|
181 } |
|
182 |
|
183 |
|
184 /** |
|
185 * Dump Term Dictionary (.tis) and Term Dictionary Index (.tii) segment files |
|
186 */ |
|
187 protected function _dumpDictionary() |
|
188 { |
|
189 ksort($this->_termDictionary, SORT_STRING); |
|
190 |
|
191 $this->initializeDictionaryFiles(); |
|
192 |
|
193 foreach ($this->_termDictionary as $termId => $term) { |
|
194 $this->addTerm($term, $this->_termDocs[$termId]); |
|
195 } |
|
196 |
|
197 $this->closeDictionaryFiles(); |
|
198 } |
|
199 |
|
200 |
|
201 /** |
|
202 * Close segment, write it to disk and return segment info |
|
203 * |
|
204 * @return Zend_Search_Lucene_Index_SegmentInfo |
|
205 */ |
|
206 public function close() |
|
207 { |
|
208 if ($this->_docCount == 0) { |
|
209 return null; |
|
210 } |
|
211 |
|
212 $this->_dumpFNM(); |
|
213 $this->_dumpDictionary(); |
|
214 |
|
215 $this->_generateCFS(); |
|
216 |
|
217 /** Zend_Search_Lucene_Index_SegmentInfo */ |
|
218 require_once 'Zend/Search/Lucene/Index/SegmentInfo.php'; |
|
219 |
|
220 return new Zend_Search_Lucene_Index_SegmentInfo($this->_directory, |
|
221 $this->_name, |
|
222 $this->_docCount, |
|
223 -1, |
|
224 null, |
|
225 true, |
|
226 true); |
|
227 } |
|
228 |
|
229 } |
|
230 |