|
1 <?php |
|
2 /** |
|
3 * Zend Framework |
|
4 * |
|
5 * LICENSE |
|
6 * |
|
7 * This source file is subject to the new BSD license that is bundled |
|
8 * with this package in the file LICENSE.txt. |
|
9 * It is also available through the world-wide-web at this URL: |
|
10 * http://framework.zend.com/license/new-bsd |
|
11 * If you did not receive a copy of the license and are unable to |
|
12 * obtain it through the world-wide-web, please send an email |
|
13 * to license@zend.com so we can send you a copy immediately. |
|
14 * |
|
15 * @category Zend |
|
16 * @package Zend_Search_Lucene |
|
17 * @subpackage Document |
|
18 * @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com) |
|
19 * @license http://framework.zend.com/license/new-bsd New BSD License |
|
20 * @version $Id: Html.php 23392 2010-11-19 09:53:16Z ramon $ |
|
21 */ |
|
22 |
|
23 |
|
24 /** Zend_Search_Lucene_Document */ |
|
25 require_once 'Zend/Search/Lucene/Document.php'; |
|
26 |
|
27 |
|
28 /** |
|
29 * HTML document. |
|
30 * |
|
31 * @category Zend |
|
32 * @package Zend_Search_Lucene |
|
33 * @subpackage Document |
|
34 * @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com) |
|
35 * @license http://framework.zend.com/license/new-bsd New BSD License |
|
36 */ |
|
37 class Zend_Search_Lucene_Document_Html extends Zend_Search_Lucene_Document |
|
38 { |
|
39 /** |
|
40 * List of document links |
|
41 * |
|
42 * @var array |
|
43 */ |
|
44 private $_links = array(); |
|
45 |
|
46 /** |
|
47 * List of document header links |
|
48 * |
|
49 * @var array |
|
50 */ |
|
51 private $_headerLinks = array(); |
|
52 |
|
53 /** |
|
54 * Stored DOM representation |
|
55 * |
|
56 * @var DOMDocument |
|
57 */ |
|
58 private $_doc; |
|
59 |
|
60 /** |
|
61 * Exclud nofollow links flag |
|
62 * |
|
63 * If true then links with rel='nofollow' attribute are not included into |
|
64 * document links. |
|
65 * |
|
66 * @var boolean |
|
67 */ |
|
68 private static $_excludeNoFollowLinks = false; |
|
69 |
|
70 /** |
|
71 * |
|
72 * List of inline tags |
|
73 * |
|
74 * @var array |
|
75 */ |
|
76 private $_inlineTags = array('a', 'abbr', 'acronym', 'dfn', 'em', 'strong', 'code', |
|
77 'samp', 'kbd', 'var', 'b', 'i', 'big', 'small', 'strike', |
|
78 'tt', 'u', 'font', 'span', 'bdo', 'cite', 'del', 'ins', |
|
79 'q', 'sub', 'sup'); |
|
80 |
|
81 /** |
|
82 * Object constructor |
|
83 * |
|
84 * @param string $data HTML string (may be HTML fragment, ) |
|
85 * @param boolean $isFile |
|
86 * @param boolean $storeContent |
|
87 * @param string $defaultEncoding HTML encoding, is used if it's not specified using Content-type HTTP-EQUIV meta tag. |
|
88 */ |
|
89 private function __construct($data, $isFile, $storeContent, $defaultEncoding = '') |
|
90 { |
|
91 $this->_doc = new DOMDocument(); |
|
92 $this->_doc->substituteEntities = true; |
|
93 |
|
94 if ($isFile) { |
|
95 $htmlData = file_get_contents($data); |
|
96 } else { |
|
97 $htmlData = $data; |
|
98 } |
|
99 @$this->_doc->loadHTML($htmlData); |
|
100 |
|
101 if ($this->_doc->encoding === null) { |
|
102 // Document encoding is not recognized |
|
103 |
|
104 /** @todo improve HTML vs HTML fragment recognition */ |
|
105 if (preg_match('/<html[^>]*>/i', $htmlData, $matches, PREG_OFFSET_CAPTURE)) { |
|
106 // It's an HTML document |
|
107 // Add additional HEAD section and recognize document |
|
108 $htmlTagOffset = $matches[0][1] + strlen($matches[0][0]); |
|
109 |
|
110 @$this->_doc->loadHTML(iconv($defaultEncoding, 'UTF-8//IGNORE', substr($htmlData, 0, $htmlTagOffset)) |
|
111 . '<head><META HTTP-EQUIV="Content-type" CONTENT="text/html; charset=UTF-8"/></head>' |
|
112 . iconv($defaultEncoding, 'UTF-8//IGNORE', substr($htmlData, $htmlTagOffset))); |
|
113 |
|
114 // Remove additional HEAD section |
|
115 $xpath = new DOMXPath($this->_doc); |
|
116 $head = $xpath->query('/html/head')->item(0); |
|
117 $head->parentNode->removeChild($head); |
|
118 } else { |
|
119 // It's an HTML fragment |
|
120 @$this->_doc->loadHTML('<html><head><META HTTP-EQUIV="Content-type" CONTENT="text/html; charset=UTF-8"/></head><body>' |
|
121 . iconv($defaultEncoding, 'UTF-8//IGNORE', $htmlData) |
|
122 . '</body></html>'); |
|
123 } |
|
124 |
|
125 } |
|
126 /** @todo Add correction of wrong HTML encoding recognition processing |
|
127 * The case is: |
|
128 * Content-type HTTP-EQUIV meta tag is presented, but ISO-8859-5 encoding is actually used, |
|
129 * even $this->_doc->encoding demonstrates another recognized encoding |
|
130 */ |
|
131 |
|
132 $xpath = new DOMXPath($this->_doc); |
|
133 |
|
134 $docTitle = ''; |
|
135 $titleNodes = $xpath->query('/html/head/title'); |
|
136 foreach ($titleNodes as $titleNode) { |
|
137 // title should always have only one entry, but we process all nodeset entries |
|
138 $docTitle .= $titleNode->nodeValue . ' '; |
|
139 } |
|
140 $this->addField(Zend_Search_Lucene_Field::Text('title', $docTitle, 'UTF-8')); |
|
141 |
|
142 $metaNodes = $xpath->query('/html/head/meta[@name]'); |
|
143 foreach ($metaNodes as $metaNode) { |
|
144 $this->addField(Zend_Search_Lucene_Field::Text($metaNode->getAttribute('name'), |
|
145 $metaNode->getAttribute('content'), |
|
146 'UTF-8')); |
|
147 } |
|
148 |
|
149 $docBody = ''; |
|
150 $bodyNodes = $xpath->query('/html/body'); |
|
151 foreach ($bodyNodes as $bodyNode) { |
|
152 // body should always have only one entry, but we process all nodeset entries |
|
153 $this->_retrieveNodeText($bodyNode, $docBody); |
|
154 } |
|
155 if ($storeContent) { |
|
156 $this->addField(Zend_Search_Lucene_Field::Text('body', $docBody, 'UTF-8')); |
|
157 } else { |
|
158 $this->addField(Zend_Search_Lucene_Field::UnStored('body', $docBody, 'UTF-8')); |
|
159 } |
|
160 |
|
161 $linkNodes = $this->_doc->getElementsByTagName('a'); |
|
162 foreach ($linkNodes as $linkNode) { |
|
163 if (($href = $linkNode->getAttribute('href')) != '' && |
|
164 (!self::$_excludeNoFollowLinks || strtolower($linkNode->getAttribute('rel')) != 'nofollow' ) |
|
165 ) { |
|
166 $this->_links[] = $href; |
|
167 } |
|
168 } |
|
169 $linkNodes = $this->_doc->getElementsByTagName('area'); |
|
170 foreach ($linkNodes as $linkNode) { |
|
171 if (($href = $linkNode->getAttribute('href')) != '' && |
|
172 (!self::$_excludeNoFollowLinks || strtolower($linkNode->getAttribute('rel')) != 'nofollow' ) |
|
173 ) { |
|
174 $this->_links[] = $href; |
|
175 } |
|
176 } |
|
177 $this->_links = array_unique($this->_links); |
|
178 |
|
179 $linkNodes = $xpath->query('/html/head/link'); |
|
180 foreach ($linkNodes as $linkNode) { |
|
181 if (($href = $linkNode->getAttribute('href')) != '') { |
|
182 $this->_headerLinks[] = $href; |
|
183 } |
|
184 } |
|
185 $this->_headerLinks = array_unique($this->_headerLinks); |
|
186 } |
|
187 |
|
188 /** |
|
189 * Set exclude nofollow links flag |
|
190 * |
|
191 * @param boolean $newValue |
|
192 */ |
|
193 public static function setExcludeNoFollowLinks($newValue) |
|
194 { |
|
195 self::$_excludeNoFollowLinks = $newValue; |
|
196 } |
|
197 |
|
198 /** |
|
199 * Get exclude nofollow links flag |
|
200 * |
|
201 * @return boolean |
|
202 */ |
|
203 public static function getExcludeNoFollowLinks() |
|
204 { |
|
205 return self::$_excludeNoFollowLinks; |
|
206 } |
|
207 |
|
208 /** |
|
209 * Get node text |
|
210 * |
|
211 * We should exclude scripts, which may be not included into comment tags, CDATA sections, |
|
212 * |
|
213 * @param DOMNode $node |
|
214 * @param string &$text |
|
215 */ |
|
216 private function _retrieveNodeText(DOMNode $node, &$text) |
|
217 { |
|
218 if ($node->nodeType == XML_TEXT_NODE) { |
|
219 $text .= $node->nodeValue; |
|
220 if(!in_array($node->parentNode->tagName, $this->_inlineTags)) { |
|
221 $text .= ' '; |
|
222 } |
|
223 } else if ($node->nodeType == XML_ELEMENT_NODE && $node->nodeName != 'script') { |
|
224 foreach ($node->childNodes as $childNode) { |
|
225 $this->_retrieveNodeText($childNode, $text); |
|
226 } |
|
227 } |
|
228 } |
|
229 |
|
230 /** |
|
231 * Get document HREF links |
|
232 * |
|
233 * @return array |
|
234 */ |
|
235 public function getLinks() |
|
236 { |
|
237 return $this->_links; |
|
238 } |
|
239 |
|
240 /** |
|
241 * Get document header links |
|
242 * |
|
243 * @return array |
|
244 */ |
|
245 public function getHeaderLinks() |
|
246 { |
|
247 return $this->_headerLinks; |
|
248 } |
|
249 |
|
250 /** |
|
251 * Load HTML document from a string |
|
252 * |
|
253 * @param string $data |
|
254 * @param boolean $storeContent |
|
255 * @param string $defaultEncoding HTML encoding, is used if it's not specified using Content-type HTTP-EQUIV meta tag. |
|
256 * @return Zend_Search_Lucene_Document_Html |
|
257 */ |
|
258 public static function loadHTML($data, $storeContent = false, $defaultEncoding = '') |
|
259 { |
|
260 return new Zend_Search_Lucene_Document_Html($data, false, $storeContent, $defaultEncoding); |
|
261 } |
|
262 |
|
263 /** |
|
264 * Load HTML document from a file |
|
265 * |
|
266 * @param string $file |
|
267 * @param boolean $storeContent |
|
268 * @param string $defaultEncoding HTML encoding, is used if it's not specified using Content-type HTTP-EQUIV meta tag. |
|
269 * @return Zend_Search_Lucene_Document_Html |
|
270 */ |
|
271 public static function loadHTMLFile($file, $storeContent = false, $defaultEncoding = '') |
|
272 { |
|
273 return new Zend_Search_Lucene_Document_Html($file, true, $storeContent, $defaultEncoding); |
|
274 } |
|
275 |
|
276 |
|
277 /** |
|
278 * Highlight text in text node |
|
279 * |
|
280 * @param DOMText $node |
|
281 * @param array $wordsToHighlight |
|
282 * @param callback $callback Callback method, used to transform (highlighting) text. |
|
283 * @param array $params Array of additionall callback parameters (first non-optional parameter is a text to transform) |
|
284 * @throws Zend_Search_Lucene_Exception |
|
285 */ |
|
286 protected function _highlightTextNode(DOMText $node, $wordsToHighlight, $callback, $params) |
|
287 { |
|
288 /** Zend_Search_Lucene_Analysis_Analyzer */ |
|
289 require_once 'Zend/Search/Lucene/Analysis/Analyzer.php'; |
|
290 |
|
291 $analyzer = Zend_Search_Lucene_Analysis_Analyzer::getDefault(); |
|
292 $analyzer->setInput($node->nodeValue, 'UTF-8'); |
|
293 |
|
294 $matchedTokens = array(); |
|
295 |
|
296 while (($token = $analyzer->nextToken()) !== null) { |
|
297 if (isset($wordsToHighlight[$token->getTermText()])) { |
|
298 $matchedTokens[] = $token; |
|
299 } |
|
300 } |
|
301 |
|
302 if (count($matchedTokens) == 0) { |
|
303 return; |
|
304 } |
|
305 |
|
306 $matchedTokens = array_reverse($matchedTokens); |
|
307 |
|
308 foreach ($matchedTokens as $token) { |
|
309 // Cut text after matched token |
|
310 $node->splitText($token->getEndOffset()); |
|
311 |
|
312 // Cut matched node |
|
313 $matchedWordNode = $node->splitText($token->getStartOffset()); |
|
314 |
|
315 // Retrieve HTML string representation for highlihted word |
|
316 $fullCallbackparamsList = $params; |
|
317 array_unshift($fullCallbackparamsList, $matchedWordNode->nodeValue); |
|
318 $highlightedWordNodeSetHtml = call_user_func_array($callback, $fullCallbackparamsList); |
|
319 |
|
320 // Transform HTML string to a DOM representation and automatically transform retrieved string |
|
321 // into valid XHTML (It's automatically done by loadHTML() method) |
|
322 $highlightedWordNodeSetDomDocument = new DOMDocument('1.0', 'UTF-8'); |
|
323 $success = @$highlightedWordNodeSetDomDocument-> |
|
324 loadHTML('<html><head><meta http-equiv="Content-type" content="text/html; charset=UTF-8"/></head><body>' |
|
325 . $highlightedWordNodeSetHtml |
|
326 . '</body></html>'); |
|
327 if (!$success) { |
|
328 require_once 'Zend/Search/Lucene/Exception.php'; |
|
329 throw new Zend_Search_Lucene_Exception("Error occured while loading highlighted text fragment: '$highlightedWordNodeSetHtml'."); |
|
330 } |
|
331 $highlightedWordNodeSetXpath = new DOMXPath($highlightedWordNodeSetDomDocument); |
|
332 $highlightedWordNodeSet = $highlightedWordNodeSetXpath->query('/html/body')->item(0)->childNodes; |
|
333 |
|
334 for ($count = 0; $count < $highlightedWordNodeSet->length; $count++) { |
|
335 $nodeToImport = $highlightedWordNodeSet->item($count); |
|
336 $node->parentNode->insertBefore($this->_doc->importNode($nodeToImport, true /* deep copy */), |
|
337 $matchedWordNode); |
|
338 } |
|
339 |
|
340 $node->parentNode->removeChild($matchedWordNode); |
|
341 } |
|
342 } |
|
343 |
|
344 |
|
345 /** |
|
346 * highlight words in content of the specified node |
|
347 * |
|
348 * @param DOMNode $contextNode |
|
349 * @param array $wordsToHighlight |
|
350 * @param callback $callback Callback method, used to transform (highlighting) text. |
|
351 * @param array $params Array of additionall callback parameters (first non-optional parameter is a text to transform) |
|
352 */ |
|
353 protected function _highlightNodeRecursive(DOMNode $contextNode, $wordsToHighlight, $callback, $params) |
|
354 { |
|
355 $textNodes = array(); |
|
356 |
|
357 if (!$contextNode->hasChildNodes()) { |
|
358 return; |
|
359 } |
|
360 |
|
361 foreach ($contextNode->childNodes as $childNode) { |
|
362 if ($childNode->nodeType == XML_TEXT_NODE) { |
|
363 // process node later to leave childNodes structure untouched |
|
364 $textNodes[] = $childNode; |
|
365 } else { |
|
366 // Process node if it's not a script node |
|
367 if ($childNode->nodeName != 'script') { |
|
368 $this->_highlightNodeRecursive($childNode, $wordsToHighlight, $callback, $params); |
|
369 } |
|
370 } |
|
371 } |
|
372 |
|
373 foreach ($textNodes as $textNode) { |
|
374 $this->_highlightTextNode($textNode, $wordsToHighlight, $callback, $params); |
|
375 } |
|
376 } |
|
377 |
|
378 /** |
|
379 * Standard callback method used to highlight words. |
|
380 * |
|
381 * @param string $stringToHighlight |
|
382 * @return string |
|
383 * @internal |
|
384 */ |
|
385 public function applyColour($stringToHighlight, $colour) |
|
386 { |
|
387 return '<b style="color:black;background-color:' . $colour . '">' . $stringToHighlight . '</b>'; |
|
388 } |
|
389 |
|
390 /** |
|
391 * Highlight text with specified color |
|
392 * |
|
393 * @param string|array $words |
|
394 * @param string $colour |
|
395 * @return string |
|
396 */ |
|
397 public function highlight($words, $colour = '#66ffff') |
|
398 { |
|
399 return $this->highlightExtended($words, array($this, 'applyColour'), array($colour)); |
|
400 } |
|
401 |
|
402 |
|
403 |
|
404 /** |
|
405 * Highlight text using specified View helper or callback function. |
|
406 * |
|
407 * @param string|array $words Words to highlight. Words could be organized using the array or string. |
|
408 * @param callback $callback Callback method, used to transform (highlighting) text. |
|
409 * @param array $params Array of additionall callback parameters passed through into it |
|
410 * (first non-optional parameter is an HTML fragment for highlighting) |
|
411 * @return string |
|
412 * @throws Zend_Search_Lucene_Exception |
|
413 */ |
|
414 public function highlightExtended($words, $callback, $params = array()) |
|
415 { |
|
416 /** Zend_Search_Lucene_Analysis_Analyzer */ |
|
417 require_once 'Zend/Search/Lucene/Analysis/Analyzer.php'; |
|
418 |
|
419 if (!is_array($words)) { |
|
420 $words = array($words); |
|
421 } |
|
422 |
|
423 $wordsToHighlightList = array(); |
|
424 $analyzer = Zend_Search_Lucene_Analysis_Analyzer::getDefault(); |
|
425 foreach ($words as $wordString) { |
|
426 $wordsToHighlightList[] = $analyzer->tokenize($wordString); |
|
427 } |
|
428 $wordsToHighlight = call_user_func_array('array_merge', $wordsToHighlightList); |
|
429 |
|
430 if (count($wordsToHighlight) == 0) { |
|
431 return $this->_doc->saveHTML(); |
|
432 } |
|
433 |
|
434 $wordsToHighlightFlipped = array(); |
|
435 foreach ($wordsToHighlight as $id => $token) { |
|
436 $wordsToHighlightFlipped[$token->getTermText()] = $id; |
|
437 } |
|
438 |
|
439 if (!is_callable($callback)) { |
|
440 require_once 'Zend/Search/Lucene/Exception.php'; |
|
441 throw new Zend_Search_Lucene_Exception('$viewHelper parameter mast be a View Helper name, View Helper object or callback.'); |
|
442 } |
|
443 |
|
444 $xpath = new DOMXPath($this->_doc); |
|
445 |
|
446 $matchedNodes = $xpath->query("/html/body"); |
|
447 foreach ($matchedNodes as $matchedNode) { |
|
448 $this->_highlightNodeRecursive($matchedNode, $wordsToHighlightFlipped, $callback, $params); |
|
449 } |
|
450 } |
|
451 |
|
452 |
|
453 /** |
|
454 * Get HTML |
|
455 * |
|
456 * @return string |
|
457 */ |
|
458 public function getHTML() |
|
459 { |
|
460 return $this->_doc->saveHTML(); |
|
461 } |
|
462 |
|
463 /** |
|
464 * Get HTML body |
|
465 * |
|
466 * @return string |
|
467 */ |
|
468 public function getHtmlBody() |
|
469 { |
|
470 $xpath = new DOMXPath($this->_doc); |
|
471 $bodyNodes = $xpath->query('/html/body')->item(0)->childNodes; |
|
472 |
|
473 $outputFragments = array(); |
|
474 for ($count = 0; $count < $bodyNodes->length; $count++) { |
|
475 $outputFragments[] = $this->_doc->saveXML($bodyNodes->item($count)); |
|
476 } |
|
477 |
|
478 return implode($outputFragments); |
|
479 } |
|
480 } |
|
481 |