| author | Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com> |
| Tue, 15 Mar 2011 16:32:22 +0100 | |
| changeset 68 | ecaf28ffe26e |
| parent 0 | web/Zend/Search/Lucene/Document/Html.php@4eba9c11703f |
| parent 64 | web/Zend/Search/Lucene/Document/Html.php@162c1de6545a |
| child 207 | 621fa6caec0c |
| permissions | -rw-r--r-- |
|
0
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
1 |
<?php |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
2 |
/** |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
3 |
* Zend Framework |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
4 |
* |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
5 |
* LICENSE |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
6 |
* |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
7 |
* This source file is subject to the new BSD license that is bundled |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
8 |
* with this package in the file LICENSE.txt. |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
9 |
* It is also available through the world-wide-web at this URL: |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
10 |
* http://framework.zend.com/license/new-bsd |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
11 |
* If you did not receive a copy of the license and are unable to |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
12 |
* obtain it through the world-wide-web, please send an email |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
13 |
* to license@zend.com so we can send you a copy immediately. |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
14 |
* |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
15 |
* @category Zend |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
16 |
* @package Zend_Search_Lucene |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
17 |
* @subpackage Document |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
18 |
* @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com) |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
19 |
* @license http://framework.zend.com/license/new-bsd New BSD License |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
20 |
* @version $Id: Html.php 23392 2010-11-19 09:53:16Z ramon $ |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
21 |
*/ |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
22 |
|
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
23 |
|
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
24 |
/** Zend_Search_Lucene_Document */ |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
25 |
require_once 'Zend/Search/Lucene/Document.php'; |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
26 |
|
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
27 |
|
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
28 |
/** |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
29 |
* HTML document. |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
30 |
* |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
31 |
* @category Zend |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
32 |
* @package Zend_Search_Lucene |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
33 |
* @subpackage Document |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
34 |
* @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com) |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
35 |
* @license http://framework.zend.com/license/new-bsd New BSD License |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
36 |
*/ |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
37 |
class Zend_Search_Lucene_Document_Html extends Zend_Search_Lucene_Document |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
38 |
{ |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
39 |
/** |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
40 |
* List of document links |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
41 |
* |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
42 |
* @var array |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
43 |
*/ |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
44 |
private $_links = array(); |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
45 |
|
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
46 |
/** |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
47 |
* List of document header links |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
48 |
* |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
49 |
* @var array |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
50 |
*/ |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
51 |
private $_headerLinks = array(); |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
52 |
|
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
53 |
/** |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
54 |
* Stored DOM representation |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
55 |
* |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
56 |
* @var DOMDocument |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
57 |
*/ |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
58 |
private $_doc; |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
59 |
|
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
60 |
/** |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
61 |
* Exclud nofollow links flag |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
62 |
* |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
63 |
* If true then links with rel='nofollow' attribute are not included into |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
64 |
* document links. |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
65 |
* |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
66 |
* @var boolean |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
67 |
*/ |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
68 |
private static $_excludeNoFollowLinks = false; |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
69 |
|
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
70 |
/** |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
71 |
* |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
72 |
* List of inline tags |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
73 |
* |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
74 |
* @var array |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
75 |
*/ |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
76 |
private $_inlineTags = array('a', 'abbr', 'acronym', 'dfn', 'em', 'strong', 'code', |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
77 |
'samp', 'kbd', 'var', 'b', 'i', 'big', 'small', 'strike', |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
78 |
'tt', 'u', 'font', 'span', 'bdo', 'cite', 'del', 'ins', |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
79 |
'q', 'sub', 'sup'); |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
80 |
|
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
81 |
/** |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
82 |
* Object constructor |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
83 |
* |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
84 |
* @param string $data HTML string (may be HTML fragment, ) |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
85 |
* @param boolean $isFile |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
86 |
* @param boolean $storeContent |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
87 |
* @param string $defaultEncoding HTML encoding, is used if it's not specified using Content-type HTTP-EQUIV meta tag. |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
88 |
*/ |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
89 |
private function __construct($data, $isFile, $storeContent, $defaultEncoding = '') |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
90 |
{ |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
91 |
$this->_doc = new DOMDocument(); |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
92 |
$this->_doc->substituteEntities = true; |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
93 |
|
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
94 |
if ($isFile) { |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
95 |
$htmlData = file_get_contents($data); |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
96 |
} else { |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
97 |
$htmlData = $data; |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
98 |
} |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
99 |
@$this->_doc->loadHTML($htmlData); |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
100 |
|
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
101 |
if ($this->_doc->encoding === null) { |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
102 |
// Document encoding is not recognized |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
103 |
|
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
104 |
/** @todo improve HTML vs HTML fragment recognition */ |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
105 |
if (preg_match('/<html[^>]*>/i', $htmlData, $matches, PREG_OFFSET_CAPTURE)) { |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
106 |
// It's an HTML document |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
107 |
// Add additional HEAD section and recognize document |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
108 |
$htmlTagOffset = $matches[0][1] + strlen($matches[0][0]); |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
109 |
|
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
110 |
@$this->_doc->loadHTML(iconv($defaultEncoding, 'UTF-8//IGNORE', substr($htmlData, 0, $htmlTagOffset)) |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
111 |
. '<head><META HTTP-EQUIV="Content-type" CONTENT="text/html; charset=UTF-8"/></head>' |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
112 |
. iconv($defaultEncoding, 'UTF-8//IGNORE', substr($htmlData, $htmlTagOffset))); |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
113 |
|
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
114 |
// Remove additional HEAD section |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
115 |
$xpath = new DOMXPath($this->_doc); |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
116 |
$head = $xpath->query('/html/head')->item(0); |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
117 |
$head->parentNode->removeChild($head); |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
118 |
} else { |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
119 |
// It's an HTML fragment |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
120 |
@$this->_doc->loadHTML('<html><head><META HTTP-EQUIV="Content-type" CONTENT="text/html; charset=UTF-8"/></head><body>' |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
121 |
. iconv($defaultEncoding, 'UTF-8//IGNORE', $htmlData) |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
122 |
. '</body></html>'); |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
123 |
} |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
124 |
|
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
125 |
} |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
126 |
/** @todo Add correction of wrong HTML encoding recognition processing |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
127 |
* The case is: |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
128 |
* Content-type HTTP-EQUIV meta tag is presented, but ISO-8859-5 encoding is actually used, |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
129 |
* even $this->_doc->encoding demonstrates another recognized encoding |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
130 |
*/ |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
131 |
|
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
132 |
$xpath = new DOMXPath($this->_doc); |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
133 |
|
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
134 |
$docTitle = ''; |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
135 |
$titleNodes = $xpath->query('/html/head/title'); |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
136 |
foreach ($titleNodes as $titleNode) { |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
137 |
// title should always have only one entry, but we process all nodeset entries |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
138 |
$docTitle .= $titleNode->nodeValue . ' '; |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
139 |
} |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
140 |
$this->addField(Zend_Search_Lucene_Field::Text('title', $docTitle, 'UTF-8')); |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
141 |
|
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
142 |
$metaNodes = $xpath->query('/html/head/meta[@name]'); |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
143 |
foreach ($metaNodes as $metaNode) { |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
144 |
$this->addField(Zend_Search_Lucene_Field::Text($metaNode->getAttribute('name'), |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
145 |
$metaNode->getAttribute('content'), |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
146 |
'UTF-8')); |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
147 |
} |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
148 |
|
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
149 |
$docBody = ''; |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
150 |
$bodyNodes = $xpath->query('/html/body'); |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
151 |
foreach ($bodyNodes as $bodyNode) { |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
152 |
// body should always have only one entry, but we process all nodeset entries |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
153 |
$this->_retrieveNodeText($bodyNode, $docBody); |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
154 |
} |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
155 |
if ($storeContent) { |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
156 |
$this->addField(Zend_Search_Lucene_Field::Text('body', $docBody, 'UTF-8')); |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
157 |
} else { |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
158 |
$this->addField(Zend_Search_Lucene_Field::UnStored('body', $docBody, 'UTF-8')); |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
159 |
} |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
160 |
|
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
161 |
$linkNodes = $this->_doc->getElementsByTagName('a'); |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
162 |
foreach ($linkNodes as $linkNode) { |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
163 |
if (($href = $linkNode->getAttribute('href')) != '' && |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
164 |
(!self::$_excludeNoFollowLinks || strtolower($linkNode->getAttribute('rel')) != 'nofollow' ) |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
165 |
) { |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
166 |
$this->_links[] = $href; |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
167 |
} |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
168 |
} |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
169 |
$linkNodes = $this->_doc->getElementsByTagName('area'); |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
170 |
foreach ($linkNodes as $linkNode) { |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
171 |
if (($href = $linkNode->getAttribute('href')) != '' && |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
172 |
(!self::$_excludeNoFollowLinks || strtolower($linkNode->getAttribute('rel')) != 'nofollow' ) |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
173 |
) { |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
174 |
$this->_links[] = $href; |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
175 |
} |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
176 |
} |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
177 |
$this->_links = array_unique($this->_links); |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
178 |
|
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
179 |
$linkNodes = $xpath->query('/html/head/link'); |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
180 |
foreach ($linkNodes as $linkNode) { |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
181 |
if (($href = $linkNode->getAttribute('href')) != '') { |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
182 |
$this->_headerLinks[] = $href; |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
183 |
} |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
184 |
} |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
185 |
$this->_headerLinks = array_unique($this->_headerLinks); |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
186 |
} |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
187 |
|
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
188 |
/** |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
189 |
* Set exclude nofollow links flag |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
190 |
* |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
191 |
* @param boolean $newValue |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
192 |
*/ |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
193 |
public static function setExcludeNoFollowLinks($newValue) |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
194 |
{ |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
195 |
self::$_excludeNoFollowLinks = $newValue; |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
196 |
} |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
197 |
|
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
198 |
/** |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
199 |
* Get exclude nofollow links flag |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
200 |
* |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
201 |
* @return boolean |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
202 |
*/ |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
203 |
public static function getExcludeNoFollowLinks() |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
204 |
{ |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
205 |
return self::$_excludeNoFollowLinks; |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
206 |
} |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
207 |
|
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
208 |
/** |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
209 |
* Get node text |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
210 |
* |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
211 |
* We should exclude scripts, which may be not included into comment tags, CDATA sections, |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
212 |
* |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
213 |
* @param DOMNode $node |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
214 |
* @param string &$text |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
215 |
*/ |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
216 |
private function _retrieveNodeText(DOMNode $node, &$text) |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
217 |
{ |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
218 |
if ($node->nodeType == XML_TEXT_NODE) { |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
219 |
$text .= $node->nodeValue; |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
220 |
if(!in_array($node->parentNode->tagName, $this->_inlineTags)) { |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
221 |
$text .= ' '; |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
222 |
} |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
223 |
} else if ($node->nodeType == XML_ELEMENT_NODE && $node->nodeName != 'script') { |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
224 |
foreach ($node->childNodes as $childNode) { |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
225 |
$this->_retrieveNodeText($childNode, $text); |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
226 |
} |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
227 |
} |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
228 |
} |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
229 |
|
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
230 |
/** |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
231 |
* Get document HREF links |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
232 |
* |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
233 |
* @return array |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
234 |
*/ |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
235 |
public function getLinks() |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
236 |
{ |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
237 |
return $this->_links; |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
238 |
} |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
239 |
|
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
240 |
/** |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
241 |
* Get document header links |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
242 |
* |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
243 |
* @return array |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
244 |
*/ |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
245 |
public function getHeaderLinks() |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
246 |
{ |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
247 |
return $this->_headerLinks; |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
248 |
} |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
249 |
|
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
250 |
/** |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
251 |
* Load HTML document from a string |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
252 |
* |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
253 |
* @param string $data |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
254 |
* @param boolean $storeContent |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
255 |
* @param string $defaultEncoding HTML encoding, is used if it's not specified using Content-type HTTP-EQUIV meta tag. |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
256 |
* @return Zend_Search_Lucene_Document_Html |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
257 |
*/ |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
258 |
public static function loadHTML($data, $storeContent = false, $defaultEncoding = '') |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
259 |
{ |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
260 |
return new Zend_Search_Lucene_Document_Html($data, false, $storeContent, $defaultEncoding); |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
261 |
} |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
262 |
|
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
263 |
/** |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
264 |
* Load HTML document from a file |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
265 |
* |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
266 |
* @param string $file |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
267 |
* @param boolean $storeContent |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
268 |
* @param string $defaultEncoding HTML encoding, is used if it's not specified using Content-type HTTP-EQUIV meta tag. |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
269 |
* @return Zend_Search_Lucene_Document_Html |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
270 |
*/ |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
271 |
public static function loadHTMLFile($file, $storeContent = false, $defaultEncoding = '') |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
272 |
{ |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
273 |
return new Zend_Search_Lucene_Document_Html($file, true, $storeContent, $defaultEncoding); |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
274 |
} |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
275 |
|
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
276 |
|
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
277 |
/** |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
278 |
* Highlight text in text node |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
279 |
* |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
280 |
* @param DOMText $node |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
281 |
* @param array $wordsToHighlight |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
282 |
* @param callback $callback Callback method, used to transform (highlighting) text. |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
283 |
* @param array $params Array of additionall callback parameters (first non-optional parameter is a text to transform) |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
284 |
* @throws Zend_Search_Lucene_Exception |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
285 |
*/ |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
286 |
protected function _highlightTextNode(DOMText $node, $wordsToHighlight, $callback, $params) |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
287 |
{ |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
288 |
/** Zend_Search_Lucene_Analysis_Analyzer */ |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
289 |
require_once 'Zend/Search/Lucene/Analysis/Analyzer.php'; |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
290 |
|
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
291 |
$analyzer = Zend_Search_Lucene_Analysis_Analyzer::getDefault(); |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
292 |
$analyzer->setInput($node->nodeValue, 'UTF-8'); |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
293 |
|
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
294 |
$matchedTokens = array(); |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
295 |
|
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
296 |
while (($token = $analyzer->nextToken()) !== null) { |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
297 |
if (isset($wordsToHighlight[$token->getTermText()])) { |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
298 |
$matchedTokens[] = $token; |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
299 |
} |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
300 |
} |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
301 |
|
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
302 |
if (count($matchedTokens) == 0) { |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
303 |
return; |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
304 |
} |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
305 |
|
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
306 |
$matchedTokens = array_reverse($matchedTokens); |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
307 |
|
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
308 |
foreach ($matchedTokens as $token) { |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
309 |
// Cut text after matched token |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
310 |
$node->splitText($token->getEndOffset()); |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
311 |
|
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
312 |
// Cut matched node |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
313 |
$matchedWordNode = $node->splitText($token->getStartOffset()); |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
314 |
|
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
315 |
// Retrieve HTML string representation for highlihted word |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
316 |
$fullCallbackparamsList = $params; |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
317 |
array_unshift($fullCallbackparamsList, $matchedWordNode->nodeValue); |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
318 |
$highlightedWordNodeSetHtml = call_user_func_array($callback, $fullCallbackparamsList); |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
319 |
|
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
320 |
// Transform HTML string to a DOM representation and automatically transform retrieved string |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
321 |
// into valid XHTML (It's automatically done by loadHTML() method) |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
322 |
$highlightedWordNodeSetDomDocument = new DOMDocument('1.0', 'UTF-8'); |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
323 |
$success = @$highlightedWordNodeSetDomDocument-> |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
324 |
loadHTML('<html><head><meta http-equiv="Content-type" content="text/html; charset=UTF-8"/></head><body>' |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
325 |
. $highlightedWordNodeSetHtml |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
326 |
. '</body></html>'); |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
327 |
if (!$success) { |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
328 |
require_once 'Zend/Search/Lucene/Exception.php'; |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
329 |
throw new Zend_Search_Lucene_Exception("Error occured while loading highlighted text fragment: '$highlightedWordNodeSetHtml'."); |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
330 |
} |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
331 |
$highlightedWordNodeSetXpath = new DOMXPath($highlightedWordNodeSetDomDocument); |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
332 |
$highlightedWordNodeSet = $highlightedWordNodeSetXpath->query('/html/body')->item(0)->childNodes; |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
333 |
|
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
334 |
for ($count = 0; $count < $highlightedWordNodeSet->length; $count++) { |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
335 |
$nodeToImport = $highlightedWordNodeSet->item($count); |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
336 |
$node->parentNode->insertBefore($this->_doc->importNode($nodeToImport, true /* deep copy */), |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
337 |
$matchedWordNode); |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
338 |
} |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
339 |
|
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
340 |
$node->parentNode->removeChild($matchedWordNode); |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
341 |
} |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
342 |
} |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
343 |
|
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
344 |
|
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
345 |
/** |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
346 |
* highlight words in content of the specified node |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
347 |
* |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
348 |
* @param DOMNode $contextNode |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
349 |
* @param array $wordsToHighlight |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
350 |
* @param callback $callback Callback method, used to transform (highlighting) text. |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
351 |
* @param array $params Array of additionall callback parameters (first non-optional parameter is a text to transform) |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
352 |
*/ |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
353 |
protected function _highlightNodeRecursive(DOMNode $contextNode, $wordsToHighlight, $callback, $params) |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
354 |
{ |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
355 |
$textNodes = array(); |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
356 |
|
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
357 |
if (!$contextNode->hasChildNodes()) { |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
358 |
return; |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
359 |
} |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
360 |
|
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
361 |
foreach ($contextNode->childNodes as $childNode) { |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
362 |
if ($childNode->nodeType == XML_TEXT_NODE) { |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
363 |
// process node later to leave childNodes structure untouched |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
364 |
$textNodes[] = $childNode; |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
365 |
} else { |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
366 |
// Process node if it's not a script node |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
367 |
if ($childNode->nodeName != 'script') { |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
368 |
$this->_highlightNodeRecursive($childNode, $wordsToHighlight, $callback, $params); |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
369 |
} |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
370 |
} |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
371 |
} |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
372 |
|
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
373 |
foreach ($textNodes as $textNode) { |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
374 |
$this->_highlightTextNode($textNode, $wordsToHighlight, $callback, $params); |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
375 |
} |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
376 |
} |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
377 |
|
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
378 |
/** |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
379 |
* Standard callback method used to highlight words. |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
380 |
* |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
381 |
* @param string $stringToHighlight |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
382 |
* @return string |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
383 |
* @internal |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
384 |
*/ |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
385 |
public function applyColour($stringToHighlight, $colour) |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
386 |
{ |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
387 |
return '<b style="color:black;background-color:' . $colour . '">' . $stringToHighlight . '</b>'; |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
388 |
} |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
389 |
|
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
390 |
/** |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
391 |
* Highlight text with specified color |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
392 |
* |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
393 |
* @param string|array $words |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
394 |
* @param string $colour |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
395 |
* @return string |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
396 |
*/ |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
397 |
public function highlight($words, $colour = '#66ffff') |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
398 |
{ |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
399 |
return $this->highlightExtended($words, array($this, 'applyColour'), array($colour)); |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
400 |
} |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
401 |
|
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
402 |
|
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
403 |
|
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
404 |
/** |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
405 |
* Highlight text using specified View helper or callback function. |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
406 |
* |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
407 |
* @param string|array $words Words to highlight. Words could be organized using the array or string. |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
408 |
* @param callback $callback Callback method, used to transform (highlighting) text. |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
409 |
* @param array $params Array of additionall callback parameters passed through into it |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
410 |
* (first non-optional parameter is an HTML fragment for highlighting) |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
411 |
* @return string |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
412 |
* @throws Zend_Search_Lucene_Exception |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
413 |
*/ |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
414 |
public function highlightExtended($words, $callback, $params = array()) |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
415 |
{ |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
416 |
/** Zend_Search_Lucene_Analysis_Analyzer */ |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
417 |
require_once 'Zend/Search/Lucene/Analysis/Analyzer.php'; |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
418 |
|
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
419 |
if (!is_array($words)) { |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
420 |
$words = array($words); |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
421 |
} |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
422 |
|
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
423 |
$wordsToHighlightList = array(); |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
424 |
$analyzer = Zend_Search_Lucene_Analysis_Analyzer::getDefault(); |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
425 |
foreach ($words as $wordString) { |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
426 |
$wordsToHighlightList[] = $analyzer->tokenize($wordString); |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
427 |
} |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
428 |
$wordsToHighlight = call_user_func_array('array_merge', $wordsToHighlightList); |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
429 |
|
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
430 |
if (count($wordsToHighlight) == 0) { |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
431 |
return $this->_doc->saveHTML(); |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
432 |
} |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
433 |
|
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
434 |
$wordsToHighlightFlipped = array(); |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
435 |
foreach ($wordsToHighlight as $id => $token) { |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
436 |
$wordsToHighlightFlipped[$token->getTermText()] = $id; |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
437 |
} |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
438 |
|
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
439 |
if (!is_callable($callback)) { |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
440 |
require_once 'Zend/Search/Lucene/Exception.php'; |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
441 |
throw new Zend_Search_Lucene_Exception('$viewHelper parameter mast be a View Helper name, View Helper object or callback.'); |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
442 |
} |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
443 |
|
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
444 |
$xpath = new DOMXPath($this->_doc); |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
445 |
|
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
446 |
$matchedNodes = $xpath->query("/html/body"); |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
447 |
foreach ($matchedNodes as $matchedNode) { |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
448 |
$this->_highlightNodeRecursive($matchedNode, $wordsToHighlightFlipped, $callback, $params); |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
449 |
} |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
450 |
} |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
451 |
|
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
452 |
|
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
453 |
/** |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
454 |
* Get HTML |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
455 |
* |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
456 |
* @return string |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
457 |
*/ |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
458 |
public function getHTML() |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
459 |
{ |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
460 |
return $this->_doc->saveHTML(); |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
461 |
} |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
462 |
|
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
463 |
/** |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
464 |
* Get HTML body |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
465 |
* |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
466 |
* @return string |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
467 |
*/ |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
468 |
public function getHtmlBody() |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
469 |
{ |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
470 |
$xpath = new DOMXPath($this->_doc); |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
471 |
$bodyNodes = $xpath->query('/html/body')->item(0)->childNodes; |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
472 |
|
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
473 |
$outputFragments = array(); |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
474 |
for ($count = 0; $count < $bodyNodes->length; $count++) { |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
475 |
$outputFragments[] = $this->_doc->saveXML($bodyNodes->item($count)); |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
476 |
} |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
477 |
|
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
478 |
return implode($outputFragments); |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
479 |
} |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
480 |
} |
|
4eba9c11703f
first import
Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
parents:
diff
changeset
|
481 |