web/lib/Zend/Search/Lucene/Document/Html.php
changeset 64 162c1de6545a
parent 19 1c2f13fd785c
child 68 ecaf28ffe26e
equal deleted inserted replaced
63:5b37998e522e 64:162c1de6545a
       
     1 <?php
       
     2 /**
       
     3  * Zend Framework
       
     4  *
       
     5  * LICENSE
       
     6  *
       
     7  * This source file is subject to the new BSD license that is bundled
       
     8  * with this package in the file LICENSE.txt.
       
     9  * It is also available through the world-wide-web at this URL:
       
    10  * http://framework.zend.com/license/new-bsd
       
    11  * If you did not receive a copy of the license and are unable to
       
    12  * obtain it through the world-wide-web, please send an email
       
    13  * to license@zend.com so we can send you a copy immediately.
       
    14  *
       
    15  * @category   Zend
       
    16  * @package    Zend_Search_Lucene
       
    17  * @subpackage Document
       
    18  * @copyright  Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
       
    19  * @license    http://framework.zend.com/license/new-bsd     New BSD License
       
    20  * @version    $Id: Html.php 23392 2010-11-19 09:53:16Z ramon $
       
    21  */
       
    22 
       
    23 
       
    24 /** Zend_Search_Lucene_Document */
       
    25 require_once 'Zend/Search/Lucene/Document.php';
       
    26 
       
    27 
       
    28 /**
       
    29  * HTML document.
       
    30  *
       
    31  * @category   Zend
       
    32  * @package    Zend_Search_Lucene
       
    33  * @subpackage Document
       
    34  * @copyright  Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
       
    35  * @license    http://framework.zend.com/license/new-bsd     New BSD License
       
    36  */
       
    37 class Zend_Search_Lucene_Document_Html extends Zend_Search_Lucene_Document
       
    38 {
       
    39     /**
       
    40      * List of document links
       
    41      *
       
    42      * @var array
       
    43      */
       
    44     private $_links = array();
       
    45 
       
    46     /**
       
    47      * List of document header links
       
    48      *
       
    49      * @var array
       
    50      */
       
    51     private $_headerLinks = array();
       
    52 
       
    53     /**
       
    54      * Stored DOM representation
       
    55      *
       
    56      * @var DOMDocument
       
    57      */
       
    58     private $_doc;
       
    59 
       
    60     /**
       
    61      * Exclud nofollow links flag
       
    62      *
       
    63      * If true then links with rel='nofollow' attribute are not included into
       
    64      * document links.
       
    65      *
       
    66      * @var boolean
       
    67      */
       
    68     private static $_excludeNoFollowLinks = false;
       
    69 
       
    70     /**
       
    71      *
       
    72      * List of inline tags
       
    73      *
       
    74      * @var array
       
    75      */
       
    76     private $_inlineTags = array('a', 'abbr', 'acronym', 'dfn', 'em', 'strong', 'code',
       
    77                                 'samp', 'kbd', 'var', 'b', 'i', 'big', 'small', 'strike',
       
    78                                 'tt', 'u', 'font', 'span', 'bdo', 'cite', 'del', 'ins',
       
    79                                 'q', 'sub', 'sup');
       
    80 
       
    81     /**
       
    82      * Object constructor
       
    83      *
       
    84      * @param string  $data         HTML string (may be HTML fragment, )
       
    85      * @param boolean $isFile
       
    86      * @param boolean $storeContent
       
    87      * @param string  $defaultEncoding   HTML encoding, is used if it's not specified using Content-type HTTP-EQUIV meta tag.
       
    88      */
       
    89     private function __construct($data, $isFile, $storeContent, $defaultEncoding = '')
       
    90     {
       
    91         $this->_doc = new DOMDocument();
       
    92         $this->_doc->substituteEntities = true;
       
    93 
       
    94         if ($isFile) {
       
    95             $htmlData = file_get_contents($data);
       
    96         } else {
       
    97             $htmlData = $data;
       
    98         }
       
    99         @$this->_doc->loadHTML($htmlData);
       
   100 
       
   101         if ($this->_doc->encoding === null) {
       
   102             // Document encoding is not recognized
       
   103 
       
   104             /** @todo improve HTML vs HTML fragment recognition */
       
   105             if (preg_match('/<html[^>]*>/i', $htmlData, $matches, PREG_OFFSET_CAPTURE)) {
       
   106                 // It's an HTML document
       
   107                 // Add additional HEAD section and recognize document
       
   108                 $htmlTagOffset = $matches[0][1] + strlen($matches[0][0]);
       
   109 
       
   110                 @$this->_doc->loadHTML(iconv($defaultEncoding, 'UTF-8//IGNORE', substr($htmlData, 0, $htmlTagOffset))
       
   111                                      . '<head><META HTTP-EQUIV="Content-type" CONTENT="text/html; charset=UTF-8"/></head>'
       
   112                                      . iconv($defaultEncoding, 'UTF-8//IGNORE', substr($htmlData, $htmlTagOffset)));
       
   113 
       
   114                 // Remove additional HEAD section
       
   115                 $xpath = new DOMXPath($this->_doc);
       
   116                 $head  = $xpath->query('/html/head')->item(0);
       
   117                 $head->parentNode->removeChild($head);
       
   118             } else {
       
   119                 // It's an HTML fragment
       
   120                 @$this->_doc->loadHTML('<html><head><META HTTP-EQUIV="Content-type" CONTENT="text/html; charset=UTF-8"/></head><body>'
       
   121                                      . iconv($defaultEncoding, 'UTF-8//IGNORE', $htmlData)
       
   122                                      . '</body></html>');
       
   123             }
       
   124 
       
   125         }
       
   126         /** @todo Add correction of wrong HTML encoding recognition processing
       
   127          * The case is:
       
   128          * Content-type HTTP-EQUIV meta tag is presented, but ISO-8859-5 encoding is actually used,
       
   129          * even $this->_doc->encoding demonstrates another recognized encoding
       
   130          */
       
   131 
       
   132         $xpath = new DOMXPath($this->_doc);
       
   133 
       
   134         $docTitle = '';
       
   135         $titleNodes = $xpath->query('/html/head/title');
       
   136         foreach ($titleNodes as $titleNode) {
       
   137             // title should always have only one entry, but we process all nodeset entries
       
   138             $docTitle .= $titleNode->nodeValue . ' ';
       
   139         }
       
   140         $this->addField(Zend_Search_Lucene_Field::Text('title', $docTitle, 'UTF-8'));
       
   141 
       
   142         $metaNodes = $xpath->query('/html/head/meta[@name]');
       
   143         foreach ($metaNodes as $metaNode) {
       
   144             $this->addField(Zend_Search_Lucene_Field::Text($metaNode->getAttribute('name'),
       
   145                                                            $metaNode->getAttribute('content'),
       
   146                                                            'UTF-8'));
       
   147         }
       
   148 
       
   149         $docBody = '';
       
   150         $bodyNodes = $xpath->query('/html/body');
       
   151         foreach ($bodyNodes as $bodyNode) {
       
   152             // body should always have only one entry, but we process all nodeset entries
       
   153             $this->_retrieveNodeText($bodyNode, $docBody);
       
   154         }
       
   155         if ($storeContent) {
       
   156             $this->addField(Zend_Search_Lucene_Field::Text('body', $docBody, 'UTF-8'));
       
   157         } else {
       
   158             $this->addField(Zend_Search_Lucene_Field::UnStored('body', $docBody, 'UTF-8'));
       
   159         }
       
   160 
       
   161         $linkNodes = $this->_doc->getElementsByTagName('a');
       
   162         foreach ($linkNodes as $linkNode) {
       
   163             if (($href = $linkNode->getAttribute('href')) != '' &&
       
   164                 (!self::$_excludeNoFollowLinks  ||  strtolower($linkNode->getAttribute('rel')) != 'nofollow' )
       
   165                ) {
       
   166                 $this->_links[] = $href;
       
   167             }
       
   168         }
       
   169         $linkNodes = $this->_doc->getElementsByTagName('area');
       
   170         foreach ($linkNodes as $linkNode) {
       
   171             if (($href = $linkNode->getAttribute('href')) != '' &&
       
   172                 (!self::$_excludeNoFollowLinks  ||  strtolower($linkNode->getAttribute('rel')) != 'nofollow' )
       
   173                ) {
       
   174                 $this->_links[] = $href;
       
   175             }
       
   176         }
       
   177         $this->_links = array_unique($this->_links);
       
   178 
       
   179         $linkNodes = $xpath->query('/html/head/link');
       
   180         foreach ($linkNodes as $linkNode) {
       
   181             if (($href = $linkNode->getAttribute('href')) != '') {
       
   182                 $this->_headerLinks[] = $href;
       
   183             }
       
   184         }
       
   185         $this->_headerLinks = array_unique($this->_headerLinks);
       
   186     }
       
   187 
       
   188     /**
       
   189      * Set exclude nofollow links flag
       
   190      *
       
   191      * @param boolean $newValue
       
   192      */
       
   193     public static function setExcludeNoFollowLinks($newValue)
       
   194     {
       
   195         self::$_excludeNoFollowLinks = $newValue;
       
   196     }
       
   197 
       
   198     /**
       
   199      * Get exclude nofollow links flag
       
   200      *
       
   201      * @return boolean
       
   202      */
       
   203     public static function getExcludeNoFollowLinks()
       
   204     {
       
   205         return self::$_excludeNoFollowLinks;
       
   206     }
       
   207 
       
   208     /**
       
   209      * Get node text
       
   210      *
       
   211      * We should exclude scripts, which may be not included into comment tags, CDATA sections,
       
   212      *
       
   213      * @param DOMNode $node
       
   214      * @param string &$text
       
   215      */
       
   216     private function _retrieveNodeText(DOMNode $node, &$text)
       
   217     {
       
   218         if ($node->nodeType == XML_TEXT_NODE) {
       
   219             $text .= $node->nodeValue;
       
   220             if(!in_array($node->parentNode->tagName, $this->_inlineTags)) {
       
   221                 $text .= ' ';
       
   222             }
       
   223         } else if ($node->nodeType == XML_ELEMENT_NODE  &&  $node->nodeName != 'script') {
       
   224             foreach ($node->childNodes as $childNode) {
       
   225                 $this->_retrieveNodeText($childNode, $text);
       
   226             }
       
   227         }
       
   228     }
       
   229 
       
   230     /**
       
   231      * Get document HREF links
       
   232      *
       
   233      * @return array
       
   234      */
       
   235     public function getLinks()
       
   236     {
       
   237         return $this->_links;
       
   238     }
       
   239 
       
   240     /**
       
   241      * Get document header links
       
   242      *
       
   243      * @return array
       
   244      */
       
   245     public function getHeaderLinks()
       
   246     {
       
   247         return $this->_headerLinks;
       
   248     }
       
   249 
       
   250     /**
       
   251      * Load HTML document from a string
       
   252      *
       
   253      * @param string  $data
       
   254      * @param boolean $storeContent
       
   255      * @param string  $defaultEncoding   HTML encoding, is used if it's not specified using Content-type HTTP-EQUIV meta tag.
       
   256      * @return Zend_Search_Lucene_Document_Html
       
   257      */
       
   258     public static function loadHTML($data, $storeContent = false, $defaultEncoding = '')
       
   259     {
       
   260         return new Zend_Search_Lucene_Document_Html($data, false, $storeContent, $defaultEncoding);
       
   261     }
       
   262 
       
   263     /**
       
   264      * Load HTML document from a file
       
   265      *
       
   266      * @param string  $file
       
   267      * @param boolean $storeContent
       
   268      * @param string  $defaultEncoding   HTML encoding, is used if it's not specified using Content-type HTTP-EQUIV meta tag.
       
   269      * @return Zend_Search_Lucene_Document_Html
       
   270      */
       
   271     public static function loadHTMLFile($file, $storeContent = false, $defaultEncoding = '')
       
   272     {
       
   273         return new Zend_Search_Lucene_Document_Html($file, true, $storeContent, $defaultEncoding);
       
   274     }
       
   275 
       
   276 
       
   277     /**
       
   278      * Highlight text in text node
       
   279      *
       
   280      * @param DOMText $node
       
   281      * @param array   $wordsToHighlight
       
   282      * @param callback $callback   Callback method, used to transform (highlighting) text.
       
   283      * @param array    $params     Array of additionall callback parameters (first non-optional parameter is a text to transform)
       
   284      * @throws Zend_Search_Lucene_Exception
       
   285      */
       
   286     protected function _highlightTextNode(DOMText $node, $wordsToHighlight, $callback, $params)
       
   287     {
       
   288         /** Zend_Search_Lucene_Analysis_Analyzer */
       
   289         require_once 'Zend/Search/Lucene/Analysis/Analyzer.php';
       
   290 
       
   291         $analyzer = Zend_Search_Lucene_Analysis_Analyzer::getDefault();
       
   292         $analyzer->setInput($node->nodeValue, 'UTF-8');
       
   293 
       
   294         $matchedTokens = array();
       
   295 
       
   296         while (($token = $analyzer->nextToken()) !== null) {
       
   297             if (isset($wordsToHighlight[$token->getTermText()])) {
       
   298                 $matchedTokens[] = $token;
       
   299             }
       
   300         }
       
   301 
       
   302         if (count($matchedTokens) == 0) {
       
   303             return;
       
   304         }
       
   305 
       
   306         $matchedTokens = array_reverse($matchedTokens);
       
   307 
       
   308         foreach ($matchedTokens as $token) {
       
   309             // Cut text after matched token
       
   310             $node->splitText($token->getEndOffset());
       
   311 
       
   312             // Cut matched node
       
   313             $matchedWordNode = $node->splitText($token->getStartOffset());
       
   314 
       
   315             // Retrieve HTML string representation for highlihted word
       
   316             $fullCallbackparamsList = $params;
       
   317             array_unshift($fullCallbackparamsList, $matchedWordNode->nodeValue);
       
   318             $highlightedWordNodeSetHtml = call_user_func_array($callback, $fullCallbackparamsList);
       
   319 
       
   320             // Transform HTML string to a DOM representation and automatically transform retrieved string
       
   321             // into valid XHTML (It's automatically done by loadHTML() method)
       
   322             $highlightedWordNodeSetDomDocument = new DOMDocument('1.0', 'UTF-8');
       
   323             $success = @$highlightedWordNodeSetDomDocument->
       
   324                                 loadHTML('<html><head><meta http-equiv="Content-type" content="text/html; charset=UTF-8"/></head><body>'
       
   325                                        . $highlightedWordNodeSetHtml
       
   326                                        . '</body></html>');
       
   327             if (!$success) {
       
   328                 require_once 'Zend/Search/Lucene/Exception.php';
       
   329                 throw new Zend_Search_Lucene_Exception("Error occured while loading highlighted text fragment: '$highlightedWordNodeSetHtml'.");
       
   330             }
       
   331             $highlightedWordNodeSetXpath = new DOMXPath($highlightedWordNodeSetDomDocument);
       
   332             $highlightedWordNodeSet      = $highlightedWordNodeSetXpath->query('/html/body')->item(0)->childNodes;
       
   333 
       
   334             for ($count = 0; $count < $highlightedWordNodeSet->length; $count++) {
       
   335                 $nodeToImport = $highlightedWordNodeSet->item($count);
       
   336                 $node->parentNode->insertBefore($this->_doc->importNode($nodeToImport, true /* deep copy */),
       
   337                                                 $matchedWordNode);
       
   338             }
       
   339 
       
   340             $node->parentNode->removeChild($matchedWordNode);
       
   341         }
       
   342     }
       
   343 
       
   344 
       
   345     /**
       
   346      * highlight words in content of the specified node
       
   347      *
       
   348      * @param DOMNode $contextNode
       
   349      * @param array $wordsToHighlight
       
   350      * @param callback $callback   Callback method, used to transform (highlighting) text.
       
   351      * @param array    $params     Array of additionall callback parameters (first non-optional parameter is a text to transform)
       
   352      */
       
   353     protected function _highlightNodeRecursive(DOMNode $contextNode, $wordsToHighlight, $callback, $params)
       
   354     {
       
   355         $textNodes = array();
       
   356 
       
   357         if (!$contextNode->hasChildNodes()) {
       
   358             return;
       
   359         }
       
   360 
       
   361         foreach ($contextNode->childNodes as $childNode) {
       
   362             if ($childNode->nodeType == XML_TEXT_NODE) {
       
   363                 // process node later to leave childNodes structure untouched
       
   364                 $textNodes[] = $childNode;
       
   365             } else {
       
   366                 // Process node if it's not a script node
       
   367                 if ($childNode->nodeName != 'script') {
       
   368                     $this->_highlightNodeRecursive($childNode, $wordsToHighlight, $callback, $params);
       
   369                 }
       
   370             }
       
   371         }
       
   372 
       
   373         foreach ($textNodes as $textNode) {
       
   374             $this->_highlightTextNode($textNode, $wordsToHighlight, $callback, $params);
       
   375         }
       
   376     }
       
   377 
       
   378     /**
       
   379      * Standard callback method used to highlight words.
       
   380      *
       
   381      * @param  string  $stringToHighlight
       
   382      * @return string
       
   383      * @internal
       
   384      */
       
   385     public function applyColour($stringToHighlight, $colour)
       
   386     {
       
   387         return '<b style="color:black;background-color:' . $colour . '">' . $stringToHighlight . '</b>';
       
   388     }
       
   389 
       
   390     /**
       
   391      * Highlight text with specified color
       
   392      *
       
   393      * @param string|array $words
       
   394      * @param string $colour
       
   395      * @return string
       
   396      */
       
   397     public function highlight($words, $colour = '#66ffff')
       
   398     {
       
   399         return $this->highlightExtended($words, array($this, 'applyColour'), array($colour));
       
   400     }
       
   401 
       
   402 
       
   403 
       
   404     /**
       
   405      * Highlight text using specified View helper or callback function.
       
   406      *
       
   407      * @param string|array $words  Words to highlight. Words could be organized using the array or string.
       
   408      * @param callback $callback   Callback method, used to transform (highlighting) text.
       
   409      * @param array    $params     Array of additionall callback parameters passed through into it
       
   410      *                             (first non-optional parameter is an HTML fragment for highlighting)
       
   411      * @return string
       
   412      * @throws Zend_Search_Lucene_Exception
       
   413      */
       
   414     public function highlightExtended($words, $callback, $params = array())
       
   415     {
       
   416         /** Zend_Search_Lucene_Analysis_Analyzer */
       
   417         require_once 'Zend/Search/Lucene/Analysis/Analyzer.php';
       
   418 
       
   419         if (!is_array($words)) {
       
   420             $words = array($words);
       
   421         }
       
   422 
       
   423         $wordsToHighlightList = array();
       
   424         $analyzer = Zend_Search_Lucene_Analysis_Analyzer::getDefault();
       
   425         foreach ($words as $wordString) {
       
   426             $wordsToHighlightList[] = $analyzer->tokenize($wordString);
       
   427         }
       
   428         $wordsToHighlight = call_user_func_array('array_merge', $wordsToHighlightList);
       
   429 
       
   430         if (count($wordsToHighlight) == 0) {
       
   431             return $this->_doc->saveHTML();
       
   432         }
       
   433 
       
   434         $wordsToHighlightFlipped = array();
       
   435         foreach ($wordsToHighlight as $id => $token) {
       
   436             $wordsToHighlightFlipped[$token->getTermText()] = $id;
       
   437         }
       
   438 
       
   439         if (!is_callable($callback)) {
       
   440             require_once 'Zend/Search/Lucene/Exception.php';
       
   441             throw new Zend_Search_Lucene_Exception('$viewHelper parameter mast be a View Helper name, View Helper object or callback.');
       
   442         }
       
   443 
       
   444         $xpath = new DOMXPath($this->_doc);
       
   445 
       
   446         $matchedNodes = $xpath->query("/html/body");
       
   447         foreach ($matchedNodes as $matchedNode) {
       
   448             $this->_highlightNodeRecursive($matchedNode, $wordsToHighlightFlipped, $callback, $params);
       
   449         }
       
   450     }
       
   451 
       
   452 
       
   453     /**
       
   454      * Get HTML
       
   455      *
       
   456      * @return string
       
   457      */
       
   458     public function getHTML()
       
   459     {
       
   460         return $this->_doc->saveHTML();
       
   461     }
       
   462 
       
   463     /**
       
   464      * Get HTML body
       
   465      *
       
   466      * @return string
       
   467      */
       
   468     public function getHtmlBody()
       
   469     {
       
   470         $xpath = new DOMXPath($this->_doc);
       
   471         $bodyNodes = $xpath->query('/html/body')->item(0)->childNodes;
       
   472 
       
   473         $outputFragments = array();
       
   474         for ($count = 0; $count < $bodyNodes->length; $count++) {
       
   475             $outputFragments[] = $this->_doc->saveXML($bodyNodes->item($count));
       
   476         }
       
   477 
       
   478         return implode($outputFragments);
       
   479     }
       
   480 }
       
   481