web/Zend/Search/Lucene/Analysis/Analyzer/Common/Utf8.php
changeset 0 4eba9c11703f
equal deleted inserted replaced
-1:000000000000 0:4eba9c11703f
       
     1 <?php
       
     2 /**
       
     3  * Zend Framework
       
     4  *
       
     5  * LICENSE
       
     6  *
       
     7  * This source file is subject to the new BSD license that is bundled
       
     8  * with this package in the file LICENSE.txt.
       
     9  * It is also available through the world-wide-web at this URL:
       
    10  * http://framework.zend.com/license/new-bsd
       
    11  * If you did not receive a copy of the license and are unable to
       
    12  * obtain it through the world-wide-web, please send an email
       
    13  * to license@zend.com so we can send you a copy immediately.
       
    14  *
       
    15  * @category   Zend
       
    16  * @package    Zend_Search_Lucene
       
    17  * @subpackage Analysis
       
    18  * @copyright  Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
       
    19  * @license    http://framework.zend.com/license/new-bsd     New BSD License
       
    20  * @version    $Id: Utf8.php 20096 2010-01-06 02:05:09Z bkarwin $
       
    21  */
       
    22 
       
    23 
       
    24 /** Zend_Search_Lucene_Analysis_Analyzer_Common */
       
    25 require_once 'Zend/Search/Lucene/Analysis/Analyzer/Common.php';
       
    26 
       
    27 
       
    28 /**
       
    29  * @category   Zend
       
    30  * @package    Zend_Search_Lucene
       
    31  * @subpackage Analysis
       
    32  * @copyright  Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
       
    33  * @license    http://framework.zend.com/license/new-bsd     New BSD License
       
    34  */
       
    35 
       
    36 class Zend_Search_Lucene_Analysis_Analyzer_Common_Utf8 extends Zend_Search_Lucene_Analysis_Analyzer_Common
       
    37 {
       
    38     /**
       
    39      * Current char position in an UTF-8 stream
       
    40      *
       
    41      * @var integer
       
    42      */
       
    43     private $_position;
       
    44 
       
    45     /**
       
    46      * Current binary position in an UTF-8 stream
       
    47      *
       
    48      * @var integer
       
    49      */
       
    50     private $_bytePosition;
       
    51 
       
    52     /**
       
    53      * Object constructor
       
    54      *
       
    55      * @throws Zend_Search_Lucene_Exception
       
    56      */
       
    57     public function __construct()
       
    58     {
       
    59         if (@preg_match('/\pL/u', 'a') != 1) {
       
    60             // PCRE unicode support is turned off
       
    61             require_once 'Zend/Search/Lucene/Exception.php';
       
    62             throw new Zend_Search_Lucene_Exception('Utf8 analyzer needs PCRE unicode support to be enabled.');
       
    63         }
       
    64     }
       
    65 
       
    66     /**
       
    67      * Reset token stream
       
    68      */
       
    69     public function reset()
       
    70     {
       
    71         $this->_position     = 0;
       
    72         $this->_bytePosition = 0;
       
    73 
       
    74         // convert input into UTF-8
       
    75         if (strcasecmp($this->_encoding, 'utf8' ) != 0  &&
       
    76             strcasecmp($this->_encoding, 'utf-8') != 0 ) {
       
    77                 $this->_input = iconv($this->_encoding, 'UTF-8', $this->_input);
       
    78                 $this->_encoding = 'UTF-8';
       
    79         }
       
    80     }
       
    81 
       
    82     /**
       
    83      * Tokenization stream API
       
    84      * Get next token
       
    85      * Returns null at the end of stream
       
    86      *
       
    87      * @return Zend_Search_Lucene_Analysis_Token|null
       
    88      */
       
    89     public function nextToken()
       
    90     {
       
    91         if ($this->_input === null) {
       
    92             return null;
       
    93         }
       
    94 
       
    95         do {
       
    96             if (! preg_match('/[\p{L}]+/u', $this->_input, $match, PREG_OFFSET_CAPTURE, $this->_bytePosition)) {
       
    97                 // It covers both cases a) there are no matches (preg_match(...) === 0)
       
    98                 // b) error occured (preg_match(...) === FALSE)
       
    99                 return null;
       
   100             }
       
   101 
       
   102             // matched string
       
   103             $matchedWord = $match[0][0];
       
   104 
       
   105             // binary position of the matched word in the input stream
       
   106             $binStartPos = $match[0][1];
       
   107 
       
   108             // character position of the matched word in the input stream
       
   109             $startPos = $this->_position +
       
   110                         iconv_strlen(substr($this->_input,
       
   111                                             $this->_bytePosition,
       
   112                                             $binStartPos - $this->_bytePosition),
       
   113                                      'UTF-8');
       
   114             // character postion of the end of matched word in the input stream
       
   115             $endPos = $startPos + iconv_strlen($matchedWord, 'UTF-8');
       
   116 
       
   117             $this->_bytePosition = $binStartPos + strlen($matchedWord);
       
   118             $this->_position     = $endPos;
       
   119 
       
   120             $token = $this->normalize(new Zend_Search_Lucene_Analysis_Token($matchedWord, $startPos, $endPos));
       
   121         } while ($token === null); // try again if token is skipped
       
   122 
       
   123         return $token;
       
   124     }
       
   125 }
       
   126