|
1 <?php |
|
2 /** |
|
3 * Zend Framework |
|
4 * |
|
5 * LICENSE |
|
6 * |
|
7 * This source file is subject to the new BSD license that is bundled |
|
8 * with this package in the file LICENSE.txt. |
|
9 * It is also available through the world-wide-web at this URL: |
|
10 * http://framework.zend.com/license/new-bsd |
|
11 * If you did not receive a copy of the license and are unable to |
|
12 * obtain it through the world-wide-web, please send an email |
|
13 * to license@zend.com so we can send you a copy immediately. |
|
14 * |
|
15 * @category Zend |
|
16 * @package Zend_Search_Lucene |
|
17 * @subpackage Analysis |
|
18 * @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com) |
|
19 * @license http://framework.zend.com/license/new-bsd New BSD License |
|
20 * @version $Id: Utf8.php 20096 2010-01-06 02:05:09Z bkarwin $ |
|
21 */ |
|
22 |
|
23 |
|
24 /** Zend_Search_Lucene_Analysis_Analyzer_Common */ |
|
25 require_once 'Zend/Search/Lucene/Analysis/Analyzer/Common.php'; |
|
26 |
|
27 |
|
28 /** |
|
29 * @category Zend |
|
30 * @package Zend_Search_Lucene |
|
31 * @subpackage Analysis |
|
32 * @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com) |
|
33 * @license http://framework.zend.com/license/new-bsd New BSD License |
|
34 */ |
|
35 |
|
36 class Zend_Search_Lucene_Analysis_Analyzer_Common_Utf8 extends Zend_Search_Lucene_Analysis_Analyzer_Common |
|
37 { |
|
38 /** |
|
39 * Current char position in an UTF-8 stream |
|
40 * |
|
41 * @var integer |
|
42 */ |
|
43 private $_position; |
|
44 |
|
45 /** |
|
46 * Current binary position in an UTF-8 stream |
|
47 * |
|
48 * @var integer |
|
49 */ |
|
50 private $_bytePosition; |
|
51 |
|
52 /** |
|
53 * Object constructor |
|
54 * |
|
55 * @throws Zend_Search_Lucene_Exception |
|
56 */ |
|
57 public function __construct() |
|
58 { |
|
59 if (@preg_match('/\pL/u', 'a') != 1) { |
|
60 // PCRE unicode support is turned off |
|
61 require_once 'Zend/Search/Lucene/Exception.php'; |
|
62 throw new Zend_Search_Lucene_Exception('Utf8 analyzer needs PCRE unicode support to be enabled.'); |
|
63 } |
|
64 } |
|
65 |
|
66 /** |
|
67 * Reset token stream |
|
68 */ |
|
69 public function reset() |
|
70 { |
|
71 $this->_position = 0; |
|
72 $this->_bytePosition = 0; |
|
73 |
|
74 // convert input into UTF-8 |
|
75 if (strcasecmp($this->_encoding, 'utf8' ) != 0 && |
|
76 strcasecmp($this->_encoding, 'utf-8') != 0 ) { |
|
77 $this->_input = iconv($this->_encoding, 'UTF-8', $this->_input); |
|
78 $this->_encoding = 'UTF-8'; |
|
79 } |
|
80 } |
|
81 |
|
82 /** |
|
83 * Tokenization stream API |
|
84 * Get next token |
|
85 * Returns null at the end of stream |
|
86 * |
|
87 * @return Zend_Search_Lucene_Analysis_Token|null |
|
88 */ |
|
89 public function nextToken() |
|
90 { |
|
91 if ($this->_input === null) { |
|
92 return null; |
|
93 } |
|
94 |
|
95 do { |
|
96 if (! preg_match('/[\p{L}]+/u', $this->_input, $match, PREG_OFFSET_CAPTURE, $this->_bytePosition)) { |
|
97 // It covers both cases a) there are no matches (preg_match(...) === 0) |
|
98 // b) error occured (preg_match(...) === FALSE) |
|
99 return null; |
|
100 } |
|
101 |
|
102 // matched string |
|
103 $matchedWord = $match[0][0]; |
|
104 |
|
105 // binary position of the matched word in the input stream |
|
106 $binStartPos = $match[0][1]; |
|
107 |
|
108 // character position of the matched word in the input stream |
|
109 $startPos = $this->_position + |
|
110 iconv_strlen(substr($this->_input, |
|
111 $this->_bytePosition, |
|
112 $binStartPos - $this->_bytePosition), |
|
113 'UTF-8'); |
|
114 // character postion of the end of matched word in the input stream |
|
115 $endPos = $startPos + iconv_strlen($matchedWord, 'UTF-8'); |
|
116 |
|
117 $this->_bytePosition = $binStartPos + strlen($matchedWord); |
|
118 $this->_position = $endPos; |
|
119 |
|
120 $token = $this->normalize(new Zend_Search_Lucene_Analysis_Token($matchedWord, $startPos, $endPos)); |
|
121 } while ($token === null); // try again if token is skipped |
|
122 |
|
123 return $token; |
|
124 } |
|
125 } |
|
126 |