|
1 <?php |
|
2 /** |
|
3 * Zend Framework |
|
4 * |
|
5 * LICENSE |
|
6 * |
|
7 * This source file is subject to the new BSD license that is bundled |
|
8 * with this package in the file LICENSE.txt. |
|
9 * It is also available through the world-wide-web at this URL: |
|
10 * http://framework.zend.com/license/new-bsd |
|
11 * If you did not receive a copy of the license and are unable to |
|
12 * obtain it through the world-wide-web, please send an email |
|
13 * to license@zend.com so we can send you a copy immediately. |
|
14 * |
|
15 * @category Zend |
|
16 * @package Zend_Search_Lucene |
|
17 * @subpackage Analysis |
|
18 * @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com) |
|
19 * @license http://framework.zend.com/license/new-bsd New BSD License |
|
20 * @version $Id: Analyzer.php 20096 2010-01-06 02:05:09Z bkarwin $ |
|
21 */ |
|
22 |
|
23 |
|
24 /** User land classes and interfaces turned on by Zend/Search/Analyzer.php file inclusion. */ |
|
25 /** @todo Section should be removed with ZF 2.0 release as obsolete */ |
|
26 if (!defined('ZEND_SEARCH_LUCENE_COMMON_ANALYZER_PROCESSED')) { |
|
27 /** Zend_Search_Lucene_Analysis_Analyzer_Common_Utf8 */ |
|
28 require_once 'Zend/Search/Lucene/Analysis/Analyzer/Common/Utf8.php'; |
|
29 |
|
30 /** Zend_Search_Lucene_Analysis_Analyzer_Common_Utf8_CaseInsensitive */ |
|
31 require_once 'Zend/Search/Lucene/Analysis/Analyzer/Common/Utf8/CaseInsensitive.php'; |
|
32 |
|
33 /** Zend_Search_Lucene_Analysis_Analyzer_Common_Utf8Num */ |
|
34 require_once 'Zend/Search/Lucene/Analysis/Analyzer/Common/Utf8Num.php'; |
|
35 |
|
36 /** Zend_Search_Lucene_Analysis_Analyzer_Common_Utf8Num_CaseInsensitive */ |
|
37 require_once 'Zend/Search/Lucene/Analysis/Analyzer/Common/Utf8Num/CaseInsensitive.php'; |
|
38 |
|
39 /** Zend_Search_Lucene_Analysis_Analyzer_Common_Text */ |
|
40 require_once 'Zend/Search/Lucene/Analysis/Analyzer/Common/Text.php'; |
|
41 |
|
42 /** Zend_Search_Lucene_Analysis_Analyzer_Common_Text_CaseInsensitive */ |
|
43 require_once 'Zend/Search/Lucene/Analysis/Analyzer/Common/Text/CaseInsensitive.php'; |
|
44 |
|
45 /** Zend_Search_Lucene_Analysis_Analyzer_Common_TextNum */ |
|
46 require_once 'Zend/Search/Lucene/Analysis/Analyzer/Common/TextNum.php'; |
|
47 |
|
48 /** Zend_Search_Lucene_Analysis_Analyzer_Common_TextNum_CaseInsensitive */ |
|
49 require_once 'Zend/Search/Lucene/Analysis/Analyzer/Common/TextNum/CaseInsensitive.php'; |
|
50 } |
|
51 |
|
52 |
|
53 /** |
|
54 * An Analyzer is used to analyze text. |
|
55 * It thus represents a policy for extracting index terms from text. |
|
56 * |
|
57 * Note: |
|
58 * Lucene Java implementation is oriented to streams. It provides effective work |
|
59 * with a huge documents (more then 20Mb). |
|
60 * But engine itself is not oriented such documents. |
|
61 * Thus Zend_Search_Lucene analysis API works with data strings and sets (arrays). |
|
62 * |
|
63 * @category Zend |
|
64 * @package Zend_Search_Lucene |
|
65 * @subpackage Analysis |
|
66 * @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com) |
|
67 * @license http://framework.zend.com/license/new-bsd New BSD License |
|
68 */ |
|
69 |
|
70 abstract class Zend_Search_Lucene_Analysis_Analyzer |
|
71 { |
|
72 /** |
|
73 * The Analyzer implementation used by default. |
|
74 * |
|
75 * @var Zend_Search_Lucene_Analysis_Analyzer |
|
76 */ |
|
77 private static $_defaultImpl; |
|
78 |
|
79 /** |
|
80 * Input string |
|
81 * |
|
82 * @var string |
|
83 */ |
|
84 protected $_input = null; |
|
85 |
|
86 /** |
|
87 * Input string encoding |
|
88 * |
|
89 * @var string |
|
90 */ |
|
91 protected $_encoding = ''; |
|
92 |
|
93 /** |
|
94 * Tokenize text to a terms |
|
95 * Returns array of Zend_Search_Lucene_Analysis_Token objects |
|
96 * |
|
97 * Tokens are returned in UTF-8 (internal Zend_Search_Lucene encoding) |
|
98 * |
|
99 * @param string $data |
|
100 * @return array |
|
101 */ |
|
102 public function tokenize($data, $encoding = '') |
|
103 { |
|
104 $this->setInput($data, $encoding); |
|
105 |
|
106 $tokenList = array(); |
|
107 while (($nextToken = $this->nextToken()) !== null) { |
|
108 $tokenList[] = $nextToken; |
|
109 } |
|
110 |
|
111 return $tokenList; |
|
112 } |
|
113 |
|
114 |
|
115 /** |
|
116 * Tokenization stream API |
|
117 * Set input |
|
118 * |
|
119 * @param string $data |
|
120 */ |
|
121 public function setInput($data, $encoding = '') |
|
122 { |
|
123 $this->_input = $data; |
|
124 $this->_encoding = $encoding; |
|
125 $this->reset(); |
|
126 } |
|
127 |
|
128 /** |
|
129 * Reset token stream |
|
130 */ |
|
131 abstract public function reset(); |
|
132 |
|
133 /** |
|
134 * Tokenization stream API |
|
135 * Get next token |
|
136 * Returns null at the end of stream |
|
137 * |
|
138 * Tokens are returned in UTF-8 (internal Zend_Search_Lucene encoding) |
|
139 * |
|
140 * @return Zend_Search_Lucene_Analysis_Token|null |
|
141 */ |
|
142 abstract public function nextToken(); |
|
143 |
|
144 |
|
145 |
|
146 |
|
147 /** |
|
148 * Set the default Analyzer implementation used by indexing code. |
|
149 * |
|
150 * @param Zend_Search_Lucene_Analysis_Analyzer $similarity |
|
151 */ |
|
152 public static function setDefault(Zend_Search_Lucene_Analysis_Analyzer $analyzer) |
|
153 { |
|
154 self::$_defaultImpl = $analyzer; |
|
155 } |
|
156 |
|
157 |
|
158 /** |
|
159 * Return the default Analyzer implementation used by indexing code. |
|
160 * |
|
161 * @return Zend_Search_Lucene_Analysis_Analyzer |
|
162 */ |
|
163 public static function getDefault() |
|
164 { |
|
165 /** Zend_Search_Lucene_Analysis_Analyzer_Common_Text_CaseInsensitive */ |
|
166 require_once 'Zend/Search/Lucene/Analysis/Analyzer/Common/Text/CaseInsensitive.php'; |
|
167 |
|
168 if (!self::$_defaultImpl instanceof Zend_Search_Lucene_Analysis_Analyzer) { |
|
169 self::$_defaultImpl = new Zend_Search_Lucene_Analysis_Analyzer_Common_Text_CaseInsensitive(); |
|
170 } |
|
171 |
|
172 return self::$_defaultImpl; |
|
173 } |
|
174 } |
|
175 |