diff -r 5b37998e522e -r 162c1de6545a web/lib/Zend/Search/Lucene/Analysis/Analyzer/Common/Utf8Num.php --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/web/lib/Zend/Search/Lucene/Analysis/Analyzer/Common/Utf8Num.php Fri Mar 11 15:05:35 2011 +0100 @@ -0,0 +1,126 @@ +_position = 0; + $this->_bytePosition = 0; + + // convert input into UTF-8 + if (strcasecmp($this->_encoding, 'utf8' ) != 0 && + strcasecmp($this->_encoding, 'utf-8') != 0 ) { + $this->_input = iconv($this->_encoding, 'UTF-8', $this->_input); + $this->_encoding = 'UTF-8'; + } + } + + /** + * Tokenization stream API + * Get next token + * Returns null at the end of stream + * + * @return Zend_Search_Lucene_Analysis_Token|null + */ + public function nextToken() + { + if ($this->_input === null) { + return null; + } + + do { + if (! preg_match('/[\p{L}\p{N}]+/u', $this->_input, $match, PREG_OFFSET_CAPTURE, $this->_bytePosition)) { + // It covers both cases a) there are no matches (preg_match(...) === 0) + // b) error occured (preg_match(...) === FALSE) + return null; + } + + // matched string + $matchedWord = $match[0][0]; + + // binary position of the matched word in the input stream + $binStartPos = $match[0][1]; + + // character position of the matched word in the input stream + $startPos = $this->_position + + iconv_strlen(substr($this->_input, + $this->_bytePosition, + $binStartPos - $this->_bytePosition), + 'UTF-8'); + // character postion of the end of matched word in the input stream + $endPos = $startPos + iconv_strlen($matchedWord, 'UTF-8'); + + $this->_bytePosition = $binStartPos + strlen($matchedWord); + $this->_position = $endPos; + + $token = $this->normalize(new Zend_Search_Lucene_Analysis_Token($matchedWord, $startPos, $endPos)); + } while ($token === null); // try again if token is skipped + + return $token; + } +} +