web/enmi/Zend/Search/Lucene/Analysis/Analyzer.php
changeset 19 1c2f13fd785c
parent 0 4eba9c11703f
equal deleted inserted replaced
18:bd595ad770fc 19:1c2f13fd785c
       
     1 <?php
       
     2 /**
       
     3  * Zend Framework
       
     4  *
       
     5  * LICENSE
       
     6  *
       
     7  * This source file is subject to the new BSD license that is bundled
       
     8  * with this package in the file LICENSE.txt.
       
     9  * It is also available through the world-wide-web at this URL:
       
    10  * http://framework.zend.com/license/new-bsd
       
    11  * If you did not receive a copy of the license and are unable to
       
    12  * obtain it through the world-wide-web, please send an email
       
    13  * to license@zend.com so we can send you a copy immediately.
       
    14  *
       
    15  * @category   Zend
       
    16  * @package    Zend_Search_Lucene
       
    17  * @subpackage Analysis
       
    18  * @copyright  Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
       
    19  * @license    http://framework.zend.com/license/new-bsd     New BSD License
       
    20  * @version    $Id: Analyzer.php 20096 2010-01-06 02:05:09Z bkarwin $
       
    21  */
       
    22 
       
    23 
       
    24 /** User land classes and interfaces turned on by Zend/Search/Analyzer.php file inclusion. */
       
    25 /** @todo Section should be removed with ZF 2.0 release as obsolete                      */
       
    26 if (!defined('ZEND_SEARCH_LUCENE_COMMON_ANALYZER_PROCESSED')) {
       
    27     /** Zend_Search_Lucene_Analysis_Analyzer_Common_Utf8 */
       
    28     require_once 'Zend/Search/Lucene/Analysis/Analyzer/Common/Utf8.php';
       
    29 
       
    30     /** Zend_Search_Lucene_Analysis_Analyzer_Common_Utf8_CaseInsensitive */
       
    31     require_once 'Zend/Search/Lucene/Analysis/Analyzer/Common/Utf8/CaseInsensitive.php';
       
    32 
       
    33     /** Zend_Search_Lucene_Analysis_Analyzer_Common_Utf8Num */
       
    34     require_once 'Zend/Search/Lucene/Analysis/Analyzer/Common/Utf8Num.php';
       
    35 
       
    36     /** Zend_Search_Lucene_Analysis_Analyzer_Common_Utf8Num_CaseInsensitive */
       
    37     require_once 'Zend/Search/Lucene/Analysis/Analyzer/Common/Utf8Num/CaseInsensitive.php';
       
    38 
       
    39     /** Zend_Search_Lucene_Analysis_Analyzer_Common_Text */
       
    40     require_once 'Zend/Search/Lucene/Analysis/Analyzer/Common/Text.php';
       
    41 
       
    42     /** Zend_Search_Lucene_Analysis_Analyzer_Common_Text_CaseInsensitive */
       
    43     require_once 'Zend/Search/Lucene/Analysis/Analyzer/Common/Text/CaseInsensitive.php';
       
    44 
       
    45     /** Zend_Search_Lucene_Analysis_Analyzer_Common_TextNum */
       
    46     require_once 'Zend/Search/Lucene/Analysis/Analyzer/Common/TextNum.php';
       
    47 
       
    48     /** Zend_Search_Lucene_Analysis_Analyzer_Common_TextNum_CaseInsensitive */
       
    49     require_once 'Zend/Search/Lucene/Analysis/Analyzer/Common/TextNum/CaseInsensitive.php';
       
    50 }
       
    51 
       
    52 
       
    53 /**
       
    54  * An Analyzer is used to analyze text.
       
    55  * It thus represents a policy for extracting index terms from text.
       
    56  *
       
    57  * Note:
       
    58  * Lucene Java implementation is oriented to streams. It provides effective work
       
    59  * with a huge documents (more then 20Mb).
       
    60  * But engine itself is not oriented such documents.
       
    61  * Thus Zend_Search_Lucene analysis API works with data strings and sets (arrays).
       
    62  *
       
    63  * @category   Zend
       
    64  * @package    Zend_Search_Lucene
       
    65  * @subpackage Analysis
       
    66  * @copyright  Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
       
    67  * @license    http://framework.zend.com/license/new-bsd     New BSD License
       
    68  */
       
    69 
       
    70 abstract class Zend_Search_Lucene_Analysis_Analyzer
       
    71 {
       
    72     /**
       
    73      * The Analyzer implementation used by default.
       
    74      *
       
    75      * @var Zend_Search_Lucene_Analysis_Analyzer
       
    76      */
       
    77     private static $_defaultImpl;
       
    78 
       
    79     /**
       
    80      * Input string
       
    81      *
       
    82      * @var string
       
    83      */
       
    84     protected $_input = null;
       
    85 
       
    86     /**
       
    87      * Input string encoding
       
    88      *
       
    89      * @var string
       
    90      */
       
    91     protected $_encoding = '';
       
    92 
       
    93     /**
       
    94      * Tokenize text to a terms
       
    95      * Returns array of Zend_Search_Lucene_Analysis_Token objects
       
    96      *
       
    97      * Tokens are returned in UTF-8 (internal Zend_Search_Lucene encoding)
       
    98      *
       
    99      * @param string $data
       
   100      * @return array
       
   101      */
       
   102     public function tokenize($data, $encoding = '')
       
   103     {
       
   104         $this->setInput($data, $encoding);
       
   105 
       
   106         $tokenList = array();
       
   107         while (($nextToken = $this->nextToken()) !== null) {
       
   108             $tokenList[] = $nextToken;
       
   109         }
       
   110 
       
   111         return $tokenList;
       
   112     }
       
   113 
       
   114 
       
   115     /**
       
   116      * Tokenization stream API
       
   117      * Set input
       
   118      *
       
   119      * @param string $data
       
   120      */
       
   121     public function setInput($data, $encoding = '')
       
   122     {
       
   123         $this->_input    = $data;
       
   124         $this->_encoding = $encoding;
       
   125         $this->reset();
       
   126     }
       
   127 
       
   128     /**
       
   129      * Reset token stream
       
   130      */
       
   131     abstract public function reset();
       
   132 
       
   133     /**
       
   134      * Tokenization stream API
       
   135      * Get next token
       
   136      * Returns null at the end of stream
       
   137      *
       
   138      * Tokens are returned in UTF-8 (internal Zend_Search_Lucene encoding)
       
   139      *
       
   140      * @return Zend_Search_Lucene_Analysis_Token|null
       
   141      */
       
   142     abstract public function nextToken();
       
   143 
       
   144 
       
   145 
       
   146 
       
   147     /**
       
   148      * Set the default Analyzer implementation used by indexing code.
       
   149      *
       
   150      * @param Zend_Search_Lucene_Analysis_Analyzer $similarity
       
   151      */
       
   152     public static function setDefault(Zend_Search_Lucene_Analysis_Analyzer $analyzer)
       
   153     {
       
   154         self::$_defaultImpl = $analyzer;
       
   155     }
       
   156 
       
   157 
       
   158     /**
       
   159      * Return the default Analyzer implementation used by indexing code.
       
   160      *
       
   161      * @return Zend_Search_Lucene_Analysis_Analyzer
       
   162      */
       
   163     public static function getDefault()
       
   164     {
       
   165         /** Zend_Search_Lucene_Analysis_Analyzer_Common_Text_CaseInsensitive */
       
   166         require_once 'Zend/Search/Lucene/Analysis/Analyzer/Common/Text/CaseInsensitive.php';
       
   167 
       
   168         if (!self::$_defaultImpl instanceof Zend_Search_Lucene_Analysis_Analyzer) {
       
   169             self::$_defaultImpl = new Zend_Search_Lucene_Analysis_Analyzer_Common_Text_CaseInsensitive();
       
   170         }
       
   171 
       
   172         return self::$_defaultImpl;
       
   173     }
       
   174 }
       
   175