web/lib/Zend/Search/Lucene/Index/SegmentWriter/DocumentWriter.php
changeset 64 162c1de6545a
parent 19 1c2f13fd785c
child 68 ecaf28ffe26e
equal deleted inserted replaced
63:5b37998e522e 64:162c1de6545a
       
     1 <?php
       
     2 /**
       
     3  * Zend Framework
       
     4  *
       
     5  * LICENSE
       
     6  *
       
     7  * This source file is subject to the new BSD license that is bundled
       
     8  * with this package in the file LICENSE.txt.
       
     9  * It is also available through the world-wide-web at this URL:
       
    10  * http://framework.zend.com/license/new-bsd
       
    11  * If you did not receive a copy of the license and are unable to
       
    12  * obtain it through the world-wide-web, please send an email
       
    13  * to license@zend.com so we can send you a copy immediately.
       
    14  *
       
    15  * @category   Zend
       
    16  * @package    Zend_Search_Lucene
       
    17  * @subpackage Index
       
    18  * @copyright  Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
       
    19  * @license    http://framework.zend.com/license/new-bsd     New BSD License
       
    20  * @version    $Id: DocumentWriter.php 20096 2010-01-06 02:05:09Z bkarwin $
       
    21  */
       
    22 
       
    23 /** Zend_Search_Lucene_Index_SegmentWriter */
       
    24 require_once 'Zend/Search/Lucene/Index/SegmentWriter.php';
       
    25 
       
    26 /**
       
    27  * @category   Zend
       
    28  * @package    Zend_Search_Lucene
       
    29  * @subpackage Index
       
    30  * @copyright  Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
       
    31  * @license    http://framework.zend.com/license/new-bsd     New BSD License
       
    32  */
       
    33 class Zend_Search_Lucene_Index_SegmentWriter_DocumentWriter extends Zend_Search_Lucene_Index_SegmentWriter
       
    34 {
       
    35     /**
       
    36      * Term Dictionary
       
    37      * Array of the Zend_Search_Lucene_Index_Term objects
       
    38      * Corresponding Zend_Search_Lucene_Index_TermInfo object stored in the $_termDictionaryInfos
       
    39      *
       
    40      * @var array
       
    41      */
       
    42     protected $_termDictionary;
       
    43 
       
    44     /**
       
    45      * Documents, which contain the term
       
    46      *
       
    47      * @var array
       
    48      */
       
    49     protected $_termDocs;
       
    50 
       
    51     /**
       
    52      * Object constructor.
       
    53      *
       
    54      * @param Zend_Search_Lucene_Storage_Directory $directory
       
    55      * @param string $name
       
    56      */
       
    57     public function __construct(Zend_Search_Lucene_Storage_Directory $directory, $name)
       
    58     {
       
    59         parent::__construct($directory, $name);
       
    60 
       
    61         $this->_termDocs       = array();
       
    62         $this->_termDictionary = array();
       
    63     }
       
    64 
       
    65 
       
    66     /**
       
    67      * Adds a document to this segment.
       
    68      *
       
    69      * @param Zend_Search_Lucene_Document $document
       
    70      * @throws Zend_Search_Lucene_Exception
       
    71      */
       
    72     public function addDocument(Zend_Search_Lucene_Document $document)
       
    73     {
       
    74         /** Zend_Search_Lucene_Search_Similarity */
       
    75         require_once 'Zend/Search/Lucene/Search/Similarity.php';
       
    76 
       
    77         $storedFields = array();
       
    78         $docNorms     = array();
       
    79         $similarity   = Zend_Search_Lucene_Search_Similarity::getDefault();
       
    80 
       
    81         foreach ($document->getFieldNames() as $fieldName) {
       
    82             $field = $document->getField($fieldName);
       
    83 
       
    84             if ($field->storeTermVector) {
       
    85                 /**
       
    86                  * @todo term vector storing support
       
    87                  */
       
    88                 require_once 'Zend/Search/Lucene/Exception.php';
       
    89                 throw new Zend_Search_Lucene_Exception('Store term vector functionality is not supported yet.');
       
    90             }
       
    91 
       
    92             if ($field->isIndexed) {
       
    93                 if ($field->isTokenized) {
       
    94                     /** Zend_Search_Lucene_Analysis_Analyzer */
       
    95                     require_once 'Zend/Search/Lucene/Analysis/Analyzer.php';
       
    96 
       
    97                     $analyzer = Zend_Search_Lucene_Analysis_Analyzer::getDefault();
       
    98                     $analyzer->setInput($field->value, $field->encoding);
       
    99 
       
   100                     $position     = 0;
       
   101                     $tokenCounter = 0;
       
   102                     while (($token = $analyzer->nextToken()) !== null) {
       
   103                         $tokenCounter++;
       
   104 
       
   105                         $term = new Zend_Search_Lucene_Index_Term($token->getTermText(), $field->name);
       
   106                         $termKey = $term->key();
       
   107 
       
   108                         if (!isset($this->_termDictionary[$termKey])) {
       
   109                             // New term
       
   110                             $this->_termDictionary[$termKey] = $term;
       
   111                             $this->_termDocs[$termKey] = array();
       
   112                             $this->_termDocs[$termKey][$this->_docCount] = array();
       
   113                         } else if (!isset($this->_termDocs[$termKey][$this->_docCount])) {
       
   114                             // Existing term, but new term entry
       
   115                             $this->_termDocs[$termKey][$this->_docCount] = array();
       
   116                         }
       
   117                         $position += $token->getPositionIncrement();
       
   118                         $this->_termDocs[$termKey][$this->_docCount][] = $position;
       
   119                     }
       
   120 
       
   121                     if ($tokenCounter == 0) {
       
   122                         // Field contains empty value. Treat it as non-indexed and non-tokenized
       
   123                         $field = clone($field);
       
   124                         $field->isIndexed = $field->isTokenized = false;
       
   125                     } else {
       
   126                         $docNorms[$field->name] = chr($similarity->encodeNorm( $similarity->lengthNorm($field->name,
       
   127                                                                                                        $tokenCounter)*
       
   128                                                                                $document->boost*
       
   129                                                                                $field->boost ));
       
   130                     }
       
   131                 } else if (($fieldUtf8Value = $field->getUtf8Value()) == '') {
       
   132                     // Field contains empty value. Treat it as non-indexed and non-tokenized
       
   133                     $field = clone($field);
       
   134                     $field->isIndexed = $field->isTokenized = false;
       
   135                 } else {
       
   136                     $term = new Zend_Search_Lucene_Index_Term($fieldUtf8Value, $field->name);
       
   137                     $termKey = $term->key();
       
   138 
       
   139                     if (!isset($this->_termDictionary[$termKey])) {
       
   140                         // New term
       
   141                         $this->_termDictionary[$termKey] = $term;
       
   142                         $this->_termDocs[$termKey] = array();
       
   143                         $this->_termDocs[$termKey][$this->_docCount] = array();
       
   144                     } else if (!isset($this->_termDocs[$termKey][$this->_docCount])) {
       
   145                         // Existing term, but new term entry
       
   146                         $this->_termDocs[$termKey][$this->_docCount] = array();
       
   147                     }
       
   148                     $this->_termDocs[$termKey][$this->_docCount][] = 0; // position
       
   149 
       
   150                     $docNorms[$field->name] = chr($similarity->encodeNorm( $similarity->lengthNorm($field->name, 1)*
       
   151                                                                            $document->boost*
       
   152                                                                            $field->boost ));
       
   153                 }
       
   154             }
       
   155 
       
   156             if ($field->isStored) {
       
   157                 $storedFields[] = $field;
       
   158             }
       
   159 
       
   160             $this->addField($field);
       
   161         }
       
   162 
       
   163         foreach ($this->_fields as $fieldName => $field) {
       
   164             if (!$field->isIndexed) {
       
   165                 continue;
       
   166             }
       
   167 
       
   168             if (!isset($this->_norms[$fieldName])) {
       
   169                 $this->_norms[$fieldName] = str_repeat(chr($similarity->encodeNorm( $similarity->lengthNorm($fieldName, 0) )),
       
   170                                                        $this->_docCount);
       
   171             }
       
   172 
       
   173             if (isset($docNorms[$fieldName])){
       
   174                 $this->_norms[$fieldName] .= $docNorms[$fieldName];
       
   175             } else {
       
   176                 $this->_norms[$fieldName] .= chr($similarity->encodeNorm( $similarity->lengthNorm($fieldName, 0) ));
       
   177             }
       
   178         }
       
   179 
       
   180         $this->addStoredFields($storedFields);
       
   181     }
       
   182 
       
   183 
       
   184     /**
       
   185      * Dump Term Dictionary (.tis) and Term Dictionary Index (.tii) segment files
       
   186      */
       
   187     protected function _dumpDictionary()
       
   188     {
       
   189         ksort($this->_termDictionary, SORT_STRING);
       
   190 
       
   191         $this->initializeDictionaryFiles();
       
   192 
       
   193         foreach ($this->_termDictionary as $termId => $term) {
       
   194             $this->addTerm($term, $this->_termDocs[$termId]);
       
   195         }
       
   196 
       
   197         $this->closeDictionaryFiles();
       
   198     }
       
   199 
       
   200 
       
   201     /**
       
   202      * Close segment, write it to disk and return segment info
       
   203      *
       
   204      * @return Zend_Search_Lucene_Index_SegmentInfo
       
   205      */
       
   206     public function close()
       
   207     {
       
   208         if ($this->_docCount == 0) {
       
   209             return null;
       
   210         }
       
   211 
       
   212         $this->_dumpFNM();
       
   213         $this->_dumpDictionary();
       
   214 
       
   215         $this->_generateCFS();
       
   216 
       
   217         /** Zend_Search_Lucene_Index_SegmentInfo */
       
   218         require_once 'Zend/Search/Lucene/Index/SegmentInfo.php';
       
   219 
       
   220         return new Zend_Search_Lucene_Index_SegmentInfo($this->_directory,
       
   221                                                         $this->_name,
       
   222                                                         $this->_docCount,
       
   223                                                         -1,
       
   224                                                         null,
       
   225                                                         true,
       
   226                                                         true);
       
   227     }
       
   228 
       
   229 }
       
   230