web/lib/Zend/Search/Lucene/Document/Docx.php
changeset 64 162c1de6545a
parent 19 1c2f13fd785c
child 68 ecaf28ffe26e
equal deleted inserted replaced
63:5b37998e522e 64:162c1de6545a
       
     1 <?php
       
     2 /**
       
     3  * Zend Framework
       
     4  *
       
     5  * LICENSE
       
     6  *
       
     7  * This source file is subject to the new BSD license that is bundled
       
     8  * with this package in the file LICENSE.txt.
       
     9  * It is also available through the world-wide-web at this URL:
       
    10  * http://framework.zend.com/license/new-bsd
       
    11  * If you did not receive a copy of the license and are unable to
       
    12  * obtain it through the world-wide-web, please send an email
       
    13  * to license@zend.com so we can send you a copy immediately.
       
    14  *
       
    15  * @category   Zend
       
    16  * @package    Zend_Search_Lucene
       
    17  * @subpackage Document
       
    18  * @copyright  Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
       
    19  * @license    http://framework.zend.com/license/new-bsd     New BSD License
       
    20  * @version    $Id: Docx.php 20096 2010-01-06 02:05:09Z bkarwin $
       
    21  */
       
    22 
       
    23 /** Zend_Search_Lucene_Document_OpenXml */
       
    24 require_once 'Zend/Search/Lucene/Document/OpenXml.php';
       
    25 
       
    26 /**
       
    27  * Docx document.
       
    28  *
       
    29  * @category   Zend
       
    30  * @package    Zend_Search_Lucene
       
    31  * @subpackage Document
       
    32  * @copyright  Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
       
    33  * @license    http://framework.zend.com/license/new-bsd     New BSD License
       
    34  */
       
    35 class Zend_Search_Lucene_Document_Docx extends Zend_Search_Lucene_Document_OpenXml {
       
    36     /**
       
    37      * Xml Schema - WordprocessingML
       
    38      *
       
    39      * @var string
       
    40      */
       
    41     const SCHEMA_WORDPROCESSINGML = 'http://schemas.openxmlformats.org/wordprocessingml/2006/main';
       
    42 
       
    43     /**
       
    44      * Object constructor
       
    45      *
       
    46      * @param string  $fileName
       
    47      * @param boolean $storeContent
       
    48      * @throws Zend_Search_Lucene_Exception
       
    49      */
       
    50     private function __construct($fileName, $storeContent) {
       
    51         if (!class_exists('ZipArchive', false)) {
       
    52             require_once 'Zend/Search/Lucene/Exception.php';
       
    53             throw new Zend_Search_Lucene_Exception('MS Office documents processing functionality requires Zip extension to be loaded');
       
    54         }
       
    55 
       
    56         // Document data holders
       
    57         $documentBody = array();
       
    58         $coreProperties = array();
       
    59 
       
    60         // Open OpenXML package
       
    61         $package = new ZipArchive();
       
    62         $package->open($fileName);
       
    63 
       
    64         // Read relations and search for officeDocument
       
    65         $relationsXml = $package->getFromName('_rels/.rels');
       
    66         if ($relationsXml === false) {
       
    67             require_once 'Zend/Search/Lucene/Exception.php';
       
    68             throw new Zend_Search_Lucene_Exception('Invalid archive or corrupted .docx file.');
       
    69         }
       
    70         $relations = simplexml_load_string($relationsXml);
       
    71         foreach($relations->Relationship as $rel) {
       
    72             if ($rel ["Type"] == Zend_Search_Lucene_Document_OpenXml::SCHEMA_OFFICEDOCUMENT) {
       
    73                 // Found office document! Read in contents...
       
    74                 $contents = simplexml_load_string($package->getFromName(
       
    75                                                                 $this->absoluteZipPath(dirname($rel['Target'])
       
    76                                                               . '/'
       
    77                                                               . basename($rel['Target']))
       
    78                                                                        ));
       
    79 
       
    80                 $contents->registerXPathNamespace('w', Zend_Search_Lucene_Document_Docx::SCHEMA_WORDPROCESSINGML);
       
    81                 $paragraphs = $contents->xpath('//w:body/w:p');
       
    82 
       
    83                 foreach ($paragraphs as $paragraph) {
       
    84                     $runs = $paragraph->xpath('.//w:r/*[name() = "w:t" or name() = "w:br"]');
       
    85 
       
    86                     if ($runs === false) {
       
    87                         // Paragraph doesn't contain any text or breaks
       
    88                         continue;
       
    89                     }
       
    90 
       
    91                     foreach ($runs as $run) {
       
    92                      if ($run->getName() == 'br') {
       
    93                          // Break element
       
    94                          $documentBody[] = ' ';
       
    95                      } else {
       
    96                          $documentBody[] = (string)$run;
       
    97                      }
       
    98                     }
       
    99 
       
   100                     // Add space after each paragraph. So they are not bound together.
       
   101                     $documentBody[] = ' ';
       
   102                 }
       
   103 
       
   104                 break;
       
   105             }
       
   106         }
       
   107 
       
   108         // Read core properties
       
   109         $coreProperties = $this->extractMetaData($package);
       
   110 
       
   111         // Close file
       
   112         $package->close();
       
   113 
       
   114         // Store filename
       
   115         $this->addField(Zend_Search_Lucene_Field::Text('filename', $fileName, 'UTF-8'));
       
   116 
       
   117         // Store contents
       
   118         if ($storeContent) {
       
   119             $this->addField(Zend_Search_Lucene_Field::Text('body', implode('', $documentBody), 'UTF-8'));
       
   120         } else {
       
   121             $this->addField(Zend_Search_Lucene_Field::UnStored('body', implode('', $documentBody), 'UTF-8'));
       
   122         }
       
   123 
       
   124         // Store meta data properties
       
   125         foreach ($coreProperties as $key => $value) {
       
   126             $this->addField(Zend_Search_Lucene_Field::Text($key, $value, 'UTF-8'));
       
   127         }
       
   128 
       
   129         // Store title (if not present in meta data)
       
   130         if (! isset($coreProperties['title'])) {
       
   131             $this->addField(Zend_Search_Lucene_Field::Text('title', $fileName, 'UTF-8'));
       
   132         }
       
   133     }
       
   134 
       
   135     /**
       
   136      * Load Docx document from a file
       
   137      *
       
   138      * @param string  $fileName
       
   139      * @param boolean $storeContent
       
   140      * @return Zend_Search_Lucene_Document_Docx
       
   141      * @throws Zend_Search_Lucene_Document_Exception
       
   142      */
       
   143     public static function loadDocxFile($fileName, $storeContent = false) {
       
   144         if (!is_readable($fileName)) {
       
   145             require_once 'Zend/Search/Lucene/Document/Exception.php';
       
   146             throw new Zend_Search_Lucene_Document_Exception('Provided file \'' . $fileName . '\' is not readable.');
       
   147         }
       
   148 
       
   149         return new Zend_Search_Lucene_Document_Docx($fileName, $storeContent);
       
   150     }
       
   151 }