web/enmi/Zend/Search/Lucene/Document/Xlsx.php
changeset 19 1c2f13fd785c
parent 0 4eba9c11703f
equal deleted inserted replaced
18:bd595ad770fc 19:1c2f13fd785c
       
     1 <?php
       
     2 /**
       
     3  * Zend Framework
       
     4  *
       
     5  * LICENSE
       
     6  *
       
     7  * This source file is subject to the new BSD license that is bundled
       
     8  * with this package in the file LICENSE.txt.
       
     9  * It is also available through the world-wide-web at this URL:
       
    10  * http://framework.zend.com/license/new-bsd
       
    11  * If you did not receive a copy of the license and are unable to
       
    12  * obtain it through the world-wide-web, please send an email
       
    13  * to license@zend.com so we can send you a copy immediately.
       
    14  *
       
    15  * @category   Zend
       
    16  * @package    Zend_Search_Lucene
       
    17  * @subpackage Document
       
    18  * @copyright  Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
       
    19  * @license    http://framework.zend.com/license/new-bsd     New BSD License
       
    20  * @version    $Id: Xlsx.php 20096 2010-01-06 02:05:09Z bkarwin $
       
    21  */
       
    22 
       
    23 
       
    24 /** Zend_Search_Lucene_Document_OpenXml */
       
    25 require_once 'Zend/Search/Lucene/Document/OpenXml.php';
       
    26 
       
    27 /**
       
    28  * Xlsx document.
       
    29  *
       
    30  * @category   Zend
       
    31  * @package    Zend_Search_Lucene
       
    32  * @subpackage Document
       
    33  * @copyright  Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
       
    34  * @license    http://framework.zend.com/license/new-bsd     New BSD License
       
    35  */
       
    36 class Zend_Search_Lucene_Document_Xlsx extends Zend_Search_Lucene_Document_OpenXml
       
    37 {
       
    38     /**
       
    39      * Xml Schema - SpreadsheetML
       
    40      *
       
    41      * @var string
       
    42      */
       
    43     const SCHEMA_SPREADSHEETML = 'http://schemas.openxmlformats.org/spreadsheetml/2006/main';
       
    44 
       
    45     /**
       
    46      * Xml Schema - DrawingML
       
    47      *
       
    48      * @var string
       
    49      */
       
    50     const SCHEMA_DRAWINGML = 'http://schemas.openxmlformats.org/drawingml/2006/main';
       
    51 
       
    52     /**
       
    53      * Xml Schema - Shared Strings
       
    54      *
       
    55      * @var string
       
    56      */
       
    57     const SCHEMA_SHAREDSTRINGS = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/sharedStrings';
       
    58 
       
    59     /**
       
    60      * Xml Schema - Worksheet relation
       
    61      *
       
    62      * @var string
       
    63      */
       
    64     const SCHEMA_WORKSHEETRELATION = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/worksheet';
       
    65 
       
    66     /**
       
    67      * Xml Schema - Slide notes relation
       
    68      *
       
    69      * @var string
       
    70      */
       
    71     const SCHEMA_SLIDENOTESRELATION = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/notesSlide';
       
    72 
       
    73     /**
       
    74      * Object constructor
       
    75      *
       
    76      * @param string  $fileName
       
    77      * @param boolean $storeContent
       
    78      * @throws Zend_Search_Lucene_Exception
       
    79      */
       
    80     private function __construct($fileName, $storeContent)
       
    81     {
       
    82         if (!class_exists('ZipArchive', false)) {
       
    83             require_once 'Zend/Search/Lucene/Exception.php';
       
    84             throw new Zend_Search_Lucene_Exception('MS Office documents processing functionality requires Zip extension to be loaded');
       
    85         }
       
    86 
       
    87         // Document data holders
       
    88         $sharedStrings = array();
       
    89         $worksheets = array();
       
    90         $documentBody = array();
       
    91         $coreProperties = array();
       
    92 
       
    93         // Open OpenXML package
       
    94         $package = new ZipArchive();
       
    95         $package->open($fileName);
       
    96 
       
    97         // Read relations and search for officeDocument
       
    98         $relationsXml = $package->getFromName('_rels/.rels');
       
    99         if ($relationsXml === false) {
       
   100             require_once 'Zend/Search/Lucene/Exception.php';
       
   101             throw new Zend_Search_Lucene_Exception('Invalid archive or corrupted .xlsx file.');
       
   102         }
       
   103         $relations = simplexml_load_string($relationsXml);
       
   104         foreach ($relations->Relationship as $rel) {
       
   105             if ($rel["Type"] == Zend_Search_Lucene_Document_OpenXml::SCHEMA_OFFICEDOCUMENT) {
       
   106                 // Found office document! Read relations for workbook...
       
   107                 $workbookRelations = simplexml_load_string($package->getFromName( $this->absoluteZipPath(dirname($rel["Target"]) . "/_rels/" . basename($rel["Target"]) . ".rels")) );
       
   108                 $workbookRelations->registerXPathNamespace("rel", Zend_Search_Lucene_Document_OpenXml::SCHEMA_RELATIONSHIP);
       
   109 
       
   110                 // Read shared strings
       
   111                 $sharedStringsPath = $workbookRelations->xpath("rel:Relationship[@Type='" . Zend_Search_Lucene_Document_Xlsx::SCHEMA_SHAREDSTRINGS . "']");
       
   112                 $sharedStringsPath = (string)$sharedStringsPath[0]['Target'];
       
   113                 $xmlStrings = simplexml_load_string($package->getFromName( $this->absoluteZipPath(dirname($rel["Target"]) . "/" . $sharedStringsPath)) );
       
   114                 if (isset($xmlStrings) && isset($xmlStrings->si)) {
       
   115                     foreach ($xmlStrings->si as $val) {
       
   116                         if (isset($val->t)) {
       
   117                             $sharedStrings[] = (string)$val->t;
       
   118                         } elseif (isset($val->r)) {
       
   119                             $sharedStrings[] = $this->_parseRichText($val);
       
   120                         }
       
   121                     }
       
   122                 }
       
   123 
       
   124                 // Loop relations for workbook and extract worksheets...
       
   125                 foreach ($workbookRelations->Relationship as $workbookRelation) {
       
   126                     if ($workbookRelation["Type"] == Zend_Search_Lucene_Document_Xlsx::SCHEMA_WORKSHEETRELATION) {
       
   127                         $worksheets[ str_replace( 'rId', '', (string)$workbookRelation["Id"]) ] = simplexml_load_string(
       
   128                             $package->getFromName( $this->absoluteZipPath(dirname($rel["Target"]) . "/" . dirname($workbookRelation["Target"]) . "/" . basename($workbookRelation["Target"])) )
       
   129                         );
       
   130                     }
       
   131                 }
       
   132 
       
   133                 break;
       
   134             }
       
   135         }
       
   136 
       
   137         // Sort worksheets
       
   138         ksort($worksheets);
       
   139 
       
   140         // Extract contents from worksheets
       
   141         foreach ($worksheets as $sheetKey => $worksheet) {
       
   142             foreach ($worksheet->sheetData->row as $row) {
       
   143                 foreach ($row->c as $c) {
       
   144                     // Determine data type
       
   145                     $dataType = (string)$c["t"];
       
   146                     switch ($dataType) {
       
   147                         case "s":
       
   148                             // Value is a shared string
       
   149                             if ((string)$c->v != '') {
       
   150                                 $value = $sharedStrings[intval($c->v)];
       
   151                             } else {
       
   152                                 $value = '';
       
   153                             }
       
   154 
       
   155                             break;
       
   156 
       
   157                         case "b":
       
   158                             // Value is boolean
       
   159                             $value = (string)$c->v;
       
   160                             if ($value == '0') {
       
   161                                 $value = false;
       
   162                             } else if ($value == '1') {
       
   163                                 $value = true;
       
   164                             } else {
       
   165                                 $value = (bool)$c->v;
       
   166                             }
       
   167 
       
   168                             break;
       
   169 
       
   170                         case "inlineStr":
       
   171                             // Value is rich text inline
       
   172                             $value = $this->_parseRichText($c->is);
       
   173 
       
   174                             break;
       
   175 
       
   176                         case "e":
       
   177                             // Value is an error message
       
   178                             if ((string)$c->v != '') {
       
   179                                 $value = (string)$c->v;
       
   180                             } else {
       
   181                                 $value = '';
       
   182                             }
       
   183 
       
   184                             break;
       
   185 
       
   186                         default:
       
   187                             // Value is a string
       
   188                             $value = (string)$c->v;
       
   189 
       
   190                             // Check for numeric values
       
   191                             if (is_numeric($value) && $dataType != 's') {
       
   192                                 if ($value == (int)$value) $value = (int)$value;
       
   193                                 elseif ($value == (float)$value) $value = (float)$value;
       
   194                                 elseif ($value == (double)$value) $value = (double)$value;
       
   195                             }
       
   196                     }
       
   197 
       
   198                     $documentBody[] = $value;
       
   199                 }
       
   200             }
       
   201         }
       
   202 
       
   203         // Read core properties
       
   204         $coreProperties = $this->extractMetaData($package);
       
   205 
       
   206         // Close file
       
   207         $package->close();
       
   208 
       
   209         // Store filename
       
   210         $this->addField(Zend_Search_Lucene_Field::Text('filename', $fileName, 'UTF-8'));
       
   211 
       
   212         // Store contents
       
   213         if ($storeContent) {
       
   214             $this->addField(Zend_Search_Lucene_Field::Text('body', implode(' ', $documentBody), 'UTF-8'));
       
   215         } else {
       
   216             $this->addField(Zend_Search_Lucene_Field::UnStored('body', implode(' ', $documentBody), 'UTF-8'));
       
   217         }
       
   218 
       
   219         // Store meta data properties
       
   220         foreach ($coreProperties as $key => $value)
       
   221         {
       
   222             $this->addField(Zend_Search_Lucene_Field::Text($key, $value, 'UTF-8'));
       
   223         }
       
   224 
       
   225         // Store title (if not present in meta data)
       
   226         if (!isset($coreProperties['title']))
       
   227         {
       
   228             $this->addField(Zend_Search_Lucene_Field::Text('title', $fileName, 'UTF-8'));
       
   229         }
       
   230     }
       
   231 
       
   232     /**
       
   233      * Parse rich text XML
       
   234      *
       
   235      * @param SimpleXMLElement $is
       
   236      * @return string
       
   237      */
       
   238     private function _parseRichText($is = null) {
       
   239         $value = array();
       
   240 
       
   241         if (isset($is->t)) {
       
   242             $value[] = (string)$is->t;
       
   243         } else {
       
   244             foreach ($is->r as $run) {
       
   245                 $value[] = (string)$run->t;
       
   246             }
       
   247         }
       
   248 
       
   249         return implode('', $value);
       
   250     }
       
   251 
       
   252     /**
       
   253      * Load Xlsx document from a file
       
   254      *
       
   255      * @param string  $fileName
       
   256      * @param boolean $storeContent
       
   257      * @return Zend_Search_Lucene_Document_Xlsx
       
   258      */
       
   259     public static function loadXlsxFile($fileName, $storeContent = false)
       
   260     {
       
   261         return new Zend_Search_Lucene_Document_Xlsx($fileName, $storeContent);
       
   262     }
       
   263 }