web/enmi/Zend/Search/Lucene/Document/Pptx.php
changeset 19 1c2f13fd785c
parent 0 4eba9c11703f
equal deleted inserted replaced
18:bd595ad770fc 19:1c2f13fd785c
       
     1 <?php
       
     2 /**
       
     3  * Zend Framework
       
     4  *
       
     5  * LICENSE
       
     6  *
       
     7  * This source file is subject to the new BSD license that is bundled
       
     8  * with this package in the file LICENSE.txt.
       
     9  * It is also available through the world-wide-web at this URL:
       
    10  * http://framework.zend.com/license/new-bsd
       
    11  * If you did not receive a copy of the license and are unable to
       
    12  * obtain it through the world-wide-web, please send an email
       
    13  * to license@zend.com so we can send you a copy immediately.
       
    14  *
       
    15  * @category   Zend
       
    16  * @package    Zend_Search_Lucene
       
    17  * @subpackage Document
       
    18  * @copyright  Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
       
    19  * @license    http://framework.zend.com/license/new-bsd     New BSD License
       
    20  * @version    $Id: Pptx.php 20096 2010-01-06 02:05:09Z bkarwin $
       
    21  */
       
    22 
       
    23 
       
    24 /** Zend_Search_Lucene_Document_OpenXml */
       
    25 require_once 'Zend/Search/Lucene/Document/OpenXml.php';
       
    26 
       
    27 /**
       
    28  * Pptx document.
       
    29  *
       
    30  * @category   Zend
       
    31  * @package    Zend_Search_Lucene
       
    32  * @subpackage Document
       
    33  * @copyright  Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
       
    34  * @license    http://framework.zend.com/license/new-bsd     New BSD License
       
    35  */
       
    36 class Zend_Search_Lucene_Document_Pptx extends Zend_Search_Lucene_Document_OpenXml
       
    37 {
       
    38     /**
       
    39      * Xml Schema - PresentationML
       
    40      *
       
    41      * @var string
       
    42      */
       
    43     const SCHEMA_PRESENTATIONML = 'http://schemas.openxmlformats.org/presentationml/2006/main';
       
    44 
       
    45     /**
       
    46      * Xml Schema - DrawingML
       
    47      *
       
    48      * @var string
       
    49      */
       
    50     const SCHEMA_DRAWINGML = 'http://schemas.openxmlformats.org/drawingml/2006/main';
       
    51 
       
    52     /**
       
    53      * Xml Schema - Slide relation
       
    54      *
       
    55      * @var string
       
    56      */
       
    57     const SCHEMA_SLIDERELATION = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/slide';
       
    58 
       
    59     /**
       
    60      * Xml Schema - Slide notes relation
       
    61      *
       
    62      * @var string
       
    63      */
       
    64     const SCHEMA_SLIDENOTESRELATION = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/notesSlide';
       
    65 
       
    66     /**
       
    67      * Object constructor
       
    68      *
       
    69      * @param string  $fileName
       
    70      * @param boolean $storeContent
       
    71      * @throws Zend_Search_Lucene_Exception
       
    72      */
       
    73     private function __construct($fileName, $storeContent)
       
    74     {
       
    75         if (!class_exists('ZipArchive', false)) {
       
    76             require_once 'Zend/Search/Lucene/Exception.php';
       
    77             throw new Zend_Search_Lucene_Exception('MS Office documents processing functionality requires Zip extension to be loaded');
       
    78         }
       
    79 
       
    80         // Document data holders
       
    81         $slides = array();
       
    82         $slideNotes = array();
       
    83         $documentBody = array();
       
    84         $coreProperties = array();
       
    85 
       
    86         // Open OpenXML package
       
    87         $package = new ZipArchive();
       
    88         $package->open($fileName);
       
    89 
       
    90         // Read relations and search for officeDocument
       
    91         $relationsXml = $package->getFromName('_rels/.rels');
       
    92         if ($relationsXml === false) {
       
    93             require_once 'Zend/Search/Lucene/Exception.php';
       
    94             throw new Zend_Search_Lucene_Exception('Invalid archive or corrupted .pptx file.');
       
    95         }
       
    96         $relations = simplexml_load_string($relationsXml);
       
    97         foreach ($relations->Relationship as $rel) {
       
    98             if ($rel["Type"] == Zend_Search_Lucene_Document_OpenXml::SCHEMA_OFFICEDOCUMENT) {
       
    99                 // Found office document! Search for slides...
       
   100                 $slideRelations = simplexml_load_string($package->getFromName( $this->absoluteZipPath(dirname($rel["Target"]) . "/_rels/" . basename($rel["Target"]) . ".rels")) );
       
   101                 foreach ($slideRelations->Relationship as $slideRel) {
       
   102                     if ($slideRel["Type"] == Zend_Search_Lucene_Document_Pptx::SCHEMA_SLIDERELATION) {
       
   103                         // Found slide!
       
   104                         $slides[ str_replace( 'rId', '', (string)$slideRel["Id"] ) ] = simplexml_load_string(
       
   105                             $package->getFromName( $this->absoluteZipPath(dirname($rel["Target"]) . "/" . dirname($slideRel["Target"]) . "/" . basename($slideRel["Target"])) )
       
   106                         );
       
   107 
       
   108                         // Search for slide notes
       
   109                         $slideNotesRelations = simplexml_load_string($package->getFromName( $this->absoluteZipPath(dirname($rel["Target"]) . "/" . dirname($slideRel["Target"]) . "/_rels/" . basename($slideRel["Target"]) . ".rels")) );
       
   110                         foreach ($slideNotesRelations->Relationship as $slideNoteRel) {
       
   111                             if ($slideNoteRel["Type"] == Zend_Search_Lucene_Document_Pptx::SCHEMA_SLIDENOTESRELATION) {
       
   112                                 // Found slide notes!
       
   113                                 $slideNotes[ str_replace( 'rId', '', (string)$slideRel["Id"] ) ] = simplexml_load_string(
       
   114                                     $package->getFromName( $this->absoluteZipPath(dirname($rel["Target"]) . "/" . dirname($slideRel["Target"]) . "/" . dirname($slideNoteRel["Target"]) . "/" . basename($slideNoteRel["Target"])) )
       
   115                                 );
       
   116 
       
   117                                 break;
       
   118                             }
       
   119                         }
       
   120                     }
       
   121                 }
       
   122 
       
   123                 break;
       
   124             }
       
   125         }
       
   126 
       
   127         // Sort slides
       
   128         ksort($slides);
       
   129         ksort($slideNotes);
       
   130 
       
   131         // Extract contents from slides
       
   132         foreach ($slides as $slideKey => $slide) {
       
   133             // Register namespaces
       
   134             $slide->registerXPathNamespace("p", Zend_Search_Lucene_Document_Pptx::SCHEMA_PRESENTATIONML);
       
   135             $slide->registerXPathNamespace("a", Zend_Search_Lucene_Document_Pptx::SCHEMA_DRAWINGML);
       
   136 
       
   137             // Fetch all text
       
   138             $textElements = $slide->xpath('//a:t');
       
   139             foreach ($textElements as $textElement) {
       
   140                 $documentBody[] = (string)$textElement;
       
   141             }
       
   142 
       
   143             // Extract contents from slide notes
       
   144             if (isset($slideNotes[$slideKey])) {
       
   145                 // Fetch slide note
       
   146                 $slideNote = $slideNotes[$slideKey];
       
   147 
       
   148                 // Register namespaces
       
   149                 $slideNote->registerXPathNamespace("p", Zend_Search_Lucene_Document_Pptx::SCHEMA_PRESENTATIONML);
       
   150                 $slideNote->registerXPathNamespace("a", Zend_Search_Lucene_Document_Pptx::SCHEMA_DRAWINGML);
       
   151 
       
   152                 // Fetch all text
       
   153                 $textElements = $slideNote->xpath('//a:t');
       
   154                 foreach ($textElements as $textElement) {
       
   155                     $documentBody[] = (string)$textElement;
       
   156                 }
       
   157             }
       
   158         }
       
   159 
       
   160         // Read core properties
       
   161         $coreProperties = $this->extractMetaData($package);
       
   162 
       
   163         // Close file
       
   164         $package->close();
       
   165 
       
   166         // Store filename
       
   167         $this->addField(Zend_Search_Lucene_Field::Text('filename', $fileName, 'UTF-8'));
       
   168 
       
   169             // Store contents
       
   170         if ($storeContent) {
       
   171             $this->addField(Zend_Search_Lucene_Field::Text('body', implode(' ', $documentBody), 'UTF-8'));
       
   172         } else {
       
   173             $this->addField(Zend_Search_Lucene_Field::UnStored('body', implode(' ', $documentBody), 'UTF-8'));
       
   174         }
       
   175 
       
   176         // Store meta data properties
       
   177         foreach ($coreProperties as $key => $value)
       
   178         {
       
   179             $this->addField(Zend_Search_Lucene_Field::Text($key, $value, 'UTF-8'));
       
   180         }
       
   181 
       
   182         // Store title (if not present in meta data)
       
   183         if (!isset($coreProperties['title']))
       
   184         {
       
   185             $this->addField(Zend_Search_Lucene_Field::Text('title', $fileName, 'UTF-8'));
       
   186         }
       
   187     }
       
   188 
       
   189     /**
       
   190      * Load Pptx document from a file
       
   191      *
       
   192      * @param string  $fileName
       
   193      * @param boolean $storeContent
       
   194      * @return Zend_Search_Lucene_Document_Pptx
       
   195      */
       
   196     public static function loadPptxFile($fileName, $storeContent = false)
       
   197     {
       
   198         return new Zend_Search_Lucene_Document_Pptx($fileName, $storeContent);
       
   199     }
       
   200 }