diff -r 5b37998e522e -r 162c1de6545a web/lib/Zend/Search/Lucene/Document/Docx.php --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/web/lib/Zend/Search/Lucene/Document/Docx.php Fri Mar 11 15:05:35 2011 +0100 @@ -0,0 +1,151 @@ +open($fileName); + + // Read relations and search for officeDocument + $relationsXml = $package->getFromName('_rels/.rels'); + if ($relationsXml === false) { + require_once 'Zend/Search/Lucene/Exception.php'; + throw new Zend_Search_Lucene_Exception('Invalid archive or corrupted .docx file.'); + } + $relations = simplexml_load_string($relationsXml); + foreach($relations->Relationship as $rel) { + if ($rel ["Type"] == Zend_Search_Lucene_Document_OpenXml::SCHEMA_OFFICEDOCUMENT) { + // Found office document! Read in contents... + $contents = simplexml_load_string($package->getFromName( + $this->absoluteZipPath(dirname($rel['Target']) + . '/' + . basename($rel['Target'])) + )); + + $contents->registerXPathNamespace('w', Zend_Search_Lucene_Document_Docx::SCHEMA_WORDPROCESSINGML); + $paragraphs = $contents->xpath('//w:body/w:p'); + + foreach ($paragraphs as $paragraph) { + $runs = $paragraph->xpath('.//w:r/*[name() = "w:t" or name() = "w:br"]'); + + if ($runs === false) { + // Paragraph doesn't contain any text or breaks + continue; + } + + foreach ($runs as $run) { + if ($run->getName() == 'br') { + // Break element + $documentBody[] = ' '; + } else { + $documentBody[] = (string)$run; + } + } + + // Add space after each paragraph. So they are not bound together. + $documentBody[] = ' '; + } + + break; + } + } + + // Read core properties + $coreProperties = $this->extractMetaData($package); + + // Close file + $package->close(); + + // Store filename + $this->addField(Zend_Search_Lucene_Field::Text('filename', $fileName, 'UTF-8')); + + // Store contents + if ($storeContent) { + $this->addField(Zend_Search_Lucene_Field::Text('body', implode('', $documentBody), 'UTF-8')); + } else { + $this->addField(Zend_Search_Lucene_Field::UnStored('body', implode('', $documentBody), 'UTF-8')); + } + + // Store meta data properties + foreach ($coreProperties as $key => $value) { + $this->addField(Zend_Search_Lucene_Field::Text($key, $value, 'UTF-8')); + } + + // Store title (if not present in meta data) + if (! isset($coreProperties['title'])) { + $this->addField(Zend_Search_Lucene_Field::Text('title', $fileName, 'UTF-8')); + } + } + + /** + * Load Docx document from a file + * + * @param string $fileName + * @param boolean $storeContent + * @return Zend_Search_Lucene_Document_Docx + * @throws Zend_Search_Lucene_Document_Exception + */ + public static function loadDocxFile($fileName, $storeContent = false) { + if (!is_readable($fileName)) { + require_once 'Zend/Search/Lucene/Document/Exception.php'; + throw new Zend_Search_Lucene_Document_Exception('Provided file \'' . $fileName . '\' is not readable.'); + } + + return new Zend_Search_Lucene_Document_Docx($fileName, $storeContent); + } +}