web/lib/Zend/Pdf/FileParser.php
changeset 64 162c1de6545a
parent 19 1c2f13fd785c
child 68 ecaf28ffe26e
equal deleted inserted replaced
63:5b37998e522e 64:162c1de6545a
       
     1 <?php
       
     2 /**
       
     3  * Zend Framework
       
     4  *
       
     5  * LICENSE
       
     6  *
       
     7  * This source file is subject to the new BSD license that is bundled
       
     8  * with this package in the file LICENSE.txt.
       
     9  * It is also available through the world-wide-web at this URL:
       
    10  * http://framework.zend.com/license/new-bsd
       
    11  * If you did not receive a copy of the license and are unable to
       
    12  * obtain it through the world-wide-web, please send an email
       
    13  * to license@zend.com so we can send you a copy immediately.
       
    14  *
       
    15  * @category   Zend
       
    16  * @package    Zend_Pdf
       
    17  * @subpackage FileParser
       
    18  * @copyright  Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
       
    19  * @license    http://framework.zend.com/license/new-bsd     New BSD License
       
    20  * @version    $Id: FileParser.php 20096 2010-01-06 02:05:09Z bkarwin $
       
    21  */
       
    22 
       
    23 /**
       
    24  * Abstract utility class for parsing binary files.
       
    25  *
       
    26  * Provides a library of methods to quickly navigate and extract various data
       
    27  * types (signed and unsigned integers, floating- and fixed-point numbers,
       
    28  * strings, etc.) from the file.
       
    29  *
       
    30  * File access is managed via a {@link Zend_Pdf_FileParserDataSource} object.
       
    31  * This allows the same parser code to work with many different data sources:
       
    32  * in-memory objects, filesystem files, etc.
       
    33  *
       
    34  * @package    Zend_Pdf
       
    35  * @subpackage FileParser
       
    36  * @copyright  Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
       
    37  * @license    http://framework.zend.com/license/new-bsd     New BSD License
       
    38  */
       
    39 abstract class Zend_Pdf_FileParser
       
    40 {
       
    41   /**** Class Constants ****/
       
    42 
       
    43     /**
       
    44      * Little-endian byte order (0x04 0x03 0x02 0x01).
       
    45      */
       
    46     const BYTE_ORDER_LITTLE_ENDIAN = 0;
       
    47 
       
    48     /**
       
    49      * Big-endian byte order (0x01 0x02 0x03 0x04).
       
    50      */
       
    51     const BYTE_ORDER_BIG_ENDIAN    = 1;
       
    52 
       
    53 
       
    54 
       
    55   /**** Instance Variables ****/
       
    56 
       
    57 
       
    58     /**
       
    59      * Flag indicating that the file has passed a cursory validation check.
       
    60      * @var boolean
       
    61      */
       
    62     protected $_isScreened = false;
       
    63 
       
    64     /**
       
    65      * Flag indicating that the file has been sucessfully parsed.
       
    66      * @var boolean
       
    67      */
       
    68     protected $_isParsed = false;
       
    69 
       
    70     /**
       
    71      * Object representing the data source to be parsed.
       
    72      * @var Zend_Pdf_FileParserDataSource
       
    73      */
       
    74     protected $_dataSource = null;
       
    75 
       
    76 
       
    77 
       
    78   /**** Public Interface ****/
       
    79 
       
    80 
       
    81   /* Abstract Methods */
       
    82 
       
    83     /**
       
    84      * Performs a cursory check to verify that the binary file is in the expected
       
    85      * format. Intended to quickly weed out obviously bogus files.
       
    86      *
       
    87      * Must set $this->_isScreened to true if successful.
       
    88      *
       
    89      * @throws Zend_Pdf_Exception
       
    90      */
       
    91     abstract public function screen();
       
    92 
       
    93     /**
       
    94      * Reads and parses the complete binary file.
       
    95      *
       
    96      * Must set $this->_isParsed to true if successful.
       
    97      *
       
    98      * @throws Zend_Pdf_Exception
       
    99      */
       
   100     abstract public function parse();
       
   101 
       
   102 
       
   103   /* Object Lifecycle */
       
   104 
       
   105     /**
       
   106      * Object constructor.
       
   107      *
       
   108      * Verifies that the data source has been properly initialized.
       
   109      *
       
   110      * @param Zend_Pdf_FileParserDataSource $dataSource
       
   111      * @throws Zend_Pdf_Exception
       
   112      */
       
   113     public function __construct(Zend_Pdf_FileParserDataSource $dataSource)
       
   114     {
       
   115         if ($dataSource->getSize() == 0) {
       
   116             require_once 'Zend/Pdf/Exception.php';
       
   117             throw new Zend_Pdf_Exception('The data source has not been properly initialized',
       
   118                                          Zend_Pdf_Exception::BAD_DATA_SOURCE);
       
   119         }
       
   120         $this->_dataSource = $dataSource;
       
   121     }
       
   122 
       
   123     /**
       
   124      * Object destructor.
       
   125      *
       
   126      * Discards the data source object.
       
   127      */
       
   128     public function __destruct()
       
   129     {
       
   130         $this->_dataSource = null;
       
   131     }
       
   132 
       
   133 
       
   134   /* Accessors */
       
   135 
       
   136     /**
       
   137      * Returns true if the file has passed a cursory validation check.
       
   138      *
       
   139      * @return boolean
       
   140      */
       
   141     public function isScreened()
       
   142     {
       
   143         return $this->_isScreened;
       
   144     }
       
   145 
       
   146     /**
       
   147      * Returns true if the file has been successfully parsed.
       
   148      *
       
   149      * @return boolean
       
   150      */
       
   151     public function isParsed()
       
   152     {
       
   153         return $this->_isParsed;
       
   154     }
       
   155 
       
   156     /**
       
   157      * Returns the data source object representing the file being parsed.
       
   158      *
       
   159      * @return Zend_Pdf_FileParserDataSource
       
   160      */
       
   161     public function getDataSource()
       
   162     {
       
   163         return $this->_dataSource;
       
   164     }
       
   165 
       
   166 
       
   167   /* Primitive Methods */
       
   168 
       
   169     /**
       
   170      * Convenience wrapper for the data source object's moveToOffset() method.
       
   171      *
       
   172      * @param integer $offset Destination byte offset.
       
   173      * @throws Zend_Pdf_Exception
       
   174      */
       
   175     public function moveToOffset($offset)
       
   176     {
       
   177         $this->_dataSource->moveToOffset($offset);
       
   178     }
       
   179 
       
   180     public function getOffset() {
       
   181        return $this->_dataSource->getOffset();
       
   182     }
       
   183 
       
   184     public function getSize() {
       
   185        return $this->_dataSource->getSize();
       
   186     }
       
   187 
       
   188     /**
       
   189      * Convenience wrapper for the data source object's readBytes() method.
       
   190      *
       
   191      * @param integer $byteCount Number of bytes to read.
       
   192      * @return string
       
   193      * @throws Zend_Pdf_Exception
       
   194      */
       
   195     public function readBytes($byteCount)
       
   196     {
       
   197         return $this->_dataSource->readBytes($byteCount);
       
   198     }
       
   199 
       
   200     /**
       
   201      * Convenience wrapper for the data source object's skipBytes() method.
       
   202      *
       
   203      * @param integer $byteCount Number of bytes to skip.
       
   204      * @throws Zend_Pdf_Exception
       
   205      */
       
   206     public function skipBytes($byteCount)
       
   207     {
       
   208         $this->_dataSource->skipBytes($byteCount);
       
   209     }
       
   210 
       
   211 
       
   212   /* Parser Methods */
       
   213 
       
   214     /**
       
   215      * Reads the signed integer value from the binary file at the current byte
       
   216      * offset.
       
   217      *
       
   218      * Advances the offset by the number of bytes read. Throws an exception if
       
   219      * an error occurs.
       
   220      *
       
   221      * @param integer $size Size of integer in bytes: 1-4
       
   222      * @param integer $byteOrder (optional) Big- or little-endian byte order.
       
   223      *   Use the BYTE_ORDER_ constants defined in {@link Zend_Pdf_FileParser}.
       
   224      *   If omitted, uses big-endian.
       
   225      * @return integer
       
   226      * @throws Zend_Pdf_Exception
       
   227      */
       
   228     public function readInt($size, $byteOrder = Zend_Pdf_FileParser::BYTE_ORDER_BIG_ENDIAN)
       
   229     {
       
   230         if (($size < 1) || ($size > 4)) {
       
   231             require_once 'Zend/Pdf/Exception.php';
       
   232             throw new Zend_Pdf_Exception("Invalid signed integer size: $size",
       
   233                                          Zend_Pdf_Exception::INVALID_INTEGER_SIZE);
       
   234         }
       
   235         $bytes = $this->_dataSource->readBytes($size);
       
   236         /* unpack() will not work for this method because it always works in
       
   237          * the host byte order for signed integers. It also does not allow for
       
   238          * variable integer sizes.
       
   239          */
       
   240         if ($byteOrder == Zend_Pdf_FileParser::BYTE_ORDER_BIG_ENDIAN) {
       
   241             $number = ord($bytes[0]);
       
   242             if (($number & 0x80) == 0x80) {
       
   243                 /* This number is negative. Extract the positive equivalent.
       
   244                  */
       
   245                 $number = (~ $number) & 0xff;
       
   246                 for ($i = 1; $i < $size; $i++) {
       
   247                     $number = ($number << 8) | ((~ ord($bytes[$i])) & 0xff);
       
   248                 }
       
   249                 /* Now turn this back into a negative number by taking the
       
   250                  * two's complement (we didn't add one above so won't
       
   251                  * subtract it below). This works reliably on both 32- and
       
   252                  * 64-bit systems.
       
   253                  */
       
   254                 $number = ~$number;
       
   255             } else {
       
   256                 for ($i = 1; $i < $size; $i++) {
       
   257                     $number = ($number << 8) | ord($bytes[$i]);
       
   258                 }
       
   259             }
       
   260         } else if ($byteOrder == Zend_Pdf_FileParser::BYTE_ORDER_LITTLE_ENDIAN) {
       
   261             $number = ord($bytes[$size - 1]);
       
   262             if (($number & 0x80) == 0x80) {
       
   263                 /* Negative number. See discussion above.
       
   264                  */
       
   265                 $number = 0;
       
   266                 for ($i = --$size; $i >= 0; $i--) {
       
   267                     $number |= ((~ ord($bytes[$i])) & 0xff) << ($i * 8);
       
   268                 }
       
   269                 $number = ~$number;
       
   270             } else {
       
   271                 $number = 0;
       
   272                 for ($i = --$size; $i >= 0; $i--) {
       
   273                     $number |= ord($bytes[$i]) << ($i * 8);
       
   274                 }
       
   275             }
       
   276         } else {
       
   277             require_once 'Zend/Pdf/Exception.php';
       
   278             throw new Zend_Pdf_Exception("Invalid byte order: $byteOrder",
       
   279                                          Zend_Pdf_Exception::INVALID_BYTE_ORDER);
       
   280         }
       
   281         return $number;
       
   282     }
       
   283 
       
   284     /**
       
   285      * Reads the unsigned integer value from the binary file at the current byte
       
   286      * offset.
       
   287      *
       
   288      * Advances the offset by the number of bytes read. Throws an exception if
       
   289      * an error occurs.
       
   290      *
       
   291      * NOTE: If you ask for a 4-byte unsigned integer on a 32-bit machine, the
       
   292      * resulting value WILL BE SIGNED because PHP uses signed integers internally
       
   293      * for everything. To guarantee portability, be sure to use bitwise operators
       
   294      * operators on large unsigned integers!
       
   295      *
       
   296      * @param integer $size Size of integer in bytes: 1-4
       
   297      * @param integer $byteOrder (optional) Big- or little-endian byte order.
       
   298      *   Use the BYTE_ORDER_ constants defined in {@link Zend_Pdf_FileParser}.
       
   299      *   If omitted, uses big-endian.
       
   300      * @return integer
       
   301      * @throws Zend_Pdf_Exception
       
   302      */
       
   303     public function readUInt($size, $byteOrder = Zend_Pdf_FileParser::BYTE_ORDER_BIG_ENDIAN)
       
   304     {
       
   305         if (($size < 1) || ($size > 4)) {
       
   306             require_once 'Zend/Pdf/Exception.php';
       
   307             throw new Zend_Pdf_Exception("Invalid unsigned integer size: $size",
       
   308                                          Zend_Pdf_Exception::INVALID_INTEGER_SIZE);
       
   309         }
       
   310         $bytes = $this->_dataSource->readBytes($size);
       
   311         /* unpack() is a bit heavyweight for this simple conversion. Just
       
   312          * work the bytes directly.
       
   313          */
       
   314         if ($byteOrder == Zend_Pdf_FileParser::BYTE_ORDER_BIG_ENDIAN) {
       
   315             $number = ord($bytes[0]);
       
   316             for ($i = 1; $i < $size; $i++) {
       
   317                 $number = ($number << 8) | ord($bytes[$i]);
       
   318             }
       
   319         } else if ($byteOrder == Zend_Pdf_FileParser::BYTE_ORDER_LITTLE_ENDIAN) {
       
   320             $number = 0;
       
   321             for ($i = --$size; $i >= 0; $i--) {
       
   322                 $number |= ord($bytes[$i]) << ($i * 8);
       
   323             }
       
   324         } else {
       
   325             require_once 'Zend/Pdf/Exception.php';
       
   326             throw new Zend_Pdf_Exception("Invalid byte order: $byteOrder",
       
   327                                          Zend_Pdf_Exception::INVALID_BYTE_ORDER);
       
   328         }
       
   329         return $number;
       
   330     }
       
   331 
       
   332     /**
       
   333      * Returns true if the specified bit is set in the integer bitfield.
       
   334      *
       
   335      * @param integer $bit Bit number to test (i.e. - 0-31)
       
   336      * @param integer $bitField
       
   337      * @return boolean
       
   338      */
       
   339     public function isBitSet($bit, $bitField)
       
   340     {
       
   341         $bitMask = 1 << $bit;
       
   342         $isSet = (($bitField & $bitMask) == $bitMask);
       
   343         return $isSet;
       
   344     }
       
   345 
       
   346     /**
       
   347      * Reads the signed fixed-point number from the binary file at the current
       
   348      * byte offset.
       
   349      *
       
   350      * Common fixed-point sizes are 2.14 and 16.16.
       
   351      *
       
   352      * Advances the offset by the number of bytes read. Throws an exception if
       
   353      * an error occurs.
       
   354      *
       
   355      * @param integer $mantissaBits Number of bits in the mantissa
       
   356      * @param integer $fractionBits Number of bits in the fraction
       
   357      * @param integer $byteOrder (optional) Big- or little-endian byte order.
       
   358      *   Use the BYTE_ORDER_ constants defined in {@link Zend_Pdf_FileParser}.
       
   359      *   If omitted, uses big-endian.
       
   360      * @return float
       
   361      * @throws Zend_Pdf_Exception
       
   362      */
       
   363     public function readFixed($mantissaBits, $fractionBits,
       
   364                               $byteOrder = Zend_Pdf_FileParser::BYTE_ORDER_BIG_ENDIAN)
       
   365     {
       
   366         $bitsToRead = $mantissaBits + $fractionBits;
       
   367         if (($bitsToRead % 8) !== 0) {
       
   368             require_once 'Zend/Pdf/Exception.php';
       
   369             throw new Zend_Pdf_Exception('Fixed-point numbers are whole bytes',
       
   370                                          Zend_Pdf_Exception::BAD_FIXED_POINT_SIZE);
       
   371         }
       
   372         $number = $this->readInt(($bitsToRead >> 3), $byteOrder) / (1 << $fractionBits);
       
   373         return $number;
       
   374     }
       
   375 
       
   376     /**
       
   377      * Reads the Unicode UTF-16-encoded string from the binary file at the
       
   378      * current byte offset.
       
   379      *
       
   380      * The byte order of the UTF-16 string must be specified. You must also
       
   381      * supply the desired resulting character set.
       
   382      *
       
   383      * Advances the offset by the number of bytes read. Throws an exception if
       
   384      * an error occurs.
       
   385      *
       
   386      * @todo Consider changing $byteCount to a character count. They are not
       
   387      *   always equivalent (in the case of surrogates).
       
   388      * @todo Make $byteOrder optional if there is a byte-order mark (BOM) in the
       
   389      *   string being extracted.
       
   390      *
       
   391      * @param integer $byteCount Number of bytes (characters * 2) to return.
       
   392      * @param integer $byteOrder (optional) Big- or little-endian byte order.
       
   393      *   Use the BYTE_ORDER_ constants defined in {@link Zend_Pdf_FileParser}.
       
   394      *   If omitted, uses big-endian.
       
   395      * @param string $characterSet (optional) Desired resulting character set.
       
   396      *   You may use any character set supported by {@link iconv()}. If omitted,
       
   397      *   uses 'current locale'.
       
   398      * @return string
       
   399      * @throws Zend_Pdf_Exception
       
   400      */
       
   401     public function readStringUTF16($byteCount,
       
   402                                     $byteOrder = Zend_Pdf_FileParser::BYTE_ORDER_BIG_ENDIAN,
       
   403                                     $characterSet = '')
       
   404     {
       
   405         if ($byteCount == 0) {
       
   406             return '';
       
   407         }
       
   408         $bytes = $this->_dataSource->readBytes($byteCount);
       
   409         if ($byteOrder == Zend_Pdf_FileParser::BYTE_ORDER_BIG_ENDIAN) {
       
   410             if ($characterSet == 'UTF-16BE') {
       
   411                 return $bytes;
       
   412             }
       
   413             return iconv('UTF-16BE', $characterSet, $bytes);
       
   414         } else if ($byteOrder == Zend_Pdf_FileParser::BYTE_ORDER_LITTLE_ENDIAN) {
       
   415             if ($characterSet == 'UTF-16LE') {
       
   416                 return $bytes;
       
   417             }
       
   418             return iconv('UTF-16LE', $characterSet, $bytes);
       
   419         } else {
       
   420             require_once 'Zend/Pdf/Exception.php';
       
   421             throw new Zend_Pdf_Exception("Invalid byte order: $byteOrder",
       
   422                                          Zend_Pdf_Exception::INVALID_BYTE_ORDER);
       
   423         }
       
   424     }
       
   425 
       
   426     /**
       
   427      * Reads the Mac Roman-encoded string from the binary file at the current
       
   428      * byte offset.
       
   429      *
       
   430      * You must supply the desired resulting character set.
       
   431      *
       
   432      * Advances the offset by the number of bytes read. Throws an exception if
       
   433      * an error occurs.
       
   434      *
       
   435      * @param integer $byteCount Number of bytes (characters) to return.
       
   436      * @param string $characterSet (optional) Desired resulting character set.
       
   437      *   You may use any character set supported by {@link iconv()}. If omitted,
       
   438      *   uses 'current locale'.
       
   439      * @return string
       
   440      * @throws Zend_Pdf_Exception
       
   441      */
       
   442     public function readStringMacRoman($byteCount, $characterSet = '')
       
   443     {
       
   444         if ($byteCount == 0) {
       
   445             return '';
       
   446         }
       
   447         $bytes = $this->_dataSource->readBytes($byteCount);
       
   448         if ($characterSet == 'MacRoman') {
       
   449             return $bytes;
       
   450         }
       
   451         return iconv('MacRoman', $characterSet, $bytes);
       
   452     }
       
   453 
       
   454     /**
       
   455      * Reads the Pascal string from the binary file at the current byte offset.
       
   456      *
       
   457      * The length of the Pascal string is determined by reading the length bytes
       
   458      * which preceed the character data. You must supply the desired resulting
       
   459      * character set.
       
   460      *
       
   461      * Advances the offset by the number of bytes read. Throws an exception if
       
   462      * an error occurs.
       
   463      *
       
   464      * @param string $characterSet (optional) Desired resulting character set.
       
   465      *   You may use any character set supported by {@link iconv()}. If omitted,
       
   466      *   uses 'current locale'.
       
   467      * @param integer $lengthBytes (optional) Number of bytes that make up the
       
   468      *   length. Default is 1.
       
   469      * @return string
       
   470      * @throws Zend_Pdf_Exception
       
   471      */
       
   472     public function readStringPascal($characterSet = '', $lengthBytes = 1)
       
   473     {
       
   474         $byteCount = $this->readUInt($lengthBytes);
       
   475         if ($byteCount == 0) {
       
   476             return '';
       
   477         }
       
   478         $bytes = $this->_dataSource->readBytes($byteCount);
       
   479         if ($characterSet == 'ASCII') {
       
   480             return $bytes;
       
   481         }
       
   482         return iconv('ASCII', $characterSet, $bytes);
       
   483     }
       
   484 
       
   485 }