|
1 <?php |
|
2 /** |
|
3 * Zend Framework |
|
4 * |
|
5 * LICENSE |
|
6 * |
|
7 * This source file is subject to the new BSD license that is bundled |
|
8 * with this package in the file LICENSE.txt. |
|
9 * It is also available through the world-wide-web at this URL: |
|
10 * http://framework.zend.com/license/new-bsd |
|
11 * If you did not receive a copy of the license and are unable to |
|
12 * obtain it through the world-wide-web, please send an email |
|
13 * to license@zend.com so we can send you a copy immediately. |
|
14 * |
|
15 * @category Zend |
|
16 * @package Zend_Pdf |
|
17 * @subpackage FileParser |
|
18 * @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com) |
|
19 * @license http://framework.zend.com/license/new-bsd New BSD License |
|
20 * @version $Id: FileParser.php 20096 2010-01-06 02:05:09Z bkarwin $ |
|
21 */ |
|
22 |
|
23 /** |
|
24 * Abstract utility class for parsing binary files. |
|
25 * |
|
26 * Provides a library of methods to quickly navigate and extract various data |
|
27 * types (signed and unsigned integers, floating- and fixed-point numbers, |
|
28 * strings, etc.) from the file. |
|
29 * |
|
30 * File access is managed via a {@link Zend_Pdf_FileParserDataSource} object. |
|
31 * This allows the same parser code to work with many different data sources: |
|
32 * in-memory objects, filesystem files, etc. |
|
33 * |
|
34 * @package Zend_Pdf |
|
35 * @subpackage FileParser |
|
36 * @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com) |
|
37 * @license http://framework.zend.com/license/new-bsd New BSD License |
|
38 */ |
|
39 abstract class Zend_Pdf_FileParser |
|
40 { |
|
41 /**** Class Constants ****/ |
|
42 |
|
43 /** |
|
44 * Little-endian byte order (0x04 0x03 0x02 0x01). |
|
45 */ |
|
46 const BYTE_ORDER_LITTLE_ENDIAN = 0; |
|
47 |
|
48 /** |
|
49 * Big-endian byte order (0x01 0x02 0x03 0x04). |
|
50 */ |
|
51 const BYTE_ORDER_BIG_ENDIAN = 1; |
|
52 |
|
53 |
|
54 |
|
55 /**** Instance Variables ****/ |
|
56 |
|
57 |
|
58 /** |
|
59 * Flag indicating that the file has passed a cursory validation check. |
|
60 * @var boolean |
|
61 */ |
|
62 protected $_isScreened = false; |
|
63 |
|
64 /** |
|
65 * Flag indicating that the file has been sucessfully parsed. |
|
66 * @var boolean |
|
67 */ |
|
68 protected $_isParsed = false; |
|
69 |
|
70 /** |
|
71 * Object representing the data source to be parsed. |
|
72 * @var Zend_Pdf_FileParserDataSource |
|
73 */ |
|
74 protected $_dataSource = null; |
|
75 |
|
76 |
|
77 |
|
78 /**** Public Interface ****/ |
|
79 |
|
80 |
|
81 /* Abstract Methods */ |
|
82 |
|
83 /** |
|
84 * Performs a cursory check to verify that the binary file is in the expected |
|
85 * format. Intended to quickly weed out obviously bogus files. |
|
86 * |
|
87 * Must set $this->_isScreened to true if successful. |
|
88 * |
|
89 * @throws Zend_Pdf_Exception |
|
90 */ |
|
91 abstract public function screen(); |
|
92 |
|
93 /** |
|
94 * Reads and parses the complete binary file. |
|
95 * |
|
96 * Must set $this->_isParsed to true if successful. |
|
97 * |
|
98 * @throws Zend_Pdf_Exception |
|
99 */ |
|
100 abstract public function parse(); |
|
101 |
|
102 |
|
103 /* Object Lifecycle */ |
|
104 |
|
105 /** |
|
106 * Object constructor. |
|
107 * |
|
108 * Verifies that the data source has been properly initialized. |
|
109 * |
|
110 * @param Zend_Pdf_FileParserDataSource $dataSource |
|
111 * @throws Zend_Pdf_Exception |
|
112 */ |
|
113 public function __construct(Zend_Pdf_FileParserDataSource $dataSource) |
|
114 { |
|
115 if ($dataSource->getSize() == 0) { |
|
116 require_once 'Zend/Pdf/Exception.php'; |
|
117 throw new Zend_Pdf_Exception('The data source has not been properly initialized', |
|
118 Zend_Pdf_Exception::BAD_DATA_SOURCE); |
|
119 } |
|
120 $this->_dataSource = $dataSource; |
|
121 } |
|
122 |
|
123 /** |
|
124 * Object destructor. |
|
125 * |
|
126 * Discards the data source object. |
|
127 */ |
|
128 public function __destruct() |
|
129 { |
|
130 $this->_dataSource = null; |
|
131 } |
|
132 |
|
133 |
|
134 /* Accessors */ |
|
135 |
|
136 /** |
|
137 * Returns true if the file has passed a cursory validation check. |
|
138 * |
|
139 * @return boolean |
|
140 */ |
|
141 public function isScreened() |
|
142 { |
|
143 return $this->_isScreened; |
|
144 } |
|
145 |
|
146 /** |
|
147 * Returns true if the file has been successfully parsed. |
|
148 * |
|
149 * @return boolean |
|
150 */ |
|
151 public function isParsed() |
|
152 { |
|
153 return $this->_isParsed; |
|
154 } |
|
155 |
|
156 /** |
|
157 * Returns the data source object representing the file being parsed. |
|
158 * |
|
159 * @return Zend_Pdf_FileParserDataSource |
|
160 */ |
|
161 public function getDataSource() |
|
162 { |
|
163 return $this->_dataSource; |
|
164 } |
|
165 |
|
166 |
|
167 /* Primitive Methods */ |
|
168 |
|
169 /** |
|
170 * Convenience wrapper for the data source object's moveToOffset() method. |
|
171 * |
|
172 * @param integer $offset Destination byte offset. |
|
173 * @throws Zend_Pdf_Exception |
|
174 */ |
|
175 public function moveToOffset($offset) |
|
176 { |
|
177 $this->_dataSource->moveToOffset($offset); |
|
178 } |
|
179 |
|
180 public function getOffset() { |
|
181 return $this->_dataSource->getOffset(); |
|
182 } |
|
183 |
|
184 public function getSize() { |
|
185 return $this->_dataSource->getSize(); |
|
186 } |
|
187 |
|
188 /** |
|
189 * Convenience wrapper for the data source object's readBytes() method. |
|
190 * |
|
191 * @param integer $byteCount Number of bytes to read. |
|
192 * @return string |
|
193 * @throws Zend_Pdf_Exception |
|
194 */ |
|
195 public function readBytes($byteCount) |
|
196 { |
|
197 return $this->_dataSource->readBytes($byteCount); |
|
198 } |
|
199 |
|
200 /** |
|
201 * Convenience wrapper for the data source object's skipBytes() method. |
|
202 * |
|
203 * @param integer $byteCount Number of bytes to skip. |
|
204 * @throws Zend_Pdf_Exception |
|
205 */ |
|
206 public function skipBytes($byteCount) |
|
207 { |
|
208 $this->_dataSource->skipBytes($byteCount); |
|
209 } |
|
210 |
|
211 |
|
212 /* Parser Methods */ |
|
213 |
|
214 /** |
|
215 * Reads the signed integer value from the binary file at the current byte |
|
216 * offset. |
|
217 * |
|
218 * Advances the offset by the number of bytes read. Throws an exception if |
|
219 * an error occurs. |
|
220 * |
|
221 * @param integer $size Size of integer in bytes: 1-4 |
|
222 * @param integer $byteOrder (optional) Big- or little-endian byte order. |
|
223 * Use the BYTE_ORDER_ constants defined in {@link Zend_Pdf_FileParser}. |
|
224 * If omitted, uses big-endian. |
|
225 * @return integer |
|
226 * @throws Zend_Pdf_Exception |
|
227 */ |
|
228 public function readInt($size, $byteOrder = Zend_Pdf_FileParser::BYTE_ORDER_BIG_ENDIAN) |
|
229 { |
|
230 if (($size < 1) || ($size > 4)) { |
|
231 require_once 'Zend/Pdf/Exception.php'; |
|
232 throw new Zend_Pdf_Exception("Invalid signed integer size: $size", |
|
233 Zend_Pdf_Exception::INVALID_INTEGER_SIZE); |
|
234 } |
|
235 $bytes = $this->_dataSource->readBytes($size); |
|
236 /* unpack() will not work for this method because it always works in |
|
237 * the host byte order for signed integers. It also does not allow for |
|
238 * variable integer sizes. |
|
239 */ |
|
240 if ($byteOrder == Zend_Pdf_FileParser::BYTE_ORDER_BIG_ENDIAN) { |
|
241 $number = ord($bytes[0]); |
|
242 if (($number & 0x80) == 0x80) { |
|
243 /* This number is negative. Extract the positive equivalent. |
|
244 */ |
|
245 $number = (~ $number) & 0xff; |
|
246 for ($i = 1; $i < $size; $i++) { |
|
247 $number = ($number << 8) | ((~ ord($bytes[$i])) & 0xff); |
|
248 } |
|
249 /* Now turn this back into a negative number by taking the |
|
250 * two's complement (we didn't add one above so won't |
|
251 * subtract it below). This works reliably on both 32- and |
|
252 * 64-bit systems. |
|
253 */ |
|
254 $number = ~$number; |
|
255 } else { |
|
256 for ($i = 1; $i < $size; $i++) { |
|
257 $number = ($number << 8) | ord($bytes[$i]); |
|
258 } |
|
259 } |
|
260 } else if ($byteOrder == Zend_Pdf_FileParser::BYTE_ORDER_LITTLE_ENDIAN) { |
|
261 $number = ord($bytes[$size - 1]); |
|
262 if (($number & 0x80) == 0x80) { |
|
263 /* Negative number. See discussion above. |
|
264 */ |
|
265 $number = 0; |
|
266 for ($i = --$size; $i >= 0; $i--) { |
|
267 $number |= ((~ ord($bytes[$i])) & 0xff) << ($i * 8); |
|
268 } |
|
269 $number = ~$number; |
|
270 } else { |
|
271 $number = 0; |
|
272 for ($i = --$size; $i >= 0; $i--) { |
|
273 $number |= ord($bytes[$i]) << ($i * 8); |
|
274 } |
|
275 } |
|
276 } else { |
|
277 require_once 'Zend/Pdf/Exception.php'; |
|
278 throw new Zend_Pdf_Exception("Invalid byte order: $byteOrder", |
|
279 Zend_Pdf_Exception::INVALID_BYTE_ORDER); |
|
280 } |
|
281 return $number; |
|
282 } |
|
283 |
|
284 /** |
|
285 * Reads the unsigned integer value from the binary file at the current byte |
|
286 * offset. |
|
287 * |
|
288 * Advances the offset by the number of bytes read. Throws an exception if |
|
289 * an error occurs. |
|
290 * |
|
291 * NOTE: If you ask for a 4-byte unsigned integer on a 32-bit machine, the |
|
292 * resulting value WILL BE SIGNED because PHP uses signed integers internally |
|
293 * for everything. To guarantee portability, be sure to use bitwise operators |
|
294 * operators on large unsigned integers! |
|
295 * |
|
296 * @param integer $size Size of integer in bytes: 1-4 |
|
297 * @param integer $byteOrder (optional) Big- or little-endian byte order. |
|
298 * Use the BYTE_ORDER_ constants defined in {@link Zend_Pdf_FileParser}. |
|
299 * If omitted, uses big-endian. |
|
300 * @return integer |
|
301 * @throws Zend_Pdf_Exception |
|
302 */ |
|
303 public function readUInt($size, $byteOrder = Zend_Pdf_FileParser::BYTE_ORDER_BIG_ENDIAN) |
|
304 { |
|
305 if (($size < 1) || ($size > 4)) { |
|
306 require_once 'Zend/Pdf/Exception.php'; |
|
307 throw new Zend_Pdf_Exception("Invalid unsigned integer size: $size", |
|
308 Zend_Pdf_Exception::INVALID_INTEGER_SIZE); |
|
309 } |
|
310 $bytes = $this->_dataSource->readBytes($size); |
|
311 /* unpack() is a bit heavyweight for this simple conversion. Just |
|
312 * work the bytes directly. |
|
313 */ |
|
314 if ($byteOrder == Zend_Pdf_FileParser::BYTE_ORDER_BIG_ENDIAN) { |
|
315 $number = ord($bytes[0]); |
|
316 for ($i = 1; $i < $size; $i++) { |
|
317 $number = ($number << 8) | ord($bytes[$i]); |
|
318 } |
|
319 } else if ($byteOrder == Zend_Pdf_FileParser::BYTE_ORDER_LITTLE_ENDIAN) { |
|
320 $number = 0; |
|
321 for ($i = --$size; $i >= 0; $i--) { |
|
322 $number |= ord($bytes[$i]) << ($i * 8); |
|
323 } |
|
324 } else { |
|
325 require_once 'Zend/Pdf/Exception.php'; |
|
326 throw new Zend_Pdf_Exception("Invalid byte order: $byteOrder", |
|
327 Zend_Pdf_Exception::INVALID_BYTE_ORDER); |
|
328 } |
|
329 return $number; |
|
330 } |
|
331 |
|
332 /** |
|
333 * Returns true if the specified bit is set in the integer bitfield. |
|
334 * |
|
335 * @param integer $bit Bit number to test (i.e. - 0-31) |
|
336 * @param integer $bitField |
|
337 * @return boolean |
|
338 */ |
|
339 public function isBitSet($bit, $bitField) |
|
340 { |
|
341 $bitMask = 1 << $bit; |
|
342 $isSet = (($bitField & $bitMask) == $bitMask); |
|
343 return $isSet; |
|
344 } |
|
345 |
|
346 /** |
|
347 * Reads the signed fixed-point number from the binary file at the current |
|
348 * byte offset. |
|
349 * |
|
350 * Common fixed-point sizes are 2.14 and 16.16. |
|
351 * |
|
352 * Advances the offset by the number of bytes read. Throws an exception if |
|
353 * an error occurs. |
|
354 * |
|
355 * @param integer $mantissaBits Number of bits in the mantissa |
|
356 * @param integer $fractionBits Number of bits in the fraction |
|
357 * @param integer $byteOrder (optional) Big- or little-endian byte order. |
|
358 * Use the BYTE_ORDER_ constants defined in {@link Zend_Pdf_FileParser}. |
|
359 * If omitted, uses big-endian. |
|
360 * @return float |
|
361 * @throws Zend_Pdf_Exception |
|
362 */ |
|
363 public function readFixed($mantissaBits, $fractionBits, |
|
364 $byteOrder = Zend_Pdf_FileParser::BYTE_ORDER_BIG_ENDIAN) |
|
365 { |
|
366 $bitsToRead = $mantissaBits + $fractionBits; |
|
367 if (($bitsToRead % 8) !== 0) { |
|
368 require_once 'Zend/Pdf/Exception.php'; |
|
369 throw new Zend_Pdf_Exception('Fixed-point numbers are whole bytes', |
|
370 Zend_Pdf_Exception::BAD_FIXED_POINT_SIZE); |
|
371 } |
|
372 $number = $this->readInt(($bitsToRead >> 3), $byteOrder) / (1 << $fractionBits); |
|
373 return $number; |
|
374 } |
|
375 |
|
376 /** |
|
377 * Reads the Unicode UTF-16-encoded string from the binary file at the |
|
378 * current byte offset. |
|
379 * |
|
380 * The byte order of the UTF-16 string must be specified. You must also |
|
381 * supply the desired resulting character set. |
|
382 * |
|
383 * Advances the offset by the number of bytes read. Throws an exception if |
|
384 * an error occurs. |
|
385 * |
|
386 * @todo Consider changing $byteCount to a character count. They are not |
|
387 * always equivalent (in the case of surrogates). |
|
388 * @todo Make $byteOrder optional if there is a byte-order mark (BOM) in the |
|
389 * string being extracted. |
|
390 * |
|
391 * @param integer $byteCount Number of bytes (characters * 2) to return. |
|
392 * @param integer $byteOrder (optional) Big- or little-endian byte order. |
|
393 * Use the BYTE_ORDER_ constants defined in {@link Zend_Pdf_FileParser}. |
|
394 * If omitted, uses big-endian. |
|
395 * @param string $characterSet (optional) Desired resulting character set. |
|
396 * You may use any character set supported by {@link iconv()}. If omitted, |
|
397 * uses 'current locale'. |
|
398 * @return string |
|
399 * @throws Zend_Pdf_Exception |
|
400 */ |
|
401 public function readStringUTF16($byteCount, |
|
402 $byteOrder = Zend_Pdf_FileParser::BYTE_ORDER_BIG_ENDIAN, |
|
403 $characterSet = '') |
|
404 { |
|
405 if ($byteCount == 0) { |
|
406 return ''; |
|
407 } |
|
408 $bytes = $this->_dataSource->readBytes($byteCount); |
|
409 if ($byteOrder == Zend_Pdf_FileParser::BYTE_ORDER_BIG_ENDIAN) { |
|
410 if ($characterSet == 'UTF-16BE') { |
|
411 return $bytes; |
|
412 } |
|
413 return iconv('UTF-16BE', $characterSet, $bytes); |
|
414 } else if ($byteOrder == Zend_Pdf_FileParser::BYTE_ORDER_LITTLE_ENDIAN) { |
|
415 if ($characterSet == 'UTF-16LE') { |
|
416 return $bytes; |
|
417 } |
|
418 return iconv('UTF-16LE', $characterSet, $bytes); |
|
419 } else { |
|
420 require_once 'Zend/Pdf/Exception.php'; |
|
421 throw new Zend_Pdf_Exception("Invalid byte order: $byteOrder", |
|
422 Zend_Pdf_Exception::INVALID_BYTE_ORDER); |
|
423 } |
|
424 } |
|
425 |
|
426 /** |
|
427 * Reads the Mac Roman-encoded string from the binary file at the current |
|
428 * byte offset. |
|
429 * |
|
430 * You must supply the desired resulting character set. |
|
431 * |
|
432 * Advances the offset by the number of bytes read. Throws an exception if |
|
433 * an error occurs. |
|
434 * |
|
435 * @param integer $byteCount Number of bytes (characters) to return. |
|
436 * @param string $characterSet (optional) Desired resulting character set. |
|
437 * You may use any character set supported by {@link iconv()}. If omitted, |
|
438 * uses 'current locale'. |
|
439 * @return string |
|
440 * @throws Zend_Pdf_Exception |
|
441 */ |
|
442 public function readStringMacRoman($byteCount, $characterSet = '') |
|
443 { |
|
444 if ($byteCount == 0) { |
|
445 return ''; |
|
446 } |
|
447 $bytes = $this->_dataSource->readBytes($byteCount); |
|
448 if ($characterSet == 'MacRoman') { |
|
449 return $bytes; |
|
450 } |
|
451 return iconv('MacRoman', $characterSet, $bytes); |
|
452 } |
|
453 |
|
454 /** |
|
455 * Reads the Pascal string from the binary file at the current byte offset. |
|
456 * |
|
457 * The length of the Pascal string is determined by reading the length bytes |
|
458 * which preceed the character data. You must supply the desired resulting |
|
459 * character set. |
|
460 * |
|
461 * Advances the offset by the number of bytes read. Throws an exception if |
|
462 * an error occurs. |
|
463 * |
|
464 * @param string $characterSet (optional) Desired resulting character set. |
|
465 * You may use any character set supported by {@link iconv()}. If omitted, |
|
466 * uses 'current locale'. |
|
467 * @param integer $lengthBytes (optional) Number of bytes that make up the |
|
468 * length. Default is 1. |
|
469 * @return string |
|
470 * @throws Zend_Pdf_Exception |
|
471 */ |
|
472 public function readStringPascal($characterSet = '', $lengthBytes = 1) |
|
473 { |
|
474 $byteCount = $this->readUInt($lengthBytes); |
|
475 if ($byteCount == 0) { |
|
476 return ''; |
|
477 } |
|
478 $bytes = $this->_dataSource->readBytes($byteCount); |
|
479 if ($characterSet == 'ASCII') { |
|
480 return $bytes; |
|
481 } |
|
482 return iconv('ASCII', $characterSet, $bytes); |
|
483 } |
|
484 |
|
485 } |