[ Index ] |
PHP Cross Reference of Unnamed Project |
[Summary view] [Print] [Text view]
1 <?php 2 /** 3 * This file is part of FPDI 4 * 5 * @package FPDI 6 * @copyright Copyright (c) 2015 Setasign - Jan Slabon (http://www.setasign.com) 7 * @license http://opensource.org/licenses/mit-license The MIT License 8 * @version 1.6.1 9 */ 10 11 /** 12 * Class pdf_parser 13 */ 14 class pdf_parser 15 { 16 /** 17 * Type constant 18 * 19 * @var integer 20 */ 21 const TYPE_NULL = 0; 22 23 /** 24 * Type constant 25 * 26 * @var integer 27 */ 28 const TYPE_NUMERIC = 1; 29 30 /** 31 * Type constant 32 * 33 * @var integer 34 */ 35 const TYPE_TOKEN = 2; 36 37 /** 38 * Type constant 39 * 40 * @var integer 41 */ 42 const TYPE_HEX = 3; 43 44 /** 45 * Type constant 46 * 47 * @var integer 48 */ 49 const TYPE_STRING = 4; 50 51 /** 52 * Type constant 53 * 54 * @var integer 55 */ 56 const TYPE_DICTIONARY = 5; 57 58 /** 59 * Type constant 60 * 61 * @var integer 62 */ 63 const TYPE_ARRAY = 6; 64 65 /** 66 * Type constant 67 * 68 * @var integer 69 */ 70 const TYPE_OBJDEC = 7; 71 72 /** 73 * Type constant 74 * 75 * @var integer 76 */ 77 const TYPE_OBJREF = 8; 78 79 /** 80 * Type constant 81 * 82 * @var integer 83 */ 84 const TYPE_OBJECT = 9; 85 86 /** 87 * Type constant 88 * 89 * @var integer 90 */ 91 const TYPE_STREAM = 10; 92 93 /** 94 * Type constant 95 * 96 * @var integer 97 */ 98 const TYPE_BOOLEAN = 11; 99 100 /** 101 * Type constant 102 * 103 * @var integer 104 */ 105 const TYPE_REAL = 12; 106 107 /** 108 * Define the amount of byte in which the initial keyword of a PDF document should be searched. 109 * 110 * @var int 111 */ 112 static public $searchForStartxrefLength = 5500; 113 114 /** 115 * Filename 116 * 117 * @var string 118 */ 119 public $filename; 120 121 /** 122 * File resource 123 * 124 * @var resource 125 */ 126 protected $_f; 127 128 /** 129 * PDF Context 130 * 131 * @var pdf_context 132 */ 133 protected $_c; 134 135 /** 136 * xref-Data 137 * 138 * @var array 139 */ 140 protected $_xref; 141 142 /** 143 * Data of the Root object 144 * 145 * @var array 146 */ 147 protected $_root; 148 149 /** 150 * PDF version of the loaded document 151 * 152 * @var string 153 */ 154 protected $_pdfVersion; 155 156 /** 157 * For reading encrypted documents and xref/object streams are in use 158 * 159 * @var boolean 160 */ 161 protected $_readPlain = true; 162 163 /** 164 * The current read object 165 * 166 * @var array 167 */ 168 protected $_currentObj; 169 170 /** 171 * Constructor 172 * 173 * @param string $filename Source filename 174 * @throws InvalidArgumentException 175 */ 176 public function __construct($filename) 177 { 178 $this->filename = $filename; 179 180 $this->_f = @fopen($this->filename, 'rb'); 181 182 if (!$this->_f) { 183 throw new InvalidArgumentException(sprintf('Cannot open %s !', $filename)); 184 } 185 186 $this->getPdfVersion(); 187 188 if (!class_exists('pdf_context')) { 189 require_once ('pdf_context.php'); 190 } 191 $this->_c = new pdf_context($this->_f); 192 193 // Read xref-Data 194 $this->_xref = array(); 195 $this->_readXref($this->_xref, $this->_findXref()); 196 197 // Check for Encryption 198 $this->getEncryption(); 199 200 // Read root 201 $this->_readRoot(); 202 } 203 204 /** 205 * Destructor 206 */ 207 public function __destruct() 208 { 209 $this->closeFile(); 210 } 211 212 /** 213 * Close the opened file 214 */ 215 public function closeFile() 216 { 217 if (isset($this->_f) && is_resource($this->_f)) { 218 fclose($this->_f); 219 unset($this->_f); 220 } 221 } 222 223 /** 224 * Check Trailer for Encryption 225 * 226 * @throws Exception 227 */ 228 public function getEncryption() 229 { 230 if (isset($this->_xref['trailer'][1]['/Encrypt'])) { 231 throw new Exception('File is encrypted!'); 232 } 233 } 234 235 /** 236 * Get PDF-Version 237 * 238 * @return string 239 */ 240 public function getPdfVersion() 241 { 242 if ($this->_pdfVersion === null) { 243 fseek($this->_f, 0); 244 preg_match('/\d\.\d/', fread($this->_f, 16), $m); 245 if (isset($m[0])) 246 $this->_pdfVersion = $m[0]; 247 } 248 249 return $this->_pdfVersion; 250 } 251 252 /** 253 * Read the /Root dictionary 254 */ 255 protected function _readRoot() 256 { 257 if ($this->_xref['trailer'][1]['/Root'][0] != self::TYPE_OBJREF) { 258 throw new Exception('Wrong Type of Root-Element! Must be an indirect reference'); 259 } 260 261 $this->_root = $this->resolveObject($this->_xref['trailer'][1]['/Root']); 262 } 263 264 /** 265 * Find the xref table 266 * 267 * @return integer 268 * @throws Exception 269 */ 270 protected function _findXref() 271 { 272 $toRead = self::$searchForStartxrefLength; 273 274 $stat = fseek($this->_f, -$toRead, SEEK_END); 275 if ($stat === -1) { 276 fseek($this->_f, 0); 277 } 278 279 $data = fread($this->_f, $toRead); 280 281 $keywordPos = strpos(strrev($data), strrev('startxref')); 282 if (false === $keywordPos) { 283 $keywordPos = strpos(strrev($data), strrev('startref')); 284 } 285 286 if (false === $keywordPos) { 287 throw new Exception('Unable to find "startxref" keyword.'); 288 } 289 290 $pos = strlen($data) - $keywordPos; 291 $data = substr($data, $pos); 292 293 if (!preg_match('/\s*(\d+).*$/s', $data, $matches)) { 294 throw new Exception('Unable to find pointer to xref table.'); 295 } 296 297 return (int) $matches[1]; 298 } 299 300 /** 301 * Read the xref table 302 * 303 * @param array $result Array of xref table entries 304 * @param integer $offset of xref table 305 * @return boolean 306 * @throws Exception 307 */ 308 protected function _readXref(&$result, $offset) 309 { 310 $tempPos = $offset - min(20, $offset); 311 fseek($this->_f, $tempPos); // set some bytes backwards to fetch corrupted docs 312 313 $data = fread($this->_f, 100); 314 315 $xrefPos = strrpos($data, 'xref'); 316 317 if ($xrefPos === false) { 318 $this->_c->reset($offset); 319 $xrefStreamObjDec = $this->_readValue($this->_c); 320 321 if (is_array($xrefStreamObjDec) && isset($xrefStreamObjDec[0]) && $xrefStreamObjDec[0] == self::TYPE_OBJDEC) { 322 throw new Exception( 323 sprintf( 324 'This document (%s) probably uses a compression technique which is not supported by the ' . 325 'free parser shipped with FPDI. (See https://www.setasign.com/fpdi-pdf-parser for more details)', 326 $this->filename 327 ) 328 ); 329 } else { 330 throw new Exception('Unable to find xref table.'); 331 } 332 } 333 334 if (!isset($result['xrefLocation'])) { 335 $result['xrefLocation'] = $tempPos + $xrefPos; 336 $result['maxObject'] = 0; 337 } 338 339 $cycles = -1; 340 $bytesPerCycle = 100; 341 342 fseek($this->_f, $tempPos = $tempPos + $xrefPos + 4); // set the handle directly after the "xref"-keyword 343 $data = fread($this->_f, $bytesPerCycle); 344 345 while (($trailerPos = strpos($data, 'trailer', max($bytesPerCycle * $cycles++, 0))) === false && !feof($this->_f)) { 346 $data .= fread($this->_f, $bytesPerCycle); 347 } 348 349 if ($trailerPos === false) { 350 throw new Exception('Trailer keyword not found after xref table'); 351 } 352 353 $data = ltrim(substr($data, 0, $trailerPos)); 354 355 // get Line-Ending 356 $found = preg_match_all("/(\r\n|\n|\r)/", substr($data, 0, 100), $m); // check the first 100 bytes for line breaks 357 if ($found === 0) { 358 throw new Exception('Xref table seems to be corrupted.'); 359 } 360 $differentLineEndings = count(array_unique($m[0])); 361 if ($differentLineEndings > 1) { 362 $lines = preg_split("/(\r\n|\n|\r)/", $data, -1, PREG_SPLIT_NO_EMPTY); 363 } else { 364 $lines = explode($m[0][0], $data); 365 } 366 367 $data = $differentLineEndings = $m = null; 368 unset($data, $differentLineEndings, $m); 369 370 $linesCount = count($lines); 371 372 $start = 1; 373 374 for ($i = 0; $i < $linesCount; $i++) { 375 $line = trim($lines[$i]); 376 if ($line) { 377 $pieces = explode(' ', $line); 378 $c = count($pieces); 379 switch($c) { 380 case 2: 381 $start = (int)$pieces[0]; 382 $end = $start + (int)$pieces[1]; 383 if ($end > $result['maxObject']) 384 $result['maxObject'] = $end; 385 break; 386 case 3: 387 if (!isset($result['xref'][$start])) 388 $result['xref'][$start] = array(); 389 390 if (!array_key_exists($gen = (int) $pieces[1], $result['xref'][$start])) { 391 $result['xref'][$start][$gen] = $pieces[2] == 'n' ? (int) $pieces[0] : null; 392 } 393 $start++; 394 break; 395 default: 396 throw new Exception('Unexpected data in xref table'); 397 } 398 } 399 } 400 401 $lines = $pieces = $line = $start = $end = $gen = null; 402 unset($lines, $pieces, $line, $start, $end, $gen); 403 404 $this->_c->reset($tempPos + $trailerPos + 7); 405 $trailer = $this->_readValue($this->_c); 406 407 if (!isset($result['trailer'])) { 408 $result['trailer'] = $trailer; 409 } 410 411 if (isset($trailer[1]['/Prev'])) { 412 $this->_readXref($result, $trailer[1]['/Prev'][1]); 413 } 414 415 $trailer = null; 416 unset($trailer); 417 418 return true; 419 } 420 421 /** 422 * Reads a PDF value 423 * 424 * @param pdf_context $c 425 * @param string $token A token 426 * @return mixed 427 * @throws Exception 428 */ 429 protected function _readValue(&$c, $token = null) 430 { 431 if (is_null($token)) { 432 $token = $this->_readToken($c); 433 } 434 435 if ($token === false) { 436 return false; 437 } 438 439 switch ($token) { 440 case '<': 441 // This is a hex string. 442 // Read the value, then the terminator 443 444 $pos = $c->offset; 445 446 while(1) { 447 448 $match = strpos($c->buffer, '>', $pos); 449 450 // If you can't find it, try 451 // reading more data from the stream 452 453 if ($match === false) { 454 if (!$c->increaseLength()) { 455 return false; 456 } else { 457 continue; 458 } 459 } 460 461 $result = substr($c->buffer, $c->offset, $match - $c->offset); 462 $c->offset = $match + 1; 463 464 return array (self::TYPE_HEX, $result); 465 } 466 break; 467 468 case '<<': 469 // This is a dictionary. 470 471 $result = array(); 472 473 // Recurse into this function until we reach 474 // the end of the dictionary. 475 while (($key = $this->_readToken($c)) !== '>>') { 476 if ($key === false) { 477 return false; 478 } 479 480 if (($value = $this->_readValue($c)) === false) { 481 return false; 482 } 483 484 // Catch missing value 485 if ($value[0] == self::TYPE_TOKEN && $value[1] == '>>') { 486 $result[$key] = array(self::TYPE_NULL); 487 break; 488 } 489 490 $result[$key] = $value; 491 } 492 493 return array (self::TYPE_DICTIONARY, $result); 494 495 case '[': 496 // This is an array. 497 498 $result = array(); 499 500 // Recurse into this function until we reach 501 // the end of the array. 502 while (($token = $this->_readToken($c)) !== ']') { 503 if ($token === false) { 504 return false; 505 } 506 507 if (($value = $this->_readValue($c, $token)) === false) { 508 return false; 509 } 510 511 $result[] = $value; 512 } 513 514 return array (self::TYPE_ARRAY, $result); 515 516 case '(': 517 // This is a string 518 $pos = $c->offset; 519 520 $openBrackets = 1; 521 do { 522 for (; $openBrackets != 0 && $pos < $c->length; $pos++) { 523 switch (ord($c->buffer[$pos])) { 524 case 0x28: // '(' 525 $openBrackets++; 526 break; 527 case 0x29: // ')' 528 $openBrackets--; 529 break; 530 case 0x5C: // backslash 531 $pos++; 532 } 533 } 534 } while($openBrackets != 0 && $c->increaseLength()); 535 536 $result = substr($c->buffer, $c->offset, $pos - $c->offset - 1); 537 $c->offset = $pos; 538 539 return array (self::TYPE_STRING, $result); 540 541 case 'stream': 542 $tempPos = $c->getPos() - strlen($c->buffer); 543 $tempOffset = $c->offset; 544 545 $c->reset($startPos = $tempPos + $tempOffset); 546 547 // Find the first "newline" 548 while ($c->buffer[0] !== chr(10) && $c->buffer[0] !== chr(13)) { 549 $c->reset(++$startPos); 550 if ($c->ensureContent() === false) { 551 throw new Exception( 552 'Unable to parse stream data. No newline followed the stream keyword.' 553 ); 554 } 555 } 556 557 $e = 0; // ensure line breaks in front of the stream 558 if ($c->buffer[0] == chr(10) || $c->buffer[0] == chr(13)) 559 $e++; 560 if ($c->buffer[1] == chr(10) && $c->buffer[0] != chr(10)) 561 $e++; 562 563 if ($this->_currentObj[1][1]['/Length'][0] == self::TYPE_OBJREF) { 564 $tmpLength = $this->resolveObject($this->_currentObj[1][1]['/Length']); 565 $length = $tmpLength[1][1]; 566 } else { 567 $length = $this->_currentObj[1][1]['/Length'][1]; 568 } 569 570 if ($length > 0) { 571 $c->reset($startPos + $e, $length); 572 $v = $c->buffer; 573 } else { 574 $v = ''; 575 } 576 577 $c->reset($startPos + $e + $length); 578 $endstream = $this->_readToken($c); 579 580 if ($endstream != 'endstream') { 581 $c->reset($startPos + $e + $length + 9); // 9 = strlen("endstream") 582 // We don't throw an error here because the next 583 // round trip will start at a new offset 584 } 585 586 return array(self::TYPE_STREAM, $v); 587 588 default: 589 if (is_numeric($token)) { 590 // A numeric token. Make sure that 591 // it is not part of something else. 592 if (($tok2 = $this->_readToken($c)) !== false) { 593 if (is_numeric($tok2)) { 594 595 // Two numeric tokens in a row. 596 // In this case, we're probably in 597 // front of either an object reference 598 // or an object specification. 599 // Determine the case and return the data 600 if (($tok3 = $this->_readToken($c)) !== false) { 601 switch ($tok3) { 602 case 'obj': 603 return array(self::TYPE_OBJDEC, (int)$token, (int)$tok2); 604 case 'R': 605 return array(self::TYPE_OBJREF, (int)$token, (int)$tok2); 606 } 607 // If we get to this point, that numeric value up 608 // there was just a numeric value. Push the extra 609 // tokens back into the stack and return the value. 610 array_push($c->stack, $tok3); 611 } 612 } 613 614 array_push($c->stack, $tok2); 615 } 616 617 if ($token === (string)((int)$token)) 618 return array(self::TYPE_NUMERIC, (int)$token); 619 else 620 return array(self::TYPE_REAL, (float)$token); 621 } else if ($token == 'true' || $token == 'false') { 622 return array(self::TYPE_BOOLEAN, $token == 'true'); 623 } else if ($token == 'null') { 624 return array(self::TYPE_NULL); 625 } else { 626 // Just a token. Return it. 627 return array(self::TYPE_TOKEN, $token); 628 } 629 } 630 } 631 632 /** 633 * Resolve an object 634 * 635 * @param array $objSpec The object-data 636 * @return array|boolean 637 * @throws Exception 638 */ 639 public function resolveObject($objSpec) 640 { 641 $c = $this->_c; 642 643 // Exit if we get invalid data 644 if (!is_array($objSpec)) { 645 return false; 646 } 647 648 if ($objSpec[0] == self::TYPE_OBJREF) { 649 650 // This is a reference, resolve it 651 if (isset($this->_xref['xref'][$objSpec[1]][$objSpec[2]])) { 652 653 // Save current file position 654 // This is needed if you want to resolve 655 // references while you're reading another object 656 // (e.g.: if you need to determine the length 657 // of a stream) 658 659 $oldPos = $c->getPos(); 660 661 // Reposition the file pointer and 662 // load the object header. 663 664 $c->reset($this->_xref['xref'][$objSpec[1]][$objSpec[2]]); 665 666 $header = $this->_readValue($c); 667 668 if ($header[0] != self::TYPE_OBJDEC || $header[1] != $objSpec[1] || $header[2] != $objSpec[2]) { 669 $toSearchFor = $objSpec[1] . ' ' . $objSpec[2] . ' obj'; 670 if (preg_match('/' . $toSearchFor . '/', $c->buffer)) { 671 $c->offset = strpos($c->buffer, $toSearchFor) + strlen($toSearchFor); 672 // reset stack 673 $c->stack = array(); 674 } else { 675 throw new Exception( 676 sprintf("Unable to find object (%s, %s) at expected location.", $objSpec[1], $objSpec[2]) 677 ); 678 } 679 } 680 681 // If we're being asked to store all the information 682 // about the object, we add the object ID and generation 683 // number for later use 684 $result = array ( 685 self::TYPE_OBJECT, 686 'obj' => $objSpec[1], 687 'gen' => $objSpec[2] 688 ); 689 690 $this->_currentObj =& $result; 691 692 // Now simply read the object data until 693 // we encounter an end-of-object marker 694 while (true) { 695 $value = $this->_readValue($c); 696 if ($value === false || count($result) > 4) { 697 // in this case the parser couldn't find an "endobj" so we break here 698 break; 699 } 700 701 if ($value[0] == self::TYPE_TOKEN && $value[1] === 'endobj') { 702 break; 703 } 704 705 $result[] = $value; 706 } 707 708 $c->reset($oldPos); 709 710 if (isset($result[2][0]) && $result[2][0] == self::TYPE_STREAM) { 711 $result[0] = self::TYPE_STREAM; 712 } 713 714 } else { 715 throw new Exception( 716 sprintf("Unable to find object (%s, %s) at expected location.", $objSpec[1], $objSpec[2]) 717 ); 718 } 719 720 return $result; 721 } else { 722 return $objSpec; 723 } 724 } 725 726 /** 727 * Reads a token from the context 728 * 729 * @param pdf_context $c 730 * @return mixed 731 */ 732 protected function _readToken($c) 733 { 734 // If there is a token available 735 // on the stack, pop it out and 736 // return it. 737 738 if (count($c->stack)) { 739 return array_pop($c->stack); 740 } 741 742 // Strip away any whitespace 743 744 do { 745 if (!$c->ensureContent()) { 746 return false; 747 } 748 $c->offset += strspn($c->buffer, "\x20\x0A\x0C\x0D\x09\x00", $c->offset); 749 } while ($c->offset >= $c->length - 1); 750 751 // Get the first character in the stream 752 753 $char = $c->buffer[$c->offset++]; 754 755 switch ($char) { 756 757 case '[': 758 case ']': 759 case '(': 760 case ')': 761 762 // This is either an array or literal string 763 // delimiter, Return it 764 765 return $char; 766 767 case '<': 768 case '>': 769 770 // This could either be a hex string or 771 // dictionary delimiter. Determine the 772 // appropriate case and return the token 773 774 if ($c->buffer[$c->offset] == $char) { 775 if (!$c->ensureContent()) { 776 return false; 777 } 778 $c->offset++; 779 return $char . $char; 780 } else { 781 return $char; 782 } 783 784 case '%': 785 786 // This is a comment - jump over it! 787 788 $pos = $c->offset; 789 while(1) { 790 $match = preg_match("/(\r\n|\r|\n)/", $c->buffer, $m, PREG_OFFSET_CAPTURE, $pos); 791 if ($match === 0) { 792 if (!$c->increaseLength()) { 793 return false; 794 } else { 795 continue; 796 } 797 } 798 799 $c->offset = $m[0][1] + strlen($m[0][0]); 800 801 return $this->_readToken($c); 802 } 803 804 default: 805 806 // This is "another" type of token (probably 807 // a dictionary entry or a numeric value) 808 // Find the end and return it. 809 810 if (!$c->ensureContent()) { 811 return false; 812 } 813 814 while(1) { 815 816 // Determine the length of the token 817 818 $pos = strcspn($c->buffer, "\x20%[]<>()/\x0A\x0C\x0D\x09\x00", $c->offset); 819 820 if ($c->offset + $pos <= $c->length - 1) { 821 break; 822 } else { 823 // If the script reaches this point, 824 // the token may span beyond the end 825 // of the current buffer. Therefore, 826 // we increase the size of the buffer 827 // and try again--just to be safe. 828 829 $c->increaseLength(); 830 } 831 } 832 833 $result = substr($c->buffer, $c->offset - 1, $pos + 1); 834 835 $c->offset += $pos; 836 837 return $result; 838 } 839 } 840 841 /** 842 * Un-filter a stream object 843 * 844 * @param array $obj 845 * @return string 846 * @throws Exception 847 */ 848 protected function _unFilterStream($obj) 849 { 850 $filters = array(); 851 852 if (isset($obj[1][1]['/Filter'])) { 853 $filter = $obj[1][1]['/Filter']; 854 855 if ($filter[0] == pdf_parser::TYPE_OBJREF) { 856 $tmpFilter = $this->resolveObject($filter); 857 $filter = $tmpFilter[1]; 858 } 859 860 if ($filter[0] == pdf_parser::TYPE_TOKEN) { 861 $filters[] = $filter; 862 } else if ($filter[0] == pdf_parser::TYPE_ARRAY) { 863 $filters = $filter[1]; 864 } 865 } 866 867 $stream = $obj[2][1]; 868 869 foreach ($filters AS $filter) { 870 switch ($filter[1]) { 871 case '/FlateDecode': 872 case '/Fl': 873 if (function_exists('gzuncompress')) { 874 $oStream = $stream; 875 $stream = (strlen($stream) > 0) ? @gzuncompress($stream) : ''; 876 } else { 877 throw new Exception( 878 sprintf('To handle %s filter, please compile php with zlib support.', $filter[1]) 879 ); 880 } 881 882 if ($stream === false) { 883 $tries = 0; 884 while ($tries < 8 && ($stream === false || strlen($stream) < strlen($oStream))) { 885 $oStream = substr($oStream, 1); 886 $stream = @gzinflate($oStream); 887 $tries++; 888 } 889 890 if ($stream === false) { 891 throw new Exception('Error while decompressing stream.'); 892 } 893 } 894 break; 895 case '/LZWDecode': 896 if (!class_exists('FilterLZW')) { 897 require_once ('filters/FilterLZW.php'); 898 } 899 $decoder = new FilterLZW(); 900 $stream = $decoder->decode($stream); 901 break; 902 case '/ASCII85Decode': 903 if (!class_exists('FilterASCII85')) { 904 require_once ('filters/FilterASCII85.php'); 905 } 906 $decoder = new FilterASCII85(); 907 $stream = $decoder->decode($stream); 908 break; 909 case '/ASCIIHexDecode': 910 if (!class_exists('FilterASCIIHexDecode')) { 911 require_once ('filters/FilterASCIIHexDecode.php'); 912 } 913 $decoder = new FilterASCIIHexDecode(); 914 $stream = $decoder->decode($stream); 915 break; 916 case null: 917 break; 918 default: 919 throw new Exception(sprintf('Unsupported Filter: %s', $filter[1])); 920 } 921 } 922 923 return $stream; 924 } 925 }
title
Description
Body
title
Description
Body
title
Description
Body
title
Body
Generated: Thu Aug 11 10:00:09 2016 | Cross-referenced by PHPXref 0.7.1 |