.
//
// See LICENSE file for more information.
// -------------------------------------------------------------------
//
// Description : This is a PHP class for parsing PDF documents.
//
//============================================================+
/**
* @file
* This is a PHP class for parsing PDF documents.
* @author Paul Nicholls
* @author Nicola Asuni
* @version 1.0
*/
// include class for decoding filters
require_once(dirname(__FILE__).'/../tcpdf/include/tcpdf_filters.php');
if (!defined ('PDF_TYPE_NULL'))
define ('PDF_TYPE_NULL', 0);
if (!defined ('PDF_TYPE_NUMERIC'))
define ('PDF_TYPE_NUMERIC', 1);
if (!defined ('PDF_TYPE_TOKEN'))
define ('PDF_TYPE_TOKEN', 2);
if (!defined ('PDF_TYPE_HEX'))
define ('PDF_TYPE_HEX', 3);
if (!defined ('PDF_TYPE_STRING'))
define ('PDF_TYPE_STRING', 4);
if (!defined ('PDF_TYPE_DICTIONARY'))
define ('PDF_TYPE_DICTIONARY', 5);
if (!defined ('PDF_TYPE_ARRAY'))
define ('PDF_TYPE_ARRAY', 6);
if (!defined ('PDF_TYPE_OBJDEC'))
define ('PDF_TYPE_OBJDEC', 7);
if (!defined ('PDF_TYPE_OBJREF'))
define ('PDF_TYPE_OBJREF', 8);
if (!defined ('PDF_TYPE_OBJECT'))
define ('PDF_TYPE_OBJECT', 9);
if (!defined ('PDF_TYPE_STREAM'))
define ('PDF_TYPE_STREAM', 10);
if (!defined ('PDF_TYPE_BOOLEAN'))
define ('PDF_TYPE_BOOLEAN', 11);
if (!defined ('PDF_TYPE_REAL'))
define ('PDF_TYPE_REAL', 12);
/**
* @class tcpdi_parser
* This is a PHP class for parsing PDF documents.
* Based on TCPDF_PARSER, part of the TCPDF project by Nicola Asuni.
* @brief This is a PHP class for parsing PDF documents..
* @version 1.0
* @author Paul Nicholls - github.com/pauln
* @author Nicola Asuni - info@tecnick.com
*/
class tcpdi_parser {
/**
* Unique parser ID
* @public
*/
public $uniqueid = '';
/**
* Raw content of the PDF document.
* @private
*/
private $pdfdata = '';
/**
* XREF data.
* @protected
*/
protected $xref = array();
/**
* Object streams.
* @protected
*/
protected $objstreams = array();
/**
* Objects in objstreams.
* @protected
*/
protected $objstreamobjs = array();
/**
* List of seen XREF data locations.
* @protected
*/
protected $xref_seen_offsets = array();
/**
* Array of PDF objects.
* @protected
*/
protected $objects = array();
/**
* Array of object offsets.
* @private
*/
private $objoffsets = array();
/**
* Class object for decoding filters.
* @private
*/
private $FilterDecoders;
/**
* Pages
*
* @private array
*/
private $pages;
/**
* Page count
* @private integer
*/
private $page_count;
/**
* actual page number
* @private integer
*/
private $pageno;
/**
* PDF version of the loaded document
* @private string
*/
private $pdfVersion;
/**
* Available BoxTypes
*
* @public array
*/
public $availableBoxes = array('/MediaBox', '/CropBox', '/BleedBox', '/TrimBox', '/ArtBox');
// -----------------------------------------------------------------------------
/**
* Parse a PDF document an return an array of objects.
* @param $data (string) PDF data to parse.
* @public
* @since 1.0.000 (2011-05-24)
*/
public function __construct($data, $uniqueid) {
if (empty($data)) {
$this->Error('Empty PDF data.');
}
$this->uniqueid = $uniqueid;
$this->pdfdata = $data;
// get length
$pdflen = strlen($this->pdfdata);
// initialize class for decoding filters
$this->FilterDecoders = new TCPDF_FILTERS();
// get xref and trailer data
$this->xref = $this->getXrefData();
$this->findObjectOffsets();
// parse all document objects
$this->objects = array();
/*foreach ($this->xref['xref'] as $obj => $offset) {
if (!isset($this->objects[$obj]) AND ($offset > 0)) {
// decode only objects with positive offset
//$this->objects[$obj] = $this->getIndirectObject($obj, $offset, true);
}
}*/
$this->getPDFVersion();
$this->readPages();
}
/**
* Clean up when done, to free memory etc
*/
public function cleanUp() {
unset($this->pdfdata);
$this->pdfdata = '';
unset($this->objstreams);
$this->objstreams = array();
unset($this->objects);
$this->objects = array();
unset($this->objstreamobjs);
$this->objstreamobjs = array();
unset($this->xref);
$this->xref = array();
unset($this->objoffsets);
$this->objoffsets = array();
unset($this->pages);
$this->pages = array();
}
/**
* Return an array of parsed PDF document objects.
* @return (array) Array of parsed PDF document objects.
* @public
* @since 1.0.000 (2011-06-26)
*/
public function getParsedData() {
return array($this->xref, $this->objects, $this->pages);
}
/**
* Get PDF-Version
*
* And reset the PDF Version used in FPDI if needed
* @public
*/
public function getPDFVersion() {
preg_match('/\d\.\d/', substr($this->pdfdata, 0, 16), $m);
if (isset($m[0]))
$this->pdfVersion = $m[0];
return $this->pdfVersion;
}
/**
* Read all /Page(es)
*
*/
function readPages() {
$params = $this->getObjectVal($this->xref['trailer'][1]['/Root']);
$objref = null;
foreach ($params[1][1] as $k=>$v) {
if ($k == '/Pages') {
$objref = $v;
break;
}
}
if ($objref == null || $objref[0] !== PDF_TYPE_OBJREF) {
// Offset not found.
return;
}
$dict = $this->getObjectVal($objref);
if ($dict[0] == PDF_TYPE_OBJECT && $dict[1][0] == PDF_TYPE_DICTIONARY) {
// Dict wrapped in an object
$dict = $dict[1];
}
if ($dict[0] !== PDF_TYPE_DICTIONARY) {
return;
}
$this->pages = array();
if (isset($dict[1]['/Kids'])) {
$v = $dict[1]['/Kids'];
if ($v[0] == PDF_TYPE_ARRAY) {
foreach ($v[1] as $ref) {
$page = $this->getObjectVal($ref);
$this->readPage($page);
}
}
}
$this->page_count = count($this->pages);
}
/**
* Read a single /Page element, recursing through /Kids if necessary
*
*/
private function readPage($page) {
if (isset($page[1][1]['/Kids'])) {
// Nested pages!
foreach ($page[1][1]['/Kids'][1] as $subref) {
$subpage = $this->getObjectVal($subref);
$this->readPage($subpage);
}
} else {
$this->pages[] = $page;
}
}
/**
* Get pagecount from sourcefile
*
* @return int
*/
function getPageCount() {
return $this->page_count;
}
/**
* Get Cross-Reference (xref) table and trailer data from PDF document data.
* @param $offset (int) xref offset (if know).
* @param $xref (array) previous xref array (if any).
* @return Array containing xref and trailer data.
* @protected
* @since 1.0.000 (2011-05-24)
*/
protected function getXrefData($offset=0, $xref=array()) {
if ($offset == 0) {
// find last startxref
if (preg_match('/.*[\r\n]startxref[\s]*[\r\n]+([0-9]+)[\s]*[\r\n]+%%EOF/is', $this->pdfdata, $matches) == 0) {
$this->Error('Unable to find startxref');
}
$startxref = $matches[1];
} else {
if (preg_match('/([0-9]+[\s][0-9]+[\s]obj)/i', $this->pdfdata, $matches, PREG_OFFSET_CAPTURE, $offset)) {
// Cross-Reference Stream object
$startxref = $offset;
} elseif (preg_match('/[\r\n]startxref[\s]*[\r\n]+([0-9]+)[\s]*[\r\n]+%%EOF/i', $this->pdfdata, $matches, PREG_OFFSET_CAPTURE, $offset)) {
// startxref found
$startxref = $matches[1][0];
} else {
$this->Error('Unable to find startxref');
}
}
unset($matches);
// DOMPDF gets the startxref wrong, giving us the linebreak before the xref starts.
$startxref += strspn($this->pdfdata, "\r\n", $startxref);
// check xref position
if (strpos($this->pdfdata, 'xref', $startxref) == $startxref) {
// Cross-Reference
$xref = $this->decodeXref($startxref, $xref);
} else {
// Cross-Reference Stream
$xref = $this->decodeXrefStream($startxref, $xref);
}
if (empty($xref)) {
$this->Error('Unable to find xref');
}
return $xref;
}
/**
* Decode the Cross-Reference section
* @param $startxref (int) Offset at which the xref section starts.
* @param $xref (array) Previous xref array (if any).
* @return Array containing xref and trailer data.
* @protected
* @since 1.0.000 (2011-06-20)
*/
protected function decodeXref($startxref, $xref=array()) {
$this->xref_seen_offsets[] = $startxref;
if (!isset($xref['xref_location'])) {
$xref['xref_location'] = $startxref;
$xref['max_object'] = 0;
}
// extract xref data (object indexes and offsets)
$xoffset = $startxref + 5;
// initialize object number
$obj_num = 0;
$offset = $xoffset;
while (preg_match('/^([0-9]+)[\s]([0-9]+)[\s]?([nf]?)/im', $this->pdfdata, $matches, PREG_OFFSET_CAPTURE, $offset) > 0) {
$offset = (strlen($matches[0][0]) + $matches[0][1]);
if ($matches[3][0] == 'n') {
// create unique object index: [object number]_[generation number]
$gen_num = intval($matches[2][0]);
$index = $obj_num.'_'.$gen_num;
// check if object already exist
if (!isset($xref['xref'][$obj_num][$gen_num])) {
// store object offset position
$xref['xref'][$obj_num][$gen_num] = intval($matches[1][0]);
}
++$obj_num;
$offset += 2;
} elseif ($matches[3][0] == 'f') {
++$obj_num;
$offset += 2;
} else {
// object number (index)
$obj_num = intval($matches[1][0]);
}
}
unset($matches);
$xref['max_object'] = max($xref['max_object'], $obj_num);
// get trailer data
if (preg_match('/trailer[\s]*<<(.*)>>[\s]*[\r\n]+startxref[\s]*[\r\n]+/isU', $this->pdfdata, $matches, PREG_OFFSET_CAPTURE, $xoffset) > 0) {
$trailer_data = $matches[1][0];
if (!isset($xref['trailer']) OR empty($xref['trailer'])) {
// get only the last updated version
$xref['trailer'] = array();
$xref['trailer'][0] = PDF_TYPE_DICTIONARY;
$xref['trailer'][1] = array();
// parse trailer_data
if (preg_match('/Size[\s]+([0-9]+)/i', $trailer_data, $matches) > 0) {
$xref['trailer'][1]['/Size'] = array(PDF_TYPE_NUMERIC, intval($matches[1]));
}
if (preg_match('/Root[\s]+([0-9]+)[\s]+([0-9]+)[\s]+R/i', $trailer_data, $matches) > 0) {
$xref['trailer'][1]['/Root'] = array(PDF_TYPE_OBJREF, intval($matches[1]), intval($matches[2]));
}
if (preg_match('/Encrypt[\s]+([0-9]+)[\s]+([0-9]+)[\s]+R/i', $trailer_data, $matches) > 0) {
$xref['trailer'][1]['/Encrypt'] = array(PDF_TYPE_OBJREF, intval($matches[1]), intval($matches[2]));
}
if (preg_match('/Info[\s]+([0-9]+)[\s]+([0-9]+)[\s]+R/i', $trailer_data, $matches) > 0) {
$xref['trailer'][1]['/Info'] = array(PDF_TYPE_OBJREF, intval($matches[1]), intval($matches[2]));
}
if (preg_match('/ID[\s]*[\[][\s]*[<]([^>]*)[>][\s]*[<]([^>]*)[>]/i', $trailer_data, $matches) > 0) {
$xref['trailer'][1]['/ID'] = array(PDF_TYPE_ARRAY, array());
$xref['trailer'][1]['/ID'][1][0] = array(PDF_TYPE_HEX, $matches[1]);
$xref['trailer'][1]['/ID'][1][1] = array(PDF_TYPE_HEX, $matches[2]);
}
}
if (preg_match('/Prev[\s]+([0-9]+)/i', $trailer_data, $matches) > 0) {
// get previous xref
$prevoffset = intval($matches[1]);
if (!in_array($prevoffset, $this->xref_seen_offsets)) {
$this->xref_seen_offsets[] = $prevoffset;
$xref = $this->getXrefData($prevoffset, $xref);
}
}
unset($matches);
} else {
$this->Error('Unable to find trailer');
}
return $xref;
}
/**
* Decode the Cross-Reference Stream section
* @param $startxref (int) Offset at which the xref section starts.
* @param $xref (array) Previous xref array (if any).
* @return Array containing xref and trailer data.
* @protected
* @since 1.0.003 (2013-03-16)
*/
protected function decodeXrefStream($startxref, $xref=array()) {
// try to read Cross-Reference Stream
list($xrefobj, $unused) = $this->getRawObject($startxref);
$xrefcrs = $this->getIndirectObject($xrefobj[1], $startxref, true);
if (!isset($xref['xref_location'])) {
$xref['xref_location'] = $startxref;
$xref['max_object'] = 0;
}
if (!isset($xref['xref'])) {
$xref['xref'] = array();
}
if (!isset($xref['trailer']) OR empty($xref['trailer'])) {
// get only the last updated version
$xref['trailer'] = array();
$xref['trailer'][0] = PDF_TYPE_DICTIONARY;
$xref['trailer'][1] = array();
$filltrailer = true;
} else {
$filltrailer = false;
}
$valid_crs = false;
$sarr = $xrefcrs[0][1];
$keys = array_keys($sarr);
$columns = 1; // Default as per PDF 32000-1:2008.
$predictor = 1; // Default as per PDF 32000-1:2008.
foreach ($keys as $k=>$key) {
$v = $sarr[$key];
if (($key == '/Type') AND ($v[0] == PDF_TYPE_TOKEN AND ($v[1] == 'XRef'))) {
$valid_crs = true;
} elseif (($key == '/Index') AND ($v[0] == PDF_TYPE_ARRAY AND count($v[1] >= 2))) {
// first object number in the subsection
$index_first = intval($v[1][0][1]);
// number of entries in the subsection
$index_entries = intval($v[1][1][1]);
} elseif (($key == '/Prev') AND ($v[0] == PDF_TYPE_NUMERIC)) {
// get previous xref offset
$prevxref = intval($v[1]);
} elseif (($key == '/W') AND ($v[0] == PDF_TYPE_ARRAY)) {
// number of bytes (in the decoded stream) of the corresponding field
$wb = array();
$wb[0] = intval($v[1][0][1]);
$wb[1] = intval($v[1][1][1]);
$wb[2] = intval($v[1][2][1]);
} elseif (($key == '/DecodeParms') AND ($v[0] == PDF_TYPE_DICTIONARY)) {
$decpar = $v[1];
foreach ($decpar as $kdc => $vdc) {
if (($kdc == '/Columns') AND ($vdc[0] == PDF_TYPE_NUMERIC)) {
$columns = intval($vdc[1]);
} elseif (($kdc == '/Predictor') AND ($vdc[0] == PDF_TYPE_NUMERIC)) {
$predictor = intval($vdc[1]);
}
}
} elseif ($filltrailer) {
switch($key) {
case '/Size':
case '/Root':
case '/Info':
case '/ID':
$xref['trailer'][1][$key] = $v;
break;
default:
break;
}
}
}
// decode data
$obj_num = 0;
if ($valid_crs AND isset($xrefcrs[1][3][0])) {
// number of bytes in a row
$rowlen = ($columns + 1);
// convert the stream into an array of integers
$sdata = unpack('C*', $xrefcrs[1][3][0]);
// split the rows
$sdata = array_chunk($sdata, $rowlen);
// initialize decoded array
$ddata = array();
// initialize first row with zeros
$prev_row = array_fill (0, $rowlen, 0);
// for each row apply PNG unpredictor
foreach ($sdata as $k => $row) {
// initialize new row
$ddata[$k] = array();
// get PNG predictor value
if (empty($predictor)) {
$predictor = (10 + $row[0]);
}
// for each byte on the row
for ($i=1; $i<=$columns; ++$i) {
// new index
$j = ($i - 1);
$row_up = $prev_row[$j];
if ($i == 1) {
$row_left = 0;
$row_upleft = 0;
} else {
$row_left = $row[($i - 1)];
$row_upleft = $prev_row[($j - 1)];
}
switch ($predictor) {
case 1: // No prediction (equivalent to PNG None)
case 10: { // PNG prediction (on encoding, PNG None on all rows)
$ddata[$k][$j] = $row[$i];
break;
}
case 11: { // PNG prediction (on encoding, PNG Sub on all rows)
$ddata[$k][$j] = (($row[$i] + $row_left) & 0xff);
break;
}
case 12: { // PNG prediction (on encoding, PNG Up on all rows)
$ddata[$k][$j] = (($row[$i] + $row_up) & 0xff);
break;
}
case 13: { // PNG prediction (on encoding, PNG Average on all rows)
$ddata[$k][$j] = (($row[$i] + (($row_left + $row_up) / 2)) & 0xff);
break;
}
case 14: { // PNG prediction (on encoding, PNG Paeth on all rows)
// initial estimate
$p = ($row_left + $row_up - $row_upleft);
// distances
$pa = abs($p - $row_left);
$pb = abs($p - $row_up);
$pc = abs($p - $row_upleft);
$pmin = min($pa, $pb, $pc);
// return minumum distance
switch ($pmin) {
case $pa: {
$ddata[$k][$j] = (($row[$i] + $row_left) & 0xff);
break;
}
case $pb: {
$ddata[$k][$j] = (($row[$i] + $row_up) & 0xff);
break;
}
case $pc: {
$ddata[$k][$j] = (($row[$i] + $row_upleft) & 0xff);
break;
}
}
break;
}
default: { // PNG prediction (on encoding, PNG optimum)
$this->Error("Unknown PNG predictor $predictor");
break;
}
}
}
$prev_row = $ddata[$k];
} // end for each row
// complete decoding
unset($sdata);
$sdata = array();
// for every row
foreach ($ddata as $k => $row) {
// initialize new row
$sdata[$k] = array(0, 0, 0);
if ($wb[0] == 0) {
// default type field
$sdata[$k][0] = 1;
}
$i = 0; // count bytes on the row
// for every column
for ($c = 0; $c < 3; ++$c) {
// for every byte on the column
for ($b = 0; $b < $wb[$c]; ++$b) {
$sdata[$k][$c] += ($row[$i] << (($wb[$c] - 1 - $b) * 8));
++$i;
}
}
}
unset($ddata);
// fill xref
if (isset($index_first)) {
$obj_num = $index_first;
} else {
$obj_num = 0;
}
foreach ($sdata as $k => $row) {
switch ($row[0]) {
case 0: { // (f) linked list of free objects
++$obj_num;
break;
}
case 1: { // (n) objects that are in use but are not compressed
// create unique object index: [object number]_[generation number]
$index = $obj_num.'_'.$row[2];
// check if object already exist
if (!isset($xref['xref'][$obj_num][$row[2]])) {
// store object offset position
$xref['xref'][$obj_num][$row[2]] = $row[1];
}
++$obj_num;
break;
}
case 2: { // compressed objects
// $row[1] = object number of the object stream in which this object is stored
// $row[2] = index of this object within the object stream
/*$index = $row[1].'_0_'.$row[2];
$xref['xref'][$row[1]][0][$row[2]] = -1;*/
break;
}
default: { // null objects
break;
}
}
}
} // end decoding data
$xref['max_object'] = max($xref['max_object'], $obj_num);
if (isset($prevxref)) {
// get previous xref
$xref = $this->getXrefData($prevxref, $xref);
}
return $xref;
}
/**
* Get raw stream data
* @param $offset (int) Stream offset.
* @param $length (int) Stream length.
* @return string Steam content
* @protected
*/
protected function getRawStream($offset, $length) {
$offset += strspn($this->pdfdata, "\x00\x09\x0a\x0c\x0d\x20", $offset);
$offset += 6; // "stream"
$offset += strspn($this->pdfdata, "\r\n", $offset);
$obj = array();
$obj[] = PDF_TYPE_STREAM;
$obj[] = substr($this->pdfdata, $offset, $length);
return array($obj, $offset+$length);
}
/**
* Get object type, raw value and offset to next object
* @param $offset (int) Object offset.
* @return array containing object type, raw value and offset to next object
* @protected
* @since 1.0.000 (2011-06-20)
*/
protected function getRawObject($offset=0, $data=null) {
if ($data == null) {
$data =& $this->pdfdata;
}
$objtype = ''; // object type to be returned
$objval = ''; // object value to be returned
// skip initial white space chars: \x00 null (NUL), \x09 horizontal tab (HT), \x0A line feed (LF), \x0C form feed (FF), \x0D carriage return (CR), \x20 space (SP)
while (strspn($data{$offset}, "\x00\x09\x0a\x0c\x0d\x20") == 1) {
$offset++;
}
// get first char
$char = $data{$offset};
// get object type
switch ($char) {
case '%': { // \x25 PERCENT SIGN
// skip comment and search for next token
$next = strcspn($data, "\r\n", $offset);
if ($next > 0) {
$offset += $next;
list($obj, $unused) = $this->getRawObject($offset, $data);
return $obj;
}
break;
}
case '/': { // \x2F SOLIDUS
// name object
$objtype = PDF_TYPE_TOKEN;
++$offset;
$length = strcspn($data, "\x00\x09\x0a\x0c\x0d\x20\x28\x29\x3c\x3e\x5b\x5d\x7b\x7d\x2f\x25", $offset);
$objval = substr($data, $offset, $length);
$offset += $length;
break;
}
case '(': // \x28 LEFT PARENTHESIS
case ')': { // \x29 RIGHT PARENTHESIS
// literal string object
$objtype = PDF_TYPE_STRING;
++$offset;
$strpos = $offset;
if ($char == '(') {
$open_bracket = 1;
while ($open_bracket > 0) {
if (!isset($data{$strpos})) {
break;
}
$ch = $data{$strpos};
switch ($ch) {
case '\\': { // REVERSE SOLIDUS (5Ch) (Backslash)
// skip next character
++$strpos;
break;
}
case '(': { // LEFT PARENHESIS (28h)
++$open_bracket;
break;
}
case ')': { // RIGHT PARENTHESIS (29h)
--$open_bracket;
break;
}
}
++$strpos;
}
$objval = substr($data, $offset, ($strpos - $offset - 1));
$offset = $strpos;
}
break;
}
case '[': // \x5B LEFT SQUARE BRACKET
case ']': { // \x5D RIGHT SQUARE BRACKET
// array object
$objtype = PDF_TYPE_ARRAY;
++$offset;
if ($char == '[') {
// get array content
$objval = array();
do {
// get element
list($element, $offset) = $this->getRawObject($offset, $data);
$objval[] = $element;
} while ($element[0] !== ']');
// remove closing delimiter
array_pop($objval);
} else {
$objtype = ']';
}
break;
}
case '<': // \x3C LESS-THAN SIGN
case '>': { // \x3E GREATER-THAN SIGN
if (isset($data{($offset + 1)}) AND ($data{($offset + 1)} == $char)) {
// dictionary object
$objtype = PDF_TYPE_DICTIONARY;
if ($char == '<') {
list ($objval, $offset) = $this->getDictValue($offset, $data);
} else {
$objtype = '>>';
$offset += 2;
}
} else {
// hexadecimal string object
$objtype = PDF_TYPE_HEX;
++$offset;
// The "Panose" entry in the FontDescriptor Style dict seems to have hex bytes separated by spaces.
if (($char == '<') AND (preg_match('/^([0-9A-Fa-f ]+)[>]/iU', substr($data, $offset), $matches) == 1)) {
$objval = $matches[1];
$offset += strlen($matches[0]);
unset($matches);
}
}
break;
}
default: {
$frag = $data{$offset} . @$data{$offset+1} . @$data{$offset+2} . @$data{$offset+3};
switch ($frag) {
case 'endo':
// indirect object
$objtype = 'endobj';
$offset += 6;
break;
case 'stre':
// Streams should always be indirect objects, and thus processed by getRawStream().
// If we get here, treat it as a null object as something has gone wrong.
case 'null':
// null object
$objtype = PDF_TYPE_NULL;
$offset += 4;
$objval = 'null';
break;
case 'true':
// boolean true object
$objtype = PDF_TYPE_BOOLEAN;
$offset += 4;
$objval = true;
break;
case 'fals':
// boolean false object
$objtype = PDF_TYPE_BOOLEAN;
$offset += 5;
$objval = false;
break;
case 'ends':
// end stream object
$objtype = 'endstream';
$offset += 9;
break;
default:
if (preg_match('/^([0-9]+)[\s]+([0-9]+)[\s]+([Robj]{1,3})/i', substr($data, $offset, 33), $matches) == 1) {
if ($matches[3] == 'R') {
// indirect object reference
$objtype = PDF_TYPE_OBJREF;
$offset += strlen($matches[0]);
$objval = array(intval($matches[1]), intval($matches[2]));
} elseif ($matches[3] == 'obj') {
// object start
$objtype = PDF_TYPE_OBJECT;
$objval = intval($matches[1]).'_'.intval($matches[2]);
$offset += strlen ($matches[0]);
}
} elseif (($numlen = strspn($data, '+-.0123456789', $offset)) > 0) {
// numeric object
$objval = substr($data, $offset, $numlen);
$objtype = (intval($objval) != $objval) ? PDF_TYPE_REAL : PDF_TYPE_NUMERIC;
$offset += $numlen;
}
unset($matches);
break;
}
break;
}
}
$obj = array();
$obj[] = $objtype;
if ($objtype == PDF_TYPE_OBJREF && is_array($objval)) {
foreach ($objval as $val) {
$obj[] = $val;
}
} else {
$obj[] = $objval;
}
return array($obj, $offset);
}
private function getDictValue($offset, &$data) {
$objval = array();
// Extract dict from data.
$i=1;
$dict = '';
$offset += 2;
do {
if ($data{$offset} == '>' && $data{$offset+1} == '>') {
$i--;
$dict .= '>>';
$offset += 2;
} else if ($data{$offset} == '<' && $data{$offset+1} == '<') {
$i++;
$dict .= '<<';
$offset += 2;
} else {
$dict .= $data{$offset};
$offset++;
}
} while ($i>0);
// Now that we have just the dict, parse it.
$dictoffset = 0;
do {
// Get dict element.
list($key, $eloffset) = $this->getRawObject($dictoffset, $dict);
if ($key[0] == '>>') {
break;
}
list($element, $dictoffset) = $this->getRawObject($eloffset, $dict);
$objval['/'.$key[1]] = $element;
unset($key);
unset($element);
} while (true);
return array($objval, $offset);
}
/**
* Get content of indirect object.
* @param $obj_ref (string) Object number and generation number separated by underscore character.
* @param $offset (int) Object offset.
* @param $decoding (boolean) If true decode streams.
* @return array containing object data.
* @protected
* @since 1.0.000 (2011-05-24)
*/
protected function getIndirectObject($obj_ref, $offset=0, $decoding=true) {
$obj = explode('_', $obj_ref);
if (($obj === false) OR (count($obj) != 2)) {
$this->Error('Invalid object reference: '.$obj);
return;
}
$objref = $obj[0].' '.$obj[1].' obj';
if (strpos($this->pdfdata, $objref, $offset) != $offset) {
// an indirect reference to an undefined object shall be considered a reference to the null object
return array('null', 'null', $offset);
}
// starting position of object content
$offset += strlen($objref);
// get array of object content
$objdata = array();
$i = 0; // object main index
do {
if (($i > 0) AND (isset($objdata[($i - 1)][0])) AND ($objdata[($i - 1)][0] == PDF_TYPE_DICTIONARY) AND array_key_exists('/Length', $objdata[($i - 1)][1])) {
// Stream - get using /Length in stream's dict
$lengthobj = $objdata[($i-1)][1]['/Length'];
if ($lengthobj[0] === PDF_TYPE_OBJREF) {
$lengthobj = $this->getObjectVal($lengthobj);
if ($lengthobj[0] === PDF_TYPE_OBJECT) {
$lengthobj = $lengthobj[1];
}
}
$streamlength = $lengthobj[1];
list($element, $offset) = $this->getRawStream($offset, $streamlength);
} else {
// get element
list($element, $offset) = $this->getRawObject($offset);
}
// decode stream using stream's dictionary information
if ($decoding AND ($element[0] == PDF_TYPE_STREAM) AND (isset($objdata[($i - 1)][0])) AND ($objdata[($i - 1)][0] == PDF_TYPE_DICTIONARY)) {
$element[3] = $this->decodeStream($objdata[($i - 1)][1], $element[1]);
}
$objdata[$i] = $element;
++$i;
} while ($element[0] != 'endobj');
// remove closing delimiter
array_pop($objdata);
// return raw object content
return $objdata;
}
/**
* Get the content of object, resolving indect object reference if necessary.
* @param $obj (string) Object value.
* @return array containing object data.
* @public
* @since 1.0.000 (2011-06-26)
*/
public function getObjectVal($obj) {
if ($obj[0] == PDF_TYPE_OBJREF) {
if (strpos($obj[1], '_') !== false) {
$key = explode('_', $obj[1]);
} else {
$key = array($obj[1], $obj[2]);
}
$ret = array(0=>PDF_TYPE_OBJECT, 'obj'=>$key[0], 'gen'=>$key[1]);
// reference to indirect object
$object = null;
if (isset($this->objects[$key[0]][$key[1]])) {
// this object has been already parsed
$object = $this->objects[$key[0]][$key[1]];
} elseif (($offset = $this->findObjectOffset($key)) !== false) {
// parse new object
$this->objects[$key[0]][$key[1]] = $this->getIndirectObject($key[0].'_'.$key[1], $offset, false);
$object = $this->objects[$key[0]][$key[1]];
} elseif (($key[1] == 0) && isset($this->objstreamobjs[$key[0]])) {
// Object is in an object stream
$streaminfo = $this->objstreamobjs[$key[0]];
$objs = $streaminfo[0];
if (!isset($this->objstreams[$objs[0]][$objs[1]])) {
// Fetch and decode object stream
$offset = $this->findObjectOffset($objs);;
$objstream = $this->getObjectVal(array(PDF_TYPE_OBJREF, $objs[0], $objs[1]));
$decoded = $this->decodeStream($objstream[1][1], $objstream[2][1]);
$this->objstreams[$objs[0]][$objs[1]] = $decoded[0]; // Store just the data, in case we need more from this objstream
// Free memory
unset($objstream);
unset($decoded);
}
$this->objects[$key[0]][$key[1]] = $this->getRawObject($streaminfo[1], $this->objstreams[$objs[0]][$objs[1]]);
$object = $this->objects[$key[0]][$key[1]];
}
if (!is_null($object)) {
$ret[1] = $object[0];
if (isset($object[1][0]) && $object[1][0] == PDF_TYPE_STREAM) {
$ret[0] = PDF_TYPE_STREAM;
$ret[2] = $object[1];
}
return $ret;
}
}
return $obj;
}
/**
* Extract object stream to find out what it contains.
*
*/
function extractObjectStream($key) {
$objref = array(PDF_TYPE_OBJREF, $key[0], $key[1]);
$obj = $this->getObjectVal($objref);
if ($obj[0] !== PDF_TYPE_STREAM || !isset($obj[1][1]['/First'][1])) {
// Not a valid object stream dictionary - skip it.
return;
}
$stream = $this->decodeStream($obj[1][1], $obj[2][1]);// Decode object stream, as we need the first bit
$first = intval($obj[1][1]['/First'][1]);
$ints = explode(' ', substr($stream[0], 0, $first)); // Get list of object / offset pairs
for ($j=1; $jobjstreamobjs[$ints[$j-1]] = array($key, $ints[$j]+$first);
}
}
// Free memory - we may not need this at all.
unset($obj);
unset($stream);
}
/**
* Find all object offsets. Saves having to scour the file multiple times.
* @private
*/
private function findObjectOffsets() {
$this->objoffsets = array();
if (preg_match_all('/(*ANYCRLF)^[\s]*([0-9]+)[\s]+([0-9]+)[\s]+obj/im', $this->pdfdata, $matches, PREG_OFFSET_CAPTURE) >= 1) {
$i = 0;
foreach($matches[0] as $match) {
$offset = $match[1] + strspn($match[0], "\x00\x09\x0a\x0c\x0d\x20");
$this->objoffsets[trim($match[0])] = $offset;
$dictoffset = $match[1] + strlen($match[0]);
if (preg_match('|^\s+<<[^>]+/ObjStm|', substr($this->pdfdata, $dictoffset, 256), $objstm) == 1) {
$this->extractObjectStream(array($matches[1][$i][0], $matches[2][$i][0]));
}
$i++;
}
}
unset($matches);
}
/**
* Get offset of an object. Checks xref first, then offsets found by scouring the file.
* @param $key (array) Object key to find (obj, gen).
* @return int Offset of the object in $this->pdfdata.
* @private
*/
private function findObjectOffset($key) {
$objref = $key[0].' '.$key[1].' obj';
if (isset($this->xref['xref'][$key[0]][$key[1]])) {
$offset = $this->xref['xref'][$key[0]][$key[1]];
if (strpos($this->pdfdata, $objref, $offset) === $offset) {
// Offset is in xref table and matches actual position in file
//echo "Offset in XREF is correct, returning
";
return $this->xref['xref'][$key[0]][$key[1]];
}
}
if (array_key_exists($objref, $this->objoffsets)) {
//echo "Offset found in internal reftable
";
return $this->objoffsets[$objref];
}
return false;
}
/**
* Decode the specified stream.
* @param $sdic (array) Stream's dictionary array.
* @param $stream (string) Stream to decode.
* @return array containing decoded stream data and remaining filters.
* @protected
* @since 1.0.000 (2011-06-22)
*/
protected function decodeStream($sdic, $stream) {
// get stream lenght and filters
$slength = strlen($stream);
if ($slength <= 0) {
return array('', array());
}
$filters = array();
foreach ($sdic as $k => $v) {
if ($v[0] == PDF_TYPE_TOKEN) {
if (($k == '/Length') AND ($v[0] == PDF_TYPE_NUMERIC)) {
// get declared stream lenght
$declength = intval($v[1]);
if ($declength < $slength) {
$stream = substr($stream, 0, $declength);
$slength = $declength;
}
} elseif ($k == '/Filter') {
if ($v[0] == PDF_TYPE_TOKEN) {
// single filter
$filters[] = $v[1];
} elseif ($v[0] == PDF_TYPE_ARRAY) {
// array of filters
foreach ($v[1] as $flt) {
if ($flt[0] == PDF_TYPE_TOKEN) {
$filters[] = $flt[1];
}
}
}
}
}
}
// decode the stream
$remaining_filters = array();
foreach ($filters as $filter) {
if (in_array($filter, $this->FilterDecoders->getAvailableFilters())) {
$stream = $this->FilterDecoders->decodeFilter($filter, $stream);
} else {
// add missing filter to array
$remaining_filters[] = $filter;
}
}
return array($stream, $remaining_filters);
}
/**
* Set pageno
*
* @param int $pageno Pagenumber to use
*/
public function setPageno($pageno) {
$pageno = ((int) $pageno) - 1;
if ($pageno < 0 || $pageno >= $this->getPageCount()) {
$this->error("Pagenumber is wrong! (Requested $pageno, max ".$this->getPageCount().")");
}
$this->pageno = $pageno;
}
/**
* Get page-resources from current page
*
* @return array
*/
public function getPageResources() {
return $this->_getPageResources($this->pages[$this->pageno]);
}
/**
* Get page-resources from /Page
*
* @param array $obj Array of pdf-data
*/
private function _getPageResources ($obj) { // $obj = /Page
$obj = $this->getObjectVal($obj);
// If the current object has a resources
// dictionary associated with it, we use
// it. Otherwise, we move back to its
// parent object.
if (isset ($obj[1][1]['/Resources'])) {
$res = $obj[1][1]['/Resources'];
if ($res[0] == PDF_TYPE_OBJECT)
return $res[1];
return $res;
} else {
if (!isset ($obj[1][1]['/Parent'])) {
return false;
} else {
$res = $this->_getPageResources($obj[1][1]['/Parent']);
if ($res[0] == PDF_TYPE_OBJECT)
return $res[1];
return $res;
}
}
}
/**
* Get content of current page
*
* If more /Contents is an array, the streams are concated
*
* @return string
*/
public function getContent() {
$buffer = '';
if (isset($this->pages[$this->pageno][1][1]['/Contents'])) {
$contents = $this->_getPageContent($this->pages[$this->pageno][1][1]['/Contents']);
foreach($contents AS $tmp_content) {
$buffer .= $this->_rebuildContentStream($tmp_content) . ' ';
}
}
return $buffer;
}
/**
* Resolve all content-objects
*
* @param array $content_ref
* @return array
*/
private function _getPageContent($content_ref) {
$contents = array();
if ($content_ref[0] == PDF_TYPE_OBJREF) {
$content = $this->getObjectVal($content_ref);
if ($content[1][0] == PDF_TYPE_ARRAY) {
$contents = $this->_getPageContent($content[1]);
} else {
$contents[] = $content;
}
} elseif ($content_ref[0] == PDF_TYPE_ARRAY) {
foreach ($content_ref[1] AS $tmp_content_ref) {
$contents = array_merge($contents,$this->_getPageContent($tmp_content_ref));
}
}
return $contents;
}
/**
* Rebuild content-streams
*
* @param array $obj
* @return string
*/
private function _rebuildContentStream($obj) {
$filters = array();
if (isset($obj[1][1]['/Filter'])) {
$_filter = $obj[1][1]['/Filter'];
if ($_filter[0] == PDF_TYPE_OBJREF) {
$tmpFilter = $this->getObjectVal($_filter);
$_filter = $tmpFilter[1];
}
if ($_filter[0] == PDF_TYPE_TOKEN) {
$filters[] = $_filter;
} elseif ($_filter[0] == PDF_TYPE_ARRAY) {
$filters = $_filter[1];
}
}
$stream = $obj[2][1];
foreach ($filters AS $_filter) {
$stream = $this->FilterDecoders->decodeFilter($_filter[1], $stream);
}
return $stream;
}
/**
* Get a Box from a page
* Arrayformat is same as used by fpdf_tpl
*
* @param array $page a /Page
* @param string $box_index Type of Box @see $availableBoxes
* @param float Scale factor from user space units to points
* @return array
*/
public function getPageBox($page, $box_index, $k) {
$page = $this->getObjectVal($page);
$box = null;
if (isset($page[1][1][$box_index]))
$box =& $page[1][1][$box_index];
if (!is_null($box) && $box[0] == PDF_TYPE_OBJREF) {
$tmp_box = $this->getObjectVal($box);
$box = $tmp_box[1];
}
if (!is_null($box) && $box[0] == PDF_TYPE_ARRAY) {
$b =& $box[1];
return array('x' => $b[0][1] / $k,
'y' => $b[1][1] / $k,
'w' => abs($b[0][1] - $b[2][1]) / $k,
'h' => abs($b[1][1] - $b[3][1]) / $k,
'llx' => min($b[0][1], $b[2][1]) / $k,
'lly' => min($b[1][1], $b[3][1]) / $k,
'urx' => max($b[0][1], $b[2][1]) / $k,
'ury' => max($b[1][1], $b[3][1]) / $k,
);
} elseif (!isset ($page[1][1]['/Parent'])) {
return false;
} else {
return $this->getPageBox($this->getObjectVal($page[1][1]['/Parent']), $box_index, $k);
}
}
/**
* Get all page boxes by page no
*
* @param int The page number
* @param float Scale factor from user space units to points
* @return array
*/
public function getPageBoxes($pageno, $k) {
return $this->_getPageBoxes($this->pages[$pageno - 1], $k);
}
/**
* Get all boxes from /Page
*
* @param array a /Page
* @return array
*/
private function _getPageBoxes($page, $k) {
$boxes = array();
foreach($this->availableBoxes AS $box) {
if ($_box = $this->getPageBox($page, $box, $k)) {
$boxes[$box] = $_box;
}
}
return $boxes;
}
/**
* Get the page rotation by pageno
*
* @param integer $pageno
* @return array
*/
public function getPageRotation($pageno) {
return $this->_getPageRotation($this->pages[$pageno - 1]);
}
private function _getPageRotation($obj) { // $obj = /Page
$obj = $this->getObjectVal($obj);
if (isset ($obj[1][1]['/Rotate'])) {
$res = $this->getObjectVal($obj[1][1]['/Rotate']);
if ($res[0] == PDF_TYPE_OBJECT)
return $res[1];
return $res;
} else {
if (!isset ($obj[1][1]['/Parent'])) {
return false;
} else {
$res = $this->_getPageRotation($obj[1][1]['/Parent']);
if ($res[0] == PDF_TYPE_OBJECT)
return $res[1];
return $res;
}
}
}
/**
* This method is automatically called in case of fatal error; it simply outputs the message and halts the execution.
* @param $msg (string) The error message
* @public
* @since 1.0.000 (2011-05-23)
*/
public function Error($msg) {
// exit program and print error
die('TCPDF_PARSER ERROR: '.$msg);
}
} // END OF TCPDF_PARSER CLASS
//============================================================+
// END OF FILE
//============================================================+