1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158
|
<?php
/////////////////////////////////////////////////////////////////
/// getID3() by James Heinrich <info@getid3.org> //
// available at https://github.com/JamesHeinrich/getID3 //
// or https://www.getid3.org //
// or http://getid3.sourceforge.net //
// see readme.txt for more details //
/////////////////////////////////////////////////////////////////
// //
// module.misc.pdf.php //
// module for analyzing PDF files //
// dependencies: NONE //
// ///
/////////////////////////////////////////////////////////////////
if (!defined('GETID3_INCLUDEPATH')) { // prevent path-exposing attacks that access modules directly on public webservers
exit;
}
class getid3_pdf extends getid3_handler
{
/** misc.pdf
* return full details of PDF Cross-Reference Table (XREF)
*
* @var bool
*/
public $returnXREF = false;
/**
* @return bool
*/
public function Analyze() {
$info = &$this->getid3->info;
$this->fseek(0);
if (preg_match('#^%PDF-([0-9\\.]+)$#', rtrim($this->fgets()), $matches)) {
$info['pdf']['header']['version'] = floatval($matches[1]);
$info['fileformat'] = 'pdf';
// the PDF Cross-Reference Table (XREF) is located near the end of the file
// the starting offset is specified in the penultimate section, on the two lines just before "%%EOF"
// the first line is "startxref", the second line is the byte offset of the XREF.
// We know the length of "%%EOF" and "startxref", but the offset could be 2-10 bytes,
// and we're not sure if the line ends are one or two bytes, so we might find "startxref" as little as 18(?) bytes
// from EOF, but it could 30 bytes, so we start 40 bytes back just to be safe and do a search for the data we want.
$this->fseek(-40, SEEK_END);
if (preg_match('#[\r\n]startxref[ \r\n]+([0-9]+)[ \r\n]+#', $this->fread(40), $matches)) {
$info['pdf']['trailer']['startxref'] = intval($matches[1]);
$this->parseXREF($info['pdf']['trailer']['startxref']);
if (!empty($info['pdf']['xref']['offset'])) {
while (!$this->feof() && (max(array_keys($info['pdf']['xref']['offset'])) > $info['pdf']['xref']['count'])) {
// suspect that there may be another XREF entry somewhere in the file, brute-force scan for it
/*
// starting at last known entry of main XREF table
$this->fseek(max($info['pdf']['xref']['offset']));
*/
// starting at the beginning of the file
$this->fseek(0);
while (!$this->feof()) {
$XREFoffset = $this->ftell();
if (rtrim($this->fgets()) == 'xref') {
if (empty($info['pdf']['xref']['xref_offsets']) || !in_array($XREFoffset, $info['pdf']['xref']['xref_offsets'])) {
$this->parseXREF($XREFoffset);
break;
}
}
}
}
asort($info['pdf']['xref']['offset']);
$maxObjLengths = array();
$prevOffset = 0;
$prevObjNum = 0;
foreach ($info['pdf']['xref']['offset'] as $objectNumber => $offset) {
// walk through all listed offsets to calculate the maximum possible length for each known object
if ($prevObjNum) {
$maxObjLengths[$prevObjNum] = $offset - $prevOffset;
}
$prevOffset = $offset;
$prevObjNum = $objectNumber;
}
ksort($maxObjLengths);
foreach ($info['pdf']['xref']['offset'] as $objectNumber => $offset) {
if ($info['pdf']['xref']['entry'][$objectNumber] == 'f') {
// "free" object means "deleted", ignore
continue;
}
if (!empty($maxObjLengths[$objectNumber]) && ($maxObjLengths[$objectNumber] < $this->getid3->option_fread_buffer_size)) {
// ignore object that are zero-size or >32kB, they are unlikely to contain information we're interested in
$this->fseek($offset);
$objBlob = $this->fread($maxObjLengths[$objectNumber]);
if (preg_match('#^'.$objectNumber.'[\\x00 \\r\\n\\t]*([0-9]+)[\\x00 \\r\\n\\t]*obj[\\x00 \\r\\n\\t]*(.*)(endobj)?[\\x00 \\r\\n\\t]*$#s', $objBlob, $matches)) {
list($dummy, $generation, $objectData) = $matches;
if (preg_match('#^<<[\r\n\s]*(/Type|/Pages|/Parent [0-9]+ [0-9]+ [A-Z]|/Count [0-9]+|/Kids *\\[[0-9A-Z ]+\\]|[\r\n\s])+[\r\n\s]*>>#', $objectData, $matches)) {
if (preg_match('#/Count ([0-9]+)#', $objectData, $matches)) {
$info['pdf']['pages'] = (int) $matches[1];
break; // for now this is the only data we're looking for in the PDF not need to loop through every object in the file (and a large PDF may contain MANY objects). And it MAY be possible that there are other objects elsewhere in the file that define additional (or removed?) pages
}
}
} else {
$this->error('Unexpected structure "'.substr($objBlob, 0, 100).'" at offset '.$offset);
break;
}
}
}
if (!$this->returnXREF) {
unset($info['pdf']['xref']['offset'], $info['pdf']['xref']['generation'], $info['pdf']['xref']['entry'], $info['pdf']['xref']['xref_offsets']);
}
} else {
$this->error('Did not find "xref" at offset '.$info['pdf']['trailer']['startxref']);
}
} else {
$this->error('Did not find "startxref" in the last 40 bytes of the PDF');
}
$this->warning('PDF parsing incomplete in this version of getID3() ['.$this->getid3->version().']');
return true;
}
$this->error('Did not find "%PDF" at the beginning of the PDF');
return false;
}
/**
* @return bool
*/
private function parseXREF($XREFoffset) {
$info = &$this->getid3->info;
$this->fseek($XREFoffset);
if (rtrim($this->fgets()) == 'xref') {
$info['pdf']['xref']['xref_offsets'][$XREFoffset] = $XREFoffset;
list($firstObjectNumber, $XREFcount) = explode(' ', rtrim($this->fgets()));
$firstObjectNumber = (int) $firstObjectNumber;
$XREFcount = (int) $XREFcount;
$info['pdf']['xref']['count'] = $XREFcount + (!empty($info['pdf']['xref']['count']) ? $info['pdf']['xref']['count'] : 0);
for ($i = 0; $i < $XREFcount; $i++) {
$line = rtrim($this->fgets());
if (preg_match('#^([0-9]+) ([0-9]+) ([nf])$#', $line, $matches)) {
$info['pdf']['xref']['offset'][($firstObjectNumber + $i)] = (int) $matches[1];
$info['pdf']['xref']['generation'][($firstObjectNumber + $i)] = (int) $matches[2];
$info['pdf']['xref']['entry'][($firstObjectNumber + $i)] = $matches[3];
} else {
$this->error('failed to parse XREF entry #'.$i.' in XREF table at offset '.$XREFoffset);
return false;
}
}
sort($info['pdf']['xref']['xref_offsets']);
return true;
}
$this->warning('failed to find expected XREF structure at offset '.$XREFoffset);
return false;
}
}
|