File: pdf_rbld.ps

package info (click to toggle)
ghostscript 8.71~dfsg2-9+squeeze1
  • links: PTS, VCS
  • area: main
  • in suites: squeeze
  • size: 79,896 kB
  • ctags: 80,654
  • sloc: ansic: 501,432; sh: 25,689; python: 4,853; cpp: 3,633; perl: 3,597; tcl: 1,480; makefile: 1,187; lisp: 407; asm: 284; xml: 263; awk: 66; csh: 17; yacc: 15
file content (344 lines) | stat: -rw-r--r-- 15,056 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
%    Copyright (C) 2002 Artifex Software, Inc.  All rights reserved.
% 
% This software is provided AS-IS with no warranty, either express or
% implied.
% 
% This software is distributed under license and may not be copied,
% modified or distributed except as expressly authorized under the terms
% of the license contained in the file LICENSE in this distribution.
% 
% For more information about licensing, please refer to
% http://www.ghostscript.com/licensing/. For information on
% commercial licensing, go to http://www.artifex.com/licensing/ or
% contact Artifex Software, Inc., 101 Lucas Valley Road #110,
% San Rafael, CA  94903, U.S.A., +1(415)492-9861.

% $Id: pdf_rbld.ps 9175 2008-10-19 20:32:03Z alexcher $
% pdf_rbld.ps - Rebuilding of broken PDF files (xref errors)

% This module contains routines that are used if we detect an error
% while reading the xref tables.  These routines will scan the file and
% build an xref table by finding the objects.  We also need to find the
% appropriate trailer dictionary.  Note:  One procedure is also used
% even if we do not need to rebuild a PDF file.
%
% This module cannot rebuild a PDF file which has had errors created inside
% of objects or binary data streams.  It often succeeds with files that
% have had its end of lines converted between unix and dos versions.

% if true --> we have an object with duplicate object and generation numbers.
/dup_obj_gen_num false def

% Note:  This procedure is also used by non-rebuild code.
% Store a line in the xref array (Actually Objects and Generations arrays)
% <obj num> (strm num> <obj loc> <gen num> <rebuild>
%                         setxrefentry <obj num> strm num> <obj loc> <gen num>
/setxrefentry
{
  5 1 roll
  dup 65535 or 65535 ne {
    (   **** Warning:  Generation number out of 0..65535 range, assuming 0.\n)
    pdfformaterror
    pop 0
  } if
       	% We store generation numbers as value + 1
	% We reserve 0 to indicate an free xref entry
  1 add			% increment generation number
	% To save space, generations numbers are stored in a lstring unless we
	% find a generation number greater than 255.  If so then transfer to 
	% an larray.
  dup 255 gt {
    Generations ltype /stringtype eq {	% Convert Generations to an larray.
      larray Generations llength lgrowto dup	% Create new larray
      0 1 2 index llength 1 sub {	% Copy from old lstring to new larray
	Generations 1 index lget lput dup
      } for
      pop
      /Generations exch store		% Save new Generations larray
    } if
  } if
	% Verify that the new values are for a new object.  If the current
	% entry is null then we have a new entry.
  Objects 4 index lget null eq {
    ObjectStream 4 index 4 index cvx lput % Save ObjectStream object number
    Objects 4 index 3 index cvx lput	% Save object location
    Generations 4 index 2 index lput	% Save geenration number
  } {
	% Verify that the new entry has at least as high a generaton number
	% We accept equal entry number because we have found PDF files in
	% which there are multiple objects with the same object and entry
	% numbers.  The normal xref logic only accepts the first such
	% entry that it finds.  However the 'rebuild PDF' logic can find
	% both such entries.  The correct one is usually the last one.
    Generations 4 index lget 1 index le {
      ObjectStream 4 index 4 index cvx lput % Save ObjectStream object number
      Objects 4 index 3 index cvx lput	% Save object location
      Generations 4 index 2 index lput	% Save geenration number
    } if
	% Set error flag if we have equal object and generation numbers
    Generations 4 index lget 1 index eq { /dup_obj_gen_num true def } if
  } 8 -1 roll { ifelse } { pop if } ifelse  % Run 'else' only when rebuilding.
} bind def

% Print the contents of the xref array.  This actually consists of three
% arrays (Objects, Generations, and ObjectStream).  All three are larrays.
% larrays are a special Ghostscript object which can be arrays with more
% than 64k elements.
/print_xref				% - print_xref -
{ 0 1 Objects llength 1 sub		% stack: 0 1 <number of objects - 1>
  { dup =only				% print object number
    (  ) print
    dup Generations exch lget 1 sub =only % print Generation number
    (  ) print
    dup ObjectStream exch lget ==only	% print ObjectStream object number
    (  ) print
    Objects exch lget ===		% print object location
  } for
  flush
} bind def

% Get token from string and check its type
%   <string> <type> typed_token <false>		% no token or not match
%   <string> <type> typed_token <obj> <last> <true>	% matching token type
% Where last is the string remainder
/typed_token
{ exch
  token_nofail			% get token
  {
    dup type			% stack:  type last token type
    4 -1 roll eq {		% stack:  last token bool
      exch true			% desired object found - set exit status
    } {
      pop pop false		% not type - clear stack, set exit status
    } ifelse
  } {
    pop false			% no token - pop type, set exit status
  } ifelse			% check if we got token
} bind def

% Allocate space for post_eof_count to be bound into procedures below.
/post_eof_count 0 def
 
% We want the location of the trailer dictionary at the start of file.
% First we will find the xref.  Then we will skip over the xref entries
% to the trailer.
/search_start_trailer		% - search_start_trailer <trailer loc>
{ % Read the first 300 bytes and check for xref
  PDFfile 0 setfileposition
  PDFfile bytesavailable post_eof_count sub	% location of end of data
  300 .min			% block size to read
  dup string 0 1 4 -1 roll 1 sub
  { 2 copy PDFfile read pop put pop } for
  (xref) search {
    % found 'xref'
    exch pop exch pop length 4 add PDFfile exch setfileposition
    PDFfile token pop		% get starting entry - or 'trailer'
    (trailer) ne {		% if we do not already have 'trailer'
      PDFfile token pop		% get number of entries
      PDFfile token pop pop	% this moves us into the middle of the first entry
      25 string exch		% define working string for readline
      { PDFfile 1 index readline pop pop
      } repeat			% skip entries
      pop			% pop working string
      PDFfile token pop pop	% get 'trailer'
      PDFfile fileposition	% get file position
    } if
  } {
    pop search_end_trailer	% no xref, should not happen, search end of file
  } ifelse
} bind def

% We want the location of the trailer dictionary at the end of file.
% We will read the last block of data and search for the final occurance
% of the word 'trailer'
/search_end_trailer		% - search_end_trailer <trailer loc>
{ % Position to read block of data from the end of the file.  Note:  We ignore
  % anything past the last %%EOF since this is not PDF data.
  PDFfile 0 setfileposition
  PDFfile bytesavailable post_eof_count sub	% location of end of data
  dup 65535 .min		% block size to read
				% stack: <file end pos> <block size>
  % move file position to the start of the block
  2 copy sub PDFfile exch setfileposition
  % read block of data
  dup string 0 1 4 -1 roll 1 sub { 2 copy PDFfile read pop put pop } for
  % search for last occurance of 'trailer'
  (trailer) { search not { exit } if pop } loop
  % determine where the trailer is in the file
  %   trailer loc = end loc - remaing string length
  length sub 
} bind def

% We want to find the trailer dictionary.  There is a trailer dictionary
% for each xref object list.  We only want the trailer dictionary associated
% with the first xref object list.  In theory this can be anywhere in the
% file.  However since we are trying to repair a broken file, we cannot simply
% follow the xref links.  So we are falling back to a simple strategy.  We
% find the specified location of the first xref list.  If its location is in
% the first half of the file then we search for the first trailer dictionary
% at the start of the file.  Otherwise we search for the last trailer at the
% end of the file.
/search_trailer			% - search_trailer -
{ % Find the 'startxref' and associated position at the end of the file.
  % Position to read block of data from the end of the file.  Note:  We
  % actually end at the end of the last %%EOF since this is the end of the
  % useful PDF data.  (Some files contain trailing garbage.)
  PDFfile 0 setfileposition
  PDFfile bytesavailable	% size of file
  post_eof_count sub dup	% location of end of last %%EOF
  dup 4096 .min			% block size to read
  % stack: <useful file size> <useful file size file> <block size>
  % move file position to the start of the block
  2 copy sub PDFfile exch setfileposition
  % read block of data
  dup string 0 1 4 -1 roll 1 sub { 2 copy PDFfile read pop put pop } for
  % search for last occurance of 'startxref'
  false				% Assume that startxref not present
  exch (startxref) { 
     search not { exit } if	% Exit loop when no more startxref's
     pop 3 -1 roll pop true 3 1 roll	% Idicate that we have found starxref
  } loop
  exch				% Exch last string and 'found' flag
  { 
     % determine where the startxref is in the file
     %   'startxref' loc = end loc - remaing string length - 9 bytes
     length sub 9 sub
     % move the file to this position and read startxref and position
     PDFfile exch setfileposition PDFfile token
     pop pop PDFfile token pop
  } {
     % startxref not found.  We will search the end of the file for trailer.
     pop pop PDFfilelen
  } ifelse
  % compare xref position to 1/2 the length of the file and search for trailer
  exch 2 div lt { search_start_trailer } { search_end_trailer } ifelse
  % get the trailer
  PDFfile exch setfileposition		% set to the specified trailer location
  /dictlevelcount 0 def
  PDFfile traileropdict .pdfrun		% read trailer info
  /Trailer exch def
} bind def

% This routine will determine if there is stuff after the %%EOF.  There is
% supposed to be only a line termination.  However many real life files
% contain some garbage.  This routine checks how much.  We then ignore this
% stuff when we are scanning for objects.
/determine_post_eof_count		% - determine_post_eof_count <count>
{ % Position to read block of data from the end of the file. 
  PDFfilelen			% size of file
  dup 4096 .min	                % file_size block_size
  dup 3 1 roll sub              % block_size file_size-block_size 
  PDFfile exch setfileposition  % block_size
  string PDFfile exch readstring pop % ()

  % search for last occurance of 'startxref', '%%EOF' is often damaged
  (startxref) {
    search not { exit } if pop
  } loop
  % how much is left = remaining string length

  % Now search for %%EO or try to read a number after 'startxref'.
  (%%EO) search {
    pop pop 
  } {
    % Look for a number after startxref
    { dup token { pop exch pop } if
    } stopped {
      pop
    } if
  } ifelse
  length
} bind def

% This routine will scan a file searaching for object locations to build
% an alternate version of the data in the xref tables.
% Its purpose is to provide a basis for an xref fixing facility.
/search_objects				% - search_objects -
{ % Initialize the Objects, Generations, etc. larrays
  initPDFobjects
  % reset duplicate object and generation numbers error flag
  /dup_obj_gen_num false def
  % Determine how many bytes are in the file after the final %%EOF
  /post_eof_count determine_post_eof_count def
  % Start at the beginning of the file
  PDFfile 0 setfileposition
  % Create a working string (and also store its length on stack).  We are
  % using a maximum size string size the logic below wants a recovered object
  % to fit into our working string.
  65535 dup string
  { % Now loop through the entire file lloking for objects
    PDFfile fileposition		% save current file position
    % When we get near the end of the file, we use a smaller interval of
    % our working string to prevent reading past the end.  (See comments on
    % EOF testing below.)
    PDFfile bytesavailable post_eof_count sub 10 sub dup 4 index lt {
      2 index 0 3 -1 roll getinterval	% near EOF, use interval of string
    } { pop 1 index			% not near end, use full working string
    }ifelse
    % Read a line from file.  If the line does not fit into our working string,
    % or any other error, then we will discard it.
    PDFfile exch { readline } .internalstopped
    { pop pop false } if		% indicate no string if we stopped
    { % stack: <length> <working_str> <loc> <string>
      % Now that we have line, get obj num, ref num, and 'obj'.  Verify that each
      % of these is correct type.
      /integertype typed_token {	% get obj number
        /integertype typed_token {	% get ref number
          /nametype typed_token {	% get 'obj' text
	    pop				% pop remaining string
	    /obj eq {			% verify name is 'obj'
	      % make sure we have room in the arrays.  We work in increments
	      % of 20 each time we increase the size.
	      1 index 20 add 20 idiv 20 mul
	      growPDFobjects
	      % save xref parameters into ObjectStream, Objects and Generations
	      1 index 0			% rearrange parms for setxrefentry
	      4 index PDFoffset sub 3 index
	      //true setxrefentry	% save parameters
	      pop pop pop pop		% clear parameters
	    } if			% check if name is 'obj'
          } if				% check if we got 'obj" string
          pop				% remove ref number
        } if				% check if we got ref number
        pop				% remove obj number
      } if				% check if we got object number
    } if				% check if got a string from readline
    pop					% remove location
    % Check if we are approaching the end of the file.  We do not want to
    % read past the end of the file since that closes it.  We actually stop
    % 10-20 bytes early since there cannot be an object that close to the end.
    % (There is a Trailer dictionary, etc. at the end of the file.)
    PDFfile bytesavailable post_eof_count sub 20 lt { exit } if
  } loop				% loop through the entire file
  pop pop				% remove working string and its length
  % Output warning if we have two objects with the same object and generation
  % numbers.
  dup_obj_gen_num {
    (   **** Warning:  There are objects with matching object and generation\n)
    pdfformaterror
    (   **** numbers.  The accuracy of the resulting image is unknown.\n)
    pdfformaterror
  } if
} bind def

% Print warning message because we found a problem while reading the xref
% tables
/print_xref_warning
{ (   **** Warning:  An error occurred while reading an XREF table.\n)
  pdfformaterror
  (   **** The file has been damaged.  This may have been caused\n)
  pdfformaterror
  (   **** by a problem while converting or transfering the file.\n)
  pdfformaterror
  (   **** Ghostscript will attempt to recover the data.\n)
  pdfformaterror
} bind def

% Attempt to recover the XRef data.  This is called if we have a failure
% while reading the normal XRef tables.  This routine usually works
% only for pre PDF1.5 versions of PDF files.
/recover_xref_data		% - recover_xref_data -
{ print_xref_warning		% Print warning message
  count pdfemptycount sub { pop } repeat % remove anything left by readxref
  search_objects		% Search for objects
} bind def