File: buffer.rb

package info (click to toggle)
ruby-pdf-reader 2.15.0-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 33,512 kB
  • sloc: ruby: 11,959; sh: 46; makefile: 11
file content (463 lines) | stat: -rw-r--r-- 14,357 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
# coding: ASCII-8BIT
# typed: strict
# frozen_string_literal: true

################################################################################
#
# Copyright (C) 2010 James Healy (jimmy@deefa.com)
#
# Permission is hereby granted, free of charge, to any person obtaining
# a copy of this software and associated documentation files (the
# "Software"), to deal in the Software without restriction, including
# without limitation the rights to use, copy, modify, merge, publish,
# distribute, sublicense, and/or sell copies of the Software, and to
# permit persons to whom the Software is furnished to do so, subject to
# the following conditions:
#
# The above copyright notice and this permission notice shall be
# included in all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
#
################################################################################

class PDF::Reader

  # A string tokeniser that recognises PDF grammar. When passed an IO stream or a
  # string, repeated calls to token() will return the next token from the source.
  #
  # This is very low level, and getting the raw tokens is not very useful in itself.
  #
  # This will usually be used in conjunction with PDF:Reader::Parser, which converts
  # the raw tokens into objects we can work with (strings, ints, arrays, etc)
  #
  class Buffer
    TOKEN_WHITESPACE=[0x00, 0x09, 0x0A, 0x0C, 0x0D, 0x20] #: Array[Integer]
    TOKEN_DELIMITER=[0x25, 0x3C, 0x3E, 0x28, 0x5B, 0x7B, 0x29, 0x5D, 0x7D, 0x2F] #: Array[Integer]

    # some strings for comparissons. Declaring them here avoids creating new
    # strings that need GC over and over
    LEFT_PAREN = "(" #: String
    LESS_THAN = "<" #: String
    STREAM = "stream" #: String
    ID = "ID" #: String
    FWD_SLASH = "/" #: String
    NULL_BYTE = "\x00" #: String
    CR = "\r" #: String
    LF = "\n" #: String
    CRLF = "\r\n" #: String
    WHITE_SPACE = ["\n", "\r", ' '] #: Array[String]

    # Quite a few PDFs have trailing junk.
    # This can be several k of nuls in some cases
    # Allow for this here
    TRAILING_BYTECOUNT = 5000 #: Integer

    # must match whole tokens
    DIGITS_ONLY = %r{\A\d+\z} #: Regexp

    #: Integer
    attr_reader :pos

    # Creates a new buffer.
    #
    # Params:
    #
    #   io - an IO stream (usually a StringIO) with the raw data to tokenise
    #
    # options:
    #
    #   :seek - a byte offset to seek to before starting to tokenise
    #   :content_stream - set to true if buffer will be tokenising a
    #                     content stream. Defaults to false
    #
    #: ((StringIO | Tempfile | IO), ?Hash[Symbol, untyped]) -> void
    def initialize(io, opts = {})
      @io = io
      @tokens = [] #: Array[String | PDF::Reader::Reference]
      @in_content_stream = opts[:content_stream] #: bool

      @io.seek(opts[:seek]) if opts[:seek]
      @pos = @io.pos #: Integer
    end

    # return true if there are no more tokens left
    #
    #: () -> bool
    def empty?
      prepare_tokens if @tokens.size < 3

      @tokens.empty?
    end

    # return raw bytes from the underlying IO stream.
    #
    #   bytes - the number of bytes to read
    #
    # options:
    #
    #   :skip_eol - if true, the IO stream is advanced past a CRLF, CR or LF
    #               that is sitting under the io cursor.
    #   Note:
    #   Skipping a bare CR is not spec-compliant.
    #   This is because the data may start with LF.
    #   However we check for CRLF first, so the ambiguity is avoided.
    #: (Integer, ?Hash[Symbol, untyped]) -> String?
    def read(bytes, opts = {})
      reset_pos

      if opts[:skip_eol]
        @io.seek(-1, IO::SEEK_CUR)
        str = @io.read(2)
        if str.nil?
          return nil
        elsif str == CRLF # This MUST be done before checking for CR alone
          # do nothing
        elsif str[0, 1] == LF || str[0, 1] == CR # LF or CR alone
          @io.seek(-1, IO::SEEK_CUR)
        else
          @io.seek(-2, IO::SEEK_CUR)
        end
      end

      bytes = @io.read(bytes)
      save_pos
      bytes
    end

    # return the next token from the source. Returns a string if a token
    # is found, nil if there are no tokens left.
    #
    #: () -> (nil | String | PDF::Reader::Reference)
    def token
      reset_pos
      prepare_tokens if @tokens.size < 3
      merge_indirect_reference
      prepare_tokens if @tokens.size < 3

      @tokens.shift
    end

    # return the byte offset where the first XRef table in th source can be found.
    #
    #: () -> Integer
    def find_first_xref_offset
      check_size_is_non_zero
      @io.seek(-TRAILING_BYTECOUNT, IO::SEEK_END) rescue @io.seek(0)
      data = @io.read(TRAILING_BYTECOUNT)

      raise MalformedPDFError, "PDF does not contain EOF marker" if data.nil?

      # the PDF 1.7 spec (section #3.4) says that EOL markers can be either \r, \n, or both.
      lines = data.split(/[\n\r]+/).reverse
      eof_index = lines.index { |l| l.strip[/^%%EOF/] }

      raise MalformedPDFError, "PDF does not contain EOF marker" if eof_index.nil?
      raise MalformedPDFError, "PDF EOF marker does not follow offset" if eof_index >= lines.size-1
      offset = lines[eof_index+1].to_i

      # a byte offset < 0 doesn't make much sense. This is unlikely to happen, but in theory some
      # corrupted PDFs might have a line that looks like a negative int preceding the `%%EOF`
      raise MalformedPDFError, "invalid xref offset" if offset < 0
      offset
    end

    private

    #: () -> void
    def check_size_is_non_zero
      @io.seek(-1, IO::SEEK_END)
      @io.seek(0)
    rescue Errno::EINVAL
      raise MalformedPDFError, "PDF file is empty"
    end

    # Returns true if this buffer is parsing a content stream
    #
    #: () -> bool
    def in_content_stream?
      @in_content_stream ? true : false
    end

    # Some bastard moved our IO stream cursor. Restore it.
    #
    #: () -> void
    def reset_pos
      @io.seek(@pos) if @io.pos != @pos
    end

    # save the current position of the source IO stream. If someone else (like another buffer)
    # moves the cursor, we can then restore it.
    #
    #: () -> void
    def save_pos
      @pos = @io.pos
    end

    # attempt to prime the buffer with the next few tokens.
    #
    #: () -> void
    def prepare_tokens
      10.times do
        case state
        when :literal_string then prepare_literal_token
        when :hex_string     then prepare_hex_token
        when :regular        then prepare_regular_token
        when :inline         then prepare_inline_token
        end
      end

      save_pos
    end

    # tokenising behaves slightly differently based on the current context.
    # Determine the current context/state by examining the last token we found
    #
    #: () -> Symbol
    def state
      case @tokens.last
      when LEFT_PAREN then :literal_string
      when LESS_THAN then :hex_string
      when STREAM then :stream
      when ID
        if in_content_stream?  && @tokens[-2] != FWD_SLASH
          :inline
        else
          :regular
        end
      else
        :regular
      end
    end

    # detect a series of 3 tokens that make up an indirect object. If we find
    # them, replace the tokens with a PDF::Reader::Reference instance.
    #
    # Merging them into a single string was another option, but that would mean
    # code further up the stack would need to check every token  to see if it looks
    # like an indirect object. For optimisation reasons, I'd rather avoid
    # that extra check.
    #
    # It's incredibly likely that the next 3 tokens in the buffer are NOT an
    # indirect reference, so test for that case first and avoid the relatively
    # expensive regexp checks if possible.
    #
    #: () -> void
    def merge_indirect_reference
      return if @tokens.size < 3
      return if @tokens[2] != "R"

      token_one = @tokens[0]
      token_two = @tokens[1]
      if token_one.is_a?(String) && token_two.is_a?(String) && token_one.match(DIGITS_ONLY) && token_two.match(DIGITS_ONLY)
        @tokens[0] = PDF::Reader::Reference.new(token_one.to_i, token_two.to_i)
        @tokens.delete_at(2)
        @tokens.delete_at(1)
      end
    end

    # Extract data between ID and EI
    # If the EI follows white-space the space is dropped from the data
    # The EI must followed by white-space or end of buffer
    # This is to reduce the chance of accidentally matching an embedded EI
    #: () -> void
    def prepare_inline_token
      idstart = @io.pos
      prevchr = ''
      eisize = 0 # how many chars in the end marker
      seeking = 'E' # what are we looking for now?
      loop do
        chr = @io.read(1)
        break if chr.nil?
        case seeking
        when 'E'
          if chr == 'E'
            seeking = 'I'
            if WHITE_SPACE.include? prevchr
              eisize = 3 # include whitespace in delimiter, i.e. drop from data
            else # assume the EI immediately follows the data
              eisize = 2 # leave prevchr in data
            end
          end
        when 'I'
          if chr == 'I'
            seeking = ''
          else
            seeking = 'E'
          end
        when ''
          if WHITE_SPACE.include? chr
            eisize += 1 # Drop trailer
            break
          else
            seeking = 'E'
          end
        end
        prevchr = chr.is_a?(String) ? chr : ''
      end
      unless seeking == ''
        raise MalformedPDFError, "EI terminator not found"
      end
      eiend = @io.pos
      @io.seek(idstart, IO::SEEK_SET)
      str = @io.read(eiend - eisize - idstart) # get the ID content
      @tokens << str.freeze if str
    end

    # if we're currently inside a hex string, read hex nibbles until
    # we find a closing >
    #
    #: () -> void
    def prepare_hex_token
      str = "".dup

      loop do
        byte = @io.getbyte
        if byte.nil?
          break
        elsif (48..57).include?(byte) || (65..90).include?(byte) || (97..122).include?(byte)
          str << byte
        elsif byte <= 32
          # ignore it
        else
          @tokens << str if str.size > 0
          @tokens << ">" if byte != 0x3E # '>'
          @tokens << byte.chr
          break
        end
      end
    end

    # if we're currently inside a literal string we more or less just read bytes until
    # we find the closing ) delimiter. Lots of bytes that would otherwise indicate the
    # start of a new token in regular mode are left untouched when inside a literal
    # string.
    #
    # The entire literal string will be returned as a single token. It will need further
    # processing to fix things like escaped new lines, but that's someone else's
    # problem.
    #
    #: () -> void
    def prepare_literal_token
      str = "".dup
      count = 1

      while count > 0
        byte = @io.getbyte
        if byte.nil?
          count = 0 # unbalanced params
        elsif byte == 0x5C
          str << byte << @io.getbyte
        elsif byte == 0x28 # "("
          str << "("
          count += 1
        elsif byte == 0x29 # ")"
          count -= 1
          str << ")" unless count == 0
        else
          str << byte unless count == 0
        end
      end

      @tokens << str if str.size > 0
      @tokens << ")"
    end

    # Extract the next regular token and stock it in our buffer, ready to be returned.
    #
    # What each byte means is complex, check out section "3.1.1 Character Set" of the 1.7 spec
    # to read up on it.
    #
    #: () -> void
    def prepare_regular_token
      tok = "".dup

      loop do
        byte = @io.getbyte

        case byte
        when nil
          break
        when 0x25
          # comment, ignore everything until the next EOL char
          loop do
            commentbyte = @io.getbyte
            break if commentbyte.nil? || commentbyte == 0x0A || commentbyte == 0x0D
          end
        when *TOKEN_WHITESPACE
          # white space, token finished
          @tokens << tok if tok.size > 0

          #If the token was empty, chomp the rest of the whitespace too
          while TOKEN_WHITESPACE.include?(peek_byte) && tok.size == 0
            @io.getbyte
          end
          tok = "".dup
          break
        when 0x3C
          # opening delimiter '<', start of new token
          @tokens << tok if tok.size > 0
          if peek_byte == 0x3C # check if token is actually '<<'
            @io.getbyte
            @tokens << "<<"
          else
            @tokens << "<"
          end
          tok = "".dup
          break
        when 0x3E
          # closing delimiter '>', start of new token
          @tokens << tok if tok.size > 0
          if peek_byte == 0x3E # check if token is actually '>>'
            @io.getbyte
            @tokens << ">>"
          else
            @tokens << ">"
          end
          tok = "".dup
          break
        when 0x28, 0x5B, 0x7B
          # opening delimiter, start of new token
          @tokens << tok if tok.size > 0
          @tokens << byte.chr
          tok = "".dup
          break
        when 0x29, 0x5D, 0x7D
          # closing delimiter
          @tokens << tok if tok.size > 0
          @tokens << byte.chr
          tok = "".dup
          break
        when 0x2F
          # PDF name, start of new token
          @tokens << tok if tok.size > 0
          @tokens << byte.chr
          @tokens << "" if byte == 0x2F && ([nil, 0x20, 0x0A] + TOKEN_DELIMITER).include?(peek_byte)
          tok = "".dup
          break
        else
          tok << byte
        end
      end

      @tokens << tok if tok.size > 0
    end

    # peek at the next character in the io stream, leaving the stream position
    # untouched
    #
    #: () -> (Integer | nil)
    def peek_byte
      byte = @io.getbyte
      @io.seek(-1, IO::SEEK_CUR) if byte
      byte
    end

  end
end