File: page_text_receiver.rb

package info (click to toggle)
ruby-pdf-reader 2.15.0-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 33,512 kB
  • sloc: ruby: 11,959; sh: 46; makefile: 11
file content (202 lines) | stat: -rw-r--r-- 5,863 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
# coding: utf-8
# typed: true
# frozen_string_literal: true

require 'forwardable'
require 'pdf/reader/page_layout'

module PDF
  class Reader

    # Builds a UTF-8 string of all the text on a single page by processing all
    # the operaters in a content stream.
    #
    class PageTextReceiver
      extend Forwardable

      SPACE = " " #: String

      #: untyped
      attr_reader :state

      #: untyped
      attr_reader :options

      ########## BEGIN FORWARDERS ##########
      # Graphics State Operators
      def_delegators :@state, :save_graphics_state, :restore_graphics_state

      # Matrix Operators
      def_delegators :@state, :concatenate_matrix

      # Text Object Operators
      def_delegators :@state, :begin_text_object, :end_text_object

      # Text State Operators
      def_delegators :@state, :set_character_spacing, :set_horizontal_text_scaling
      def_delegators :@state, :set_text_font_and_size, :font_size
      def_delegators :@state, :set_text_leading, :set_text_rendering_mode
      def_delegators :@state, :set_text_rise, :set_word_spacing

      # Text Positioning Operators
      def_delegators :@state, :move_text_position, :move_text_position_and_set_leading
      def_delegators :@state, :set_text_matrix_and_text_line_matrix, :move_to_start_of_next_line
      ##########  END FORWARDERS  ##########

      # starting a new page
      def page=(page)
        @state = PageState.new(page)
        @page = page
        @content = []
        @characters = []
      end

      def runs(opts = {})
        runs = @characters

        if rect = opts.fetch(:rect, @page.rectangles[:CropBox])
          runs = BoundingRectangleRunsFilter.runs_within_rect(runs, rect)
        end

        if opts.fetch(:skip_zero_width, true)
          runs = ZeroWidthRunsFilter.exclude_zero_width_runs(runs)
        end

        if opts.fetch(:skip_overlapping, true)
          runs = OverlappingRunsFilter.exclude_redundant_runs(runs)
        end

        runs = NoTextFilter.exclude_empty_strings(runs)

        if opts.fetch(:merge, true)
          runs = merge_runs(runs)
        end

        if (only_filter = opts.fetch(:only, nil))
          runs = AdvancedTextRunFilter.only(runs, only_filter)
        end

        if (exclude_filter = opts.fetch(:exclude, nil))
          runs = AdvancedTextRunFilter.exclude(runs, exclude_filter)
        end

        runs
      end

      # deprecated
      def content
        mediabox = @page.rectangles[:MediaBox]
        PageLayout.new(runs, mediabox).to_s
      end

      #####################################################
      # Text Showing Operators
      #####################################################
      # record text that is drawn on the page
      def show_text(string) # Tj (AWAY)
        internal_show_text(string)
      end

      def show_text_with_positioning(params) # TJ [(A) 120 (WA) 20 (Y)]
        params.each do |arg|
          if arg.is_a?(String)
            internal_show_text(arg)
          elsif arg.is_a?(Numeric)
            @state.process_glyph_displacement(0, arg, false)
          else
            # skip it
          end
        end
      end

      def move_to_next_line_and_show_text(str) # '
        @state.move_to_start_of_next_line
        show_text(str)
      end

      def set_spacing_next_line_show_text(aw, ac, string) # "
        @state.set_word_spacing(aw)
        @state.set_character_spacing(ac)
        move_to_next_line_and_show_text(string)
      end

      #####################################################
      # XObjects
      #####################################################
      def invoke_xobject(label)
        @state.invoke_xobject(label) do |xobj|
          case xobj
          when PDF::Reader::FormXObject then
            xobj.walk(self)
          end
        end
      end

      private

      def internal_show_text(string)
        PDF::Reader::Error.validate_type_as_malformed(string, "string", String)
        if @state.current_font.nil?
          raise PDF::Reader::MalformedPDFError, "current font is invalid"
        end
        glyphs = @state.current_font.unpack(string)
        glyphs.each_with_index do |glyph_code, index|
          # paint the current glyph
          newx, newy = @state.trm_transform(0,0)
          newx, newy = apply_rotation(newx, newy)

          utf8_chars = @state.current_font.to_utf8(glyph_code)

          # apply to glyph displacment for the current glyph so the next
          # glyph will appear in the correct position
          glyph_width = @state.current_font.glyph_width_in_text_space(glyph_code)
          th = 1
          scaled_glyph_width = glyph_width * @state.font_size * th
          unless utf8_chars == SPACE
            @characters << TextRun.new(newx, newy, scaled_glyph_width, @state.font_size, utf8_chars)
          end
          @state.process_glyph_displacement(glyph_width, 0, utf8_chars == SPACE)
        end
      end

      def apply_rotation(x, y)
        if @page.rotate == 90
          tmp = x
          x = y
          y = tmp * -1
        elsif @page.rotate == 180
          y *= -1
          x *= -1
        elsif @page.rotate == 270
          tmp = y
          y = x
          x = tmp * -1
        end
        return x, y
      end

      # take a collection of TextRun objects and merge any that are in close
      # proximity
      def merge_runs(runs)
        runs.group_by { |char|
          char.y.to_i
        }.map { |y, chars|
          group_chars_into_runs(chars.sort)
        }.flatten.sort
      end

      def group_chars_into_runs(chars)
        chars.each_with_object([]) do |char, runs|
          if runs.empty?
            runs << char
          elsif runs.last.mergable?(char)
            runs[-1] = runs.last + char
          else
            runs << char
          end
        end
      end

    end
  end
end