File: page_layout.rb

package info (click to toggle)
ruby-pdf-reader 2.15.0-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 33,512 kB
  • sloc: ruby: 11,959; sh: 46; makefile: 11
file content (141 lines) | stat: -rw-r--r-- 4,289 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
# coding: utf-8
# typed: strict
# frozen_string_literal: true

require 'pdf/reader/overlapping_runs_filter'
require 'pdf/reader/zero_width_runs_filter'

class PDF::Reader

  # Takes a collection of TextRun objects and renders them into a single
  # string that best approximates the way they'd appear on a render PDF page.
  #
  # media box should be a 4 number array that describes the dimensions of the
  # page to be rendered as described by the page's MediaBox attribute
  class PageLayout

    DEFAULT_FONT_SIZE = 12 #: Numeric

    #: (Array[PDF::Reader::TextRun], Array[Numeric] | PDF::Reader::Rectangle) -> void
    def initialize(runs, mediabox)
      # mediabox is a 4-element array for now, but it'd be nice to switch to a
      # PDF::Reader::Rectangle at some point
      PDF::Reader::Error.validate_not_nil(mediabox, "mediabox")

      @mediabox = process_mediabox(mediabox) #: PDF::Reader::Rectangle
      @runs = runs #: Array[PDF::Reader::TextRun]
      @mean_font_size   = mean(@runs.map(&:font_size)) || DEFAULT_FONT_SIZE #: Numeric
      @mean_font_size = DEFAULT_FONT_SIZE if @mean_font_size == 0
      @median_glyph_width = median(@runs.map(&:mean_character_width)) || 0 #: Numeric
      @x_offset = @runs.map(&:x).sort.first || 0 #: Numeric
      lowest_y = @runs.map(&:y).sort.first || 0 #: Numeric
      @y_offset = lowest_y > 0 ? 0 : lowest_y #: Numeric
      @row_count = nil #: Numeric | nil
      @col_count = nil #: Numeric | nil
      @row_multiplier = nil #: Numeric | nil
      @col_multiplier = nil #: Numeric | nil
    end

    #: () -> String
    def to_s
      return "" if @runs.empty?
      return "" if row_count == 0

      page = row_count.times.map { |i| " " * col_count }
      @runs.each do |run|
        x_pos = ((run.x - @x_offset) / col_multiplier).round
        y_pos = row_count - ((run.y - @y_offset) / row_multiplier).round
        if y_pos <= row_count && y_pos >= 0 && x_pos <= col_count && x_pos >= 0
          local_string_insert(page[y_pos-1], run.text, x_pos)
        end
      end
      interesting_rows(page).map(&:rstrip).join("\n")
    end

    private

    #: () -> Numeric
    def page_width
      @mediabox.width
    end

    #: () -> Numeric
    def page_height
      @mediabox.height
    end

    # given an array of strings, return a new array with empty rows from the
    # beginning and end removed.
    #
    #   interesting_rows([ "", "one", "two", "" ])
    #   => [ "one", "two" ]
    #
    #: (untyped) -> untyped
    def interesting_rows(rows)
      line_lengths = rows.map { |l| l.strip.length }

      return [] if line_lengths.all?(&:zero?)

      first_line_with_text = line_lengths.index { |l| l > 0 }
      last_line_with_text  = line_lengths.size - line_lengths.reverse.index { |l| l > 0 }
      interesting_line_count = last_line_with_text - first_line_with_text
      rows[first_line_with_text, interesting_line_count].map
    end

    #: () -> untyped
    def row_count
      @row_count ||= (page_height / @mean_font_size).floor
    end

    #: () -> untyped
    def col_count
      @col_count ||= ((page_width  / @median_glyph_width) * 1.05).floor
    end

    #: () -> untyped
    def row_multiplier
      @row_multiplier ||= page_height.to_f / row_count.to_f
    end

    #: () -> untyped
    def col_multiplier
      @col_multiplier ||= page_width.to_f / col_count.to_f
    end

    #: (untyped) -> untyped
    def mean(collection)
      if collection.size == 0
        0
      else
        collection.inject(0) { |accum, v| accum + v} / collection.size.to_f
      end
    end

    #: (untyped) -> untyped
    def median(collection)
      if collection.size == 0
        0
      else
        collection.sort[(collection.size * 0.5).floor]
      end
    end

    #: (untyped, untyped, untyped) -> untyped
    def local_string_insert(haystack, needle, index)
      haystack[Range.new(index, index + needle.length - 1)] = String.new(needle)
    end

    #: (untyped) -> untyped
    def process_mediabox(mediabox)
      if mediabox.is_a?(Array)
        msg = "Passing the mediabox to PageLayout as an Array is deprecated," +
          " please use a Rectangle instead"
        $stderr.puts msg
        PDF::Reader::Rectangle.from_array(mediabox)
      else
        mediabox
      end
    end

  end
end