File: column_spec.rb

package info (click to toggle)
ruby-pdf-reader 1.3.3-1
  • links: PTS, VCS
  • area: main
  • in suites: jessie, jessie-kfreebsd
  • size: 12,908 kB
  • ctags: 569
  • sloc: ruby: 8,330; makefile: 10
file content (140 lines) | stat: -rw-r--r-- 5,821 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
# coding: utf-8

require File.dirname(__FILE__) + "/spec_helper"

describe PDF::Reader, "column specs" do

  context "page 1" do
    it "should correctly extract the headline" do
      filename = pdf_spec_file("column_integration")

      PDF::Reader.open(filename) do |reader|
        page = reader.page(1)
        page.text.should =~ /Some Headline/
      end
    end
    it "should correctly extract the first few lines" do
      filename = pdf_spec_file("column_integration")

      PDF::Reader.open(filename) do |reader|
        page = reader.page(1)
        ft = page.text
        ft.should =~ /ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu/
        ft.should =~ /Lorem ipsum dolor sit amet, consectetur adipisic -\s+adipisicing elit, sed do eiusmod tempor incididunt/
        ft.should =~ /ing elit, sed do eiusmod tempor incididunt ut labore\s+ut labore et dolore magna aliqua. Ut enim ad minim/
        ft.should =~ /et dolore magna aliqua. Ut enim ad minim veniam,\s+veniam, quis nostrud exercitation ullamco laboris/
        ft.should =~ /quis nostrud exercitation ullamco laboris nisi ut\s+nisi ut aliquip ex ea commodo consequat. Duis aute/
        ft.should =~ /aliquip ex ea commodo consequat. Duis aute irure\s+irure dolor in reprehenderit in voluptate velit esse/
      end
    end

    it "should align text from the second column" do
      filename = pdf_spec_file("column_integration")

      PDF::Reader.open(filename) do |reader|
        page = reader.page(1)
        ft = page.text
        # The following lines are in the second column, and their position with in the
        # string (from the left) should all be at the same spot
        match_pos_1 = find_position_of_match(ft, /adipisicing elit, sed do eiusmod tempor incididunt$/)
        match_pos_2 = find_position_of_match(ft, /ut labore et dolore magna aliqua. Ut enim ad minim$/)
        match_pos_3 = find_position_of_match(ft, /veniam, quis nostrud exercitation ullamco laboris$/)
        match_pos_4 = find_position_of_match(ft, /nisi ut aliquip ex ea commodo consequat. Duis aute$/)
        match_pos_5 = find_position_of_match(ft, /irure dolor in reprehenderit in voluptate velit esse$/)

        match_pos_1.should_not be_nil
        match_pos_1.should eql(match_pos_2)
        match_pos_1.should eql(match_pos_3)
        match_pos_1.should eql(match_pos_4)
        match_pos_1.should eql(match_pos_5)
      end
    end
  end

  context "page 2" do
    it "should correctly align text in column 1" do
      filename = pdf_spec_file("column_integration")

      PDF::Reader.open(filename) do |reader|
        ft = reader.page(2).text

        # The following lines are in the first column of the page prior to the interruption
        col1_1   = find_position_of_match(ft, /^tate velit esse cillum dolore eu/)
        col1_2   = find_position_of_match(ft, /^fugiat nulla pariatur. Excepteur/)
        col1_3   = find_position_of_match(ft, /^sint occaecat cupidatat non proi -/)
        col1_4   = find_position_of_match(ft, /^dent, sunt in culpa qui officia de-/)

        col1_1.should_not be_nil
        col1_1.should eql(col1_2)
        col1_1.should eql(col1_3)
        col1_1.should eql(col1_4)
      end
    end
    it "should correctly align text in column 2" do
      filename = pdf_spec_file("column_integration")

      PDF::Reader.open(filename) do |reader|
        ft = reader.page(2).text

        # The following lines are in the second column of the page prior to the interruption
        col2_1   = find_position_of_match(ft, /occaecat cupidatat non proident,\s*anim/)
        col2_2   = find_position_of_match(ft, /sunt in culpa qui officia deserunt\s*sum/)
        col2_3   = find_position_of_match(ft, /mollit anim id est laborum. Lo -\s*adipisicing/)
        col2_4   = find_position_of_match(ft, /rem ipsum dolor sit amet, con -\s*tempor/)

        col2_1.should_not be_nil
        col2_1.should eql(col2_2)
        col2_1.should eql(col2_3)
        col2_1.should eql(col2_4)
      end
    end

    it "should correctly align text in column 3 before the interruption" do
      filename = pdf_spec_file("column_integration")

      PDF::Reader.open(filename) do |reader|
        ft = reader.page(2).text

        # The following lines are in the third column of the page prior to the interruption
        col3_a_1 = find_position_of_match(ft, /anim id est laborum. Lorem ip -$/)
        col3_a_2 = find_position_of_match(ft, /sum dolor sit amet, consectetur$/)
        col3_a_3 = find_position_of_match(ft, /adipisicing elit, sed do eiusmod$/)
        col3_a_4 = find_position_of_match(ft, /tempor incididunt ut labore et$/)

        col3_a_1.should_not be_nil
        col3_a_1.should eql(col3_a_2)
        col3_a_1.should eql(col3_a_3)
        col3_a_1.should eql(col3_a_4)
      end
    end

    it "should correctly align text in column 3 during the interruption" do
      filename = pdf_spec_file("column_integration")

      PDF::Reader.open(filename) do |reader|
        ft = reader.page(2).text

        #the following lines are in the third column of the page _during_ the interruption
        col3_b_1 = find_position_of_match(ft, /\s{10}dolore magna aliqua. Ut$/)
        col3_b_2 = find_position_of_match(ft, /\s{10}enim ad minim veniam,$/)
        col3_b_3 = find_position_of_match(ft, /\s{10}quis nostrud exercitation$/)
        col3_b_4 = find_position_of_match(ft, /\s{10}ullamco laboris nisi ut$/)

        col3_b_1.should_not be_nil
        col3_b_1.should eql(col3_b_2)
        col3_b_1.should eql(col3_b_3)
        col3_b_1.should eql(col3_b_4)
      end
    end
  end

  def find_position_of_match(source, regex)
    source.each_line do |line|
      if x_pos = line =~ regex
        return x_pos
      end
    end
    nil
  end

end