File: fuzzy_paragraphs.rb

package info (click to toggle)
ruby-pdf-reader 2.15.0-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 33,512 kB
  • sloc: ruby: 11,959; sh: 46; makefile: 11
file content (24 lines) | stat: -rw-r--r-- 471 bytes parent folder | download | duplicates (4)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
#!/usr/bin/env ruby
# coding: utf-8

# Extract an (imperfect) array of paragraphs divided somewhat
# arbitrarily on line length. 

require 'pdf/reader'

reader = PDF::Reader.new('somefile.pdf')

paragraph = ""
paragraphs = []
reader.pages.each do |page|
  lines = page.text.scan(/^.+/)
  lines.each do |line|
    if line.length > 55
      paragraph += " #{line}"
    else
      paragraph += " #{line}"
      paragraphs << paragraph
      paragraph = ""
    end
  end
end