1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80
|
# coding: utf-8
# a script for measuring text extraction performance
# TO BENCHMARK: ruby tools/bench.rb <runs>
# TO PROFILE: ruby tools/bench.rb perftools
# OR: ruby-prof tools/bench.rb <runs>
# FOR OBJECT ALLOCATION STATS: ruby tools/bench.rb memprof
# TO COUNT GC RUNS: ruby tools/bench.rb gc
$project_root = File.expand_path(File.join(File.dirname(__FILE__), ".."))
require 'rubygems' # for Ruby 1.8
$:.unshift "#{$project_root}/lib"
require 'pdf/reader'
# Extract all the text from a large PDF
def extract_text
PDF::Reader.open("#{$project_root}/spec/data/no_text_spaces.pdf") do |reader|
reader.pages.each do |page|
page.text
end
end
end
case ARGV[0]
when "memprof"
# Measure object allocation with memprof
require 'memprof'
GC.disable
Memprof.track { extract_text }
when "perftools"
# Profile with perftools.rb
# (The best thing about perftools.rb is that it shows you time spent on
# garbage collection)
require 'perftools'
PerfTools::CpuProfiler.start("/tmp/perftools_data") do
extract_text
end
`pprof.rb --text /tmp/perftools_data > #{$project_root}/tools/profiles/perftools.txt`
`pprof.rb --pdf /tmp/perftools_data > #{$project_root}/tools/profiles/perftools.pdf`
when "gc"
before = GC.count
extract_text
puts "GC ran #{GC.count - before} times"
when "allocations"
GC.disable
before = ObjectSpace.count_objects
extract_text
after = ObjectSpace.count_objects
after.each do |key, val|
puts "#{key}: #{val - before[key].to_i}"
end
GC.start
else
# Benchmark
# Average the results over multiple runs
# Throw out the best and worst results, and average what remains
# With 10 runs, the results seem to fluctuate by as much as 6-7%
# I'd like that to be 1-2%, but that requires a VERY high number of runs
runs = (ARGV[0] || 10).to_i
times = []
runs.times do
start = Time.new
extract_text
times << (Time.new - start)
sleep(0.1) # results seem more consistent this way
end
times.sort!
times = times.drop(runs / 5).take(runs - (runs * 2 / 3))
average = times.reduce(0,&:+).to_f / times.size
puts "#{"%0.3f" % average} seconds"
end
|