File: corpus.rake

package info (click to toggle)
ruby-mail 2.7.1%2Bdfsg1-1
  • links: PTS, VCS
  • area: main
  • in suites: buster
  • size: 5,436 kB
  • sloc: ruby: 71,596; makefile: 3
file content (102 lines) | stat: -rw-r--r-- 2,639 bytes parent folder | download | duplicates (4)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
require 'benchmark'

namespace :corpus do

  task :load_mail do
    require File.expand_path('../../spec/environment', __FILE__)
    require 'mail'
  end

  # Used to run parsing against an arbitrary corpus of email.
  # For example: http://plg.uwaterloo.ca/~gvcormac/treccorpus/
  desc "Provide a LOCATION=/some/dir to verify parsing in bulk, otherwise defaults"
  task :verify_all => :load_mail do

    root_of_corpus    = ENV['LOCATION'] || 'corpus/spam'
    @save_failures_to = ENV['SAVE_TO']  || 'corpus/failed_emails'
    @failed_emails    = []
    @checked_count    = 0

    if root_of_corpus
      root_of_corpus = File.expand_path(root_of_corpus)
      if not File.directory?(root_of_corpus)
        raise "\n\tPath '#{root_of_corpus}' is not a directory.\n\n"
      end
    else
      raise "\n\tSupply path to corpus: LOCATION=/path/to/corpus\n\n"
    end

    puts "Mail which fails to parse will be saved in '#{@save_failures_to}'"
    puts "Checking '#{root_of_corpus}' directory (recursively)"

    elapsed = Benchmark.realtime { dir_node(root_of_corpus) }

    puts "\n\n"

    if @failed_emails.any?
      report_failures_to_stdout
    end
    puts "Out of Total: #{@checked_count}"
    puts 'Elapsed: %.2f ms' % (elapsed * 1000.0)
  end

  def dir_node(path)
    puts "\n\n"
    puts "Checking emails in '#{path}':"

    entries = Dir.entries(path)

    entries.each do |entry|
      next if ['.', '..'].include?(entry)
      full_path = File.join(path, entry)

      if File.file?(full_path)
        file_node(full_path)
      elsif File.directory?(full_path)
        dir_node(full_path)
      end
    end
  end

  def file_node(path)
    verify(path)
  end

  def verify(path)
    result, exception = parse_as_mail(path)
    if result
      print '.'
    else
      save_failure(path, exception)
      print 'x'
    end
  end

  def save_failure(path, exception)
    @failed_emails << [path, exception]
    if @save_failures_to
      email_basename = File.basename(path)
      failure_as_filename = exception.message.gsub(/\W/, '_')
      new_email_name = [failure_as_filename, email_basename].join("_")
      FileUtils.mkdir_p(@save_failures_to)
      File.open(File.join(@save_failures_to, new_email_name), 'w+') do |fh|
        fh << File.read(path)
      end
    end
  end

  def parse_as_mail(path)
    @checked_count += 1
    Mail.read(path)
    [true, nil]
  rescue => e
    [false, e]
  end

  def report_failures_to_stdout
    @failed_emails.each do |path, exception|
      puts "#{path}: #{exception.message}\n\t#{exception.backtrace.join("\n\t")}"
    end
    puts "Failed: #{@failed_emails.size}"
  end
end