File: corpus.rake

package info (click to toggle)
ruby-mail 2.6.4%2Bdfsg1-1
  • links: PTS, VCS
  • area: main
  • in suites: stretch
  • size: 4,256 kB
  • ctags: 1,327
  • sloc: ruby: 44,678; makefile: 3
file content (125 lines) | stat: -rw-r--r-- 3,247 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
namespace :corpus do

  task :load_mail do
    require File.expand_path('../../spec/environment', __FILE__)
    require 'mail'
  end

  # Used to run parsing against an arbitrary corpus of email.
  # For example: http://plg.uwaterloo.ca/~gvcormac/treccorpus/
  desc "Provide a LOCATION=/some/dir to verify parsing in bulk, otherwise defaults"
  task :verify_all => :load_mail do

    root_of_corpus    = ENV['LOCATION'] || 'corpus/spam'
    @save_failures_to = ENV['SAVE_TO']  || 'spec/fixtures/emails/failed_emails'
    @failed_emails    = []
    @checked_count    = 0

    if root_of_corpus
      root_of_corpus = File.expand_path(root_of_corpus)
      if not File.directory?(root_of_corpus)
        raise "\n\tPath '#{root_of_corpus}' is not a directory.\n\n"
      end
    else
      raise "\n\tSupply path to corpus: LOCATION=/path/to/corpus\n\n"
    end

    if @save_failures_to
      if not File.directory?(@save_failures_to)
        raise "\n\tPath '#{@save_failures_to}' is not a directory.\n\n"
      end
      @save_failures_to = File.expand_path(@save_failures_to)
      puts "Mail which fails to parse will be saved in '#{@save_failures_to}'"
    end

    puts "Checking '#{root_of_corpus}' directory (recursively)"

    # we're tracking all the errors separately, don't clutter terminal
    $stderr_backup = $stderr.dup
    $stderr.reopen("/dev/null", "w")
    STDERR = $stderr

    dir_node(root_of_corpus)

    # put our toys back now that we're done with them
    $stderr = $stderr_backup.dup
    STDERR = $stderr

    puts "\n\n"
        
    if @failed_emails.any?
      report_failures_to_stdout
    end
    puts "Out of Total: #{@checked_count}"

    if @save_failures_to
      puts "Add SAVE_TO=/some/dir to save failed emails to for review.,"
      puts "May result in a lot of saved files. Do a dry run first!\n\n"
    else
      puts "There are no errors"
    end
  end

  def dir_node(path)
    puts "\n\n"
    puts "Checking emails in '#{path}':"

    entries = Dir.entries(path)

    entries.each do |entry|
      next if ['.', '..'].include?(entry)
      full_path = File.join(path, entry)

      if File.file?(full_path)
        file_node(full_path)
      elsif File.directory?(full_path)
        dir_node(full_path)
      end
    end
  end
  
  def file_node(path)
    verify(path)
  end
  
  def verify(path)
    result, message = parse_as_mail(path)
    if result
      print '.'
      $stdout.flush
    else
      save_failure(path, message)
      print 'x'
    end
  end

  def save_failure(path, message)
    @failed_emails << [path, message]
    if @save_failures_to
      email_basename = File.basename(path)
      failure_as_filename = message.gsub(/\W/, '_')
      new_email_name = [failure_as_filename, email_basename].join("_")
      File.open(File.join(@save_failures_to, new_email_name), 'w+') do |fh|
        fh << File.read(path)
      end 
    end
  end

  def parse_as_mail(path)
    @checked_count += 1
    begin
      parsed_mail = Mail.read(path)
      [true, nil]
    rescue => e
      [false, e.message]
    end
  end
  
  def report_failures_to_stdout
    @failed_emails.each do |failed|
      puts "#{failed[0]} : #{failed[1]}"
    end
    puts "Failed: #{@failed_emails.size}"
  end
  
end