1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135
|
# Author:: Lucas Carlson (mailto:lucas@rufy.com)
# Copyright:: Copyright (c) 2005 Lucas Carlson
# License:: LGPL
module Classifier
class Bayes
# The class can be created with one or more categories, each of which will be
# initialized and given a training method. E.g.,
# b = Classifier::Bayes.new 'Interesting', 'Uninteresting', 'Spam'
def initialize(*categories)
@categories = Hash.new
categories.each { |category| @categories[category.prepare_category_name] = Hash.new }
@total_words = 0
@category_counts = Hash.new(0)
end
#
# Provides a general training method for all categories specified in Bayes#new
# For example:
# b = Classifier::Bayes.new 'This', 'That', 'the_other'
# b.train :this, "This text"
# b.train "that", "That text"
# b.train "The other", "The other text"
def train(category, text)
category = category.prepare_category_name
@category_counts[category] += 1
text.word_hash.each do |word, count|
@categories[category][word] ||= 0
@categories[category][word] += count
@total_words += count
end
end
#
# Provides a untraining method for all categories specified in Bayes#new
# Be very careful with this method.
#
# For example:
# b = Classifier::Bayes.new 'This', 'That', 'the_other'
# b.train :this, "This text"
# b.untrain :this, "This text"
def untrain(category, text)
category = category.prepare_category_name
@category_counts[category] -= 1
text.word_hash.each do |word, count|
if @total_words >= 0
orig = @categories[category][word]
@categories[category][word] ||= 0
@categories[category][word] -= count
if @categories[category][word] <= 0
@categories[category].delete(word)
count = orig
end
@total_words -= count
end
end
end
#
# Returns the scores in each category the provided +text+. E.g.,
# b.classifications "I hate bad words and you"
# => {"Uninteresting"=>-12.6997928013932, "Interesting"=>-18.4206807439524}
# The largest of these scores (the one closest to 0) is the one picked out by #classify
def classifications(text)
score = Hash.new
training_count = @category_counts.values.inject { |x,y| x+y }.to_f
@categories.each do |category, category_words|
score[category.to_s] = 0
total = category_words.values.inject(0) {|sum, element| sum+element}
text.word_hash.each do |word, count|
s = category_words.has_key?(word) ? category_words[word] : 0.1
score[category.to_s] += Math.log(s/total.to_f)
end
# now add prior probability for the category
s = @category_counts.has_key?(category) ? @category_counts[category] : 0.1
score[category.to_s] += Math.log(s / training_count)
end
return score
end
#
# Returns the classification of the provided +text+, which is one of the
# categories given in the initializer. E.g.,
# b.classify "I hate bad words and you"
# => 'Uninteresting'
def classify(text)
(classifications(text).sort_by { |a| -a[1] })[0][0]
end
#
# Provides training and untraining methods for the categories specified in Bayes#new
# For example:
# b = Classifier::Bayes.new 'This', 'That', 'the_other'
# b.train_this "This text"
# b.train_that "That text"
# b.untrain_that "That text"
# b.train_the_other "The other text"
def method_missing(name, *args)
category = name.to_s.gsub(/(un)?train_([\w]+)/, '\2').prepare_category_name
if @categories.has_key? category
args.each { |text| eval("#{$1}train(category, text)") }
elsif name.to_s =~ /(un)?train_([\w]+)/
raise StandardError, "No such category: #{category}"
else
super #raise StandardError, "No such method: #{name}"
end
end
#
# Provides a list of category names
# For example:
# b.categories
# => ['This', 'That', 'the_other']
def categories # :nodoc:
@categories.keys.collect {|c| c.to_s}
end
#
# Allows you to add categories to the classifier.
# For example:
# b.add_category "Not spam"
#
# WARNING: Adding categories to a trained classifier will
# result in an undertrained category that will tend to match
# more criteria than the trained selective categories. In short,
# try to initialize your categories at initialization.
def add_category(category)
@categories[category.prepare_category_name] = Hash.new
end
alias append_category add_category
end
end
|