File: bayes.rb

package info (click to toggle)
ruby-classifier 1.3.4-4
  • links: PTS, VCS
  • area: main
  • in suites: bookworm, bullseye, forky, sid, trixie
  • size: 224 kB
  • sloc: ruby: 739; makefile: 5
file content (135 lines) | stat: -rw-r--r-- 4,521 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
# Author::    Lucas Carlson  (mailto:lucas@rufy.com)
# Copyright:: Copyright (c) 2005 Lucas Carlson
# License::   LGPL

module Classifier

class Bayes
  # The class can be created with one or more categories, each of which will be
  # initialized and given a training method. E.g., 
  #      b = Classifier::Bayes.new 'Interesting', 'Uninteresting', 'Spam'
	def initialize(*categories)
		@categories = Hash.new
		categories.each { |category| @categories[category.prepare_category_name] = Hash.new }
		@total_words = 0
                @category_counts = Hash.new(0)
	end

	#
	# Provides a general training method for all categories specified in Bayes#new
	# For example:
	#     b = Classifier::Bayes.new 'This', 'That', 'the_other'
	#     b.train :this, "This text"
	#     b.train "that", "That text"
	#     b.train "The other", "The other text"
	def train(category, text)
		category = category.prepare_category_name
                @category_counts[category] += 1
		text.word_hash.each do |word, count|
			@categories[category][word]     ||=     0
			@categories[category][word]      +=     count
			@total_words += count
		end
	end

	#
	# Provides a untraining method for all categories specified in Bayes#new
	# Be very careful with this method.
	#
	# For example:
	#     b = Classifier::Bayes.new 'This', 'That', 'the_other'
	#     b.train :this, "This text"
	#     b.untrain :this, "This text"
	def untrain(category, text)
		category = category.prepare_category_name
                @category_counts[category] -= 1
		text.word_hash.each do |word, count|
			if @total_words >= 0
				orig = @categories[category][word]
				@categories[category][word]     ||=     0
				@categories[category][word]      -=     count
				if @categories[category][word] <= 0
					@categories[category].delete(word)
					count = orig
				end
				@total_words -= count
			end
		end
	end
		
	#
	# Returns the scores in each category the provided +text+. E.g.,
	#    b.classifications "I hate bad words and you"
	#    =>  {"Uninteresting"=>-12.6997928013932, "Interesting"=>-18.4206807439524}
	# The largest of these scores (the one closest to 0) is the one picked out by #classify
	def classifications(text)
		score = Hash.new
                training_count = @category_counts.values.inject { |x,y| x+y }.to_f
		@categories.each do |category, category_words|
			score[category.to_s] = 0
			total = category_words.values.inject(0) {|sum, element| sum+element}
			text.word_hash.each do |word, count|
				s = category_words.has_key?(word) ? category_words[word] : 0.1
				score[category.to_s] += Math.log(s/total.to_f)
			end
                        # now add prior probability for the category
                        s = @category_counts.has_key?(category) ? @category_counts[category] : 0.1
                        score[category.to_s] += Math.log(s / training_count)
		end
		return score
	end

  #
  # Returns the classification of the provided +text+, which is one of the 
  # categories given in the initializer. E.g.,
  #    b.classify "I hate bad words and you"
  #    =>  'Uninteresting'
	def classify(text)
		(classifications(text).sort_by { |a| -a[1] })[0][0]
	end
	
	#
	# Provides training and untraining methods for the categories specified in Bayes#new
	# For example:
	#     b = Classifier::Bayes.new 'This', 'That', 'the_other'
	#     b.train_this "This text"
	#     b.train_that "That text"
	#     b.untrain_that "That text"
	#     b.train_the_other "The other text"
	def method_missing(name, *args)
		category = name.to_s.gsub(/(un)?train_([\w]+)/, '\2').prepare_category_name
		if @categories.has_key? category
			args.each { |text| eval("#{$1}train(category, text)") }
		elsif name.to_s =~ /(un)?train_([\w]+)/
			raise StandardError, "No such category: #{category}"
		else
	    super  #raise StandardError, "No such method: #{name}"
		end
	end
	
	#
	# Provides a list of category names
	# For example:
	#     b.categories
	#     =>   ['This', 'That', 'the_other']
	def categories # :nodoc:
		@categories.keys.collect {|c| c.to_s}
	end
	
	#
	# Allows you to add categories to the classifier.
	# For example:
	#     b.add_category "Not spam"
	#
	# WARNING: Adding categories to a trained classifier will
	# result in an undertrained category that will tend to match
	# more criteria than the trained selective categories. In short,
	# try to initialize your categories at initialization.
	def add_category(category)
		@categories[category.prepare_category_name] = Hash.new
	end
	
	alias append_category add_category
end

end