1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75
|
package Algorithm::NaiveBayes::Model::Frequency;
use strict;
use Algorithm::NaiveBayes::Util qw(sum_hash add_hash max rescale);
use base qw(Algorithm::NaiveBayes);
sub new {
my $self = shift()->SUPER::new(@_);
$self->training_data->{attributes} = {};
$self->training_data->{labels} = {};
return $self;
}
sub do_add_instance {
my ($self, $attributes, $labels, $training_data) = @_;
add_hash($training_data->{attributes}, $attributes);
my $mylabels = $training_data->{labels};
foreach my $label ( @$labels ) {
$mylabels->{$label}{count}++;
add_hash($mylabels->{$label}{attributes} ||= {}, $attributes);
}
}
sub do_train {
my ($self, $training_data) = @_;
my $m = {};
my $instances = $self->instances;
my $labels = $training_data->{labels};
$m->{attributes} = $training_data->{attributes};
my $vocab_size = keys %{ $m->{attributes} };
# Calculate the log-probabilities for each category
foreach my $label ($self->labels) {
$m->{prior_probs}{$label} = log($labels->{$label}{count} / $instances);
# Count the number of tokens in this cat
my $label_tokens = sum_hash($labels->{$label}{attributes});
# Compute a smoothing term so P(word|cat)==0 can be avoided
$m->{smoother}{$label} = -log($label_tokens + $vocab_size);
# P(attr|label) = $count/$label_tokens (simple)
# P(attr|label) = ($count + 1)/($label_tokens + $vocab_size) (with smoothing)
# log P(attr|label) = log($count + 1) - log($label_tokens + $vocab_size)
my $denominator = log($label_tokens + $vocab_size);
while (my ($attribute, $count) = each %{ $labels->{$label}{attributes} }) {
$m->{probs}{$label}{$attribute} = log($count + 1) - $denominator;
}
}
return $m;
}
sub do_predict {
my ($self, $m, $newattrs) = @_;
# Note that we're using the log(prob) here. That's why we add instead of multiply.
my %scores = %{$m->{prior_probs}};
while (my ($feature, $value) = each %$newattrs) {
next unless exists $m->{attributes}{$feature}; # Ignore totally unseen features
while (my ($label, $attributes) = each %{$m->{probs}}) {
$scores{$label} += ($attributes->{$feature} || $m->{smoother}{$label})*$value; # P($feature|$label)**$value
}
}
rescale(\%scores);
return \%scores;
}
1;
|