File: Gaussian.pm

package info (click to toggle)
libalgorithm-naivebayes-perl 0.04-2
  • links: PTS, VCS
  • area: main
  • in suites: bookworm, forky, sid, trixie
  • size: 108 kB
  • sloc: perl: 253; makefile: 2
file content (60 lines) | stat: -rw-r--r-- 1,720 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
package Algorithm::NaiveBayes::Model::Gaussian;

use strict;
use base qw(Algorithm::NaiveBayes);
use Algorithm::NaiveBayes::Util qw(sum variance rescale);
use constant Pi => 4*atan2(1, 1);

sub do_add_instance {
  my ($self, $attributes, $labels, $training_data) = @_;
  
  foreach my $label ( @$labels ) {
    my $mylabel = $training_data->{labels}{$label} ||= {};
    $mylabel->{count}++;
    while (my ($attr, $value) = each %$attributes) {
      push @{$mylabel->{attrs}{$attr}}, $value;
    }
  }
}

sub do_train {
  my ($self, $training_data) = @_;
  my $m = {};
  
  my $instances = $self->instances;
  my $labels = $training_data->{labels};
  
  while (my ($label, $data) = each %$labels) {
    $m->{prior_probs}{$label} = log($labels->{$label}{count} / $instances);
    
    # Calculate the mean & stddev for each label-attribute combination
    while (my ($attr, $values) = each %{$data->{attrs}}) {
      my $mean = sum($values) / @$values;
      my $var  = variance($values, $mean)
	or next;  # Can't use variance of zero
      @{ $m->{summary}{$attr}{$label} }{'mean', 'var'} = ($mean, $var);
    }
  }
  return $m;
}

sub do_predict {
  my ($self, $m, $newattrs) = @_;
  
  my %scores = %{$m->{prior_probs}};
  while (my ($feature, $value) = each %$newattrs) {
    next unless exists $m->{summary}{$feature};  # Ignore totally unseen features
    while (my ($label, $data) = each %{$m->{summary}{$feature}}) {
      my ($mean, $var) = @{$data}{'mean', 'var'};
      # This is simplified from
      #   +=  log( 1/sqrt($var*2*Pi) * exp(-($value-$mean)**2/(2*$var)) );
      $scores{$label} -= 0.5*(log($var) + log(2*Pi) + ($value-$mean)**2/$var);
    }
  }
  
  rescale(\%scores);

  return \%scores;
}

1;