File: model-statistics

package info (click to toggle)
spamassassin 3.1.7-2
  • links: PTS
  • area: main
  • in suites: etch-m68k
  • size: 5,376 kB
  • ctags: 2,123
  • sloc: perl: 39,706; ansic: 3,133; sh: 1,344; sql: 170; makefile: 168
file content (63 lines) | stat: -rwxr-xr-x 1,523 bytes parent folder | download | duplicates (8)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
#!/usr/bin/perl

# This script is used to print some statistics about classification accuracy
# with a k-fold cross validation

use strict;

my $lambda = 50;  # desired lambda for TCR calculation

if ( scalar(@ARGV) < 1 ) {
	print STDERR "Usage: model-statistics [validate]\n";
	exit 1;
}

my (@fp1, @fn1, @tcr1);

open (FILE, $ARGV[0]) || die $!;
while (<FILE>) {
	my @x = split(/\s+/);
	push (@fp1, $x[2] / ($x[0] + $x[2]));
	push (@fn1, $x[3] / ($x[1] + $x[3]));
	push (@tcr1, $x[1] / ($x[3] + $lambda * $x[2]));
}
close (FILE);

stat_analysis ("False positives", "pct", \@fp1);
stat_analysis ("False negatives", "pct", \@fn1);
stat_analysis ("TCR (lambda=$lambda)", "lin", \@tcr1);

sub stat_analysis {
	my $title = shift;
	my $pct = shift;
	my $s1 = shift;

	# This is the number of degrees of freedom of the two sample sets (i.e.
	# the number of samples in each set).
	my $dof = scalar(@{$s1});

	# Compute the mean and standard deviation of the first sample
	# mean = 1/n * sum(s[i])
	my $mean_s1 = 0;
	foreach my $i (1..$dof) {
		$mean_s1 += $$s1[$i];
	}
	$mean_s1 /= $dof;

	# var = 1/(n-1) * sum((mean - s[i])^2)
	my $var_s1 = 0;
	foreach my $i (1..$dof) {
		$var_s1 += ($mean_s1 - $$s1[$i])**2;
	}
	$var_s1 /= $dof - 1;

	# std = sqrt(var)
	my $std_s1 = sqrt($var_s1);

	# SA developers like percentage points instead of probabilities.
	if ( $pct eq "pct" ) {
		printf "%s: mean=%0.4f%% std=%0.4f\n",$title,100*$mean_s1,100*$std_s1;
	} else {
		printf "%s: mean=%0.4f std=%0.4f\n",$title,$mean_s1,$std_s1;
	}
}