File: model-statistics

package info (click to toggle)

spamassassin 3.1.7-2

links: PTS
area: main
in suites: etch-m68k
size: 5,376 kB
ctags: 2,123
sloc: perl: 39,706; ansic: 3,133; sh: 1,344; sql: 170; makefile: 168

file content (63 lines) | stat: -rwxr-xr-x 1,523 bytes

parent folder | download | duplicates (8)

#!/usr/bin/perl

# This script is used to print some statistics about classification accuracy
# with a k-fold cross validation

use strict;

my $lambda = 50;  # desired lambda for TCR calculation

if ( scalar(@ARGV) < 1 ) {
	print STDERR "Usage: model-statistics [validate]\n";
	exit 1;
}

my (@fp1, @fn1, @tcr1);

open (FILE, $ARGV[0]) || die $!;
while (<FILE>) {
	my @x = split(/\s+/);
	push (@fp1, $x[2] / ($x[0] + $x[2]));
	push (@fn1, $x[3] / ($x[1] + $x[3]));
	push (@tcr1, $x[1] / ($x[3] + $lambda * $x[2]));
}
close (FILE);

stat_analysis ("False positives", "pct", \@fp1);
stat_analysis ("False negatives", "pct", \@fn1);
stat_analysis ("TCR (lambda=$lambda)", "lin", \@tcr1);

sub stat_analysis {
	my $title = shift;
	my $pct = shift;
	my $s1 = shift;

	# This is the number of degrees of freedom of the two sample sets (i.e.
	# the number of samples in each set).
	my $dof = scalar(@{$s1});

	# Compute the mean and standard deviation of the first sample
	# mean = 1/n * sum(s[i])
	my $mean_s1 = 0;
	foreach my $i (1..$dof) {
		$mean_s1 += $$s1[$i];
	}
	$mean_s1 /= $dof;

	# var = 1/(n-1) * sum((mean - s[i])^2)
	my $var_s1 = 0;
	foreach my $i (1..$dof) {
		$var_s1 += ($mean_s1 - $$s1[$i])**2;
	}
	$var_s1 /= $dof - 1;

	# std = sqrt(var)
	my $std_s1 = sqrt($var_s1);

	# SA developers like percentage points instead of probabilities.
	if ( $pct eq "pct" ) {
		printf "%s: mean=%0.4f%% std=%0.4f\n",$title,100*$mean_s1,100*$std_s1;
	} else {
		printf "%s: mean=%0.4f std=%0.4f\n",$title,$mean_s1,$std_s1;
	}
}