File: compare-models

package info (click to toggle)
spamassassin 4.0.2-3
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 25,724 kB
  • sloc: perl: 89,143; ansic: 5,193; sh: 3,737; javascript: 339; sql: 295; makefile: 209; python: 49
file content (165 lines) | stat: -rwxr-xr-x 4,665 bytes parent folder | download | duplicates (4)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
#!/usr/bin/perl

# This script is used to do a statistical comparative analysis of 

use Statistics::Distributions;
use strict;

my $alpha = 0.95; # acceptable confidence in not making a type 1 error
		  # i.e. assume that the means are different
my $lambda = 50;  # desired lambda for TCR calculation

if ( scalar(@ARGV) < 2 ) {
	print STDERR "Usage: compare-models [validate1] [validate2]\n";
	exit 1;
}

my (@fp1, @fn1, @tcr1);

# allow dir names to be specified, too, for brevity
if (-f "$ARGV[0]/validate") { $ARGV[0] .= "/validate"; }
if (-f "$ARGV[1]/validate") { $ARGV[1] .= "/validate"; }

open (FILE, $ARGV[0]) || die $!;
while (<FILE>) {
	my @x = split(/\s+/);
	push (@fp1, $x[2] / ($x[0] + $x[2]));
	push (@fn1, $x[3] / ($x[1] + $x[3]));
	push (@tcr1, $x[1] / ($x[3] + $lambda * $x[2]));
}
close (FILE);

my (@fp2, @fn2, @tcr2);

open (FILE, $ARGV[1]) || die $!;
while (<FILE>) {
	my @x = split(/\s+/);
	push (@fp2, $x[2] / ($x[0] + $x[2]));
	push (@fn2, $x[3] / ($x[1] + $x[3]));
	push (@tcr2, $x[1] / ($x[3] + $lambda * $x[2]));
}
close (FILE);

stat_analysis ("False positives", "pct", \@fp1, \@fp2);
stat_analysis ("False negatives", "pct", \@fn1, \@fn2);
stat_analysis ("TCR (lambda=$lambda)", "lin", \@tcr1, \@tcr2);

sub stat_analysis {
	my $title = shift;
	my $pct = shift;
	my $s1 = shift;
	my $s2 = shift;

	unless ( scalar(@$s1) == scalar(@$s1) ) {
		print STDERR "Can't compute stats for $title.  Samples are not paired.\n";
		return;
	}

	# This is the number of degrees of freedom of the two sample sets (i.e.
	# the number of samples in each set).
	my $dof = scalar(@{$s1});

	print "$title:\n";

	# Compute the mean and standard deviation of the first sample
	# mean = 1/n * sum(s[i])
	my $mean_s1 = 0;
	foreach my $i (1..$dof) {
		$mean_s1 += $$s1[$i];
	}
	$mean_s1 /= $dof;

	# var = 1/(n-1) * sum((mean - s[i])^2)
	my $var_s1 = 0;
	foreach my $i (1..$dof) {
		$var_s1 += ($mean_s1 - $$s1[$i])**2;
	}
	$var_s1 /= $dof - 1;

	# std = sqrt(var)
	my $std_s1 = sqrt($var_s1);

	# Compute the mean and standard deviation of the second sample
	# mean = 1/n * sum(s[i])
	my $mean_s2 = 0;
	foreach my $i (1..$dof) {
		$mean_s2 += $$s2[$i];
	}
	$mean_s2 /= $dof;

	# var = 1/(n-1) * sum((mean - s[i])^2)
	my $var_s2 = 0;
	foreach my $i (1..$dof) {
		$var_s2 += ($mean_s2 - $$s2[$i])**2;
	}
	$var_s2 /= $dof - 1;

	# std = sqrt(var)
	my $std_s2 = sqrt($var_s2);

	# SA developers like percentage points instead of probabilities.
	if ( $pct eq "pct" ) {
		printf "\tSample 1: mean=%0.4f%% std=%0.4f\n",100*$mean_s1,100*$std_s1;
		printf "\tSample 2: mean=%0.4f%% std=%0.4f\n",100*$mean_s2,100*$std_s2;
	} else {
		printf "\tSample 1: mean=%0.4f std=%0.4f\n",$mean_s1,$std_s1;
		printf "\tSample 2: mean=%0.4f std=%0.4f\n",$mean_s2,$std_s2;
	}

	# Compute the mean of the differences between the two samples
	my $mean_d = 0;
	foreach my $i (1..$dof) {
		$mean_d += $$s1[$i] - $$s2[$i];
	}
	$mean_d /= $dof;

	# Compute the variance of the differences between the two samples
	my $var_d = 0;
	foreach my $i (1..$dof) {
		$var_d += ($mean_d - $$s1[$i] + $$s2[$i])**2;
	}
	$var_d /= $dof - 1;
	my $std_d = sqrt($var_d);

	# To determine whether two samples are from the same distribution
	# (i.e. they have the same mean), we are going to use a paired sample
	# t-test.  You can find more information about this in Tom Mitchell's
	# "Machine Learning" book.
	
	# Let t = mean / (var * sqrt(n))
	my $tstat;
	if ( $var_d > 0 ) {
		$tstat = $mean_d / sqrt($var_d / $dof);
	} else {
		$tstat = 0;
	}

	# Now we find the critical value of t for the alpha% confidence
	# interval.
	my $tcrit = Statistics::Distributions::tdistr ($dof, (1-$alpha)/2);

	# This is the probability that the two distributions are from different
	# means.
	my $tprob = 1-Statistics::Distributions::tprob ($dof, abs($tstat))/2;

	# If the t statistic is less than the critical value, this means that
	# we should accept the null hypothesis (the distributions have the same
	# mean), otherwise it should be accepted.
	if ( abs($tstat) < $tcrit ) {
		printf "\tNot statistically significantly different (alpha=%0.4f)\n", $alpha;
	} else {
		printf "\tStatistically significantly different with confidence %0.4f%%\n", 100*$tprob;
	}

	# This displays an estimate of the confidence interval around the
	# estimated mean difference between the two samples.  Bear in mind that
	# the t statistic is working on the local differences and not the
	# global difference.
	if ( $pct eq "pct" ) {
		printf "\tEstimated difference: %0.4f%% +/- %0.4f\n", 100*$mean_d, 100*$std_d*$tcrit;
	} else {
		printf "\tEstimated difference: %0.4f +/- %0.4f\n", $mean_d, $std_d*$tcrit;
	}

	print "\n";
}