1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165
|
#!/usr/bin/perl
# This script is used to do a statistical comparative analysis of
use Statistics::Distributions;
use strict;
my $alpha = 0.95; # acceptable confidence in not making a type 1 error
# i.e. assume that the means are different
my $lambda = 50; # desired lambda for TCR calculation
if ( scalar(@ARGV) < 2 ) {
print STDERR "Usage: compare-models [validate1] [validate2]\n";
exit 1;
}
my (@fp1, @fn1, @tcr1);
# allow dir names to be specified, too, for brevity
if (-f "$ARGV[0]/validate") { $ARGV[0] .= "/validate"; }
if (-f "$ARGV[1]/validate") { $ARGV[1] .= "/validate"; }
open (FILE, $ARGV[0]) || die $!;
while (<FILE>) {
my @x = split(/\s+/);
push (@fp1, $x[2] / ($x[0] + $x[2]));
push (@fn1, $x[3] / ($x[1] + $x[3]));
push (@tcr1, $x[1] / ($x[3] + $lambda * $x[2]));
}
close (FILE);
my (@fp2, @fn2, @tcr2);
open (FILE, $ARGV[1]) || die $!;
while (<FILE>) {
my @x = split(/\s+/);
push (@fp2, $x[2] / ($x[0] + $x[2]));
push (@fn2, $x[3] / ($x[1] + $x[3]));
push (@tcr2, $x[1] / ($x[3] + $lambda * $x[2]));
}
close (FILE);
stat_analysis ("False positives", "pct", \@fp1, \@fp2);
stat_analysis ("False negatives", "pct", \@fn1, \@fn2);
stat_analysis ("TCR (lambda=$lambda)", "lin", \@tcr1, \@tcr2);
sub stat_analysis {
my $title = shift;
my $pct = shift;
my $s1 = shift;
my $s2 = shift;
unless ( scalar(@$s1) == scalar(@$s1) ) {
print STDERR "Can't compute stats for $title. Samples are not paired.\n";
return;
}
# This is the number of degrees of freedom of the two sample sets (i.e.
# the number of samples in each set).
my $dof = scalar(@{$s1});
print "$title:\n";
# Compute the mean and standard deviation of the first sample
# mean = 1/n * sum(s[i])
my $mean_s1 = 0;
foreach my $i (1..$dof) {
$mean_s1 += $$s1[$i];
}
$mean_s1 /= $dof;
# var = 1/(n-1) * sum((mean - s[i])^2)
my $var_s1 = 0;
foreach my $i (1..$dof) {
$var_s1 += ($mean_s1 - $$s1[$i])**2;
}
$var_s1 /= $dof - 1;
# std = sqrt(var)
my $std_s1 = sqrt($var_s1);
# Compute the mean and standard deviation of the second sample
# mean = 1/n * sum(s[i])
my $mean_s2 = 0;
foreach my $i (1..$dof) {
$mean_s2 += $$s2[$i];
}
$mean_s2 /= $dof;
# var = 1/(n-1) * sum((mean - s[i])^2)
my $var_s2 = 0;
foreach my $i (1..$dof) {
$var_s2 += ($mean_s2 - $$s2[$i])**2;
}
$var_s2 /= $dof - 1;
# std = sqrt(var)
my $std_s2 = sqrt($var_s2);
# SA developers like percentage points instead of probabilities.
if ( $pct eq "pct" ) {
printf "\tSample 1: mean=%0.4f%% std=%0.4f\n",100*$mean_s1,100*$std_s1;
printf "\tSample 2: mean=%0.4f%% std=%0.4f\n",100*$mean_s2,100*$std_s2;
} else {
printf "\tSample 1: mean=%0.4f std=%0.4f\n",$mean_s1,$std_s1;
printf "\tSample 2: mean=%0.4f std=%0.4f\n",$mean_s2,$std_s2;
}
# Compute the mean of the differences between the two samples
my $mean_d = 0;
foreach my $i (1..$dof) {
$mean_d += $$s1[$i] - $$s2[$i];
}
$mean_d /= $dof;
# Compute the variance of the differences between the two samples
my $var_d = 0;
foreach my $i (1..$dof) {
$var_d += ($mean_d - $$s1[$i] + $$s2[$i])**2;
}
$var_d /= $dof - 1;
my $std_d = sqrt($var_d);
# To determine whether two samples are from the same distribution
# (i.e. they have the same mean), we are going to use a paired sample
# t-test. You can find more information about this in Tom Mitchell's
# "Machine Learning" book.
# Let t = mean / (var * sqrt(n))
my $tstat;
if ( $var_d > 0 ) {
$tstat = $mean_d / sqrt($var_d / $dof);
} else {
$tstat = 0;
}
# Now we find the critical value of t for the alpha% confidence
# interval.
my $tcrit = Statistics::Distributions::tdistr ($dof, (1-$alpha)/2);
# This is the probability that the two distributions are from different
# means.
my $tprob = 1-Statistics::Distributions::tprob ($dof, abs($tstat))/2;
# If the t statistic is less than the critical value, this means that
# we should accept the null hypothesis (the distributions have the same
# mean), otherwise it should be accepted.
if ( abs($tstat) < $tcrit ) {
printf "\tNot statistically significantly different (alpha=%0.4f)\n", $alpha;
} else {
printf "\tStatistically significantly different with confidence %0.4f%%\n", 100*$tprob;
}
# This displays an estimate of the confidence interval around the
# estimated mean difference between the two samples. Bear in mind that
# the t statistic is working on the local differences and not the
# global difference.
if ( $pct eq "pct" ) {
printf "\tEstimated difference: %0.4f%% +/- %0.4f\n", 100*$mean_d, 100*$std_d*$tcrit;
} else {
printf "\tEstimated difference: %0.4f +/- %0.4f\n", $mean_d, $std_d*$tcrit;
}
print "\n";
}
|