1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63
|
#!/usr/bin/perl
# This script is used to print some statistics about classification accuracy
# with a k-fold cross validation
use strict;
my $lambda = 50; # desired lambda for TCR calculation
if ( scalar(@ARGV) < 1 ) {
print STDERR "Usage: model-statistics [validate]\n";
exit 1;
}
my (@fp1, @fn1, @tcr1);
open (FILE, $ARGV[0]) || die $!;
while (<FILE>) {
my @x = split(/\s+/);
push (@fp1, $x[2] / ($x[0] + $x[2]));
push (@fn1, $x[3] / ($x[1] + $x[3]));
push (@tcr1, $x[1] / ($x[3] + $lambda * $x[2]));
}
close (FILE);
stat_analysis ("False positives", "pct", \@fp1);
stat_analysis ("False negatives", "pct", \@fn1);
stat_analysis ("TCR (lambda=$lambda)", "lin", \@tcr1);
sub stat_analysis {
my $title = shift;
my $pct = shift;
my $s1 = shift;
# This is the number of degrees of freedom of the two sample sets (i.e.
# the number of samples in each set).
my $dof = scalar(@{$s1});
# Compute the mean and standard deviation of the first sample
# mean = 1/n * sum(s[i])
my $mean_s1 = 0;
foreach my $i (1..$dof) {
$mean_s1 += $$s1[$i];
}
$mean_s1 /= $dof;
# var = 1/(n-1) * sum((mean - s[i])^2)
my $var_s1 = 0;
foreach my $i (1..$dof) {
$var_s1 += ($mean_s1 - $$s1[$i])**2;
}
$var_s1 /= $dof - 1;
# std = sqrt(var)
my $std_s1 = sqrt($var_s1);
# SA developers like percentage points instead of probabilities.
if ( $pct eq "pct" ) {
printf "%s: mean=%0.4f%% std=%0.4f\n",$title,100*$mean_s1,100*$std_s1;
} else {
printf "%s: mean=%0.4f std=%0.4f\n",$title,$mean_s1,$std_s1;
}
}
|