1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286
|
#!/usr/bin/perl
# read the prokMSAids of the hits, as produced by exactMatchIds.
# Then grab the taxonomy info for those prokMSAids and make some classification info from the set.
use strict;
use warnings;
#use lib '.';
use FindBin;
use lib "$FindBin::Bin";
use LevelPrediction;
loadTaxonomy( $ARGV[0] );
processHits( $ARGV[1] );
my %taxonomies = ();
# my %taxonomyPrior = ();
sub loadTaxonomy {
my ($taxFileName) = @_;
print STDERR "loading taxonomy...\n";
open( TAX, $taxFileName ) or die "Cannot open input taxonomy file $taxFileName: $!";
my $lines = 0;
while ( my $line = <TAX> ) {
chomp $line;
# try to accept several formats:
# semicolon or tab-delimited taxonomy strings
# and including the "pcid-width" column between the sequence id and
# the taxonomy string, or not
my ( $prokMSAid, @taxonomy ) = split( /[;\t]/, $line );
#my ( @taxonomy ) = split( /; /, $taxString );
if ( @taxonomy && ($taxonomy[0] eq "" || $taxonomy[0] =~ /^(\d+\.?\d*|\.\d+)$/ )) {
# value is numeric or empty, must be the pcid width of the target cluster
my $taxPcid = shift @taxonomy;
#print STDERR "Ignoring target cluster width: $taxPcid\n";
}
$taxonomies{$prokMSAid} = \@taxonomy;
# my $taxString = "";
# for my $taxElem (@taxonomy) {
# $taxString .= "$taxElem; ";
#
# # $taxonomyPrior{$taxString}++;
# }
$lines++;
}
close TAX;
print STDERR "...done loading $lines taxonomy lines\n";
# print STDERR "normalizing prior...\n";
# for my $taxKey (keys %taxonomyPrior)
# {
# $taxonomyPrior{$taxKey} /= $lines;
# }
# $taxonomyPrior{""} = 1;
# print STDERR "...done normalizing prior\n";
}
sub printLine {
my ( $label, $bestPcid, @ids ) = @_;
my $levelPredictions = makeTaxonomyPrediction(@ids);
print "$label\t$bestPcid";
for my $levelPrediction (@$levelPredictions) {
print "\t" . $levelPrediction->toString();
}
#if(scalar(@$levelPredictions) == 0) {$unclassified++}
#if(scalar(@$levelPredictions) == 0) { print "\tUNCLASSIFIED"; }
print "\n";
return ( scalar(@$levelPredictions) == 0 );
}
sub processHits {
my ($hitsFileName) = @_;
open( HITS, $hitsFileName ) || die("Could not open $hitsFileName");
my $hit = 0;
my $noprimer = 0;
my $nohit = 0;
my $toomanyhits = 0;
my $nomatepair = 0;
my $unclassified = 0;
while (<HITS>) {
chomp;
my ( $label, $bestPcid, @ids ) = split /\t/;
if ( ( !@ids ) || ( $ids[0] eq "" ) ) # an empty id list
{
print "$label\t\tNOHIT\n";
$nohit++;
}
elsif($ids[0] eq "NOHIT") # at one point we represented this only as an empty id list, but now apparently it is used
{
print "$label\t\tNOHIT\n";
$nohit++;
}
elsif ( ( $ids[0] eq "NOPRIMER" ) ) {
print "$label\t\tNOPRIMER\n";
$noprimer++;
}
elsif ( ( $ids[0] eq "TOOMANYHITS" ) ) {
print "$label\t\tTOOMANYHITS\n";
$toomanyhits++;
}
elsif ( ( $ids[0] eq "NOMATEPAIR" ) ) {
print "$label\t\tNOMATEPAIR\n";
$nomatepair++;
}
else {
$unclassified += printLine( $label, $bestPcid, @ids );
$hit++;
}
}
my $samples = $hit + $nohit + $toomanyhits + $nomatepair + $noprimer;
print STDERR "$samples items\n";
if ( $samples != 0 ) {
print STDERR "$noprimer (" . sprintf( "%.1f", ( 100 * $noprimer / $samples ) ) . "%) had no primer match\n";
print STDERR "$nohit (" . sprintf( "%.1f", ( 100 * $nohit / $samples ) ) . "%) had no hits\n";
print STDERR "$toomanyhits (" . sprintf( "%.1f", ( 100 * $toomanyhits / $samples ) ) . "%) had too many hits\n";
print STDERR "$nomatepair (" . sprintf( "%.1f", ( 100 * $nomatepair / $samples ) ) . "%) had no mate pair\n";
print STDERR "$unclassified (" . sprintf( "%.1f", ( 100 * $unclassified / $samples ) ) . "%) had hits but no classification\n";
print STDERR "" . ( $hit - $unclassified ) . " (" . sprintf( "%.1f", ( 100 * ( $hit - $unclassified ) / $samples ) ) . "%) were classified\n";
}
# classifications per level? That depends on the stringency filter, which is downstream
close HITS;
}
#sub normalizeByPrior {
# my ($taxString, $taxonCounts) = @_;
#
# print STDERR "Normalizing: \n";
# while( my ($k, $v) = each %$taxonCounts) {
# print STDERR "$k = $v ; ";
# }
# print STDERR "\n";
#
# my $normalizer = $taxonomyPrior{$taxString};
# if(!defined $normalizer)
# {
# print STDERR ("No prior: $taxString\n");
# }
#
# for my $taxon (keys %$taxonCounts)
# {
# $taxonCounts->{$taxon} = ($taxonCounts->{$taxon} / $taxonomyPrior{$taxString . $taxon . "; "}) * $normalizer;
# }
# print STDERR " Done: \n";
# while( my ($k, $v) = each %$taxonCounts) {
# print STDERR "$k = $v ; ";
# }
# print STDERR "\n";
#}
sub makeTaxonomyPrediction {
my (@ids) = @_;
my @levelPredictions = ();
# TOOMANYHITS is now added in ucFilterBest.pl, so it should be caught at line 99 above
# if(scalar(@ids) == 1000) {
# # when we did the uclust at exactMatchOneReadSet:50, we used " --allhits --maxaccepts 1000 ".
# # after that we filtered for the best pcid set (using ucFilterBest.pl).
# # if 1000 hits remain, then the real set of best-pcid hits is larger, and we missed some.
# # In that case we should bail because the set of 1000 hits we do have may not be representative.
# # I think this is the reason why matching the E517F primer only (17nt reads) produced predictions, and at different levels to boot.
# # That also depends on the classification thresholds.
#
# my $levelPrediction = LevelPrediction->new();
# $levelPrediction->label("TOOMANYHITS");
# push @levelPredictions, $levelPrediction;
# return \@levelPredictions;
# }
my @taxonomyVectors = map { $taxonomies{$_} } @ids;
# my @taxonomyClusterSizes = map { $taxonomyWorstPcids{$_} } @ids;
my $globalTotal = @taxonomyVectors;
my $predictedTaxa = "";
my $globalUnknowns = 0; # at all levels, incrementing as we go
for my $level ( 0 .. 10 ) {
my $levelPrediction = LevelPrediction->new();
# assert the remaining taxonomyVectors are equal at higher levels
my %taxonCounts = ();
my $localUnknowns = 0;
my $localTotal = @taxonomyVectors;
# count up how often each label is seen descending from this node
for my $vec (@taxonomyVectors) {
my $taxon = $vec->[$level];
# "Unknown" should occur only towards the leaves; an unspecified intermediate level followed by a populated level is a "skip".
# Here, "skip" is counted as just another taxon.
if ( !defined $taxon || $taxon eq "" || $taxon =~ /unknown/i ) { $localUnknowns++; }
else { $taxonCounts{$taxon}++; }
}
if ( $localUnknowns == $localTotal ) { last; }
# this normalization makes no sense, because we don't want a uniform prior either
# e.g., one Archaeal hit among dozens of Bacterial hits will win, because there are so few Archaea in GreenGenes to begin with
# normalizeByPrior( $predictedTaxa, \%taxonCounts );
# get the best label and check for ties
$levelPrediction->numChildren( scalar( keys %taxonCounts ) );
my @taxaInOrder = sort { $taxonCounts{$b} <=> $taxonCounts{$a} } keys %taxonCounts;
my $bestTaxon = $taxaInOrder[0];
# print STDERR "$bestLabel\n";
$levelPrediction->label($bestTaxon);
$predictedTaxa .= "$bestTaxon; ";
my $bestCount = $taxonCounts{$bestTaxon};
$levelPrediction->count($bestCount);
my $secondBestTaxon = $taxaInOrder[1];
if ( defined $secondBestTaxon ) {
my $secondBestCount = $taxonCounts{$secondBestTaxon};
if ( $levelPrediction->count() < 2 * $secondBestCount ) {
# Declare a tie if the winning taxon doesn't have at least twice as many votes as the runner-up.
# just consider this level unknown and bail
last;
}
}
# compute various ratios of the prediction vs. alternatives
$levelPrediction->localMinProportion( $bestCount / $localTotal );
$levelPrediction->localMaxProportion( ( $bestCount + $localUnknowns ) / $localTotal );
my $globalUnknowns += $localUnknowns;
$levelPrediction->globalMinProportion( $bestCount / $globalTotal );
$levelPrediction->globalMaxProportion( ( $bestCount + $globalUnknowns ) / $globalTotal );
# what if all of the "unknown" matches should have been the same? Then an "alternate" classification might have won
$levelPrediction->alternateLocalProportion( $localUnknowns / $localTotal );
$levelPrediction->alternateGlobalProportion( $globalUnknowns / $globalTotal )
; # note we already added the local unknowns to the global unknowns
# it's possible that a completely different path has a higher global probability than this one,
# but we'd never know because we pick the max _per level_ and never explore the other paths.
# decide whether to bother continuing
# if ( $bestLocalMinProportion < 0.5 ) { last; }
# for now, print all the best labels until everything is unknown or there is a tie; sort out the confidence later
push @levelPredictions, $levelPrediction;
# remove any non-matching paths from consideration at the next level
my @newTaxonomyVectors = ();
for my $vec (@taxonomyVectors) {
my $taxon = $vec->[$level];
if ( defined $taxon && $taxon eq $bestTaxon ) { push @newTaxonomyVectors, $vec; }
}
@taxonomyVectors = @newTaxonomyVectors;
}
return \@levelPredictions;
}
|