1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52
|
#!/usr/bin/perl
# read a uclust result file; for each query sequence, collect the prokMSAids of the hits.
use strict;
use warnings;
my %clusters = ();
my %worstPcids = ();
parseUclust( $ARGV[0] );
for my $key ( sort ( keys %clusters ) ) {
my $worstPcid = $worstPcids{$key};
#print STDERR "$key -> $clusters{$key}\n";
print "$key\t$worstPcid\t" . ( join "\t", @{ $clusters{$key} } ) . "\n";
}
sub parseUclust {
my ($ucFileName) = @_;
open( UC, $ucFileName ) || die("Could not open $ucFileName");
while (<UC>) {
if (/^\s*#/) { next; }
my ( $type, $cluster, $size, $percentid, $strand, $querystart, $targetstart, $alignment, $querylabel, $targetlabel ) = split /\t/;
chomp $querylabel;
chomp $targetlabel;
if ( $type eq "S" ) {
#print STDERR "S $querylabel\n";
$clusters{$querylabel} = [$querylabel];
$worstPcids{$querylabel} = 100.0;
}
elsif ( $type eq "H" ) {
#print STDERR "H $targetlabel $querylabel\n";
push @{ $clusters{$targetlabel} }, $querylabel;
if ( $percentid < $worstPcids{$targetlabel} ) {
$worstPcids{$targetlabel} = $percentid;
}
}
# ignore other types
}
close UC;
}
|