File: filterShrimp.pl

package info (click to toggle)
augustus 3.4.0%2Bdfsg2-2
  • links: PTS, VCS
  • area: main
  • in suites: bullseye
  • size: 758,480 kB
  • sloc: cpp: 65,451; perl: 21,436; python: 3,927; ansic: 1,240; makefile: 1,032; sh: 189; javascript: 32
file content (142 lines) | stat: -rwxr-xr-x 4,465 bytes parent folder | download | duplicates (5)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
#!/usr/bin/perl
#
# filter a file from the Shrimp alignment program
#
#
#
use strict;
use Getopt::Long;

my $usage = "$0 -- filter a SHRIMP format file\n";
$usage .= "\n";
$usage .= "Usage: $0 <in.psl >out.f.psl\n";
$usage .= "  PREREQUISITE: input file must be sorted by query name\n";
$usage .= "  \n";
$usage .= "  options:\n";
$usage .= "  --minScore=n       minimal percentage of identity (default 300)\n";
$usage .= "  --uniq             take only best match and only, when second best is much worse (default false)\n";
$usage .= "  --uniqthresh=f     the best alignment is considered uniquen when the second best has a score at most this much fraction of the best (default 0.9)\n";
$usage .= "  --best             take (all) best alignment(s) if they pass the minScore filter.\n";
$usage .= "  --commongenefile=s file name in which to write cases where one read maps to several different genes\n";
$usage .= "  --verbose          output debugging info (default false)\n";


my $minScore = 300;;
my $uniqthresh = 0.9; # a match is considered unique if the second best match has less than this percentage of the best
my $uniq = 0;
my $best = 0;
my $commongenefile;
my $verbose = 0;
my $help = 0;
my $cmdline = join(" ", @ARGV);

GetOptions(
    'help!'=>\$help,
    'minScore:i'=>\$minScore,
    'uniqthresh:f'=>\$uniqthresh,
    'uniq!'=>\$uniq,
    'best!'=>\$best,
    'commongenefile:s'=>\$commongenefile,
    'verbose!'=>\$verbose);

if ($help) {
    print "$usage";
    exit(0);
}
if ($uniq && $best){
    print "--uniq and --best cannot be used at the same time\n";
    exit(0);
}

my ($readname, $contigname, $strand, $contigstart, $contigend, $readstart, $readend, $readlength, $score, $editstring);
my $oldreadname = "";
my @f;
my ($outMinScore, $outUnique, $outBest) = (0,0,0); # number of reasons for filtering (nested, this order)
my @qali = (); # array of array references: lines for each query (pair)
open (COMMON, ">$commongenefile") or die ("Could not open $commongenefile for writing.") if (defined($commongenefile));

while (<>) {
    s/^#.*//;
    next unless /\S/;

    @f = split /\t/, $_;
    if (@f < 10) { warn "Not SHRIMP format"; next }
    # ($readname, $contigname, $strand, $contigstart, $contigend, $readstart, $readend, $readlength, $score, $editstring) = @f;
    $readname = $f[0];
    $contigname = $f[1];
    $score    = $f[8];
    
    processQuery() if ($oldreadname ne $readname && $oldreadname ne "");

    # filter for minimum score
    if ($score < $minScore){
	$outMinScore++;
	next;
    }

    push @qali, [$_, $score, $contigname];
    $oldreadname = $readname;
}
processQuery() if ($readname ne "");

close COMMON if (defined($commongenefile));


print STDERR "\n        filtered:\n";
print STDERR "----------------:\n";
print STDERR "mininum score   : $outMinScore\n";
print STDERR "uniqueness      : $outUnique\n";
print STDERR "best            : $outBest\n";
print STDERR "command line: $cmdline\n";

sub processQuery(){
    # print "processing " . scalar(@qali) . " alignments\n";
    
    if (($uniq || $best) && @qali>1){
	my %rscores;
	foreach my $ali (@qali){
	    $rscores{$ali} = scoreAli($ali); # store scores in hash so later sorting is faster
	}
	@qali = sort {-($rscores{$a} <=> $rscores{$b})} @qali;
	if ($uniq){
	    # let pass only best alignment, and only if second is significantly worse
	    my $ratio = $rscores{$qali[1]}/$rscores{$qali[0]};
	    if ($verbose){
		print "best two alignments\n";
		print "" . $qali[0]->[0] ."\nscore=$rscores{$qali[0]}\n";
		print "" . $qali[1]->[0] ."\nscore=$rscores{$qali[1]}\n";
		print "ratio = $ratio\n";
	    }
	    if ($ratio < $uniqthresh){
		print $qali[0]->[0];
	    } else {
		$outUnique++;
	    }
	} else { # best filtering, take all alignments that have the same score as the best
	    my $bestscore = $rscores{$qali[0]};
	    my @bestTnames = ();
	    while ($rscores{$qali[0]} == $bestscore){
		print $qali[0]->[0];
		push @bestTnames, $qali[0]->[2];
		shift @qali;
	    }
	    $outBest += @qali;
	    if (@bestTnames > 1){
		my %genenames = ();
		foreach my $Tname (@bestTnames) { $Tname =~ s/\.t\d+//; $genenames{$Tname}=1; }
		print COMMON $oldreadname . "\t" . join(" ", keys %genenames) . "\n" if (%genenames>1 && defined($commongenefile));
	    }
	}
    } else {
	foreach my $ali (@qali){
	    print $ali->[0];
	}
    }
    @qali = ();
}

# for comparing quality of two alignments
sub scoreAli{
    my $ali = shift;
    return $ali->[1];
}