File: filterPSL.pl

package info (click to toggle)
augustus 3.5.0%2Bdfsg-5
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 777,052 kB
  • sloc: cpp: 80,066; perl: 21,491; python: 4,368; ansic: 1,244; makefile: 1,141; sh: 171; javascript: 32
file content (469 lines) | stat: -rwxr-xr-x 17,619 bytes parent folder | download | duplicates (5)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
#!/usr/bin/perl
#
# filter a psl file (from BLAT,GMAP)
#
# Mario Stanke, September 2009
#
use strict;
use Getopt::Long;

my $usage = "$0 -- filter a psl file (e.g. BLAT or GMAP)\n";
$usage .= "\n";
$usage .= "Usage: $0 <in.psl >out.f.psl\n";
$usage .= "  PREREQUISITE: input psl file must be sorted by query name (standard with BLAT and GMAP)\n";
$usage .= "                Do a sort -k 10,10 but be aware: LC_ALL may have to be set to C because sort ignores characters like \":\"\n";
$usage .= "  if option 'paired' is used then it expects .f,.r or /1,/2 suffixes of mate pairs\n";
$usage .= "  \n";
$usage .= "  options:\n";
$usage .= "  --pairbed=s        file name of pairedness coverage:\n";
$usage .= "                     a .bed format file in which for each position the number of filtered\n";
$usage .= "                     read pairs is reported that contain the position in or between the reads\n";
$usage .= "  --minId=n          minimal percentage of identity (default 92)\n";
$usage .= "  --minCover=n       minimal percentage of coverage of the query read (default 80)\n";
$usage .= "  --uniq             take only best match and only, when second best is much worse (default false)\n";
$usage .= "  --uniqthresh       threshold % for uniq, second best must be at most this fraction of best (default .96)\n";
$usage .= "  --best             output all best matches that satisfy minId and minCover\n";
$usage .= "  --commongenefile=s file name in which to write cases where one read maps to several different genes\n";
$usage .= "  --nointrons        do not allow longer gaps (for RNA-RNA alignments)\n";
$usage .= "  --paired           require that paired reads are on opposite strands of same target(default false)\n";
$usage .= "  --maxintronlen=n   maximal separation of paired reads (default 500000\n";
$usage .= "  --verbose          output debugging info (default false)\n";


my $maxintronlen = 500000;
my $minintronlen = 35;
my $maxSortesTest = 100000; # check sortedness only for this many of the first lines to save memory
my $minId = 92;
my $minCover = 80;
my $uniqthresh = 0.96; # a match is considered unique if the second best match has less than this percentage of the best
my $uniq = 0;
my $nointrons = 0;
my $best = 0;
my $commongenefile;
my $pairbedfile;
my $paired = 0;
my $verbose = 0;
my $help = 0;
my %qnamestems = (); # for checking sortedness.
my $cmdline = join(" ", @ARGV);
my $maxCountInsert = 1000000;

GetOptions(
    'help!'=>\$help,
    'maxintronlen:i'=>\$maxintronlen,
    'minId:i'=>\$minId,
    'minCover:i'=>\$minCover,
    'uniqthresh:f'=>\$uniqthresh,
    'paired!'=>\$paired,
    'uniq!'=>\$uniq,
    'nointrons!'=>\$nointrons,
    'best!'=>\$best,
    'commongenefile:s'=>\$commongenefile,
    'pairbed:s'=>\$pairbedfile,
    'verbose!'=>\$verbose);

if ($help) {
    print "$usage";
    exit(0);
}


my ($match,$TgapCount,$strand,$qname,$qstart,$qend,$qsize,$targetname,$tstart,$tend,$blockSizes,$qStarts,$tStarts,$qBaseInsert,$tBaseInsert);
my ($qnamestem,$qsuffix);
my $skiplines=0;
my ($line,$lastCompactifyLine)=(0,0);
my $oldqnamestem = "";
my (@f,@b,@t,@q,@insertlen);
my ($outMinId,$outMinCover,$outPaired,$outUniq,$outBest,$outIntrons) = (0,0,0,0,0,0); # number of reasons for filtering (nested, this order)
my @qali = (); # array of array references: lines for each query (pair)
my %paircovsteps = (); # for pairedness coverage
                       # keys: target names (chromosomes)
                       # values: array references
                       #         elements: [pos,diff]
                       #                   position is 0-based, diff is +1 at start or -1 at end of mate pair
                       #                XXXXXXXXXXXX---------------------XXXXXXXXX
                       #               +1                                         -1


open (COMMON, ">$commongenefile") or die ("Could not open $commongenefile for writing.") if (defined($commongenefile));

while (<>) {
    $skiplines=5 if (/psLayout/);
    if ($skiplines>0) {
        $skiplines--;
        next;
    }

    $line++;
    s/^#.*//;
    next unless /\S/;
    if ($line%100000==1){
	$| = 1;
	print STDERR "\r"."processed line $line";
    }
    if (defined($pairbedfile) && $line % 10000000 == 0 && $line >= $lastCompactifyLine * 1.5){
	print STDERR "\ncompactifing coverage after $line lines ...";
	compactifyBed();
	$lastCompactifyLine = $line;
	print STDERR "done\n";
    }

    @f = split /\t/, $_, 21;
    if (@f < 20) { warn "Not PSL format"; next }
    
    $match       = $f[0];
    $qBaseInsert = $f[5];
    $TgapCount   = $f[6];
    $tBaseInsert = $f[7];
    $strand      = $f[8];
    $qname       = $f[9];
    $qsize       = $f[10];
    $qstart      = $f[11];
    $qend        = $f[12];    
    $targetname  = $f[13];
    $tstart      = $f[15];
    $tend        = $f[16];
    $blockSizes  = $f[18];
    $qStarts     = $f[19];
    $tStarts     = $f[20];
    
    $blockSizes =~ s/[, ]$//;
    $tStarts =~ s/[, ]$//;
    @b = split /,/, $blockSizes; #
    @t = split /,/, $tStarts;
    @q = split /,/, $qStarts;
    
    $qnamestem = $qname;
    if ($paired){
        $qnamestem =~ s/[\.\/]([fr12])$//;
        $qsuffix = $1; # f,1: forward mate, r,2: reverse mate "": no mates
    }
    if ($oldqnamestem ne $qnamestem && $oldqnamestem ne ""){
	if ($line <= $maxSortesTest && $qnamestems{$qnamestem}){
	    print STDERR "Input file not sorted by query name! $qnamestem occurred previously. Set LC_ALL=C and sort -k 10,10\n";
	    exit 1;
	}
	processQuery() if (@qali);
    }

    # filter for minimum percentage of identity
    my $tgap = 0; # inserted bases in target sequence, excluding introns
    my $qgap = 0; # inserted bases in query sequence
    my $gaps = 0; # total amount of gaps
    
    
    for (my $i=0; $i<@b-1; $i++){
	$tgap = $t[$i+1]-$t[$i]-$b[$i];
	$qgap = $q[$i+1]-$q[$i]-$b[$i];
	$tgap = 0 if ($qgap ==0 && $tgap >= $minintronlen && $tgap <= $maxintronlen); #target gap is intron
	$gaps += ($tgap>$qgap)?  $tgap : $qgap; # count the larger gap if both seqs happen to have a gap
    }
    my $percid = sprintf("%.1f", 100*$match/((($qend - $qstart + $gaps)))); 
    if ($percid < $minId){
	$outMinId++;
	next;
    }

    # filter for minimum coverage
    my $coverage =  sprintf("%.1f", 100*($qend - $qstart)/$qsize);
    if ($coverage < $minCover){
	$outMinCover++;
	next;
    }

    # filter for introns
    if ($nointrons && $qBaseInsert + $tBaseInsert > 10){
	$outIntrons++;
	next;
    }

    push @qali, [$_, $targetname, $qsuffix, $strand, $tstart, $tend, $percid, $coverage];
    # print "$targetname, $qsuffix, $strand, $tstart, $tend, $percid\n";

    $oldqnamestem = $qnamestem;
    $qnamestems{$qnamestem} = 1 if ($line <= $maxSortesTest);
}
processQuery() if ($qnamestem ne "");

close COMMON if (defined($commongenefile));

#
# write pairedness coverage info into the pairbedfile
#
if (defined($pairbedfile)){
    open (PAIRBED, ">$pairbedfile") or die ("Could not open $pairbedfile for writing.");
    print PAIRBED "track type=bedGraph name=\"pairedness coverage\" description=\"pairedness coverage\"";
    print PAIRBED " visibility=full color=200,100,0 altColor=200,100,0\n";
    compactifyBed();
    foreach my $chr (sort keys %paircovsteps){
	my $cov = 0;
	my $pos = 0;
	next if (!@{$paircovsteps{$chr}});
	foreach my $step (@{$paircovsteps{$chr}}){
	    print PAIRBED "$chr\t$pos\t$step->[0]\t$cov\n" if ($pos<$step->[0] && $cov>0);
	    $pos = $step->[0];
	    $cov += $step->[1];
	}
	warn ("inconsistent") if ($cov!=0);
    }
    close PAIRBED;
}

@insertlen = sort {$a <=> $b} @insertlen;

#foreach(@insertlen){
#    print STDOUT "Insert\t".$_."\n";
#}

print STDERR "\n        filtered:\n";
print STDERR "----------------:\n";
print STDERR "percent identity: $outMinId\n";
print STDERR "coverage        : $outMinCover\n";
print STDERR "nointrons       : $outIntrons\n" if ($nointrons);
if ($paired) {
    print STDERR "not paired      : $outPaired\n" if ($paired);
    print STDERR "quantiles of unspliced insert lengths: ";
    for (my $i=1;$i<10;$i++){
	print STDERR "q[" . (10*$i) . "%]=" . ($insertlen[int($i*@insertlen/10)]) . ", ";
    }
    print STDERR "\n";
}
print STDERR "uniq            : $outUniq\n" if ($uniq);
print STDERR "best            : $outBest\n" if ($best);
print STDERR "command line: $cmdline\n";

sub processQuery(){
    # print "processing " . scalar(@qali) . " alignments\n";
    
    # filter @qali based on mate pair consistency
    # keep only alignments for which there is a possible mate:
    # 1) same chromosome
    # 2) different strand
    # 3) distance in genome < minintronlen
    if ($paired){
	@qali = sort {$a->[1] cmp $b->[1] || $a->[4] cmp $b->[4]} @qali;
	my @matepairs = ();
	my %mated = ();
	for (my $i=0;$i < @qali-1; $i++){
	    for (my $j=$i+1; $j < @qali && $qali[$i]->[1] eq $qali[$j]->[1]; $j++){ # only loop until leave chromosome
		#print "comparing $i,$qali[$i]->[1], with $j,$qali[$j]->[1]\n";
		if ($qali[$i]->[2] ne $qali[$j]->[2]){    # different mate: (f,r) or (1,2)
		    if ($qali[$i]->[3] ne $qali[$j]->[3]){ # different strand
			my $dist = $qali[$j]->[4] - $qali[$i]->[5] - 1;
			$dist = $qali[$i]->[4] - $qali[$j]->[5] - 1 if ($qali[$i]->[4] > $qali[$j]->[4]);
			if ($dist < $maxintronlen && $dist>=0){ # not too far apart, not overlapping either
			    # print "found mate pair $i,$j\n";
			    push @matepairs, [$i,$j,scoreMate($i,$j,$dist)];
			    $mated{$i}=0 if (!defined($mated{$i}));
			    $mated{$j}=0 if (!defined($mated{$j}));
			    $mated{$i}++;
			    $mated{$j}++;
			    my $inslen = $qali[$j]->[5] - $qali[$i]->[4] - 1;
			    $inslen = $qali[$i]->[5] - $qali[$j]->[4] - 1 if ($inslen<0);
			    push @insertlen, $inslen if (@insertlen < $maxCountInsert); # if not limited, this may use on huge files an unwarranted amount of RAM for a simple statistics
			} else {
			    #print "distance not right\n";
			}
		    }
		}
	    }
	}
	# print "found " . scalar(@matepairs) . " mate pairs, involving " . scalar(keys %mated) . " mates.\n" if ($verbose);
	$outPaired += @qali - scalar(keys %mated);
	if ((!$uniq && !$best) || @matepairs<2){# let pass all read alignments that are involved in mate pairs
	    foreach my $i (sort {$a <=> $b} keys %mated){
		print $qali[$i]->[0];
	    }
	} else {# uniq or best
	    @matepairs = sort {$b->[2] <=> $a->[2]} @matepairs;
	    if ($uniq){# let pass only best mate pair, and only if second is significantly worse
		my $second = 1;
		while ($second < @matepairs && similar($qali[$matepairs[0]->[0]], $qali[$matepairs[$second]->[0]], $qali[$matepairs[0]->[1]], $qali[$matepairs[$second]->[1]])){
		    $second++; 
		}
		if ($second < @matepairs){
		    my $ratio = $matepairs[$second]->[2] / $matepairs[0]->[2];
		    if ($verbose) {
			print "\nbest two mates\n";
			print "" . join (", ", @{$qali[$matepairs[0]->[0]]}) . "\npaired with\n"
			    .join (" ", @{$qali[$matepairs[0]->[1]]})
			    . "\nscore=$matepairs[0]->[2]\n";
			print "" . join (", ", @{$qali[$matepairs[1]->[0]]}) . "\npaired with\n"
			    .join (" ", @{$qali[$matepairs[1]->[1]]})
			    . "\nscore=$matepairs[1]->[2]\n";
			print "ratio = $ratio\n";
		    }
		    if ($ratio < $uniqthresh){
			# print the two alignments for best mate pair only
			print $qali[$matepairs[0]->[0]]->[0];
			print $qali[$matepairs[0]->[1]]->[0];
			$outUniq += @qali-1;
		    } else {
			@matepairs = ();
			$outUniq += @qali;
		    }
		} else {
		    print "suboptimal mate pairs are similar\n" if ($verbose);
		    print $qali[$matepairs[0]->[0]]->[0];
		    print $qali[$matepairs[0]->[1]]->[0];
		}
		splice @matepairs, 1; # keep only the best pair (if any)
	    } else { # best: take all best alignment pairs
		my $optscore = $matepairs[0]->[2];
                my @bestTnames = ();
		my $numbest = 0;
		my %haveOutput = (); # remember alignments that were printed, as some alignments can be part of several equally good pairs
		while ($numbest < @matepairs && $matepairs[$numbest]->[2] == $optscore){
		    my ($aliline1, $aliline2) = ($qali[$matepairs[$numbest]->[0]]->[0], $qali[$matepairs[$numbest]->[1]]->[0]);
		    print $aliline1 if (!exists($haveOutput{$aliline1}));
		    $haveOutput{$aliline1} = 1;
                    print $qali[$matepairs[$numbest]->[1]]->[0] if (!exists($haveOutput{$aliline2}));
                    $haveOutput{$aliline2} = 1;
                    push @bestTnames, $qali[$matepairs[$numbest]->[0]]->[1];
		    $numbest++;
		}
		$outBest += @matepairs - $numbest;
		splice @matepairs, $numbest; # keep only the first $numbest pairs
                if (@bestTnames>1){
                    my %genenames = ();
                    foreach my $Tname (@bestTnames) { $Tname =~ s/\.t\d+//; $genenames{$Tname}=1; }
                    print COMMON $oldqnamestem . "\t" . join(" ", keys %genenames) . "\n" if (%genenames > 1 && defined($commongenefile));
                }
	    }
	}
	# output pairedbed info: go through list of all mate pairs and store start and end position
	if (defined($pairbedfile)){
	    while (@matepairs>0){
		my $chr = $qali[$matepairs[0]->[0]]->[1];
		$paircovsteps{$chr} = [] if (!defined($paircovsteps{$chr}));
		my $pend = $qali[$matepairs[0]->[1]]->[5];
		my $pstart = $qali[$matepairs[0]->[0]]->[4];
		push @{$paircovsteps{$chr}}, [$pstart-1, 1];
		push @{$paircovsteps{$chr}}, [$pend, -1];
		shift @matepairs;
	    }
	}
    } else { # not paired, single read
	if (($uniq || $best) && @qali>1){
	    my %rscores;
	    foreach my $ali (@qali){
		$rscores{$ali} = scoreAli($ali); # store scores in hash so later sorting is faster
	    }
	    @qali = sort {-($rscores{$a} <=> $rscores{$b})} @qali;
	    if ($uniq) {
		my $second = 1;
                while ($second < @qali && similar($qali[0], $qali[$second])){
                    $second++;
                }
                if ($second < @qali){
		    # let pass only best mate pair, and only if second is significantly worse
		    my $ratio = $rscores{$qali[$second]}/$rscores{$qali[0]};
		    if ($verbose){
			print "best two alignments\n";
			print "" . $qali[0]->[0] ."\nscore=$rscores{$qali[0]}\n";
			print "" . $qali[1]->[0] ."\nscore=$rscores{$qali[1]}\n";
			print "ratio = $ratio\n";
		    }
		    if ($ratio < $uniqthresh){
			print $qali[0]->[0];
			$outUniq += @qali-1;
		    } else {
			$outUniq += @qali; # drop all
		    }
		} else {
		    print "suboptimal alignments are similar\n" if ($verbose);
		    print $qali[0]->[0];
		    $outUniq += @qali-1;
		}
	    } else { # take all best alignments, that share the maximum score
		my $optscore = $rscores{$qali[0]};
		my @bestTnames = ();
		while ($rscores{$qali[0]} == $optscore){
		    print $qali[0]->[0];
		    push @bestTnames, $qali[0]->[1];
		    shift @qali;
		}
		$outBest += @qali;
		if (@bestTnames>1){
		    my %genenames = ();
		    foreach my $Tname (@bestTnames) { $Tname =~ s/\.t\d+//; $genenames{$Tname}=1; }
		    print COMMON $oldqnamestem . "\t" . join(" ", keys %genenames) . "\n" if (%genenames>1 && defined($commongenefile));
		}
	    }
	} else {
	    foreach my $ali (@qali){
		print $ali->[0];
	    }
	}
    }
    @qali = ();
}

# for comparing quality of two alignments
sub scoreAli{
    my $ali = shift;
    return $ali->[6]/100 # percent identity
	+ $ali->[7]/100; # percent coverage
}

# for comparing quality of two mate pair read alignments (i1,j1), (i2,j2)
sub scoreMate{
    my $i = shift;
    my $j = shift;
    my $dist = shift;
    my $score = ($qali[$i]->[6] + $qali[$j]->[6])/100 # percent identity
	+ ($qali[$i]->[7] + $qali[$j]->[7])/100; # percent coverage
    $score -= $dist/$maxintronlen/10 if (!$best); # penalty for distance between mates. Do not use if option 'best' is chosen, otherwise a one base difference may cause a difference
    return $score;
}

#
# checking whether two alignments (or two alignment pairs) are similar
# Purpose: Due to separate handling of spliced and unspliced alignments it can happen
# that very similar alignments are reported, e.g. an unspliced read going approximately up to an intron
# and a spliced read with a few base pairs on one exon.
# These should not be considered ambiguous when --uniq is specified.
sub similar {
    return similar(@_[0],@_[1]) && similar(@_[2],@_[3]) if (@_ == 4);
    my $r1 = shift;
    my $s1 = shift;
    if ($verbose){
	print "\nchecking whether these alignments are approximately the same\n";
	print "" . join (", ", @$r1) . "\nand\n" . join (", ", @$s1) . "\n\n";
    }
    return 0 if ($r1->[5] <= $s1->[4] || $r1->[4] >= $s1->[5]); # here: similar = overlapping target range
    print "they are similar\n" if ($verbose);
    return 1;
}


#
# compactifyBed
# if several steps coincide then summarize them equivalently by one step in order to 
# 1) save memory or
# 2) output a bed file
#
sub compactifyBed {
    my $before=0;
    my $after=0;
    foreach my $chr (sort keys %paircovsteps){
        next if (!@{$paircovsteps{$chr}});
        @{$paircovsteps{$chr}} = sort {$a->[0] <=> $b->[0]} @{$paircovsteps{$chr}};
	$before += scalar(@{$paircovsteps{$chr}});
#	print "before compactify\n";
#	foreach my $step (@{$paircovsteps{$chr}}){
#	    print $step->[0] . "\t" . $step->[1] . "\n";
#	}
	my $i=0;
	while ($i<@{$paircovsteps{$chr}}-1){
	    if ($paircovsteps{$chr}->[$i]->[0] eq $paircovsteps{$chr}->[$i+1]->[0]){
		$paircovsteps{$chr}->[$i]->[1] += $paircovsteps{$chr}->[$i+1]->[1]; # add value from i+1 to i
		splice @{$paircovsteps{$chr}}, $i+1, 1; # remove element i+1
	    } else {
		$i++;
	    }
	}
	$after += scalar(@{$paircovsteps{$chr}});
    }
    print STDERR "\nbefore compactifying: $before  after: $after\n";
}