File: filterInFrameStopCodons.pl

package info (click to toggle)
augustus 3.5.0%2Bdfsg-5
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 777,052 kB
  • sloc: cpp: 80,066; perl: 21,491; python: 4,368; ansic: 1,244; makefile: 1,141; sh: 171; javascript: 32
file content (41 lines) | stat: -rwxr-xr-x 1,122 bytes parent folder | download | duplicates (5)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
#!/usr/bin/perl

# Katharina J. Hoff
# March 5th 2012
#
# Find all predicted genes that do not contain in-frame stop codons in an amino acid file and return the gene identifier (fasta header).
#
# Input format: multiple fasta with fasta headers that contain only the gene identifier.
# Output format: List of gene identifiers
#
# Note: consider running AUGUSTUS with the option --noInFrameStop=true if you want to avoid stop codons in the first place.

my $usage = "filterInFrameStopCodons.pl protein.fa > no-stop.lst\nConsider running AUGUSTUS with the option --noInFrameStop=true if you want to avoid stop codons in the first place.\n";

if (@ARGV != 1) {
    print $usage;
    exit;
}

my $protein = $ARGV[0];
my %hasStop;
my $currentID;

open(PROT, "<", $protein) or die "Could not open protein file $protein!\n";
while(<PROT>){
    if($_=~m/^>/){
	$_=~s/>//;
	chomp;
	$currentID = $_;
	$hasStop{$currentID} = 0;
    }elsif($_=~m/X/){
	$hasStop{$currentID} = 1;
    }
}
close(PROT) or die "Could not close protein file $protein!\n";

while ( ($id,$stop) = each %hasStop ) {
    if($stop == 0){
	print "$id\n";
    }
}