File: uniquePeptides.pl

package info (click to toggle)
augustus 3.4.0%2Bdfsg2-2
  • links: PTS, VCS
  • area: main
  • in suites: bullseye
  • size: 758,480 kB
  • sloc: cpp: 65,451; perl: 21,436; python: 3,927; ansic: 1,240; makefile: 1,032; sh: 189; javascript: 32
file content (75 lines) | stat: -rwxr-xr-x 1,965 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
#!/usr/bin/perl

# Creates a fasta file with unique peptides from a fasta input file
# where identical peptide match multiple positions. The first found 
# peptide occurrence is used for counting multiplicity.
# Input format fasta header examples:
# >scaffold4083_450 [15866 - 17011]  (65.81)
# >au14.g511.t1 (46.17)
# ... there is a score in the end of the header in round brackets!
# Output format fasta header:
# >peptideX mult=x
# Do not use for long protein sequences, only for short peptides!!!
# prefix

my $usage = "uniquePeptides.pl in.fa prefix > out.fa\n\nRead script head comments for futher documentation!\n\n";

if(@ARGV!=2){print STDERR $usage; exit -1;}

my $in = $ARGV[0];
my $prefix = $ARGV[1];

# two hashes for the same peptide (hash of hashes would be more elegant, but I am too lazy for that), key is always the peptide sequence, trimmed of "-" characters that sometimes appear to occur in Harald's peptide files.
my %locHash = ();
my %multHash = ();
my $maxLen = 100;

my $header;

open(IN, "<", $in) or die("Could not open file $in!\n");

while ( <IN> ) { 
	chomp;
	if($_=~m/^>/){
		$_=~s/>//;
		$_=~s/\(\d+\.\d+\)//; # delete score
		$header = $_;
	}else{
		if(length($_) > $maxLen){
			print STDERR "The peptide $_ is longer $maxLen. Aborting.\n";
			exit -1;
		}
		while($_=~m/-/){ # delete weird dashes in sequences
			$_ = s/-//;
		}
		if(not(exists($locHash{$_}))){
			$locHash{$_} = "$header";
			$multHash{$_} = 1;			
		}elsif($locHash{$_} eq $header){
			$multHash{$_} = $multHash{$_} + 1;
		}
		
		
	}
}

close IN or die("Could not close file $in!\n");

# print unique fasta file
my $c = 1;
while ( ($k,$v) = each %multHash ) {
	print ">$prefix$c mult=$v\n$k\n";
	$c++;
}


# print mapping file
#open(OUT2, ">", $mapping) or die("Could not close file $mapping!\n");
#for $k ( keys %mapHash ) {
#	print OUT2 "$k:\n";
#	foreach(@{$mapHash{$k}}){
#		print OUT2 "$_\n";
#	}
#}
#close(OUT2) or die("Could not close file $mapping!\n");