1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63
|
#!/usr/bin/perl
# create a gff-like file that marks regions in a fasta file
# that contain "N" or "n".
# Katharina Hoff, 18.7.2011
my $usage = "maskNregions.pl genome.fa > out.file\n";
if (@ARGV != 1) {
print $usage;
exit;
}
my $genome = $ARGV[0];
my $fastaContent = "";
my @seqLetters = ();
sub procSeq;
open (FASTA, $genome) or die ("\n\ncould not open file $genome!\n");
LINE: while ($line = <FASTA>){
next LINE if $line =~ m/^#/; #discard comments
if ($line =~ /^>/){
procSeq($fastaContent);
$fastaContent = "";
}else{
$line =~ s/[\x0A\x0D]+//g;
$line =~ s/(\s+)(\n)(\r)//g;
$fastaContent = $fastaContent.$line;
}
}
procSeq($fastaContent) if ($fastaContent ne "");
close(FASTA) or die "Could not close file $genome!\n";
sub procSeq{
my $seq = shift; # nucleotide sequence
if(length($seq)>0){
@seqLetters = split(//, $seq);
$n = scalar(@seqLetters);
$pos = 1;
foreach(@seqLetters){
if($_ eq "n" or $_ eq "N"){
if($pos == 1 || ($seqLetters[$pos-2] ne "n" and $seqLetters[$pos-2] ne "N")){
#print "$pos on $fastaHeader is the beginning of N-region\n";
print $fastaHeader."\t"."maskN"."\t"."N-region"."\t".$pos."\t";
}
if($pos == $n || ($seqLetters[$pos] ne "n" and $seqLetters[$pos] ne "N")){
#print "$pos on $fastaHeader is the end of N-region\n";
print $pos."\t"."."."\t"."."."\t"."."."\t"."N-region\n";
}
}
$pos = $pos+1;
}
}
$line =~ s/[\x0A\x0D]+//g; #removing those ugly whitelines
$line =~ s/(\n)(\r)//g; #remove them alllll!
$line =~ m/(^>\w+)/i; #matches a word starting with > (Fasta)
$fastaHeader = substr($1, 1);
}
|