1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163
|
#!/usr/bin/perl
# gffGetmRNA.pl
# Creates mRNA fasta sequence files from a GFF with gene annotation
#
# Mario Stanke, January, 2010
#
use strict;
use Getopt::Long;
my $usage = "cat genes.gff | gffGetmRNA.pl --genome=s --mrna=s\n";
$usage .= " Makes a fasta file with mRMA sequences\n";
$usage .= " --genome=s Input a fasta file with the genomic sequences.\n";
$usage .= " --mrna=s Output fasta file with mRNA sequences.\n";
my ($seqfile, $mrnafile);
my %sequence = ();
my %ignorechr = ();
GetOptions('genome=s'=>\$seqfile,
'mrnae=s'=>\$mrnafile);
if (!defined($seqfile) || !defined($mrnafile)) {
print $usage;
exit;
}
my @gfflines; # complete input file
# hash of transcripts
# keys: chromosome names
# values: hash reference
# keys: transcript ids
# values: gff array reference
my %txs = ();
readGenome();
parseAndStoreGFF();
sortAndCheck();
open MRNA, ">$mrnafile" or die ("Could not open mrnafile for writing.");
printmrnas();
close MRNA;
if (keys %ignorechr > 0){
print "Annotation from these chromosomes was ignored because they were not contained in $seqfile.:\n"
. join (", ", sort keys %ignorechr) . "\n";
}
# Read in the sequence file in one chunk.
# And sort it in the sequence hash.
# Yes, this requires a lot of memory for large genomes.
sub readGenome {
my $seqname;
if ($seqfile){
open (SEQ, "<$seqfile") or die ("Could not open sequence file $seqfile\n");
$/=">";
while (<SEQ>){
s/>$//;
next unless /\S+/;
/(.*)\n/;
$seqname = $1;
my $sequencepart = $'; #'
$seqname =~ s/\s.*//; # seqname only up to first white space
$sequencepart =~ s/\s//g;
$sequence{$seqname} = $sequencepart;
}
print "Read in " . (scalar keys %sequence) . " sequence(s) from $seqfile.\n";
$/="\n";
}
}
#
# Read in the GFF file and store the exons by their transcript id
#
sub parseAndStoreGFF{
my ($txid, $type, $chr);
while(<STDIN>){
my @f = split /\t/;
next if (@f<8);
if ($f[8] =~ /(transcript_id|Transcript)."([^"]+)"/){ #"
$txid = $2;
} elsif ($f[8] =~ /gene_id."([^"]+)"/){ #"
$txid = $1;
} elsif ($f[8] =~ /Parent=Transcript:([^;]+)/){
$txid = $1;
} elsif ($f[8] =~ m/Parent=([^;]+)/){
$txid = $1;
} else {
$txid = $f[8];
}
$txid =~ s/\n//;
my ($chr,$type) = ($f[0],$f[2]);
if ($type eq "exon"){
$txs{$chr}{$txid} = [] if (!exists($txs{$chr}{$txid}));
push @{$txs{$chr}{$txid}}, \@f;
}
}
}
sub sortAndCheck {
foreach my $chr (sort keys %txs){
foreach my $txid (sort keys %{$txs{$chr}}){
@{$txs{$chr}{$txid}} = sort {$a->[3] <=> $b->[3]} @{$txs{$chr}{$txid}}; # sort by start position
my $strand;
my $end;
foreach my $exon (@{$txs{$chr}{$txid}}){
if (!defined($strand)){
$strand = $exon->[6];
} else {
die ("Strand not consistent in transcript $txid.") if ($strand ne $exon->[6]);
}
if (defined($end) && $end >= $exon->[3]){
die ("Exons overlapping in $txid.");
}
}
}
}
}
sub printmrnas() {
foreach my $chr (sort keys %txs){
if ($sequence{$chr}){
foreach my $txid (sort keys %{$txs{$chr}}){
my $seq = "";
foreach my $exon (@{$txs{$chr}{$txid}}){
die ("Coordinate of $txid out of sequence range. $exon->[4] >= " . length($sequence{$chr}) . "\n")
if ($exon->[4] >= length($sequence{$chr}));
my $seqpart = lc(substr($sequence{$chr}, $exon->[3]-1, $exon->[4] - $exon->[3] + 1));
$seq .= $seqpart;
}
$seq = rc($seq) if ($txs{$chr}{$txid}->[0]->[6] eq "-");
print MRNA ">$txid\n" . getFa($seq, 100);
}
} else {
$ignorechr{$chr}++;
}
}
}
# reverse complement
sub rc{
my $s = shift;
$s = reverse $s;
$s =~ tr/acgtACGT/tgcaTGCA/;
return $s;
}
sub getFa{
my $seq = shift;
my $cols = 100;
$cols = shift if (@_);
my $start = 0;
my $ret = "";
while (length($seq)-$start >= $cols) {
my $shortline = substr($seq, $start, $cols);
$ret .= "$shortline\n";
$start += $cols;
}
$ret .= substr($seq, $start, $cols) . "\n";
}
|