File: getgene.pl

package info (click to toggle)
murasaki 1.68.6-6
  • links: PTS, VCS
  • area: main
  • in suites: stretch
  • size: 1,928 kB
  • ctags: 3,100
  • sloc: cpp: 16,010; perl: 8,365; makefile: 186
file content (118 lines) | stat: -rwxr-xr-x 3,644 bytes parent folder | download | duplicates (5)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
#!/usr/bin/perl -w

#Copyright (C) 2006-2008 Keio University
#(Kris Popendorf) <comp@bio.keio.ac.jp> (2006)
#
#This file is part of Murasaki.
#
#Murasaki is free software: you can redistribute it and/or modify
#it under the terms of the GNU General Public License as published by
#the Free Software Foundation, either version 3 of the License, or
#(at your option) any later version.
#
#Murasaki is distributed in the hope that it will be useful,
#but WITHOUT ANY WARRANTY; without even the implied warranty of
#MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#GNU General Public License for more details.
#
#You should have received a copy of the GNU General Public License
#along with Murasaki.  If not, see <http://www.gnu.org/licenses/>.

use File::Basename;
use Getopt::Long qw(:config pass_through);
use Pod::Usage;
#use Data::Dump qw{dump};

use strict;
my ($help,$man,$opt_prefix);

our ($seqhome,$root,$flexible);
BEGIN {
  unshift(@INC,(fileparse($0))[1].'perlmodules');
}
use Murasaki;

my $toRNA;
my ($outprefix,$echo);

GetOptions('help|?' => \$help, man => \$man, flexible => \$flexible,
	  'rna' => \$toRNA, 'outprefix=s'=>\$outprefix, echo=>\$echo);
pod2usage(1) if $help or $#ARGV<1;
pod2usage(-exitstatus => 0, -verbose => 2) if $man;


my (%names,%locii);
my $filename=shift(@ARGV);
die "There's no file $filename" unless -f $filename;
print "Loading annotation...\n";
if(!-f "$filename.cds"){
  print "CDS file not found for $filename. Generating...\n";
  my $res=system("$root/getcds.pl $filename");
  do {
    print "Generation of CDS file for $filename failed\n";
  } unless -f "$filename.cds";
}

if(open(CDS,"$filename.cds")){
  print "Loading annotation for $filename...";
  my @cds;
 LoadCDS: while(<CDS>){
    my ($name,$start,$stop,$strand,$locus) = split(/\s+/,$_);
    my $cd={name => $name,locus=>$locus,start=>$start,stop=>$stop,strand=>$strand};
    push(@{$names{$name}},$cd);
    push(@{$locii{$locus}},$cd);
  }
  print "Done.\n";
}else{
  print "Note: couldn't load any annotation...\n";
}

print "Loading $filename...\n";
my $genome=`$root/geneparse.pl $filename`;
while(@ARGV){
  my $gene=shift(@ARGV);
  if($gene=~m/^(-?\d+)\D{1,3}?(-?\d+)/){ #fake gene. i want a region!
    my ($start,$stop)=map(abs,($1,$2));
    ($start,$stop)=$start<$stop ? ($start,$stop):($stop,$start);
    push(@{$names{$gene}},{name => $gene,start=>$start,stop=>$stop,
		   strand=>$1<0 || $2<0 ? -1:1});
  }
  my @cds=(map {ref($_) ? (@$_):()} ($names{$gene},$locii{$gene}));
  print "$gene not found in annotation\n" unless scalar(@cds);
  foreach my $cds (@cds){
    my $title="$filename: ".join(" ",@{$cds}{qw{name locus start stop strand}});
    my $outfile=($outprefix and $outprefix ne "-") ? ($outprefix ? $outprefix:$filename).".$gene.fa":"-";
    open(my $outfh,"|$root/faformat.pl --title='$title' - $outfile") unless $echo;
    print STDERR "Writing $gene data to $outfile\n" unless ($outfile eq "-" or $echo);
    my $dna=uc substr($genome,$cds->{start}-1,$cds->{stop}-$cds->{start}+1);
    if($cds->{strand}<0){
      $dna=~tr/ACGT/TGCA/;
      $dna=reverse $dna;
    }
    if($toRNA){
      $dna=~tr/ACGT/UGCA/;
#      $dna=reverse $dna;
    }
    if($echo){
      print "$dna\n";
    }else{
      print $outfh "$dna\n";
    }
  }
}

__END__

=head1 NAME

getgene.pl - grab the dna for a gene using annotation

=head1 SYNOPSIS

getgene.pl <genome> <gene1> [<gene2> ...]

=head1 OPTIONS

--rna specifies recoding to rna
--outprefix=s directs output to some file s.gene_i.fa (default is <genome>). setting outprefix=- sends output to stdout.
--echo sends output directly to stdout without formatting