1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146
|
#!/usr/bin/env perl
# generegion_soi.pl
# --------------------------------
# sshu@fruitfly.org
# --------------------------------
use lib $ENV{GMOD_MODULE_PATH};
use FindBin qw($RealBin);
use lib (($ENV{SOI_ROOT}) ||
(($INC[0]=~/^\./)?"$RealBin/$INC[0]":"$RealBin/.."));
use Carp;
use strict;
use FileHandle;
use SOI::Adapter;
use SOI::Visitor;
use Getopt::Long;
my $argh = {};
GetOptions($argh,
"dbname|d=s",
"outfile|o=s",
"idfile|file=s",
"template=s",
"args|arg=s%",
"featuretype|type=s",
"extend|e=s",
"so_cv_name|so_cv|so=s",
"help",
);
if ($argh->{help}) {
usage();
exit 0;
}
# set up the database adapter
if (!$argh->{dbname}) {
print STDERR "You MUST specify a database name; for example -d gadfly3\n\n";
usage();
exit 1;
}
my $ad;
eval {
$ad =
SOI::Adapter->new($argh->{dbname});
};
if ($@) {
print STDERR "There was an error connecting to $argh->dbname}\n\n";
print STDERR $@;
exit 1;
}
if ($argh->{so_cv_name}) {
$ad->SO_cv_name($argh->{so_cv_name});
}
my @cgs = @ARGV;
if ($argh->{template}) {
my $features = $ad->get_features_by_template($argh->{template}, $argh->{args});
map{push @cgs, $_->uniquename}@{$features || []};
}
if ($argh->{idfile}) {
open(R, "<$argh->{idfile}") or die "can not open file: $argh->{idfile}";
while (<R>) {
chomp;
next unless ($_);
next if (/^\#/);
my @a = split/\s+/;
push @cgs, @a;
}
}
my $out = $argh->{outfile} || ">-";
my $fh = FileHandle->new(">$out");
my $type = $argh->{featuretype} || 'gene';
my $method = "get_features_by_$type";
foreach my $cg (@cgs) {
my ($range,$genes) =
$ad->$method($cg, {extend=>0});
if (!@{$genes || []}) {
print "NO SUCH FEATURE: $cg\n";
next;
}
#test getting CDS seq
my ($gene) = grep{$_->uniquename eq $cg}@$genes;
my @mRNA = grep{$_->type eq 'mRNA'}@{$gene->nodes || []};
map{SOI::Visitor->make_CDS_feature($_)->to_fasta($fh)}@mRNA;
next;
#get around arm residues stored problem for speed
my $GBs = $ad->get_f({range=>$range}, {feature_types=>'golden_path_region',noauxillaries=>1});
my $arm = SOI::Feature->new({type=>'chromosome_arm',name=>$range->{src}});
$arm->nodes($GBs);
my ($fmin, $fmax) = ($range->{fmin},$range->{fmax});
my $new_f = $arm->stitch_child_segments($fmin,$fmax, {name=>"$cg-region"});
$arm->hash->{residues} = "";
$arm->nodes([$new_f,@{$genes}]);
}
print STDERR "Done!\n";
$ad->close_handle;
sub usage {
print <<EOM
dump_generegion_soi.pl [-d|dbname <chado DATABASE NAME>] [-extend|e NO OF BASES] [-template <FILE>] [-outfile|o filename to put xml in] [-format <FORMAT>] [-noresults] cg-list
This script will export per-CG xml files. you can specify a list of
CGs on the command line, or you can give it a file of CG ids (just CG
ids, not gene symbols)
If outfile is specified, only a single CG can be exported to the file.
Default format is xml (soi). you can also get chaos xml:
dump_generegion_soi.pl -d chado -format chaos CG6699
you can als get GAME xml:
dump_generegion_soi.pl -d chado -format game CG6699
To get the gene region without evidence:
dump_generegion_soi.pl -d chado -format chaos -noresults CG6699
to use template, here is one example (get genes whose protein has 2 mature_peptides):
dump_generegion_soi.pl -d chado -format chaos -template ../templates/genes_by_child_count.soi -arg ptype=protein -arg ctype=mature_peptide -arg count=2 -arg operator='='
EOM
}
|