File: dump_fasta.pl

package info (click to toggle)
libchado-perl 1.23-2
  • links: PTS, VCS
  • area: main
  • in suites: jessie, jessie-kfreebsd
  • size: 23,976 kB
  • ctags: 10,378
  • sloc: xml: 192,540; sql: 165,945; perl: 28,339; sh: 101; python: 73; makefile: 46
file content (146 lines) | stat: -rwxr-xr-x 3,646 bytes parent folder | download | duplicates (4)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
#!/usr/bin/env perl 

# generegion_soi.pl

# --------------------------------
# sshu@fruitfly.org
# --------------------------------

use lib $ENV{GMOD_MODULE_PATH};
use FindBin qw($RealBin);
use lib (($ENV{SOI_ROOT}) ||
                     (($INC[0]=~/^\./)?"$RealBin/$INC[0]":"$RealBin/.."));

use Carp;
use strict;

use FileHandle;
use SOI::Adapter;
use SOI::Visitor;

use Getopt::Long;

my $argh = {};
GetOptions($argh,
           "dbname|d=s",
           "outfile|o=s",
           "idfile|file=s",
           "template=s",
           "args|arg=s%",
           "featuretype|type=s",
           "extend|e=s",
           "so_cv_name|so_cv|so=s",
           "help",
          );

if ($argh->{help}) {
    usage();
    exit 0;
}

# set up the database adapter
if (!$argh->{dbname}) {
    print STDERR "You MUST specify a database name; for example -d gadfly3\n\n";
    usage();
    exit 1;
}
my $ad;
eval {
    $ad  =
      SOI::Adapter->new($argh->{dbname});
};
if ($@) {
    print STDERR "There was an error connecting to $argh->dbname}\n\n";
    print STDERR $@;
    exit 1;
}

if ($argh->{so_cv_name}) {
    $ad->SO_cv_name($argh->{so_cv_name});
}

my @cgs = @ARGV;
if ($argh->{template}) {
    my $features = $ad->get_features_by_template($argh->{template}, $argh->{args});
    map{push @cgs, $_->uniquename}@{$features || []};
}
if ($argh->{idfile}) {
    open(R, "<$argh->{idfile}") or die "can not open file: $argh->{idfile}";
    while (<R>) {
        chomp;
        next unless ($_);
        next if (/^\#/);
        my @a = split/\s+/;
        push @cgs, @a;
    }
}
my $out = $argh->{outfile} || ">-";
my $fh = FileHandle->new(">$out");
my $type = $argh->{featuretype} || 'gene';
my $method = "get_features_by_$type";
foreach my $cg (@cgs) {
    my ($range,$genes) =
      $ad->$method($cg, {extend=>0});
    if (!@{$genes || []}) {
        print "NO SUCH FEATURE: $cg\n";
        next;
    }

    #test getting CDS seq
    my ($gene) = grep{$_->uniquename eq $cg}@$genes;
    my @mRNA = grep{$_->type eq 'mRNA'}@{$gene->nodes || []};
    map{SOI::Visitor->make_CDS_feature($_)->to_fasta($fh)}@mRNA;

    next;

    #get around arm residues stored problem for speed
    my $GBs = $ad->get_f({range=>$range}, {feature_types=>'golden_path_region',noauxillaries=>1});
    my $arm = SOI::Feature->new({type=>'chromosome_arm',name=>$range->{src}});
    $arm->nodes($GBs);
    my ($fmin, $fmax) = ($range->{fmin},$range->{fmax});
    my $new_f = $arm->stitch_child_segments($fmin,$fmax, {name=>"$cg-region"});
    $arm->hash->{residues} = "";

    $arm->nodes([$new_f,@{$genes}]);

}


print STDERR "Done!\n";
$ad->close_handle;

sub usage {
    print <<EOM
dump_generegion_soi.pl [-d|dbname <chado DATABASE NAME>] [-extend|e NO OF BASES] [-template <FILE>] [-outfile|o filename to put xml in] [-format <FORMAT>] [-noresults] cg-list

This script will export per-CG xml files. you can specify a list of
CGs on the command line, or you can give it a file of CG ids (just CG
ids, not gene symbols)

If outfile is specified, only a single CG can be exported to the file.

Default format is xml (soi). you can also get chaos xml:

dump_generegion_soi.pl -d chado -format chaos CG6699

you can als get GAME xml:

dump_generegion_soi.pl -d chado -format game CG6699

To get the gene region without evidence:

dump_generegion_soi.pl -d chado -format chaos -noresults CG6699

to use template, here is one example (get genes whose protein has 2 mature_peptides):

dump_generegion_soi.pl -d chado -format chaos -template ../templates/genes_by_child_count.soi -arg ptype=protein -arg ctype=mature_peptide -arg count=2 -arg operator='='


EOM
}