File: bp_download_query_genbank

package info (click to toggle)
libbio-db-ncbihelper-perl 1.7.8-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 332 kB
  • sloc: perl: 1,319; makefile: 5
file content (129 lines) | stat: -rw-r--r-- 3,316 bytes parent folder | download | duplicates (6)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
#!/usr/bin/perl


=head1 NAME

bp_download_query_genbank - script to query Genbank and retrieve records

=head1 USAGE

 bp_download_query_genbank --query "Neurospora[ORGN]" --db nucest -o Ncrassa_ESTs.fa --format fasta

 bp_download_query_genbank --queryfile 'filewithquery' --db nucest -o Ncrassa_ESTs.fa --format fasta 

=head2 Other options

 Provide ONE of:

  -q --query query string OR
  --queryfile profile file with query OR
  --gi --gis --gifile file with list of GIs to download

 Database type:

 -d --db database (nucleotide [default], nucest, protein, )

 -o --out --outfile output file (results are displayed on screen otherwise)
 -f --format sequence file output format (fasta by default)
 -v --verbose debugging output

=head2 Query options

 --maxids maximum number of IDs to retrieve in a set (100 at a time by default)
 --reldate 
 --maxdate maxdate for a record
 --mindate minimum date for record
 --datetype edat or mdat (entered or modified)

=head1 AUTHOR Jason Stajich

Jason Stajich, jason-AT-bioperl.org

=cut

use strict;
use warnings;
use Bio::DB::GenBank;
use Bio::DB::GenPept;
use Bio::DB::Query::GenBank;
use Bio::SeqIO;
use Getopt::Long;

my ($queryfile,$outfile,$format,$debug,%options);

$format = 'fasta';

$options{'-maxids'} = '100';
$options{'-db'} = 'nucleotide'; # can be nucleotide, nucest, protein 
my $gifile;
GetOptions(
		   'h|help' => sub { exec('perldoc', $0); 
									exit(0);
								},
			  'v|verbose'       => \$debug,
			  'f|format:s'      => \$format,
			  'queryfile:s'     => \$queryfile,
			  'o|out|outfile:s' => \$outfile,
			  'gi|gifile|gis:s' => \$gifile,
			  # DB::Query options	   
			  'd|db:s'     => \$options{'-db'},
			  'mindate:s'  => \$options{'-mindate'},
			  'maxdate:s'  => \$options{'-maxdate'},
			  'reldate:s'  => \$options{'-reldate'}, 
			  'datetype:s' => \$options{'-datetype'}, # edat or mdat
			  'maxids:i'   => \$options{'-maxids'},
			  'q|query:s'  => \$options{'-query'},
			 );

my $out;

if( $outfile ) {
	$out = Bio::SeqIO->new(-format => $format,
								  -file   => ">$outfile");
} else {
	$out = Bio::SeqIO->new(-format => $format); # write to STDOUT
}

my $dbh;
if( $options{'-db'} eq 'protein' ) {
	$dbh = Bio::DB::GenPept->new(-verbose => $debug);
} else {
	$dbh = Bio::DB::GenBank->new(-verbose => $debug);
}
my $query;
if( $gifile ) {
	my @ids;
	open my $fh, '<', $gifile or die "Could not read file '$gifile': $!\n";
	while(<$fh>) {
		push @ids, split;
	}
	close $fh;
	while( @ids ) {
		my @mini_ids = splice(@ids, 0, $options{'-maxids'});
		$query = Bio::DB::Query::GenBank->new(%options, 
						      -verbose =>$debug,
					              -ids => \@mini_ids,
						     );
		my $stream = $dbh->get_Stream_by_query($query);
		while( my $seq = $stream->next_seq ) {
			$out->write_seq($seq);
		}
	}
	exit;
} elsif( $options{'-query'}) {
	$query = Bio::DB::Query::GenBank->new(%options,-verbose => $debug);
} elsif( $queryfile ) {
	open my $fh, '<', $queryfile or die "Could not read file '$queryfile': $!\n";
	while(<$fh>) {
		chomp;
		$options{'-query'} .= $_;
	}
	$query = Bio::DB::Query::GenBank->new(%options,-verbose => $debug);
	close $fh;
} else {
	die("no query string or gifile\n");
}
my $stream = $dbh->get_Stream_by_query($query);
while( my $seq = $stream->next_seq ) {
	$out->write_seq($seq);
}