1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162
|
#!/usr/bin/perl
# This script will report the names of the tissues which were seen
# in a BLAST/FASTA report against an EST (or cDNA possibly) library.
# this script assumes you have a directory which you have downloaded
# gbestXX.seq.gz from ncbi genbank release. This will run faster if
# they are uncompressed already, but if will uncompress the files
# on demand. Be sure that there is sufficient space and the uid
# has write permission on the files and in that directory if you
# plan to run this script on compressed files.
# Alternatively you can use this with the -r option and it will
# use the remote sequence databases either genbank or embl to
# retrieve the specific EST (so one can attempt to guess the tissue type)
##
# cmd line options are
# -i/--index=indexname
# -d/--dir=dir where gbest data files are located
# -b/--blast=filename blast filename which compared against an EST db
# -f/--format=(blast|blastxml|fasta) - type of search output either
# from BLAST or FASTA suites
# -c/--cache=filename cache for accession number to tissue
# -p/pvalue=pvalue pvalue to limit search to
# -r/--remote=[GenBank|EMBL] use remote db for searching
use strict;
use DB_File;
use Bio::SeqIO;
use Bio::SearchIO;
use Bio::DB::EMBL;
use Bio::DB::GenBank;
use Bio::Index::GenBank;
use Getopt::Long;
my $GZIP = '/usr/bin/gzip';
my $GUNZIP = '/usr/bin/gunzip';
my $dir = '/home/data/libs/gbest'; # local dir for gbest files
my $index = 'dbest_tissue.idx'; # local index filename
my $cache; # filename to create cache of accession->tissue
my $VERBOSE = 0;# verbosity option
my $blastfile; # blastfile to parse
my $pvalue; # Max P-Value allowed when parsing blastfile
my $remote; # flag for remote database
my $db; # generic database handle
my %accessions; # cache results
my $format = 'blast';
&GetOptions( 'd|dir:s' => \$dir,
'i|index:s' => \$index,
'v|verbose' => \$VERBOSE,
'b|blast:s' => \$blastfile,
'f|format:s' => \$format,
'c|cache:s' => \$cache,
'p|pvalue:s' => \$pvalue,
'r|remote:s'=> \$remote);
if( $cache && -w $cache ) {
print "creating cache file\n";
tie %accessions, "DB_File", $cache, O_RDWR|O_CREAT,0660, $DB_HASH;
}
if( ! $remote ) {
opendir(GBEST, $dir) or die("cannot open $dir");
my $indexfile = new Bio::Index::GenBank(-filename => $index,
-write_flag => 'WRITE');
foreach my $file ( readdir(GBEST) ) {
# print "file is $file\n";
next unless ( $file =~ /(gbest\d+\.seq)(.gz)?$/ );
if( $2 ) {
`$GUNZIP $dir/$file`;
}
$indexfile->make_index("$dir/$1");
}
$indexfile = undef;
$db = new Bio::Index::GenBank(-filename => $index);
} else {
if( $remote =~ /(ncbi)|(genbank)/i ) {
$db = new Bio::DB::GenBank;
} elsif( $remote =~ /embl/i ) {
$db = new Bio::DB::EMBL;
} else {
die("remote must be either 'NCBI' or 'EMBL'");
}
# would need to add code to set proxy info for those who need it
}
if(! $blastfile || ! -r $blastfile ) {
die("Must specify a valid blastfile");
}
my $parser = new Bio::SearchIO(-format => $format,
-file => $blastfile);
my %tissues_seen = ();
my ($result,$hit,$hsp);
while( my $result = $parser->next_result ) {
HIT: while( my $hit = $result->next_hit ) {
if( defined $pvalue ) {
while( my $hsp = $hit->next_hsp ) {
if( $hsp->evalue > $pvalue ) {
print "skipping ", $hit->name, " because of low evalue \n";
# skip this Subject if it contains a pvalue of > $pvalue
next HIT;
}
}
}
my ($id) = split(/\s+/, $hit->name);
# get the last value
my @ids = split(/\|/, $id);
$id = pop @ids;
my ($tissuetype) = get_tissue($id);
if( defined $tissuetype ) {
push @{$tissues_seen{$tissuetype}}, $hit->name;
} else {
print STDERR "could not find tissue for $id\n" if( $VERBOSE);
}
}
print "tissues seen for: ", $result->query_name, "\n";
foreach my $tissue ( sort keys %tissues_seen ) {
print "* $tissue\n-----------\n\t",
join("\n\t",@{$tissues_seen{$tissue}}), "\n\n";
}
}
# cleanup -- avoid segfault here
$db = undef;
# subroutines
sub get_tissue {
my ($id) = @_;
my $tissue;
if( $tissue = $accessions{$id} ) {
return $tissue;
}
my $seq = $db->get_Seq_by_acc($id);
return unless( $seq );
foreach my $feature ( $seq->all_SeqFeatures ) {
if( $feature->primary_tag eq 'source' ) {
foreach my $tag ( sort { $b cmp $a }
$feature->all_tags ) {
if( $tag =~ /tissue/i ||
( ! $tissue &&
$tag =~ /clone_lib/i ) ){
($tissue) = $feature->each_tag_value($tag);
$accessions{$seq->display_id} = $tissue;
return $tissue;
}
}
}
}
return;
}
|