File: est_tissue_query.pl

package info (click to toggle)
bioperl 1.7.8-2
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 35,964 kB
  • sloc: perl: 94,019; xml: 14,811; makefile: 15
file content (162 lines) | stat: -rw-r--r-- 4,838 bytes parent folder | download | duplicates (7)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
#!/usr/bin/perl

# This script will report the names of the tissues which were seen
# in a BLAST/FASTA report against an EST (or cDNA possibly) library.

# this script assumes you have a directory which you have downloaded
# gbestXX.seq.gz from ncbi genbank release.  This will run faster if 
# they are uncompressed already, but if will uncompress the files 
# on demand.  Be sure that there is sufficient space and the uid
# has write permission on the files and in that directory if you
# plan to run this script on compressed files.

# Alternatively you can use this with the -r option and it will 
# use the remote sequence databases either genbank or embl to 
# retrieve the specific EST (so one can attempt to guess the tissue type)
##
# cmd line options are
# -i/--index=indexname
# -d/--dir=dir where gbest data files are located
# -b/--blast=filename blast filename which compared against an EST db
# -f/--format=(blast|blastxml|fasta) - type of search output either 
#                                      from BLAST or FASTA suites 
# -c/--cache=filename cache for accession number to tissue
# -p/pvalue=pvalue pvalue to limit search to
# -r/--remote=[GenBank|EMBL] use remote db for searching

use strict;
use DB_File;
use Bio::SeqIO;
use Bio::SearchIO;
use Bio::DB::EMBL;
use Bio::DB::GenBank;
use Bio::Index::GenBank;

use Getopt::Long;
my $GZIP = '/usr/bin/gzip';
my $GUNZIP = '/usr/bin/gunzip';

my $dir = '/home/data/libs/gbest'; # local dir for gbest files
my $index = 'dbest_tissue.idx';    # local index filename
my $cache;      # filename to create cache of accession->tissue
my $VERBOSE = 0;# verbosity option
my $blastfile;  # blastfile to parse
my $pvalue;     # Max P-Value allowed when parsing blastfile
my $remote;     # flag for remote database 
my $db;         # generic database handle
my %accessions; # cache results
my $format = 'blast';

&GetOptions( 'd|dir:s'   => \$dir,
	     'i|index:s' => \$index,
	     'v|verbose' => \$VERBOSE,
	     'b|blast:s' => \$blastfile,
	     'f|format:s' => \$format,
	     'c|cache:s' => \$cache,
	     'p|pvalue:s'       => \$pvalue,
	     'r|remote:s'=> \$remote);

if( $cache && -w $cache ) {
    print "creating cache file\n";
    tie %accessions, "DB_File", $cache,  O_RDWR|O_CREAT,0660, $DB_HASH;
}

if( ! $remote ) {
    opendir(GBEST, $dir) or die("cannot open $dir");
    
    my $indexfile = new Bio::Index::GenBank(-filename   => $index,
					    -write_flag => 'WRITE');
    foreach my $file  ( readdir(GBEST) ) {
#	print "file is $file\n";
	    next unless ( $file =~ /(gbest\d+\.seq)(.gz)?$/ );
	    if( $2 ) {		
		`$GUNZIP $dir/$file`;
	    }
	    $indexfile->make_index("$dir/$1");
    }

    $indexfile = undef;
    $db = new Bio::Index::GenBank(-filename => $index);
    
} else { 
    if( $remote =~ /(ncbi)|(genbank)/i ) {

	$db = new Bio::DB::GenBank;
    } elsif( $remote =~ /embl/i ) {
	$db = new Bio::DB::EMBL;
    } else { 
	die("remote must be either 'NCBI' or 'EMBL'");
    }
    # would need to add code to set proxy info for those who need it
}

if(! $blastfile || ! -r $blastfile ) {
    die("Must specify a valid blastfile");
}

my $parser = new Bio::SearchIO(-format => $format,
			       -file => $blastfile);

my %tissues_seen = ();
my ($result,$hit,$hsp);
while( my $result = $parser->next_result )  {
  HIT: while( my $hit = $result->next_hit ) {
      if( defined $pvalue ) {
	  while( my $hsp = $hit->next_hsp ) {
	      if( $hsp->evalue > $pvalue ) {
		  print "skipping ", $hit->name, " because of low evalue \n";
		  # skip this Subject if it contains a pvalue of > $pvalue
		  next HIT;
	      }
	  }
      }
      my  ($id) = split(/\s+/, $hit->name);
      # get the last value
      my @ids = split(/\|/, $id);
      $id = pop @ids;
      my ($tissuetype) = get_tissue($id);
      if( defined $tissuetype ) {
	  push @{$tissues_seen{$tissuetype}}, $hit->name;
      } else { 
	  print STDERR "could not find tissue for $id\n" if( $VERBOSE);
      }
  }
  print "tissues seen for: ", $result->query_name, "\n";

  foreach my $tissue ( sort keys %tissues_seen ) {
      print "* $tissue\n-----------\n\t", 
      join("\n\t",@{$tissues_seen{$tissue}}), "\n\n";
  }
}

# cleanup -- avoid segfault here
$db = undef;

# subroutines

sub get_tissue {
    my ($id) = @_;
    my $tissue;
    if( $tissue = $accessions{$id} ) {
	return $tissue;
    }

    my $seq = $db->get_Seq_by_acc($id);
    return  unless(  $seq );

    foreach my $feature ( $seq->all_SeqFeatures ) {
	if( $feature->primary_tag eq 'source' ) {
	    foreach my $tag ( sort { $b cmp $a }
			      $feature->all_tags ) {
		if( $tag =~ /tissue/i  || 
		    ( ! $tissue && 
		      $tag =~ /clone_lib/i ) ){
		    ($tissue) = $feature->each_tag_value($tag);
		    $accessions{$seq->display_id} = $tissue;
		    return $tissue;
		}
	    }
	}
    }	    
    return;
}