1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126
|
#!/usr/bin/perl
#use strict;
use LWP::Simple;
use LWP::UserAgent;
use strict;
# This program is licensed to you under the Fred
# Hutchinos Cancer Research Center (FHCRC)
# NONCOMMERICAL LICENSE. A copy of the license may be found at
# http://blocks.fhcrc.org/sift/license.html and should be attached
# to this software
# retrieves sequence from NCBI database
my $gi_number = $ARGV[0];
my $outfile = $ARGV[1];
my $best_hits_or_all_hits_option = "";
if (@ARGV >= 3) {
$best_hits_or_all_hits_option = $ARGV[2];
}
my $URL = "https://www.ncbi.nlm.nih.gov/sutils/blink.cgi?pid=" . $gi_number;
#Dec 2016 - changed to 399 because original gi# will be added to 400
if ($best_hits_or_all_hits_option eq "BEST") {
# $URL .= "&cut=100&org=1";
$URL .= "&org=1&per_page=399";
##25/6/2013 HueyTyng modified - to recognise another option CG_ONLY
##CG_ONLY will take sequences from complete genomes db only + hide identical hits
} elsif ($best_hits_or_all_hits_option eq "CG_ONLY" ) {
$URL .= "&org=3&set=4&per_page=399";
} else {
$URL .= "&per_page=399";
}
## December 24,2009 edits to make timeout longer
my $browser = LWP::UserAgent->new();
$browser->timeout (60);
my $request = HTTP::Request->new (GET => $URL);
my $response = $browser->request ($request);
#my $NCBI_content = get ($URL);
#if (!defined ($NCBI_content) || $NCBI_content eq "") {
# print "ERROR! Unable to contact NCBI right now for $gi_number . Please try later.\n";
# exit (-1);
#}
if ($response->is_error()) {
print "ERROR! %s ERROR267\n", $response->status_line;
}
my $NCBI_content = $response->content();
if ($NCBI_content =~ /Could not retrieve/) {
print "ERROR! NCBI could not retrieve the required information. Please make sure your sequence is a PROTEIN sequence and then try <A HREF=\"http://sift-dna.org/www/SIFT_seq_submit2.html\">submitting the sequence</A> to SIFT.\n";
exit (-1);
}
if ($NCBI_content =~ /No hits found/ ) {
# ERROR
print "ERROR! No BLAST hits were found for gi:$gi_number by NCBI BLink . Please check that this sequence is a protein.<BR>You can also submit your protein sequence directly to SIFT and have SIFT choose the protein sequences <A HREF=\"http://sift-dna.org/www/SIFT_seq_submit2.html\">here.</A>\n";
exit (-1);
}
if ($NCBI_content =~ /ERROR/) {
print "ERROR trying to retrieve sequences from NCBI! Please follow this <A HREF=$URL>link</A HREF> to NCBI to see the error. \n";
exit (-1);
}
my @lines = split ("\n", $NCBI_content);
if (@lines == 0) {
print "ERROR! Unable to retrieve sequences from NCBI. Please try later.\n";
exit (-1);
}
my $lineindex = 0;
my %seen;
my $gi_hits = $gi_number . ",";
while ($lineindex < @lines ) {
my $loc = index ($lines[$lineindex], "name='gi");
# print "location is " . $loc;
while ($loc >= 0) {
my $gi = substr ($lines[$lineindex], $loc + 8);
$loc = index ($gi, "'");
$gi = substr ($gi, 0, $loc);
if (!exists ($seen{$gi})) {
$gi_hits .= $gi . "," ;
$seen{$gi} = 1;
}
$lines[$lineindex] = substr ($lines[$lineindex], $loc+1, 100000);
$loc = index ($lines[$lineindex], "name='gi");
}
$lineindex++;
}
$gi_hits =~ s/\,$//;
print "GI hits" . $gi_hits;
#old
#my $seq_URL = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=protein&email=sift\@fhcrc.org&id=$acc_hits&rettype=fasta&retmode=text";
#new
my $seq_URL = "https://www.ncbi.nlm.nih.gov/sviewer/viewer.fcgi?db=protein&qty=1&c_start=1&list_uids=$gi_hits&uids=&dopt=fasta&dispmax=400&sendto=t&from=begin&to=end";
my $seq = get ($seq_URL);
# Pauline Sept 10,2010
# format seq because the NCBI name is gi|###|#### and psiblast will
# format it to gi|####
my @seq_lines = split (/\n/, $seq);
for (my $i= 0; $i < @seq_lines; $i++) {
if ($seq_lines[$i] =~ /^\>gi\|/) {
$_ = $seq_lines[$i];
/^\>gi\|([0-9]+)\|(.*)/;
$seq_lines[$i] = ">gi" . $1 . " " . $2;
#print "replacing with $seq_lines[$i]\n";
}
}
$seq = join ("\n", @seq_lines);
open (OUT, ">$outfile") || die "can't open $outfile";
print OUT $seq ;
close (OUT);
exit (0);
|