1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97
|
#!/usr/bin/python3
## get_protein_www.py --
## get a protein sequence from the Uniprot or NCBI/Refseq web sites using the accession
##
import sys
import re
import textwrap
import time
from urllib.request import Request, urlopen
from urllib.error import URLError
ncbi_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?"
uniprot_url = "https://www.uniprot.org/uniprot/"
sub_range = ''
for acc in sys.argv[1:]:
if (re.search(r':',acc)):
(acc, sub_range) = acc.split(':')
if (re.match(r'^(sp|tr|iso|ref)\|',acc)):
acc=acc.split('|')[1]
if (re.match(r'[A-Z]P_\d+',acc)): # get refseq
db_type="protein"
seq_args = "db=%s&id=" % (db_type) + ",".join(sys.argv[1:]) + "&rettype=fasta"
url_string = ncbi_url + seq_args
else: # get uniprot
url_string = uniprot_url + acc + ".fasta"
req = Request(url_string)
try:
resp = urlopen(req)
except URLError as e:
seq_html = ''
if hasattr(e, 'reason'):
sys.stderr.write("Sequence [%s] not found: %s\n"%(acc,str(e.reason)))
elif hasattr(e, 'code'):
sys.stderr.write("Sequence [%s] not found: %s\n"%(acc,str(e.ecode)))
else:
sys.stderr.write("Sequence [%s] not found\n"%(acc))
exit(0)
else:
seq_html=resp.read().decode('utf-8')
time.sleep(0.3)
header=''
seq = ''
for line in seq_html.split('\n'):
if (line and line[0]=='>'):
# print out old one if there
if (header):
if (sub_range):
start, stop = sub_range.split('-')
start, stop = int(start), int(stop)
if (start > 0):
start -= 1
new_seq = seq[start:stop]
else:
start = 0
new_seq = seq
if (start > 0):
print("%s @C%d" %(header, start+1))
else:
print(header)
print('\n'.join(textwrap.wrap(new_seq)))
header = line;
seq = ''
else:
seq += line
start=0
if (sub_range):
start, stop = sub_range.split('-')
start, stop = int(start), int(stop)
if (start > 0):
start -= 1
new_seq = seq[start:stop]
else:
new_seq = seq
if (start > 0):
print("%s @C:%d" %(header, start+1))
else:
print(header)
print('\n'.join(textwrap.wrap(new_seq)))
|