1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77
|
#!/usr/bin/python3
## get_protein_www.py --
## get a protein sequence from the Uniprot or NCBI/Refseq web sites using the accession
##
## modified to work with urllib.request 7-Nov-2022
import sys
import re
import textwrap
import time
import urllib.request
import urllib.error
ncbi_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?"
uniprot_url = "https://rest.uniprot.org/uniprotkb/"
sub_range = ''
for acc in sys.argv[1:]:
if (re.search(r':',acc)):
(acc, sub_range) = acc.split(':')
if (re.match(r'^(sp|tr|iso|ref)\|',acc)):
acc=acc.split('|')[1]
if (re.match(r'[A-Z]P_\d+',acc)): # get refseq
db_type="protein"
seq_args = "db=%s&id=" % (db_type) + acc + "&rettype=fasta"
url_string = ncbi_url + seq_args
else: # get uniprot
acc_fields = acc.split('|')
if (len(acc_fields)==1):
url_string = uniprot_url + acc + ".fasta"
else:
url_string = uniprot_url + acc_fields[0] + ".fasta"
try:
req = urllib.request.urlopen(url_string)
except urllib.error.URLError as e:
seq_html = ''
sys.stderr.write(e.read().decode('utf-8')+'\n')
continue
else:
seq_html=req.read().decode('utf-8')
time.sleep(0.3)
if (not sub_range):
print(seq_html)
else:
(start, stop) = sub_range.split('-')
(start, stop) = (int(start), int(stop))
lines = seq_html.split('\n')
header=lines[0]
seq = ''.join(lines[1:])
if (start > 0):
start -= 1
new_seq = seq[start:stop]
## print the header
if (start > 0):
print("%s @C:%d" %(header, start+1))
else:
print(header)
print('\n'.join(textwrap.wrap(new_seq)))
|