File: get_protein.py

package info (click to toggle)
fasta3 36.3.8h.2020-02-11-3
  • links: PTS, VCS
  • area: main
  • in suites: bullseye
  • size: 6,048 kB
  • sloc: ansic: 56,138; perl: 10,192; python: 2,205; sh: 416; csh: 85; sql: 55; makefile: 38
file content (97 lines) | stat: -rwxr-xr-x 2,234 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
#!/usr/bin/python3

## get_protein_www.py -- 
## get a protein sequence from the Uniprot or NCBI/Refseq web sites using the accession
##

import sys
import re
import textwrap
import time
from urllib.request import Request, urlopen
from urllib.error import URLError

ncbi_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?" 
uniprot_url = "https://www.uniprot.org/uniprot/"

sub_range = ''
for acc in sys.argv[1:]:

  if (re.search(r':',acc)):
    (acc, sub_range) = acc.split(':')

  if (re.match(r'^(sp|tr|iso|ref)\|',acc)):
      acc=acc.split('|')[1]

  if (re.match(r'[A-Z]P_\d+',acc)):   # get refseq
    db_type="protein"

    seq_args = "db=%s&id=" % (db_type) + ",".join(sys.argv[1:])  + "&rettype=fasta"

    url_string = ncbi_url + seq_args

  else:				# get uniprot
    url_string = uniprot_url + acc + ".fasta"


  req = Request(url_string)

  try: 
    resp = urlopen(req)
  except URLError as e:
    seq_html = ''
    if hasattr(e, 'reason'):
      sys.stderr.write("Sequence [%s] not found: %s\n"%(acc,str(e.reason)))
    elif hasattr(e, 'code'):
      sys.stderr.write("Sequence [%s] not found: %s\n"%(acc,str(e.ecode)))
    else:
      sys.stderr.write("Sequence [%s] not found\n"%(acc))
    exit(0)

  else:
    seq_html=resp.read().decode('utf-8')
    time.sleep(0.3)

  header=''
  seq = ''
  for line in seq_html.split('\n'):
    if (line and line[0]=='>'):
      # print out old one if there
      if (header):
        if (sub_range):
          start, stop = sub_range.split('-')
          start, stop = int(start), int(stop)
          if (start > 0):
            start -= 1
          new_seq = seq[start:stop]
        else:
          start = 0
          new_seq = seq

        if (start > 0):
          print("%s @C%d" %(header, start+1))
        else:
          print(header)
        print('\n'.join(textwrap.wrap(new_seq)))

      header = line;
      seq = ''
    else:
      seq += line

start=0
if (sub_range):
  start, stop = sub_range.split('-')
  start, stop = int(start), int(stop)
  if (start > 0):
    start -= 1
    new_seq = seq[start:stop]
else:
  new_seq = seq

if (start > 0):
  print("%s @C:%d" %(header, start+1))
else:
  print(header)

print('\n'.join(textwrap.wrap(new_seq)))