File: get_protein_sql.py

package info (click to toggle)
fasta3 36.3.8i.14-Nov-2020-3
  • links: PTS, VCS
  • area: main
  • in suites: sid, trixie
  • size: 7,016 kB
  • sloc: ansic: 77,269; perl: 10,677; python: 2,461; sh: 428; csh: 86; sql: 55; makefile: 40
file content (65 lines) | stat: -rwxr-xr-x 1,722 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
#!/usr/bin/python3

## get_protein_sql.py -- 
## get a protein sequence from a local Uniprot or NCBI/Refseq mySQL database using the accession
##

## modified to work with mysql.connector 7-Nov-2022

import sys
import re
import textwrap
import mysql.connector

db_r = mysql.connector.connect(db='seqdb_demox', host='wrpxdb.bioch.virginia.edu', user='web_user', passwd='fasta_www')
db_u = mysql.connector.connect(db='uniprot', host='wrpxdb.bioch.virginia.edu', user='web_user', passwd='fasta_www')

cur1_r = db_r.cursor(dictionary=True, buffered=True)
cur1_u = db_u.cursor(dictionary=True, buffered=True)

sql_get_uniprot='select db, acc, id, descr, seq from annot2 join protein using(acc) where acc="%s"'
sql_get_refseq ='select db, acc, "" as id, descr, seq from seqdb_demox.annot join seqdb_demox.protein using(prot_id) where acc="%s"'

sub_range = ''
for acc in sys.argv[1:]:

  if (re.search(r':',acc)):
    (acc, sub_range) = acc.split(':')

  if (re.match(r'^(sp|tr|iso|ref)\|',acc)):
      acc=acc.split('|')[1]

  if (re.match(r'[A-Z]P_\d+',acc)):
    sql_get_prot=sql_get_refseq
    cur1 = cur1_r
  else:
    sql_get_prot=sql_get_uniprot
    cur1 = cur1_u

  cur1.execute(sql_get_prot%(acc,))

  row = cur1.fetchone()

  if (not row):
    sys.stderr.write("*** %s *** not found\n"%(acc))
    exit(1)

  header = ">%s|%s"%(row['db'],row['acc'])
  if (row['id']):
    header += "|%s"%(row['id'])

  header += " "+row['descr']

  start = 0
  if (sub_range):
    start, stop = sub_range.split('-')
    start, stop = int(start), int(stop)

  if (start > 0):
    seq = row['seq'][start-1:stop]
    print("%s @C%d" %(header, start+1))
  else:
    seq = row['seq']

  print(header)
  print('\n'.join(textwrap.wrap(seq)))