File: generate_three_to_one_dict.py

package info (click to toggle)
python-biopython 1.68%2Bdfsg-3
  • links: PTS, VCS
  • area: main
  • in suites: stretch
  • size: 46,860 kB
  • ctags: 13,237
  • sloc: python: 160,306; xml: 93,216; ansic: 9,118; sql: 1,208; makefile: 155; sh: 63
file content (124 lines) | stat: -rw-r--r-- 3,788 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
# File download/unzip written 2012 by Lenna X. Peterson (arklenna@gmail.com)
# Dictionary extraction written 2011 by Hongbo Zhu
#
# This code is part of the Biopython distribution and governed by its
# license.  Please see the LICENSE file that should have been included
# as part of this package.

"""Download PDB Chemical Component Dictionary and generate dict.

Download and parse PDB Chemical Component Dictionary,
then write out dict for to_one_letter_code.
"""

from __future__ import print_function

import gzip
import inspect
import os
import warnings

from Bio._py3k import urlopen

url = "ftp://ftp.wwpdb.org/pub/pdb/data/monomers/components.cif.gz"

# extract name of gzip file
gzname = os.path.basename(url)
# extract name of cif file (split by sep, remove last, rejoin)
cifname = os.extsep.join(gzname.split(os.extsep)[:-1])

url_handle = urlopen(url)

with open(gzname, 'wb') as gzh:
    print("Downloading file... (approx. 29 MB)")
    while True:
        data = url_handle.read(1024)
        if len(data) == 0:
            break
        gzh.write(data)

# size as of 13 April 2012
if os.path.getsize(gzname) < 29944258:
    warnings.warn("ERROR: Downloaded file is too small",
                  RuntimeWarning)

fh = gzip.open(gzname, 'rb')

# write extracted file to disk (not necessary)
# with open(cifname, 'wb') as cifh:
#     print("Extracting file...")
#     cifh.write(fh.read())

# The following code written by Hongbo Zhu
# generate three_to_one_dict
# two records in PDB Chemical Component Dictionary are parsed to
# generate the dictionary:
# _chem_comp.one_letter_code
# _chem_comp.three_letter_code

three_to_one_buf = []      # all three-letter codes
three_to_one_buf_noq = []  # only those with non-'?' one-letter codes

current_line = 'to_one_letter_code = {'
current_line_noq = 'to_one_letter_code = {'

found_one = False    # found one-letter code
found_three = False  # found three-letter code

counter = 0
counter_noq = 0

line = fh.readline()

while line:
    if line.startswith('_chem_comp.one_letter_code'):
        one = line.strip().split()[-1]
        found_one = True
    if line.startswith('_chem_comp.three_letter_code'):
        three = '%-3s' % (line.strip().split()[-1],)  # make it three-letter
        found_three = True

    if found_one and found_three:
        if counter % 5 == 0:
            three_to_one_buf.append('%s\n' % (current_line,))
            current_line = '    '

        current_line = '%s\'%s\':\'%s\',' % (current_line, three, one)
        counter += 1

        if one != '?':
            if counter_noq % 5 == 0:
                three_to_one_buf_noq.append('%s\n' % (current_line_noq,))
                current_line_noq = '    '

            current_line_noq = '%s\'%s\':\'%s\',' % (current_line_noq, three, one)
            counter_noq += 1

        found_one = False
        found_three = False

    line = fh.readline()

if len(current_line) < 5:
    three_to_one_buf[-1] = three_to_one_buf[:-1]  # remove the last comma
    three_to_one_buf.append('}')
else:
    three_to_one_buf.append('%s }' % (current_line[:-1]))

if len(current_line_noq) < 5:
    three_to_one_buf_noq[-1] = three_to_one_buf_noq[:-1]
    three_to_one_buf_noq.append('}')
else:
    three_to_one_buf_noq.append('%s }' % (current_line_noq[:-1]))

# Find path of current script
_scriptPath = os.path.abspath(os.path.split(inspect.getfile(inspect.currentframe()))[0])
# Path to SCOP module
_rafPath = os.path.normpath(os.path.join(_scriptPath, "..", "..", "Bio", "SCOP"))
_threeAllPath = os.path.join(_rafPath, 'three_to_one_all.py')
_threePath = os.path.join(_rafPath, 'three_to_one_dict.py')

# with open(_threeAllPath, 'w') as fh:
#     fh.writelines(three_to_one_buf)
with open(_threePath, 'w') as fh:
    fh.writelines(three_to_one_buf_noq)