1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122
|
# File download/unzip written 2012 by Lenna X. Peterson (arklenna@gmail.com)
# Dictionary extraction written 2011 by Hongbo Zhu
#
# This code is part of the Biopython distribution and governed by its
# license. Please see the LICENSE file that should have been included
# as part of this package.
"""Download PDB Chemical Component Dictionary and generate dict.
Download and parse PDB Chemical Component Dictionary,
then write out dict for to_one_letter_code.
"""
import gzip
import inspect
import os
import warnings
from urllib.request import urlopen
url = "ftp://ftp.wwpdb.org/pub/pdb/data/monomers/components.cif.gz"
# extract name of gzip file
gzname = os.path.basename(url)
# extract name of cif file (split by sep, remove last, rejoin)
cifname = os.extsep.join(gzname.split(os.extsep)[:-1])
url_handle = urlopen(url)
with open(gzname, "wb") as gzh:
print("Downloading file... (approx. 29 MB)")
while True:
data = url_handle.read(1024)
if len(data) == 0:
break
gzh.write(data)
# size as of 13 April 2012
if os.path.getsize(gzname) < 29944258:
warnings.warn("ERROR: Downloaded file is too small", RuntimeWarning)
fh = gzip.open(gzname, "rb")
# write extracted file to disk (not necessary)
# with open(cifname, 'wb') as cifh:
# print("Extracting file...")
# cifh.write(fh.read())
# The following code written by Hongbo Zhu
# generate three_to_one_dict
# two records in PDB Chemical Component Dictionary are parsed to
# generate the dictionary:
# _chem_comp.one_letter_code
# _chem_comp.three_letter_code
three_to_one_buf = [] # all three-letter codes
three_to_one_buf_noq = [] # only those with non-'?' one-letter codes
current_line = "to_one_letter_code = {"
current_line_noq = "to_one_letter_code = {"
found_one = False # found one-letter code
found_three = False # found three-letter code
counter = 0
counter_noq = 0
line = fh.readline()
while line:
if line.startswith("_chem_comp.one_letter_code"):
one = line.strip().split()[-1]
found_one = True
if line.startswith("_chem_comp.three_letter_code"):
three = "%-3s" % (line.strip().split()[-1],) # make it three-letter
found_three = True
if found_one and found_three:
if counter % 5 == 0:
three_to_one_buf.append(f"{current_line}\n")
current_line = " "
current_line = f"{current_line}'{three}':'{one}',"
counter += 1
if one != "?":
if counter_noq % 5 == 0:
three_to_one_buf_noq.append(f"{current_line_noq}\n")
current_line_noq = " "
current_line_noq = f"{current_line_noq}'{three}':'{one}',"
counter_noq += 1
found_one = False
found_three = False
line = fh.readline()
if len(current_line) < 5:
three_to_one_buf[-1] = three_to_one_buf[:-1] # remove the last comma
three_to_one_buf.append("}")
else:
three_to_one_buf.append("%s }" % (current_line[:-1]))
if len(current_line_noq) < 5:
three_to_one_buf_noq[-1] = three_to_one_buf_noq[:-1]
three_to_one_buf_noq.append("}")
else:
three_to_one_buf_noq.append("%s }" % (current_line_noq[:-1]))
# Find path of current script
_scriptPath = os.path.abspath(os.path.split(inspect.getfile(inspect.currentframe()))[0])
# Path to SCOP module
_rafPath = os.path.normpath(os.path.join(_scriptPath, "..", "..", "Bio", "SCOP"))
_threeAllPath = os.path.join(_rafPath, "three_to_one_all.py")
_threePath = os.path.join(_rafPath, "three_to_one_dict.py")
# with open(_threeAllPath, 'w') as fh:
# fh.writelines(three_to_one_buf)
with open(_threePath, "w") as fh:
fh.writelines(three_to_one_buf_noq)
|