1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124
|
# File download/unzip written 2012 by Lenna X. Peterson (arklenna@gmail.com)
# Dictionary extraction written 2011 by Hongbo Zhu
#
# This code is part of the Biopython distribution and governed by its
# license. Please see the LICENSE file that should have been included
# as part of this package.
"""Download PDB Chemical Component Dictionary and generate dict.
Download and parse PDB Chemical Component Dictionary,
then write out dict for to_one_letter_code.
"""
from __future__ import print_function
import gzip
import inspect
import os
import warnings
from Bio._py3k import urlopen
url = "ftp://ftp.wwpdb.org/pub/pdb/data/monomers/components.cif.gz"
# extract name of gzip file
gzname = os.path.basename(url)
# extract name of cif file (split by sep, remove last, rejoin)
cifname = os.extsep.join(gzname.split(os.extsep)[:-1])
url_handle = urlopen(url)
with open(gzname, 'wb') as gzh:
print("Downloading file... (approx. 29 MB)")
while True:
data = url_handle.read(1024)
if len(data) == 0:
break
gzh.write(data)
# size as of 13 April 2012
if os.path.getsize(gzname) < 29944258:
warnings.warn("ERROR: Downloaded file is too small",
RuntimeWarning)
fh = gzip.open(gzname, 'rb')
# write extracted file to disk (not necessary)
# with open(cifname, 'wb') as cifh:
# print("Extracting file...")
# cifh.write(fh.read())
# The following code written by Hongbo Zhu
# generate three_to_one_dict
# two records in PDB Chemical Component Dictionary are parsed to
# generate the dictionary:
# _chem_comp.one_letter_code
# _chem_comp.three_letter_code
three_to_one_buf = [] # all three-letter codes
three_to_one_buf_noq = [] # only those with non-'?' one-letter codes
current_line = 'to_one_letter_code = {'
current_line_noq = 'to_one_letter_code = {'
found_one = False # found one-letter code
found_three = False # found three-letter code
counter = 0
counter_noq = 0
line = fh.readline()
while line:
if line.startswith('_chem_comp.one_letter_code'):
one = line.strip().split()[-1]
found_one = True
if line.startswith('_chem_comp.three_letter_code'):
three = '%-3s' % (line.strip().split()[-1],) # make it three-letter
found_three = True
if found_one and found_three:
if counter % 5 == 0:
three_to_one_buf.append('%s\n' % (current_line,))
current_line = ' '
current_line = '%s\'%s\':\'%s\',' % (current_line, three, one)
counter += 1
if one != '?':
if counter_noq % 5 == 0:
three_to_one_buf_noq.append('%s\n' % (current_line_noq,))
current_line_noq = ' '
current_line_noq = '%s\'%s\':\'%s\',' % (current_line_noq, three, one)
counter_noq += 1
found_one = False
found_three = False
line = fh.readline()
if len(current_line) < 5:
three_to_one_buf[-1] = three_to_one_buf[:-1] # remove the last comma
three_to_one_buf.append('}')
else:
three_to_one_buf.append('%s }' % (current_line[:-1]))
if len(current_line_noq) < 5:
three_to_one_buf_noq[-1] = three_to_one_buf_noq[:-1]
three_to_one_buf_noq.append('}')
else:
three_to_one_buf_noq.append('%s }' % (current_line_noq[:-1]))
# Find path of current script
_scriptPath = os.path.abspath(os.path.split(inspect.getfile(inspect.currentframe()))[0])
# Path to SCOP module
_rafPath = os.path.normpath(os.path.join(_scriptPath, "..", "..", "Bio", "SCOP"))
_threeAllPath = os.path.join(_rafPath, 'three_to_one_all.py')
_threePath = os.path.join(_rafPath, 'three_to_one_dict.py')
# with open(_threeAllPath, 'w') as fh:
# fh.writelines(three_to_one_buf)
with open(_threePath, 'w') as fh:
fh.writelines(three_to_one_buf_noq)
|