1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149
|
#!/usr/bin/env python
import os
import sys
import argparse
from collections import Counter
from gemmi import cif
def to_formula(cnt):
'{C:12, O:8, N:1} -> "C12 N O8"'
return ' '.join(k + (str(v) if v != 1 else '')
for k, v in sorted(cnt.items()))
def formula_to_dict(formula):
'"O4 P -3" -> {O:4, P:1}'
fdict = {}
for elnum in formula.split():
na = sum(e.isalpha() for e in elnum)
if na == len(elnum):
fdict[elnum] = 1
elif na != 0:
fdict[elnum[:na]] = int(elnum[na:])
return fdict
def get_monomer_cifs(mon_path):
'yield all files mon_path/?/*.cif in alphabetic order'
for root, dirs, files in os.walk(mon_path):
dirs.sort()
for name in sorted(files):
# in $CLIBD_MON, files with '_' in the name are not monomers
if name.endswith('.cif') and '_' not in name:
yield os.path.join(root, name)
elif name.endswith('.cif.gz'):
yield os.path.join(root, name)
def check_formulas(ccd):
'''\
Print the chemical components in CCD (components.cif) that have
formula not consistent with the list of atoms.
'''
for b in ccd:
atoms = Counter(a.upper()
for a in b.find_values('_chem_comp_atom.type_symbol'))
formula = cif.as_string(b.find_value('_chem_comp.formula')).upper()
fdict = formula_to_dict(formula)
if fdict != atoms:
print('[%s]' % b.name, formula, '<>', to_formula(atoms))
def compare_monlib_with_ccd(mon_path, ccd, verbose=False):
'compare monomers from monomer library and CCD that have the same names'
PRINT_MISSING_ENTRIES = False
cnt = 0
for path in get_monomer_cifs(mon_path):
mon = cif.read(path)
for mb in mon:
if mb.name in ('', 'comp_list'):
continue
assert mb.name.startswith('comp_')
name = mb.name[5:]
cb = ccd.find_block(name)
if cb:
if verbose:
print('Comparing', cb.name)
compare_chem_comp(mb, cb)
cnt += 1
elif PRINT_MISSING_ENTRIES:
print('Not in CCD:', name)
print('Compared', cnt, 'monomers.')
def get_heavy_atom_names(block):
cca = block.find('_chem_comp_atom.', ['atom_id', 'type_symbol'])
d = {a.str(0).upper(): a.str(1).upper() for a in cca if a[1] != 'H'}
assert len(d) == sum(a[1] != 'H' for a in cca), (d, cca)
return d
def bond_info(id1, id2, order, aromatic):
if id1 > id2:
id1, id2 = id2, id1
order = order[:4].lower()
assert order in ['sing', 'doub', 'trip', 'arom', 'delo', '1.5'], order
aromatic = aromatic.upper()
assert aromatic in ('Y', 'N'), aromatic
if order in ('sing', 'doub') and aromatic == 'Y':
order = 'arom'
return ((id1, id2), (order, aromatic))
def bond_dict(block, ccb_names, atom_names):
table = block.find('_chem_comp_bond.', ccb_names)
return dict(bond_info(r.str(0), r.str(1), r.str(2), r.str(3))
for r in table
if r.str(0) in atom_names and r.str(1) in atom_names)
def compare_chem_comp(mb, cb):
mon_names = get_heavy_atom_names(mb)
ccd_names = get_heavy_atom_names(cb)
# compare atom names (and counts)
if mon_names != ccd_names:
print('atom names differ in', cb.name)
mon_atoms = Counter(mon_names.values())
ccd_atoms = Counter(ccd_names.values())
if mon_atoms != ccd_atoms:
# can be relaxed by: and mon_atoms + Counter('O') != ccd_atoms
print(cb.name, to_formula(mon_atoms), '<>', to_formula(ccd_atoms))
# compare bonds
mbonds = bond_dict(mb, ['atom_id_1', 'atom_id_2', 'type', 'aromatic'],
mon_names)
cbonds = bond_dict(cb, ['atom_id_1', 'atom_id_2', 'value_order',
'pdbx_aromatic_flag'],
ccd_names)
if mbonds != cbonds:
print('Bonds differ in', cb.name)
if set(mbonds) != set(cbonds):
print(' extra bonds in mon.lib.:',
' '.join('%s-%s' % p for p in set(mbonds) - set(cbonds)))
print(' extra bonds in CCD:',
' '.join('%s-%s' % p for p in set(cbonds) - set(mbonds)))
for apair in sorted(set(mbonds) & set(cbonds)):
if mbonds[apair] != cbonds[apair]:
print(' %s-%s: %s/%s <> CCD %s/%s' %
(apair + mbonds[apair] + cbonds[apair]))
def main():
parser = argparse.ArgumentParser()
parser.add_argument('ccd_path', metavar='/path/to/components.cif[.gz]')
parser.add_argument('-m', metavar='DIR',
help='monomer library path (default: $CLIBD_MON)')
parser.add_argument('-f', action='store_true', help='check CCD formulas')
parser.add_argument('-v', action='store_true', help='verbose')
args = parser.parse_args()
mon_path = args.m or os.getenv('CLIBD_MON')
if not mon_path and not args.f:
sys.exit('Unknown monomer library path: use -m or set $CLIBD_MON.')
ccd = cif.read(args.ccd_path)
if args.f:
check_formulas(ccd)
if mon_path:
compare_monlib_with_ccd(mon_path, ccd, verbose=args.v)
if __name__ == '__main__':
main()
|