File: monomers.py

package info (click to toggle)
gemmi 0.6.5%2Bds-3
  • links: PTS, VCS
  • area: main
  • in suites: trixie
  • size: 5,836 kB
  • sloc: cpp: 54,719; python: 4,743; ansic: 3,972; sh: 384; makefile: 73; f90: 42; javascript: 12
file content (149 lines) | stat: -rwxr-xr-x 5,359 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
#!/usr/bin/env python
import os
import sys
import argparse
from collections import Counter
from gemmi import cif


def to_formula(cnt):
    '{C:12, O:8, N:1} -> "C12 N O8"'
    return ' '.join(k + (str(v) if v != 1 else '')
                    for k, v in sorted(cnt.items()))

def formula_to_dict(formula):
    '"O4 P -3" -> {O:4, P:1}'
    fdict = {}
    for elnum in formula.split():
        na = sum(e.isalpha() for e in elnum)
        if na == len(elnum):
            fdict[elnum] = 1
        elif na != 0:
            fdict[elnum[:na]] = int(elnum[na:])
    return fdict

def get_monomer_cifs(mon_path):
    'yield all files mon_path/?/*.cif in alphabetic order'
    for root, dirs, files in os.walk(mon_path):
        dirs.sort()
        for name in sorted(files):
            # in $CLIBD_MON, files with '_' in the name are not monomers
            if name.endswith('.cif') and '_' not in name:
                yield os.path.join(root, name)
            elif name.endswith('.cif.gz'):
                yield os.path.join(root, name)

def check_formulas(ccd):
    '''\
    Print the chemical components in CCD (components.cif) that have
    formula not consistent with the list of atoms.
    '''
    for b in ccd:
        atoms = Counter(a.upper()
                        for a in b.find_values('_chem_comp_atom.type_symbol'))
        formula = cif.as_string(b.find_value('_chem_comp.formula')).upper()
        fdict = formula_to_dict(formula)
        if fdict != atoms:
            print('[%s]' % b.name, formula, '<>', to_formula(atoms))


def compare_monlib_with_ccd(mon_path, ccd, verbose=False):
    'compare monomers from monomer library and CCD that have the same names'
    PRINT_MISSING_ENTRIES = False
    cnt = 0
    for path in get_monomer_cifs(mon_path):
        mon = cif.read(path)
        for mb in mon:
            if mb.name in ('', 'comp_list'):
                continue
            assert mb.name.startswith('comp_')
            name = mb.name[5:]
            cb = ccd.find_block(name)
            if cb:
                if verbose:
                    print('Comparing', cb.name)
                compare_chem_comp(mb, cb)
                cnt += 1
            elif PRINT_MISSING_ENTRIES:
                print('Not in CCD:', name)
    print('Compared', cnt, 'monomers.')


def get_heavy_atom_names(block):
    cca = block.find('_chem_comp_atom.', ['atom_id', 'type_symbol'])
    d = {a.str(0).upper(): a.str(1).upper() for a in cca if a[1] != 'H'}
    assert len(d) == sum(a[1] != 'H' for a in cca), (d, cca)
    return d


def bond_info(id1, id2, order, aromatic):
    if id1 > id2:
        id1, id2 = id2, id1
    order = order[:4].lower()
    assert order in ['sing', 'doub', 'trip', 'arom', 'delo', '1.5'], order
    aromatic = aromatic.upper()
    assert aromatic in ('Y', 'N'), aromatic
    if order in ('sing', 'doub') and aromatic == 'Y':
        order = 'arom'
    return ((id1, id2), (order, aromatic))


def bond_dict(block, ccb_names, atom_names):
    table = block.find('_chem_comp_bond.', ccb_names)
    return dict(bond_info(r.str(0), r.str(1), r.str(2), r.str(3))
                for r in table
                if r.str(0) in atom_names and r.str(1) in atom_names)


def compare_chem_comp(mb, cb):
    mon_names = get_heavy_atom_names(mb)
    ccd_names = get_heavy_atom_names(cb)

    # compare atom names (and counts)
    if mon_names != ccd_names:
        print('atom names differ in', cb.name)
        mon_atoms = Counter(mon_names.values())
        ccd_atoms = Counter(ccd_names.values())
        if mon_atoms != ccd_atoms:
            # can be relaxed by: and mon_atoms + Counter('O') != ccd_atoms
            print(cb.name, to_formula(mon_atoms), '<>', to_formula(ccd_atoms))

    # compare bonds
    mbonds = bond_dict(mb, ['atom_id_1', 'atom_id_2', 'type', 'aromatic'],
                       mon_names)
    cbonds = bond_dict(cb, ['atom_id_1', 'atom_id_2', 'value_order',
                            'pdbx_aromatic_flag'],
                       ccd_names)
    if mbonds != cbonds:
        print('Bonds differ in', cb.name)
        if set(mbonds) != set(cbonds):
            print('  extra bonds in mon.lib.:',
                  ' '.join('%s-%s' % p for p in set(mbonds) - set(cbonds)))
            print('  extra bonds in CCD:',
                  ' '.join('%s-%s' % p for p in set(cbonds) - set(mbonds)))
        for apair in sorted(set(mbonds) & set(cbonds)):
            if mbonds[apair] != cbonds[apair]:
                print('  %s-%s: %s/%s <> CCD %s/%s' %
                      (apair + mbonds[apair] + cbonds[apair]))


def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('ccd_path', metavar='/path/to/components.cif[.gz]')
    parser.add_argument('-m', metavar='DIR',
                        help='monomer library path (default: $CLIBD_MON)')
    parser.add_argument('-f', action='store_true', help='check CCD formulas')
    parser.add_argument('-v', action='store_true', help='verbose')
    args = parser.parse_args()
    mon_path = args.m or os.getenv('CLIBD_MON')
    if not mon_path and not args.f:
        sys.exit('Unknown monomer library path: use -m or set $CLIBD_MON.')
    ccd = cif.read(args.ccd_path)
    if args.f:
        check_formulas(ccd)
    if mon_path:
        compare_monlib_with_ccd(mon_path, ccd, verbose=args.v)


if __name__ == '__main__':
    main()