File: weight.py

package info (click to toggle)
gemmi 0.6.5%2Bds-3
  • links: PTS, VCS
  • area: main
  • in suites: trixie
  • size: 5,836 kB
  • sloc: cpp: 54,719; python: 4,743; ansic: 3,972; sh: 384; makefile: 73; f90: 42; javascript: 12
file content (121 lines) | stat: -rwxr-xr-x 4,952 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
#!/usr/bin/env python
# This script reads mmCIF file(s) and compares weights:
# * _entity.formula_weight with _entity_poly_seq and _chem_comp.formula_weight,
# * _chem_comp.formula_weight with _chem_comp.formula and atomic weights.

import os
import argparse
import collections
import itertools
from gemmi import cif, Element, expand_if_pdb_code


H2O_MASS = Element('O').weight + 2 * Element('H').weight
# but why PO2 and not PO3Hx?
PO2_MASS = Element('P').weight + 2 * Element('O').weight


# this function is also used in examples/monomers.py
def formula_to_dict(formula):
    '"O4 P -3" -> {O:4, P:1}'
    fdict = {}
    for elnum in formula.split():
        na = sum(e.isalpha() for e in elnum)
        if na == len(elnum):
            fdict[elnum] = 1
        elif na != 0:
            fdict[elnum[:na]] = int(elnum[na:])
    return fdict

def check_chem_comp_formula_weight(block):
    for cc in block.find('_chem_comp.', ['id', 'formula', 'formula_weight']):
        if cc.str(0) in ('UNX', 'UNL'):  # unknown residue or ligand
            continue
        fdict = formula_to_dict(cc.str(1).title())
        calc_weight = sum(n * Element(e).weight for (e, n) in fdict.items())
        diff = calc_weight - cif.as_number(cc[2])
        if not abs(diff) < 0.1:  # also true if diff is NaN
            print('%s %s  %-16s % 9.3f - %9s = %+.3f' %
                  (block.name, cc[0], cc.str(1), calc_weight, cc[2], diff))


def check_entity_formula_weight(block):
    # read chem_comp weights from the file, e.g. THR: 119.120
    cc_weights = {cc.str(0): cif.as_number(cc[1]) for cc in
                  block.find('_chem_comp.', ['id', 'formula_weight'])}

    # read polymer types from _entity_poly.type
    poly_types = {ep.str(0): ep.str(1) for ep in
                  block.find('_entity_poly.', ['entity_id', 'type'])}

    # read and sort sequences from _entity_poly_seq
    entity_seq = collections.defaultdict(list)
    for m in block.find('_entity_poly_seq.', ['entity_id', 'num', 'mon_id']):
        entity_seq[m.str(0)].append((cif.as_int(m[1]), cif.as_string(m[2])))
    for seq in entity_seq.values():
        seq.sort(key=lambda p: p[0])

    # read _entity.formula_weight values
    f_weights = block.find('_entity.', ['id', 'formula_weight'])
    entity_weights = {ent.str(0): cif.as_number(ent[1]) for ent in f_weights}

    # calculate weight from sequences and compare with _entity.formula_weight
    for ent, seq in entity_seq.items():
        # in case of microheterogeneity take the first conformer
        main_seq = [next(g) for _, g in itertools.groupby(seq, lambda p: p[0])]
        weight = sum(cc_weights[mon_id] for (num, mon_id) in main_seq)
        weight -= (len(main_seq) - 1) * H2O_MASS
        if 'ribonucleotide' in poly_types[ent]:  # DNA or RNA
            weight -= PO2_MASS
        diff = weight - entity_weights[ent]
        if abs(diff) > max(0.1, 3e-5 * weight):
            print('%4s entity_id: %2s %10.2f - %10.2f = %+8.3f' %
                  (block.name, ent, weight, entity_weights[ent], diff))

# the same function as in examples/matthews.py
def get_file_paths_from_args():
    """\
    Process arguments as filenames or directories with .cif(.gz) files,
    and yield the file paths.
    Normally we first test our scripts on a few files:
      ./myscript 1mru.cif another.cif
    and then do pdb-wide analysis:
      ./myscript $PDB_DIR/structures/divided/mmCIF
    If $PDB_DIR is set you can check the specifies PDB entries:
      ./myscript 1ABC 2def
    If you have a list of PDB codes to analyze (one code per line, the code
    must be the first word, but may be followed by others), do:
      ./myscript --only=my-list.txt $PDB_DIR/structures/divided/mmCIF
    """
    parser = argparse.ArgumentParser(usage='%(prog)s [options] path [...]')
    parser.add_argument('path', nargs='+', help=argparse.SUPPRESS)
    parser.add_argument('--only', metavar='LIST',
                        help='Use only files that match names in this file')
    args = parser.parse_args()
    only = None
    if args.only:
        with open(args.only) as list_file:
            only = {line.split()[0].lower() for line in list_file
                    if line.strip()}
    for arg in args.path:
        if os.path.isdir(arg):
            for root, dirs, files in os.walk(arg):
                dirs.sort()
                for name in sorted(files):
                    for ext in ['.cif', '.cif.gz']:
                        if name.endswith(ext):
                            if not only or name[:-len(ext)].lower() in only:
                                yield os.path.join(root, name)
                                break
        else:
            yield expand_if_pdb_code(arg)


def main():
    for path in get_file_paths_from_args():
        block = cif.read(path).sole_block()
        check_chem_comp_formula_weight(block)
        check_entity_formula_weight(block)

if __name__ == '__main__':
    main()