File: matthews.py

package info (click to toggle)
gemmi 0.5.7%2Bds-2
  • links: PTS, VCS
  • area: main
  • in suites: bookworm
  • size: 5,344 kB
  • sloc: cpp: 48,972; python: 4,352; ansic: 3,428; sh: 302; makefile: 69; f90: 42; javascript: 12
file content (163 lines) | stat: -rwxr-xr-x 6,855 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
#!/usr/bin/env python
# -*- coding: utf-8 -*-

# Usage:
#  ./matthews.py $PDB_DIR/structures/divided/mmCIF | tee data.tsv
#  ./matthews.py plot data.tsv
# Plotting uses numpy, statsmodels, pandas, matplotlib, seaborn.
# There is also a "check" command that was used to estimate data quality.

from __future__ import print_function
import datetime
import sys
import os
import csv
import argparse
from gemmi import cif, expand_if_pdb_code


# the same function as in examples/weight.py
def get_file_paths_from_args():
    """\
    Process arguments as filenames or directories with .cif(.gz) files,
    and yield the file paths.
    Normally we first test our scripts on a few files:
      ./myscript 1mru.cif another.cif
    and then do pdb-wide analysis:
      ./myscript $PDB_DIR/structures/divided/mmCIF
    If $PDB_DIR is set you can check the specifies PDB entries:
      ./myscript 1ABC 2def
    If you have a list of PDB codes to analyze (one code per line, the code
    must be the first word, but may be followed by others), do:
      ./myscript --only=my-list.txt $PDB_DIR/structures/divided/mmCIF
    """
    parser = argparse.ArgumentParser(usage='%(prog)s [options] path [...]')
    parser.add_argument('path', nargs='+', help=argparse.SUPPRESS)
    parser.add_argument('--only', metavar='LIST',
                        help='Use only files that match names in this file')
    args = parser.parse_args()
    only = None
    if args.only:
        with open(args.only) as list_file:
            only = set(line.split()[0].lower() for line in list_file
                       if line.strip())
    for arg in args.path:
        if os.path.isdir(arg):
            for root, dirs, files in os.walk(arg):
                dirs.sort()
                for name in sorted(files):
                    for ext in ['.cif', '.cif.gz']:
                        if name.endswith(ext):
                            if not only or name[:-len(ext)].lower() in only:
                                yield os.path.join(root, name)
                                break
        else:
            yield expand_if_pdb_code(arg)


def parse_date(date_str):
    year, month, day = date_str.split('-')
    return datetime.date(int(year), int(month), int(day))


def gather_data():
    "read mmCIF files and write down a few numbers (one file -> one line)"
    writer = csv.writer(sys.stdout, dialect='excel-tab')
    writer.writerow(['code', 'na_chains', 'vs', 'vm', 'd_min', 'date', 'group'])
    for path in get_file_paths_from_args():
        block = cif.read(path).sole_block()
        code = cif.as_string(block.find_value('_entry.id'))
        na = sum('nucleotide' in t[0]
                 for t in block.find_values('_entity_poly.type'))
        vs = block.find_value('_exptl_crystal.density_percent_sol')
        vm = block.find_value('_exptl_crystal.density_Matthews')
        d_min = block.find_value('_refine.ls_d_res_high')
        dep_date_tag = '_pdbx_database_status.recvd_initial_deposition_date'
        dep_date = parse_date(block.find_values(dep_date_tag).str(0))
        group = block.find_value('_pdbx_deposit_group.group_id')
        writer.writerow([code, na, vs, vm, d_min, dep_date, group])


def plot(our_csv):
    from numpy import array
    import seaborn as sns
    x, y = [], []
    with open(our_csv) as csvfile:
        for row in csv.DictReader(csvfile, dialect='excel-tab'):
            try:
                vs = float(row['vs'])
                vm = float(row['vm'])
                d_min = float(row['d_min'])
            except ValueError:
                continue
            # skip entries with inconsistent Vs and Vm
            if vm == 0 or abs(vs - 100 * (1 - 1.23 / vm)) > 1.0:
                continue
            if row['date'] >= '2015-01-01':  # and not row['group']:
                x.append(d_min)
                y.append(vs)
    print('Plotting kernel density estimation from', len(x), 'points.')
    sns.set_style("whitegrid")
    sns.set_context("notebook", font_scale=1.5, rc={"lines.linewidth": 1.5})
    g = sns.JointGrid(array(x), array(y), space=0, xlim=(0, 4.5), ylim=(20, 90))
    # calculation time is proportional to gridsize^2; gridsize=100 is default
    g = g.plot_joint(sns.kdeplot, n_levels=30, bw=(0.05, 0.3), gridsize=300,
                     shade=True, shade_lowest=False, cmap="gnuplot2_r")
    sns.plt.sca(g.ax_marg_x)
    sns.kdeplot(g.x, shade=True, bw=0.03, gridsize=10000, vertical=False)
    sns.plt.sca(g.ax_marg_y)
    sns.kdeplot(g.y, shade=True, bw=0.2, gridsize=5000, vertical=True)
    g.ax_joint.set(xlabel=u'$d_{min}$ [Å]', ylabel='solvent volume [%]')
    #g.annotate(lambda a, b: 0, template="1970s - 2014")
    g.annotate(lambda a, b: 0, template="2015 - 4/2017")
    sns.plt.show()


# This function compares our data with the CSV file available from B.Rupp's
# website. It also does some other checks on both our and BR's CSV.
# Spoilers:
# The reso column in pdb_02_06_2013_pro_sorted_flagged_highest_cs.csv
# is differs from _refine.ls_d_res_high in about 50 cases.
# RB's data has entries with negative volume of solvent.
# PDB data has about 5000 entries with inconsistent Vm and Vs.
def check_with_rupps_data(our_csv, rupps_csv):
    with open(rupps_csv) as csvfile:
        rb_data = {row['code']: row for row in csv.DictReader(csvfile)}
    # check for suspiciously low Vs
    for r in rb_data.values():
        if float(r['vs']) < 5:
            print('Suspicious RB data: %s has Vs: %s' % (r['code'], r['vs']))

    with open(our_csv) as csvfile:
        for row in csv.DictReader(csvfile, dialect='excel-tab'):
            rb = rb_data.get(row['code'])
            try:
                vs = float(row['vs'])
                vm = float(row['vm'])
                if vm == 0:
                    print('Bad PDB data: Vm=0 in', row['code'])
                vs_calc = 100 * (1 - 1.23 / vm)
                if abs(vs - vs_calc) > 1.0:
                    print('Inconsistent PDB Vs/Vm: %s %.1f (%s->)%.1f Δ=%+.1f'
                          % (row['code'], vs, row['vm'], vs_calc, vs_calc-vs),
                          end='')
                    print('  (RB: %s->%s)' % (rb['vm'], rb['vs']) if rb else '')
            except (ValueError, ZeroDivisionError):
                pass
            if rb:
                try:  # d_min can be absent in pdb data (NMR entries)
                    d_min = float(row['d_min'])
                except ValueError:
                    continue
                diff = d_min - float(rb['reso'])
                if abs(diff) > 0.01:
                    print('PDB and RB differ: ', row['code'], row['d_min'],
                          rb['reso'], '%+.2f' % diff)


if sys.argv[1] == 'check':
    check_with_rupps_data(our_csv=sys.argv[2], rupps_csv=sys.argv[3])
elif sys.argv[1] == 'plot':
    plot(sys.argv[2])
else:
    gather_data()