File: matthews.py

package info (click to toggle)
gemmi 0.5.7%2Bds-2
links: PTS, VCS
area: main
in suites: bookworm
size: 5,344 kB
sloc: cpp: 48,972; python: 4,352; ansic: 3,428; sh: 302; makefile: 69; f90: 42; javascript: 12
file content (163 lines) | stat: -rwxr-xr-x 6,855 bytes
#!/usr/bin/env python
# -*- coding: utf-8 -*-

# Usage:
#  ./matthews.py $PDB_DIR/structures/divided/mmCIF | tee data.tsv
#  ./matthews.py plot data.tsv
# Plotting uses numpy, statsmodels, pandas, matplotlib, seaborn.
# There is also a "check" command that was used to estimate data quality.

from __future__ import print_function
import datetime
import sys
import os
import csv
import argparse
from gemmi import cif, expand_if_pdb_code


# the same function as in examples/weight.py
def get_file_paths_from_args():
    """\
    Process arguments as filenames or directories with .cif(.gz) files,
    and yield the file paths.
    Normally we first test our scripts on a few files:
      ./myscript 1mru.cif another.cif
    and then do pdb-wide analysis:
      ./myscript $PDB_DIR/structures/divided/mmCIF
    If $PDB_DIR is set you can check the specifies PDB entries:
      ./myscript 1ABC 2def
    If you have a list of PDB codes to analyze (one code per line, the code
    must be the first word, but may be followed by others), do:
      ./myscript --only=my-list.txt $PDB_DIR/structures/divided/mmCIF
    """
    parser = argparse.ArgumentParser(usage='%(prog)s [options] path [...]')
    parser.add_argument('path', nargs='+', help=argparse.SUPPRESS)
    parser.add_argument('--only', metavar='LIST',
                        help='Use only files that match names in this file')
    args = parser.parse_args()
    only = None
    if args.only:
        with open(args.only) as list_file:
            only = set(line.split()[0].lower() for line in list_file
                       if line.strip())
    for arg in args.path:
        if os.path.isdir(arg):
            for root, dirs, files in os.walk(arg):
                dirs.sort()
                for name in sorted(files):
                    for ext in ['.cif', '.cif.gz']:
                        if name.endswith(ext):
                            if not only or name[:-len(ext)].lower() in only:
                                yield os.path.join(root, name)
                                break
        else:
            yield expand_if_pdb_code(arg)


def parse_date(date_str):
    year, month, day = date_str.split('-')
    return datetime.date(int(year), int(month), int(day))


def gather_data():
    "read mmCIF files and write down a few numbers (one file -> one line)"
    writer = csv.writer(sys.stdout, dialect='excel-tab')
    writer.writerow(['code', 'na_chains', 'vs', 'vm', 'd_min', 'date', 'group'])
    for path in get_file_paths_from_args():
        block = cif.read(path).sole_block()
        code = cif.as_string(block.find_value('_entry.id'))
        na = sum('nucleotide' in t[0]
                 for t in block.find_values('_entity_poly.type'))
        vs = block.find_value('_exptl_crystal.density_percent_sol')
        vm = block.find_value('_exptl_crystal.density_Matthews')
        d_min = block.find_value('_refine.ls_d_res_high')
        dep_date_tag = '_pdbx_database_status.recvd_initial_deposition_date'
        dep_date = parse_date(block.find_values(dep_date_tag).str(0))
        group = block.find_value('_pdbx_deposit_group.group_id')
        writer.writerow([code, na, vs, vm, d_min, dep_date, group])


def plot(our_csv):
    from numpy import array
    import seaborn as sns
    x, y = [], []
    with open(our_csv) as csvfile:
        for row in csv.DictReader(csvfile, dialect='excel-tab'):
            try:
                vs = float(row['vs'])
                vm = float(row['vm'])
                d_min = float(row['d_min'])
            except ValueError:
                continue
            # skip entries with inconsistent Vs and Vm
            if vm == 0 or abs(vs - 100 * (1 - 1.23 / vm)) > 1.0:
                continue
            if row['date'] >= '2015-01-01':  # and not row['group']:
                x.append(d_min)
                y.append(vs)
    print('Plotting kernel density estimation from', len(x), 'points.')
    sns.set_style("whitegrid")
    sns.set_context("notebook", font_scale=1.5, rc={"lines.linewidth": 1.5})
    g = sns.JointGrid(array(x), array(y), space=0, xlim=(0, 4.5), ylim=(20, 90))
    # calculation time is proportional to gridsize^2; gridsize=100 is default
    g = g.plot_joint(sns.kdeplot, n_levels=30, bw=(0.05, 0.3), gridsize=300,
                     shade=True, shade_lowest=False, cmap="gnuplot2_r")
    sns.plt.sca(g.ax_marg_x)
    sns.kdeplot(g.x, shade=True, bw=0.03, gridsize=10000, vertical=False)
    sns.plt.sca(g.ax_marg_y)
    sns.kdeplot(g.y, shade=True, bw=0.2, gridsize=5000, vertical=True)
    g.ax_joint.set(xlabel=u'$d_{min}$ [Å]', ylabel='solvent volume [%]')
    #g.annotate(lambda a, b: 0, template="1970s - 2014")
    g.annotate(lambda a, b: 0, template="2015 - 4/2017")
    sns.plt.show()


# This function compares our data with the CSV file available from B.Rupp's
# website. It also does some other checks on both our and BR's CSV.
# Spoilers:
# The reso column in pdb_02_06_2013_pro_sorted_flagged_highest_cs.csv
# is differs from _refine.ls_d_res_high in about 50 cases.
# RB's data has entries with negative volume of solvent.
# PDB data has about 5000 entries with inconsistent Vm and Vs.
def check_with_rupps_data(our_csv, rupps_csv):
    with open(rupps_csv) as csvfile:
        rb_data = {row['code']: row for row in csv.DictReader(csvfile)}
    # check for suspiciously low Vs
    for r in rb_data.values():
        if float(r['vs']) < 5:
            print('Suspicious RB data: %s has Vs: %s' % (r['code'], r['vs']))

    with open(our_csv) as csvfile:
        for row in csv.DictReader(csvfile, dialect='excel-tab'):
            rb = rb_data.get(row['code'])
            try:
                vs = float(row['vs'])
                vm = float(row['vm'])
                if vm == 0:
                    print('Bad PDB data: Vm=0 in', row['code'])
                vs_calc = 100 * (1 - 1.23 / vm)
                if abs(vs - vs_calc) > 1.0:
                    print('Inconsistent PDB Vs/Vm: %s %.1f (%s->)%.1f Δ=%+.1f'
                          % (row['code'], vs, row['vm'], vs_calc, vs_calc-vs),
                          end='')
                    print('  (RB: %s->%s)' % (rb['vm'], rb['vs']) if rb else '')
            except (ValueError, ZeroDivisionError):
                pass
            if rb:
                try:  # d_min can be absent in pdb data (NMR entries)
                    d_min = float(row['d_min'])
                except ValueError:
                    continue
                diff = d_min - float(rb['reso'])
                if abs(diff) > 0.01:
                    print('PDB and RB differ: ', row['code'], row['d_min'],
                          rb['reso'], '%+.2f' % diff)


if sys.argv[1] == 'check':
    check_with_rupps_data(our_csv=sys.argv[2], rupps_csv=sys.argv[3])
elif sys.argv[1] == 'plot':
    plot(sys.argv[2])
else:
    gather_data()