File: aafreq.py

package info (click to toggle)

gemmi 0.6.5%2Bds-3

links: PTS, VCS
area: main
in suites: trixie
size: 5,836 kB
sloc: cpp: 54,719; python: 4,743; ansic: 3,972; sh: 384; makefile: 73; f90: 42; javascript: 12

file content (26 lines) | stat: -rwxr-xr-x 1,175 bytes

parent folder | download | duplicates (2)

#!/usr/bin/env python
# Check amino-acid frequency in the PDB database (or it's subset)
# by reading meta-data from mmCIF files.

import sys
from collections import Counter
from gemmi import cif, CifWalk

totals = Counter()
for arg in sys.argv[1:]:
    for path in CifWalk(arg, try_pdbid='M'):
        # read file (uncompressing on the fly) and get the only block
        block = cif.read(path).sole_block()
        # find table with the sequence
        seq = block.find('_entity_poly_seq.', ['entity_id', 'mon_id'])
        # convert table with chain types (protein/DNA/RNA) to dict
        entity_types = dict(block.find('_entity_poly.', ['entity_id', 'type']))
        # and count these monomers that correspond to a protein chain
        aa_counter = Counter(row.str(1) for row in seq
                             if 'polypeptide' in entity_types[row.str(0)])
        totals += aa_counter
        # print residue counts for each file
        print(block.name, *(f'{m}:{c}' for (m, c) in aa_counter.most_common()))
# finally, print the total counts as percentages
f = 100.0 / sum(totals.values())
print('TOTAL', *(f'{m}:{c*f:.2f}%' for (m, c) in totals.most_common(20)))