1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63
|
#!/usr/bin/python3
"""
For every column that occurs in a multiple alignment print the column
and the number of times it occurs (one column/count per line, tab
separated), sorted by count descending.
This version allows special handling of the 'wildcard' symbol in alignments.
Note: all blocks must have exactly the same number of species.
usage: %prog [options] < maf > column_counts
-w, --wildcard: include wildcards
-m, --maxwildcards=N: only allow N missing species
"""
import sys
import bx.align.maf
from bx.cookbook import (
cross_lists,
doc_optparse,
)
counts = {}
nspecies = None
for block in bx.align.maf.Reader(sys.stdin):
# Ensure all blocks have the same number of rows
if nspecies:
assert len(block.components) == nspecies
else:
nspecies = len(block.components)
# Increment count for each column
for col in zip(*[iter(comp.text.upper()) for comp in block.components]):
col = "".join(col)
try:
counts[col] += 1
except Exception:
counts[col] = 1
options, args = doc_optparse.parse(__doc__)
wildcard = False
if options.wildcard:
wildcard = True
max_wildcard = nspecies - 1
if options.maxwildcards:
wildcard = True
max_wildcard = int(options.maxwildcards)
nucs = "ACGT-"
if wildcard:
nucs += "*"
for col in cross_lists(*([nucs] * nspecies)):
col = "".join(col)
if wildcard and col.count("*") > max_wildcard:
continue
if col.count("-") == nspecies:
continue
print(col, counts.get(col, 0))
|