File: maf_word_frequency.py

package info (click to toggle)
python-bx 0.13.0-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 5,000 kB
  • sloc: python: 17,136; ansic: 2,326; makefile: 24; sh: 8
file content (44 lines) | stat: -rwxr-xr-x 965 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
#!/usr/bin/python3

"""
Read a MAF and print counts and frequencies of all n-mers
(words composed on n consecutive alignment columns)

TODO: reconcile this and maf_mapping_word_frequency.py

usage: %prog n < maf_file
"""

import string
import sys

from bx.align import maf


def __main__():
    motif_len = int(sys.argv[1])

    big_map = {}
    total = 0

    maf_reader = maf.Reader(sys.stdin)

    for m in maf_reader:
        texts = [c.text.upper() for c in m.components]
        for i in range(m.text_size - motif_len):
            motif = string.join(text[i : i + motif_len] for text in texts)
            if motif in big_map:
                big_map[motif] += 1
            else:
                big_map[motif] = 1
            total += 1

    items = sorted(zip(big_map.values(), big_map.keys()))
    items.reverse()

    for count, motif in items:
        print("%d\t%0.10f\t%s" % (count, count / total, motif))


if __name__ == "__main__":
    __main__()