File: maf_region_coverage_by_src.py

package info (click to toggle)
python-bx 0.13.0-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 5,000 kB
  • sloc: python: 17,136; ansic: 2,326; makefile: 24; sh: 8
file content (67 lines) | stat: -rwxr-xr-x 1,833 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
#!/usr/bin/python3

"""
Reads a list of intervals and a set of indexed mafs. For each interval print
the amount covered by each species other than the reference.

usage: %prog maf_files  [options] < interval_file
   -s, --src=s:      Use this src for all intervals
   -p, --prefix=p:   Prepend this to each src before lookup
"""

import sys
from collections import defaultdict

import bx.align.maf
from bx.cookbook import doc_optparse


def __main__():
    # Parse Command Line

    options, args = doc_optparse.parse(__doc__)

    try:
        maf_files = args
        if options.prefix:
            prefix = options.prefix
        else:
            prefix = None
    except Exception:
        doc_optparse.exit()

    # Open indexed access to mafs
    indexes = [bx.align.maf.Indexed(maf_file, maf_file + ".index") for maf_file in maf_files]

    # Iterate over input ranges

    for line in sys.stdin:
        fields = line.split()
        src, start, end = fields[0], int(fields[1]), int(fields[2])
        if prefix:
            src = prefix + src

        total_length = end - start

        # Find overlap with reference component
        blocks = []
        for index in indexes:
            blocks += index.get(src, start, end)

        coverage = defaultdict(int)
        for block in blocks:
            overlap_start = max(start, block.components[0].start)
            overlap_end = min(end, block.components[0].end)
            length = overlap_end - overlap_start
            assert length > 0
            for c in block.components[1:]:
                species = c.src.split(".")[0]
                coverage[species] += length

        print(line, end=" ")
        for key, value in coverage.items():
            print("   ", key.ljust(10), "%0.2f" % (value / total_length))


if __name__ == "__main__":
    __main__()