File: maf_covered_regions.py

package info (click to toggle)
python-bx 0.13.0-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 5,000 kB
  • sloc: python: 17,136; ansic: 2,326; makefile: 24; sh: 8
file content (68 lines) | stat: -rwxr-xr-x 1,808 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
#!/usr/bin/python3

"""
Read a maf file and print the regions covered to a set of bed files (one for
each sequence source referenced in the maf). Only blocks with a positive
percent identity are written out.

TODO: Can this be generalized to be made more useful?

usage: %prog bed_outfile_prefix < maf
"""

import sys

import bx.align.maf


def block_pid(comp1, comp2):
    match = 0
    total = 0
    t1 = comp1.text.lower()
    t2 = comp2.text.lower()
    for i in range(0, len(t1)):
        a, b = t1[i], t2[i]
        if a == "-" or b == "-":
            continue
        elif a == b:
            match += 1
        total += 1
    if total == 0:
        return None
    return match / total


def main():
    out_prefix = sys.argv[1]
    print(out_prefix)
    out_files = {}
    for block in bx.align.maf.Reader(sys.stdin):
        ref_comp = block.components[0]
        ref_chrom = ref_comp.src.split(".")[1]
        for comp in block.components[1:]:
            comp_species, comp_chrom = comp.src.split(".")[:2]
            if comp_species not in out_files:
                f = open(f"{out_prefix}{comp_species}.bed", "w")
                out_files[comp_species] = f
            pid = block_pid(ref_comp, comp)
            if pid:
                out_files[comp_species].write(
                    "%s\t%d\t%d\t%s:%d-%d,%s\t%f\n"
                    % (
                        ref_chrom,
                        ref_comp.forward_strand_start,
                        ref_comp.forward_strand_end,
                        comp_chrom,
                        comp.start,
                        comp.end,
                        comp.strand,
                        pid,
                    )
                )

    for f in out_files.values():
        f.close()


if __name__ == "__main__":
    main()