File: maf_div_sites.py

package info (click to toggle)
python-bx 0.13.0-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 5,000 kB
  • sloc: python: 17,136; ansic: 2,326; makefile: 24; sh: 8
file content (54 lines) | stat: -rwxr-xr-x 1,435 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
#!/usr/bin/python3
"""
Create a bed file listing all the divergent sites between two specific species
in a maf.

usage: %prog maf_file reference_species_name other_species_name
"""

import sys

import bx.align.maf
import bx.bitset


def main():
    bitsets = {}
    maf = sys.argv[1]
    reference_sp, other_sp = sys.argv[2], sys.argv[3]

    for block in bx.align.maf.Reader(open(maf)):
        ref = block.get_component_by_src_start(reference_sp)
        other = block.get_component_by_src_start(other_sp)

        if not ref or not other:
            continue
        ref_chrom = ref.src.split(".")[1]
        ref_start = ref.start
        chrom_size = ref.get_src_size()

        if ref_chrom not in bitsets:
            bitsets[ref_chrom] = bx.bitset.BinnedBitSet(chrom_size)

        pos = ref_start
        for i, j in zip(ref.text.upper(), other.text.upper()):
            if i != "-":
                if i != j:  # mismatch
                    if i != "N" and j != "N" and j != "-":
                        # set if all valid chars
                        bitsets[ref_chrom].set(pos)
                pos += 1

    # bits --> bed file
    for chrom in bitsets:
        bits = bitsets[chrom]
        end = 0
        while True:
            start = bits.next_set(end)
            if start == bits.size:
                break
            end = bits.next_clear(start)
            print("%s\t%d\t%d" % (chrom, start, end))


main()