#!/usr/bin/env python

"""
Read a maf file and print the regions covered to a set of bed files (one for
each sequence source referenced in the maf). Only blocks with a positive
percent identity are written out.

TODO: Can this be generalized to be made more useful?

usage: %prog bed_outfile_prefix < maf
"""
from __future__ import division, print_function

import sys

import bx.align.maf


def block_pid(comp1, comp2):
    match = 0
    total = 0
    t1 = comp1.text.lower()
    t2 = comp2.text.lower()
    for i in range(0, len(t1)):
        a, b = t1[i], t2[i]
        if a == '-' or b == '-':
            continue
        elif a == b:
            match += 1
        total += 1
    if total == 0:
        return None
    return (match / total)


def main():
    out_prefix = sys.argv[1]
    print(out_prefix)
    out_files = dict()
    for block in bx.align.maf.Reader(sys.stdin):
        ref_comp = block.components[0]
        ref_chrom = ref_comp.src.split('.')[1]
        for comp in block.components[1:]:
            comp_species, comp_chrom = comp.src.split('.')[:2]
            if comp_species not in out_files:
                f = open("%s%s.bed" % (out_prefix, comp_species), "w")
                out_files[comp_species] = f
            pid = block_pid(ref_comp, comp)
            if pid:
                out_files[comp_species].write(
                    "%s\t%d\t%d\t%s:%d-%d,%s\t%f\n" %
                    (ref_chrom, ref_comp.forward_strand_start, ref_comp.forward_strand_end,
                     comp_chrom, comp.start, comp.end, comp.strand, pid))

    for f in out_files.values():
        f.close()


if __name__ == "__main__":
    main()
