1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68
|
#!/usr/bin/python3
"""
Read a maf file and print the regions covered to a set of bed files (one for
each sequence source referenced in the maf). Only blocks with a positive
percent identity are written out.
TODO: Can this be generalized to be made more useful?
usage: %prog bed_outfile_prefix < maf
"""
import sys
import bx.align.maf
def block_pid(comp1, comp2):
match = 0
total = 0
t1 = comp1.text.lower()
t2 = comp2.text.lower()
for i in range(0, len(t1)):
a, b = t1[i], t2[i]
if a == "-" or b == "-":
continue
elif a == b:
match += 1
total += 1
if total == 0:
return None
return match / total
def main():
out_prefix = sys.argv[1]
print(out_prefix)
out_files = {}
for block in bx.align.maf.Reader(sys.stdin):
ref_comp = block.components[0]
ref_chrom = ref_comp.src.split(".")[1]
for comp in block.components[1:]:
comp_species, comp_chrom = comp.src.split(".")[:2]
if comp_species not in out_files:
f = open(f"{out_prefix}{comp_species}.bed", "w")
out_files[comp_species] = f
pid = block_pid(ref_comp, comp)
if pid:
out_files[comp_species].write(
"%s\t%d\t%d\t%s:%d-%d,%s\t%f\n"
% (
ref_chrom,
ref_comp.forward_strand_start,
ref_comp.forward_strand_end,
comp_chrom,
comp.start,
comp.end,
comp.strand,
pid,
)
)
for f in out_files.values():
f.close()
if __name__ == "__main__":
main()
|