1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83
|
#!/usr/bin/python3
"""
Read a MAF from stdin and break into several new mafs containing no more than
`chunk_size` columns. The new mafs will be written to `out_dir` along with a
file "intervals.txt" specifying the range covered by each new maf file. A
probability for writing each chunk can optionally be specified, resulting in
a random fraction of chunks from the input MAF being produced.
usage: %prog [options] chunk_size out_dir < maf
--prob: probability of writing versus skipping each chunk.
"""
import random
import sys
from optparse import OptionParser
import numpy as np
import bx.align.maf
INF = np.inf
def __main__():
parser = OptionParser("usage: %prog chunk_size out_dir")
parser.add_option("--prob", action="store", default=None, type="float", help="Probability of writing a given chunk")
(options, args) = parser.parse_args()
chunk_size = int(args[0])
out_dir = args[1]
prob = options.prob
maf_reader = bx.align.maf.Reader(sys.stdin, parse_e_rows=True)
maf_writer = None
count = 0
current_chunk = -1
chunk_min = INF
chunk_max = 0
write_current_chunk = True
interval_file = open(f"{out_dir}/intervals.txt", "w")
for m in maf_reader:
if not maf_writer or count + m.text_size > chunk_size:
current_chunk += 1
# Finish the last chunk
if maf_writer:
maf_writer.close()
interval_file.write(f"{chunk_min} {chunk_max}\n")
chunk_min = INF
chunk_max = 0
# Decide if the new chunk will be written
if prob:
write_current_chunk = bool(random.random() <= prob)
else:
write_current_chunk = True
if write_current_chunk:
maf_writer = bx.align.maf.Writer(open("%s/%09d.maf" % (out_dir, current_chunk), "w"))
else:
maf_writer = None
count = 0
if maf_writer:
maf_writer.write(m)
# count += m.text_size
count += m.components[0].size
chunk_min = min(chunk_min, m.components[0].start)
chunk_max = max(chunk_max, m.components[0].end)
if maf_writer:
maf_writer.close()
interval_file.write(f"{chunk_min} {chunk_max}\n")
interval_file.close()
if __name__ == "__main__":
__main__()
|