1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50
|
#!/usr/bin/env python3
import sys
if len(sys.argv) < 3:
print("usage: <bamtools_coverage_output {} fasta_index num_regions >regions.bed".format(sys.argv[0]))
print("Generates regions with even sequencing coverage, provided an input of coverage per-position as")
print("generated by bamtools coverage. In other words, generates regions such that the integral of")
print("coverage is approximately equal for each. These can be used when variant calling to reduce")
print("variance in job runtime.")
exit(1)
lengths = {}
fai = open(sys.argv[1])
for line in fai.readlines():
c, l = line.split("\t")[:2]
lengths[c] = int(l)
positions = []
total_coverage = 0
for line in sys.stdin:
chrom, pos, depth = line.strip().split("\t")
pos = int(pos)
depth = int(depth)
positions.append([chrom, pos, depth])
total_coverage += depth
bp_per_region = total_coverage / int(sys.argv[2])
lchrom = None
lpos = 0
bp_in_region = 0
for line in positions:
chrom, pos, depth = line #line.strip().split("\t")
if lchrom != chrom:
if lchrom:
print(lchrom+":"+str(lpos)+"-"+str(lengths[lchrom]))
lpos = 0
lchrom = chrom
bp_in_region = 0
else:
lchrom = chrom
bp_in_region += depth
if bp_in_region > bp_per_region:
print(chrom+":"+str(lpos)+"-"+str(pos)) #, pos - lpos
lpos = pos
bp_in_region = 0
print(lchrom+":"+str(lpos)+"-"+str(lengths[lchrom]))
|