1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70
|
#!/usr/bin/python3
"""
Convert a qual (qv) file to several BinnedArray files for fast seek.
This script takes approximately 4 seconds per 1 million base pairs.
The input format is fasta style quality -- fasta headers followed by
whitespace separated integers.
usage: %prog qual_file output_file
"""
import fileinput
import sys
from bx.binned_array import BinnedArrayWriter
def main():
args = sys.argv[1:]
try:
qual_file = args[0]
output_file = args[1]
except IndexError:
print("usage: qual_file output_file")
sys.exit()
qual = fileinput.FileInput(qual_file)
outfile = None
outbin = None
base_count = 0
mega_count = 0
region = ""
for line in qual:
line = line.rstrip("\r\n")
if line.startswith(">"):
# close old
if outbin and outfile:
print("\nFinished region " + region + " at " + str(base_count) + " base pairs.")
outbin.finish()
outfile.close()
# start new file
region = line.lstrip(">")
outfname = output_file + "." + region + ".bqv"
print("Writing region " + region + " to file " + outfname)
outfile = open(outfname, "wb")
outbin = BinnedArrayWriter(outfile, typecode="b", default=0)
base_count = 0
mega_count = 0
else:
if outfile and outbin:
nums = line.split()
for val in nums:
outval = int(val)
assert outval <= 255 and outval >= 0
outbin.write(outval)
base_count += 1
if (mega_count * 1000000) <= base_count:
sys.stdout.write(str(mega_count) + " ")
sys.stdout.flush()
mega_count = base_count // 1000000 + 1
if outbin and outfile:
print("\nFinished region " + region + " at " + str(base_count) + " base pairs.")
outbin.finish()
outfile.close()
if __name__ == "__main__":
main()
|