File: qv_to_bqv.py

package info (click to toggle)
python-bx 0.13.0-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 5,000 kB
  • sloc: python: 17,136; ansic: 2,326; makefile: 24; sh: 8
file content (70 lines) | stat: -rw-r--r-- 2,092 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
#!/usr/bin/python3

"""
Convert a qual (qv) file to several BinnedArray files for fast seek.
This script takes approximately 4 seconds per 1 million base pairs.

The input format is fasta style quality -- fasta headers followed by
whitespace separated integers.

usage: %prog qual_file output_file
"""

import fileinput
import sys

from bx.binned_array import BinnedArrayWriter


def main():
    args = sys.argv[1:]
    try:
        qual_file = args[0]
        output_file = args[1]
    except IndexError:
        print("usage: qual_file output_file")
        sys.exit()

    qual = fileinput.FileInput(qual_file)
    outfile = None
    outbin = None
    base_count = 0
    mega_count = 0
    region = ""

    for line in qual:
        line = line.rstrip("\r\n")
        if line.startswith(">"):
            # close old
            if outbin and outfile:
                print("\nFinished region " + region + " at " + str(base_count) + " base pairs.")
                outbin.finish()
                outfile.close()
            # start new file
            region = line.lstrip(">")
            outfname = output_file + "." + region + ".bqv"
            print("Writing region " + region + " to file " + outfname)
            outfile = open(outfname, "wb")
            outbin = BinnedArrayWriter(outfile, typecode="b", default=0)
            base_count = 0
            mega_count = 0
        else:
            if outfile and outbin:
                nums = line.split()
                for val in nums:
                    outval = int(val)
                    assert outval <= 255 and outval >= 0
                    outbin.write(outval)
                    base_count += 1
                if (mega_count * 1000000) <= base_count:
                    sys.stdout.write(str(mega_count) + " ")
                    sys.stdout.flush()
                    mega_count = base_count // 1000000 + 1
    if outbin and outfile:
        print("\nFinished region " + region + " at " + str(base_count) + " base pairs.")
        outbin.finish()
        outfile.close()


if __name__ == "__main__":
    main()