File: mask_quality.py

package info (click to toggle)
python-bx 0.13.0-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 5,000 kB
  • sloc: python: 17,136; ansic: 2,326; makefile: 24; sh: 8
file content (98 lines) | stat: -rw-r--r-- 2,930 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
#!/usr/bin/python3

"""
Masks an AXT or MAF file based on quality (from a binned_array) and
outputs AXT or MAF.

Binned array form of quality scores can be generated with `qv_to_bqv.py`.

usage: %prog input output
    -i, --input=N: Format of input (axt or maf)
    -o, --output=N: Format of output (axt or maf)
    -m, --mask=N: Character to use as mask character
    -q, --quality=N: Min quality allowed
    -t, --type=N: base_pair or nqs
    -l, --list=N: colon separated list of species,len_file[,qualityfile].
"""

import fileinput
import sys

import bx.align.axt
import bx.align.maf
import bx.binned_array
from bx.align.sitemask.quality import Simple
from bx.cookbook import doc_optparse


def main():
    options, args = doc_optparse.parse(__doc__)
    try:
        inputformat = options.input
        outputformat = options.output
        mask = options.mask
        minqual = int(options.quality)
        speciesAndLens = options.list
        inputfile = args[0]
        outputfile = args[1]
    except Exception:
        doc_optparse.exception()

    outstream = open(outputfile, "w")
    instream = open(inputfile)

    qualfiles = {}

    # read lens
    specieslist = speciesAndLens.split(":")
    species_to_lengths = {}

    for entry in specieslist:
        fields = entry.split(",")
        lenstream = fileinput.FileInput(fields[1])
        lendict = {}
        for line in lenstream:
            region = line.split()
            lendict[region[0]] = int(region[1])
        species_to_lengths[fields[0]] = lendict
        if len(fields) >= 3:
            qualfiles[fields[0]] = fields[2]

    specieslist = [a.split(":")[0] for a in specieslist]

    # open quality binned_arrays
    reader = None
    writer = None

    if inputformat == "axt":
        # load axt
        if len(specieslist) != 2:
            print("AXT is pairwise only.")
            sys.exit()
        reader = bx.align.axt.Reader(
            instream, species1=specieslist[0], species2=specieslist[1], species_to_lengths=species_to_lengths
        )
    elif outputformat == "maf":
        # load maf
        reader = bx.align.maf.Reader(instream, species_to_lengths=species_to_lengths)

    if outputformat == "axt":
        # setup axt
        if len(specieslist) != 2:
            print("AXT is pairwise only.")
            sys.exit()
        writer = bx.align.axt.Writer(outstream, attributes=reader.attributes)
    elif outputformat == "maf":
        # setup maf
        writer = bx.align.maf.Writer(outstream, attributes=reader.attributes)

    qualfilter = Simple(mask=mask, qualspecies=species_to_lengths, qualfiles=qualfiles, minqual=minqual, cache=50)

    qualfilter.run(reader, writer.write)

    print("For " + str(qualfilter.total) + " base pairs, " + str(qualfilter.masked) + " base pairs were masked.")
    print(str(float(qualfilter.masked) / float(qualfilter.total) * 100) + "%")


if __name__ == "__main__":
    main()