File: quality.py

package info (click to toggle)
python-bx 0.13.0-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 5,000 kB
  • sloc: python: 17,136; ansic: 2,326; makefile: 24; sh: 8
file content (134 lines) | stat: -rw-r--r-- 5,648 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
"""
Support for masking out sites in alignments based on sequence quality. Both
simple masking of regions below some threshold and masking using the
neighborhood quality standard (NQS) are supported. Uses sequence quality
values stored in a `bx.binned_array.FileBinnedArray`.
"""

from bx.align.sitemask import Masker
from bx.binned_array import FileBinnedArray


# This class implements simple rules for masking quality, if base <
# minqual, mask
class Simple(Masker):
    # keys should be:
    # qualspecies: dictionary of species as key, lengths
    #              dict by chromosome or chromosome list as value
    # qualfiles: prefix for quality file for each species in qualspecies
    # mask: mask character (default is '?')
    # minqual: minimum quality
    # cache: optional, but sets the number of megabytes allowed in cache per quality masked species
    def __init__(self, qualfiles=None, qualspecies=None, minqual=None, mask="?", cache=100):
        if not qualfiles:
            raise Exception("No quality files.")
        if not qualspecies:
            raise Exception("No species dictionary.")
        if not minqual:
            raise Exception("No minimum quality specified.")
        self.mask = "?"
        self.minqual = minqual
        self.mask = mask
        self.total = 0
        self.masked = 0

        self.qualfiles = qualfiles
        self.qualspecies = qualspecies
        self.cache = cache * 2  # typical bin size is 512K
        # load quality files into FileBinnedArray
        self.qualities = {}
        for species, qualfile in self.qualfiles.items():
            specdict = {}
            for chrom in self.qualspecies[species]:
                specdict[chrom] = FileBinnedArray(
                    open(qualfile + "." + chrom + ".bqv", "rb"), cache=self.cache / len(qualfiles)
                )
            self.qualities[species] = specdict

    def __call__(self, block):
        if not block:
            return
        for qualspec in self.qualities:
            comp = block.get_component_by_src_start(qualspec)
            if not comp:
                continue
            chrom = comp.src.split(".")[1]
            start, end = comp.get_forward_strand_start(), comp.get_forward_strand_end()
            # get quality slice, for + strand
            qual = self.qualities[qualspec][chrom][start:end]
            x = 0
            while start + x < end:
                self.total += 1
                # got the column in the alignment for this particular base
                if qual[x] < self.minqual:
                    col = comp.coord_to_col(start + x)
                    self.masked += 1
                    for component in block.components:
                        if component.text[col] != "-":
                            component.text = (
                                component.text[0:col] + self.mask + component.text[col + 1 : len(component.text)]
                            )
                # iterate through quality
                x += 1
        return block


class NQS(Masker):
    # keys should be:
    # qualspecies: dictionary of species as key, lengths
    #              dict by chromosome or chromosome list as value
    # qualfiles: prefix for quality file for each species in qualspecies
    # mask: mask character (default is '?')
    # minqual: minimum quality
    # neighborqual: neighborhood minimum quality (bases within 5 bps are masked)
    # cache: optional, but sets the number of megabytes allowed in cache per quality masked species
    def __init__(self, qualfiles=None, qualspecies=None, minqual=None, mask="?", cache=100):
        if not qualfiles:
            raise Exception("No quality files.")
        if not qualspecies:
            raise Exception("No species dictionary.")
        if not minqual:
            raise Exception("No minimum quality specified.")
        self.mask = "?"
        self.minqual = minqual
        self.mask = mask
        self.total = 0
        self.masked = 0

        self.qualfiles = qualfiles
        self.qualspecies = qualspecies
        self.cache = cache * 2  # typical bin size is 512K
        # load quality files into FileBinnedArray
        self.qualities = {}
        for species, qualfile in self.qualfiles.items():
            specdict = {}
            for chrom in self.qualspecies[species]:
                specdict[chrom] = FileBinnedArray(
                    open(qualfile + "." + chrom + ".bqv", "rb"), cache=self.cache / len(qualfiles)
                )
            self.qualities[species] = specdict

    def __call__(self, block):
        if not block:
            return
        for qualspec in self.qualities:
            comp = block.get_component_by_src_start(qualspec)
            chrom = comp.src.split(".")[1]
            start, end = comp.get_forward_strand_start(), comp.get_forward_strand_end()
            # get quality slice, for + strand
            qual = self.qualities[qualspec][chrom][start:end]
            x = 0
            while start + x < end:
                self.total += 1
                # got the column in the alignment for this particular base
                if qual[x] < self.minqual:
                    col = comp.coord_to_col(start + x)
                    self.masked += 1
                    for component in block.components:
                        if component.text[col] != "-":
                            component.text = (
                                component.text[0:col] + self.mask + component.text[col + 1 : len(component.text)]
                            )
                # iterate through quality
                x += 1
        return block