1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134
|
"""
Support for masking out sites in alignments based on sequence quality. Both
simple masking of regions below some threshold and masking using the
neighborhood quality standard (NQS) are supported. Uses sequence quality
values stored in a `bx.binned_array.FileBinnedArray`.
"""
from bx.align.sitemask import Masker
from bx.binned_array import FileBinnedArray
# This class implements simple rules for masking quality, if base <
# minqual, mask
class Simple(Masker):
# keys should be:
# qualspecies: dictionary of species as key, lengths
# dict by chromosome or chromosome list as value
# qualfiles: prefix for quality file for each species in qualspecies
# mask: mask character (default is '?')
# minqual: minimum quality
# cache: optional, but sets the number of megabytes allowed in cache per quality masked species
def __init__(self, qualfiles=None, qualspecies=None, minqual=None, mask="?", cache=100):
if not qualfiles:
raise Exception("No quality files.")
if not qualspecies:
raise Exception("No species dictionary.")
if not minqual:
raise Exception("No minimum quality specified.")
self.mask = "?"
self.minqual = minqual
self.mask = mask
self.total = 0
self.masked = 0
self.qualfiles = qualfiles
self.qualspecies = qualspecies
self.cache = cache * 2 # typical bin size is 512K
# load quality files into FileBinnedArray
self.qualities = {}
for species, qualfile in self.qualfiles.items():
specdict = {}
for chrom in self.qualspecies[species]:
specdict[chrom] = FileBinnedArray(
open(qualfile + "." + chrom + ".bqv", "rb"), cache=self.cache / len(qualfiles)
)
self.qualities[species] = specdict
def __call__(self, block):
if not block:
return
for qualspec in self.qualities:
comp = block.get_component_by_src_start(qualspec)
if not comp:
continue
chrom = comp.src.split(".")[1]
start, end = comp.get_forward_strand_start(), comp.get_forward_strand_end()
# get quality slice, for + strand
qual = self.qualities[qualspec][chrom][start:end]
x = 0
while start + x < end:
self.total += 1
# got the column in the alignment for this particular base
if qual[x] < self.minqual:
col = comp.coord_to_col(start + x)
self.masked += 1
for component in block.components:
if component.text[col] != "-":
component.text = (
component.text[0:col] + self.mask + component.text[col + 1 : len(component.text)]
)
# iterate through quality
x += 1
return block
class NQS(Masker):
# keys should be:
# qualspecies: dictionary of species as key, lengths
# dict by chromosome or chromosome list as value
# qualfiles: prefix for quality file for each species in qualspecies
# mask: mask character (default is '?')
# minqual: minimum quality
# neighborqual: neighborhood minimum quality (bases within 5 bps are masked)
# cache: optional, but sets the number of megabytes allowed in cache per quality masked species
def __init__(self, qualfiles=None, qualspecies=None, minqual=None, mask="?", cache=100):
if not qualfiles:
raise Exception("No quality files.")
if not qualspecies:
raise Exception("No species dictionary.")
if not minqual:
raise Exception("No minimum quality specified.")
self.mask = "?"
self.minqual = minqual
self.mask = mask
self.total = 0
self.masked = 0
self.qualfiles = qualfiles
self.qualspecies = qualspecies
self.cache = cache * 2 # typical bin size is 512K
# load quality files into FileBinnedArray
self.qualities = {}
for species, qualfile in self.qualfiles.items():
specdict = {}
for chrom in self.qualspecies[species]:
specdict[chrom] = FileBinnedArray(
open(qualfile + "." + chrom + ".bqv", "rb"), cache=self.cache / len(qualfiles)
)
self.qualities[species] = specdict
def __call__(self, block):
if not block:
return
for qualspec in self.qualities:
comp = block.get_component_by_src_start(qualspec)
chrom = comp.src.split(".")[1]
start, end = comp.get_forward_strand_start(), comp.get_forward_strand_end()
# get quality slice, for + strand
qual = self.qualities[qualspec][chrom][start:end]
x = 0
while start + x < end:
self.total += 1
# got the column in the alignment for this particular base
if qual[x] < self.minqual:
col = comp.coord_to_col(start + x)
self.masked += 1
for component in block.components:
if component.text[col] != "-":
component.text = (
component.text[0:col] + self.mask + component.text[col + 1 : len(component.text)]
)
# iterate through quality
x += 1
return block
|