1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301
|
from math import sqrt
import math
import scipy.stats as s
import array as a
import sys
from numpy import log, pi, log10, e, log1p, exp
import numpy as np
import re
log10e = log10(e)
canonicalBaseMap = {'A': 'A', 'C': 'C', 'G': 'G',
'T': 'T', 'H': 'A', 'I': 'C', 'J': 'C', 'K': 'C'}
modNames = {'H': 'm6A', 'I': 'm5C', 'J': 'm4C', 'K': 'm5C'}
m5CCode = 'I'
iupacMap = {
'A': 'A',
'C': 'C',
'G': 'G',
'T': 'T',
'K': 'GT',
'M': 'AC',
'R': 'AG',
'Y': 'CT',
'S': 'CG',
'W': 'AT',
'B': 'CGT',
'D': 'AGT',
'H': 'ACT',
'V': 'ACG',
'N': 'ACGT'
}
def findMotifPositions(seq, motifs):
regexs = []
# Generate a regex for each motif, honouring degenerate bases
for m in motifs:
regex = ''
for c in m:
regex = regex + "[" + iupacMap[c] + "]"
regexs.append(regex)
allMatches = []
# Return a list of matching positions in the sequence
for r in regexs:
rr = re.compile(r)
matches = [x.start() for x in rr.finditer(seq)]
allMatches.extend(matches)
allMatches.sort()
return allMatches
class MultiSiteDetection(object):
def __init__(self, gbmModel, sequence, rawKinetics,
callBounds, methylMinCov, motifs=['CG']):
"""
"""
self.methylMinCov = methylMinCov
self.motifs = motifs
self.gbmModel = gbmModel
self.sequence = sequence
self.callStart = callBounds[0]
self.callEnd = callBounds[1]
# Extents that we will attempt to call a modification
self.callRange = range(self.callStart, self.callEnd)
# These switch because we changing viewpoints
self.pre = gbmModel.post
self.post = gbmModel.pre
self.lStart = self.pre
self.lEnd = len(self.sequence) - self.post
# Extents that we will use for likelihoods
self.likelihoodRange = range(self.lStart, self.lEnd)
self.alternateBases = dict(
(x, list(sequence[x])) for x in range(len(sequence)))
self.rawKinetics = rawKinetics
def getConfigs(self, centerIdx):
''' Enumerate all the contexts centered at centerIdx with one
modification added '''
start = centerIdx - self.pre
end = centerIdx + self.post
return self._possibleConfigs(start, end)
def _possibleConfigs(self, start, end):
''' Enumerate all the contexts coming from the substring self.sequence[start,end] with one
modification added '''
if start == end:
return self.alternateBases[start]
else:
r = []
allSuffixes = self._possibleConfigs(start + 1, end)
# The first suffix is alway the one with no modifications
# Only add the alternate to that one -- that way we only
# get configurations with a single modification, not all combos
noModsSuffix = allSuffixes[0]
if len(allSuffixes) > 1:
restSuffixes = allSuffixes[1:]
else:
restSuffixes = []
# The noMods suffix get the alternates
for c in self.alternateBases[start]:
r.append(c + noModsSuffix)
# the other suffixes already have mods -- they just get the
# unmodified base
for suffix in restSuffixes:
r.append(self.alternateBases[start][0] + suffix)
return r
# Compute something for all the windows in [start, end]
def getContexts(self, start, end, sequence):
contexts = []
for pos in range(start, end + 1):
ctx = sequence[(pos - self.pre):(pos + self.post + 1)].tobytes()
contexts.append(ctx)
return contexts
def computeContextMeans(self):
"""Generate a hash of the mean ipd for all candidate contexts"""
allContexts = []
for pos in self.motifPositions:
for offsetPos in range(pos - self.post, pos + self.pre + 1):
cfgs = self.getConfigs(offsetPos)
allContexts.extend(cfgs)
predictions = self.gbmModel.getPredictions(allContexts)
self.contextMeanTable = dict(zip(allContexts, predictions))
def decode(self):
"""Use this method to do the full modification finding protocol"""
# Find sites matching the desired motif
self.findMotifs()
# Compute all the required mean ipds under all possible composite
# hypotheses
self.computeContextMeans()
# Compute a confidence for each mod and return results
return self.scorePositions()
def findMotifs(self):
""" Mark all the positions matching the requested motif """
# Generate list of matching positions
allMotifPositions = findMotifPositions(self.sequence, self.motifs)
self.motifPositions = []
for pos in allMotifPositions:
# Only use bases that are inside the callBounds
if self.callStart <= pos < self.callEnd:
self.alternateBases[pos].append('I')
self.motifPositions.append(pos)
def multiSiteDetection(self, positions, nullPred, modPred, centerPosition):
''' kinetics, nullPred, and modifiedPred are parallel arrays
containing the observations and predictions surrounding a
single candidate motif site. Estimate the p-value of
modification and the modified fraction here'''
# Apply the error model to the predictions
nullErr = 0.01 + 0.03 * nullPred + 0.06 * nullPred ** (1.7)
modErr = 0.01 + 0.03 * modPred + 0.06 * modPred ** (1.7)
obsMean = np.zeros(nullPred.shape)
obsErr = np.zeros(nullPred.shape)
# Get the observations into the same array format
for i in range(len(positions)):
position = positions[i]
if position in self.rawKinetics:
siteObs = self.rawKinetics[position]
obsMean[i] = siteObs['tMean']
obsErr[i] = siteObs['tErr']
else:
# Crank up the variance -- we don't have an observation at this
# position, so we should ignore it.
obsMean[i] = 0.0
obsErr[i] = 999999999
# Subtract off the background model from the observations and the
# modified prediction
dObs = obsMean - nullPred
# Error of observation and prediction are uncorrelated
obsSigma = obsErr ** 2 + nullErr ** 2
invObsSigma = 1.0 / obsSigma
# Error of null prediction and mod prediction are probably correlated
# -- need a better estimate of the error of the difference!!
dPred = modPred - nullPred
# Just stubbing in a factor of 2 here...
dPredSigma = (obsErr ** 2 + nullErr ** 2) / 2
weightsNumerator = invObsSigma * dPred
weights = weightsNumerator / (dPred * weightsNumerator).sum()
signalEstimate = (weights * dObs).sum()
varianceEstimate = (np.abs(weights) * obsSigma).sum()
maxSignal = (weights * dPred).sum()
maxSignalVariance = (np.abs(weights) * dPredSigma).sum()
# Now just run the standard erf on this Gaussian to quantify the probability that there is some signal
# What we want now:
#
# 1. p-value that dObs * dPred (dot product) is greater than 0.
# 2. Distribution of \alpha, where dObs = \alpha dPred, where \alpha \in [0,1], with appropriate error propagation
# 2a. Is it possible to summarize 2 with a Beta distribution?
pvalue = s.norm._cdf(-signalEstimate / varianceEstimate)
pvalue = max(sys.float_info.min, pvalue)
score = -10.0 * log10(pvalue)
centerPosition['MSscore'] = score
centerPosition['MSpvalue'] = pvalue
centerPosition['signal'] = signalEstimate
centerPosition['variance'] = varianceEstimate
centerPosition['modelSignal'] = maxSignal
centerPosition['modelVariance'] = maxSignalVariance
centerPosition['Mask'] = []
return centerPosition
def scorePositions(self):
"""
Score each motif site in the sequence.
"""
qvModCalls = dict()
dnaSeq = a.array('c')
dnaSeq.frombytes(bytes(self.sequence, "ascii"))
for pos in self.motifPositions:
if pos in self.rawKinetics:
# Fetch unmodified positions
nullPred = self.getRegionPredictions(
pos - self.post, pos + self.pre, dnaSeq)
# Fetch modified positions and reset sequence
originalBase = dnaSeq[pos]
dnaSeq[pos] = m5CCode
modifiedPred = self.getRegionPredictions(
pos - self.post, pos + self.pre, dnaSeq)
dnaSeq[pos] = originalBase
# Position that contribute to this call
positions = range(pos - self.post, pos + self.pre + 1)
# Run the multi-site detection and save the results
centerStats = self.rawKinetics[pos]
centerStats = self.multiSiteDetection(
positions, nullPred, modifiedPred, centerStats)
qvModCalls[pos] = centerStats
return qvModCalls
def getRegionPredictions(self, start, end, sequence):
predictions = np.zeros(end - start + 1)
for pos in range(start, end + 1):
ctx = sequence[(pos - self.pre):(pos + self.post + 1)].tobytes()
predictions[pos - start] = self.contextMeanTable[ctx]
return predictions
|