1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120
|
// Copyright 2010 Martin C. Frith
// These are routines for masking simple regions (low-complexity and
// short-period tandem repeats) in biological sequences. To
// understand them in detail, see the published article (in
// preparation).
// Typically, you would just use the maskSequences routine. The other
// routines are more specialized. The inputs to maskSequences are as
// follows.
// seqBeg: pointer to the start of the sequence.
// seqEnd: pointer to one-past-the-end of the sequence.
// maxRepeatOffset: the maximum tandem-repeat period-size to consider.
// likelihoodRatioMatrix: a matrix of the form Qxy / (Px * Py), where
// Qxy is the probability of seeing letters x and y in equivalent
// positions of a tandem repeat, and Px and Py are the background
// probabilities of the letters. This matrix is related to a scoring
// matrix (e.g. blosum62) by:
// likelihoodRatioMatrix[x][y] = exp(lambda * scoringMatrix[x][y]).
// The uchars in the sequence will be used as indexes into the matrix
// (e.g. likelihoodRatioMatrix[x][y]). So typically the uchars are
// small integers (e.g. 0, 1, 2, 3). The matrix needs to have entries
// for any uchars that can occur.
// repeatProb: the probability of a repetitive segment starting per position.
// repeatEndProb: the probability of a repetitive segment ending per position.
// repeatOffsetProbDecay: the probability of a period-(i) repeat
// divided by the probability of a period-(i-1) repeat.
// firstGapProb: the probability of initiating an insertion or
// deletion in a repetitive region.
// otherGapProb: the probability of extending an insertion or deletion
// by one more letter.
// minMaskProb: mask letters whose posterior probability of being
// repetitive is >= this.
// maskTable: how to do the masking. Letter x will be changed to
// maskTable[x]. So maskTable needs to have entries for any uchar
// that can occur.
// Typical usage:
// tantan::maskSequences(seqBeg, seqEnd, 100, likelihoodRatioMatrix,
// 0.005, 0.05, 0.9, 0, 0, 0.5, maskTable)
#ifndef TANTAN_HH
#define TANTAN_HH
namespace tantan {
typedef unsigned char uchar;
typedef const double *const_double_ptr;
void maskSequences(uchar *seqBeg,
uchar *seqEnd,
int maxRepeatOffset,
const const_double_ptr *likelihoodRatioMatrix,
double repeatProb,
double repeatEndProb,
double repeatOffsetProbDecay,
double firstGapProb,
double otherGapProb,
double minMaskProb,
const uchar *maskTable);
// The following routine gets the posterior probability that each
// letter is repetitive. It stores the results in "probabilities",
// which must point to enough pre-allocated space to fit the results.
void getProbabilities(const uchar *seqBeg,
const uchar *seqEnd,
int maxRepeatOffset,
const const_double_ptr *likelihoodRatioMatrix,
double repeatProb,
double repeatEndProb,
double repeatOffsetProbDecay,
double firstGapProb,
double otherGapProb,
float *probabilities);
// The following routine masks each letter whose corresponding entry
// in "probabilities" is >= minMaskProb.
void maskProbableLetters(uchar *seqBeg,
uchar *seqEnd,
const float *probabilities,
double minMaskProb,
const uchar *maskTable);
// The following routine counts the expected number of transitions
// from the background (non-repeat) state to other states. It adds
// the results to "transitionCounts", which must point to
// pre-initialized space for (maxRepeatOffset+1) items. The
// background->background transition count is stored in
// transitionCounts[0]. The background->(period-i repeat) transition
// count is stored in transitionCounts[i].
// (In this routine, the HMM begin and end states are counted as
// background states. Thus, begin->X is added to background->X, and
// X->end is added to X->background.)
void countTransitions(const uchar *seqBeg,
const uchar *seqEnd,
int maxRepeatOffset,
const const_double_ptr *likelihoodRatioMatrix,
double repeatProb,
double repeatEndProb,
double repeatOffsetProbDecay,
double firstGapProb,
double otherGapProb,
double *transitionCounts);
}
#endif
|