File: MLE.cpp

package info (click to toggle)
abyss 2.3.10-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 8,284 kB
  • sloc: cpp: 78,182; ansic: 6,512; makefile: 2,252; perl: 672; sh: 509; haskell: 412; python: 4
file content (211 lines) | stat: -rw-r--r-- 5,685 bytes parent folder | download | duplicates (6)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
#include "MLE.h"
#include "PMF.h"
#include <boost/tuple/tuple.hpp>
#include <algorithm> // for swap
#include <cassert>
#include <limits> // for numeric_limits
#include <utility>

using namespace std;
using boost::tie;

/** This window function is a triangle with a flat top, or a rectangle
 * with sloped sides.
 */
class WindowFunction {
	public:
		WindowFunction(int len0, int len1)
			: x1(len0), x2(len1), x3(len0 + len1)
		{
			assert(len0 > 0);
			assert(len1 > 0);
			assert(len0 <= len1);
		}

		/** Return this window function evaluated at x. */
		double operator()(int x) const
		{
			return (x <= 0 ? 1
					: x < x1 ? x
					: x < x2 ? x1
					: x < x3 ? x3 - x
					: 1) / (double)x1;
		}

	private:
		/** Parameters of this window function. */
		int x1, x2, x3;
};

/** This is a normalized zero-phase Hann window function with
 * specified size. */
class HannWindow {
	public:
		HannWindow(int size) : size(size)
		{
			sum = getSum();
		}

		/** Return normalized Hann window value at i centered at 0. */
		double operator()(int i) const
		{
			return value(i + size/2) / sum;
		}

		/** Return the Hann window value at i. */
		double value(int i) const
		{
			if (i < 0 || i >= size)
				return 0;
			return 0.5 * (1 - cos(2 * M_PI * i/(size - 1)));
		}

		/** Get the sum of all values in this window. */
		double getSum()
		{
			double sum = 0;
			for (int i = 0; i < size; i++)
				sum += value(i);
			return sum;
		}

	private:
		double sum;
		int size;
};

/** Compute the log likelihood that these samples came from the
 * specified distribution shifted by the parameter theta.
 * @param theta the parameter of the PMF, f_theta(x)
 * @param samples the samples
 * @param pmf the probability mass function
 * @return the log likelihood
 */
static pair<double, unsigned>
computeLikelihood(int theta, const Histogram& samples, const PMF& pmf)
{
	double likelihood = 0;
	unsigned nsamples = 0;
	for (Histogram::const_iterator it = samples.begin();
			it != samples.end(); ++it) {
		double p = pmf[it->first + theta];
		unsigned n = it->second;
		likelihood += n * log(p);
		if (p > pmf.minProbability())
			nsamples += n;
	}
	return make_pair(likelihood, nsamples);
}

/** Return the most likely distance between two contigs and the number
 * of pairs that support that estimate. */
static pair<int, unsigned>
maximumLikelihoodEstimate(int first, int last,
		const Histogram& samples,
		const PMF& pmf,
		unsigned len0, unsigned len1)
{
	int filterSize = 2 * (int)(0.05 * pmf.mean()) + 3; // want an odd filter size
	first = max(first, (int)pmf.minValue() - samples.maximum()) - filterSize/2;
	last = min(last, (int)pmf.maxValue() - samples.minimum()) + filterSize/2 + 1;

	/* When randomly selecting fragments that span a given point,
	 * longer fragments are more likely to be selected than
	 * shorter fragments.
	 */
	WindowFunction window(len0, len1);

	unsigned nsamples = samples.size();
	double bestLikelihood = -numeric_limits<double>::max();
	int bestTheta = first;
	unsigned bestn = 0;
	vector<double> le;
	vector<unsigned> le_n;
	vector<int> le_theta;
	for (int theta = first; theta <= last; theta++) {
		// Calculate the normalizing constant of the PMF, f_theta(x).
		double c = 0;
		for (int i = pmf.minValue(); i <= (int)pmf.maxValue(); ++i)
			c += pmf[i] * window(i - theta);

		double likelihood;
		unsigned n;
	   	tie(likelihood, n) = computeLikelihood(theta, samples, pmf);
		likelihood -= nsamples * log(c);
		le.push_back(likelihood);
		le_n.push_back(n);
		le_theta.push_back(theta);
	}

	HannWindow filter(filterSize);
	for (int i = filterSize / 2; i < (int)le.size()-(filterSize / 2); i++) {
		double likelihood = 0;
		for (int j = -filterSize / 2; j <= filterSize / 2; j++) {
			assert((unsigned)(i + j) < le.size() && i + j >= 0);
			likelihood += filter(j) * le[i + j];
		}

		if (le_n[i] > 0 && likelihood > bestLikelihood) {
			bestLikelihood = likelihood;
			bestTheta = le_theta[i];
			bestn = le_n[i];
		}
	}
	return make_pair(bestTheta, bestn);
}

/** Return the most likely distance between two contigs and the number
 * of pairs that support that distance estimate.
 * @param len0 the length of the first contig in bp
 * @param len1 the length of the second contig in bp
 * @param rf whether the fragment library is oriented reverse-forward
 * @param[out] n the number of samples with a non-zero probability
 */
int maximumLikelihoodEstimate(unsigned l,
		int first, int last,
		const vector<int>& samples, const PMF& pmf,
		unsigned len0, unsigned len1, bool rf,
		unsigned& n)
{
	assert(first < last);
	assert(!samples.empty());

	// The aligner is unable to map reads to the ends of the sequence.
	// Correct for this lack of sensitivity by subtracting l-1 bp from
	// the length of each sequence, where the aligner requires a match
	// of at least l bp. When the fragment library is oriented
	// forward-reverse, subtract 2*(l-1) from each sample.
	assert(l > 0);
	assert(len0 >= l);
	assert(len1 >= l);
	len0 -= l - 1;
	len1 -= l - 1;

	if (len0 > len1)
		swap(len0, len1);

	if (rf) {
		// This library is oriented reverse-forward.
		Histogram h(samples.begin(), samples.end());
		int d;
		tie(d, n) = maximumLikelihoodEstimate(
				first, last, h,
				pmf, len0, len1);
		return d;
	} else {
		// This library is oriented forward-reverse.
		// Subtract 2*(l-1) from each sample.
		Histogram h;
		typedef vector<int> Samples;
		for (Samples::const_iterator it = samples.begin();
				it != samples.end(); ++it) {
			assert(*it > 2 * (int)(l - 1));
			h.insert(*it - 2 * (l - 1));
		}
		int d;
		tie(d, n) = maximumLikelihoodEstimate(
				first, last, h,
				pmf, len0, len1);
		return max(first, d - 2 * (int)(l - 1));
	}
}