1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366
|
#include <iostream>
#include <vector>
#include <tr1/tuple>
#include <fstream>
#include <iostream>
#include <sstream>
#include <string>
#include <set>
#include <utility>
#include <algorithm>
#include <tr1/memory>
#include <cmath>
#include <cstring>
#include <limits>
#include "util.hpp"
#include "DNASeq.hpp"
#include "MMAPReads.hpp"
#include "NeighborSet.hpp"
using namespace std;
struct VoteInfo {
int pos;
int base;
double log_quality;
bool reverse_complement;
bool prior;
VoteInfo(int pos, int base, double log_quality, bool reverse_complement, bool prior=false):pos(pos),base(base),log_quality(log_quality),reverse_complement(reverse_complement),prior(prior) {}
};
void zeroMat(int seq_len, double mat[][4][4]) {
for(int l=0; l<seq_len; l++)
for(int i=0; i<4; i++)
for(int j=0; j<4; j++)
mat[l][i][j]=0;
}
void initLoglikelihoodMat(const Options& opt, int max_seq_len, double confMat[][4][4], double loglikelihood[][4][4]) {
zeroMat(max_seq_len, confMat);
zeroMat(max_seq_len, loglikelihood);
// Initialize error loglikelihood matrix.
if(opt.confMatFName==NULL) {
// If no confusion matrix supplied, use majority weight instead.
for(int pos=0; pos<max_seq_len; pos++)
for(int b1=0; b1<4; b1++)
for(int b2=0; b2<4; b2++)
if(b1==b2)
loglikelihood[pos][b1][b2]=log(0.99);
else
loglikelihood[pos][b1][b2]=log(0.01/3);
} else {
// Otherwise, load confusion matrix.
ifstream fin(opt.confMatFName);
vector<vector<int> > tot(max_seq_len);
int file_len;
fin >> file_len;
for(int pos=0; pos<file_len; pos++) {
tot[pos].resize(4);
for(int trueb=0; trueb<4; trueb++)
for(int calledb=0; calledb<4; calledb++) {
fin >> confMat[pos][trueb][calledb];
confMat[pos][trueb][calledb]+=1.0;
tot[pos][calledb] += confMat[pos][trueb][calledb];
}
}
fin.close();
// Normalize and take the log.
for(int pos=0; pos<file_len; pos++)
for(int trueb=0; trueb<4; trueb++)
for(int calledb=0; calledb<4; calledb++)
loglikelihood[pos][calledb][trueb] = log(confMat[pos][trueb][calledb]) - log(tot[pos][calledb]);
// Fill the gap.
for(int pos=file_len; pos<max_seq_len; pos++)
for(int b1=0; b1<4; b1++)
for(int b2=0; b2<4; b2++)
loglikelihood[pos][b1][b2] = loglikelihood[file_len-1][b1][b2];
}
}
void generateHypothesis(bool heterozygous, vector<tr1::tuple<int, int, int> >& hypothesis) {
hypothesis.clear();
if(heterozygous) {
for(int b1=0; b1<4; b1++)
for(int b2=b1; b2<4; b2++) {
hypothesis.push_back(tr1::make_tuple(b1, b2, b1*4+b2));
}
} else {
for(int b=0; b<4; b++)
hypothesis.push_back(tr1::make_tuple(b, b, b*4+b));
}
}
int main(int argc, char** argv) {
// Initialize constants.
Options opt(argc, argv);
// Initialize reads, votes, and confusion matrix.
MMAPReads readfile(opt.readFName);
int max_seq_len = 0;
for(unsigned int readid=opt.read_st; readid<opt.read_ed; readid++) {
int seq_len = strlen(readfile[readid]);
if(seq_len>max_seq_len) max_seq_len = seq_len;
}
// candidate hypothesis
vector<tr1::tuple<int, int, int> > hypothesis;
generateHypothesis(opt.h_rate>0, hypothesis);
// Confusion matrix.
// Error loglikelihood matrix.
double confMat[max_seq_len][4][4];
double loglikelihood[max_seq_len][4][4];
initLoglikelihoodMat(opt, max_seq_len, confMat, loglikelihood);
// Voting Mechanism.
// Statitstics.
vector<int> histogram(500, 0);
set<unsigned int> hist_readset;
// Open output file.
ostringstream fname, fqualname;
fname << opt.fpre << "output_" << opt.fsuf << ".txt";
fqualname << opt.fpre << "quality_" << opt.fsuf << ".txt";
ofstream fout, fqual;
fout.open(fname.str().c_str());
fqual.open(fqualname.str().c_str());
// initialize NeighborSetLoaders
vector<tr1::shared_ptr<NeighborSetLoader> > neighborLoader;
for(size_t fiter=0; fiter<opt.inputFNames.size(); fiter++)
neighborLoader.push_back(tr1::shared_ptr<NeighborSetLoader>(new NeighborSetLoader(opt.inputFNames[fiter])));
// Zero out confusion matrix.
zeroMat(max_seq_len, confMat);
for(unsigned int readid = opt.read_st; readid<opt.read_ed; readid++) {
const string orig_seq = string(readfile[readid]);
const int seq_len = orig_seq.size();
string corr_seq(seq_len, 0);
string qual_seq(seq_len, 0);
vector<vector<VoteInfo> > Votes(seq_len);
vector<int> nVotes(seq_len, 0);
// If read is reverse complement read, output directly.
if(!readfile.isOrig(readid)) {
for(int i=0; i<seq_len; i++) {
corr_seq[i] = 'N';
qual_seq[i] = 33;
}
fout << corr_seq << endl;
fqual << qual_seq << endl;
continue;
}
// construct neighbors
tr1::shared_ptr<map<unsigned int, NeighborInfo> > readNeighbors(new map<unsigned int, NeighborInfo>);
for(size_t fiter=0; fiter<opt.inputFNames.size(); fiter++) {
tr1::shared_ptr<map<unsigned int, NeighborInfo> > newNeighbors = neighborLoader[fiter]->get(readid);
for(map<unsigned int, NeighborInfo>::iterator nn = newNeighbors->begin(); nn!=newNeighbors->end(); nn++) {
map<unsigned int, NeighborInfo>::iterator conflict = readNeighbors->find(nn->first);
int seq_len2 = strlen(readfile[nn->first]);
if(conflict==readNeighbors->end() || // No conflict.
( nn->second!=conflict->second && nn->second.isBetter(conflict->second, seq_len, seq_len2) ))// Different alignment and conflict resolution.
(*readNeighbors)[nn->first] = nn->second;
}
}
// Voting.
// Collect votes.
set<unsigned int> my_neighbors;
for(map<unsigned int, NeighborInfo>::iterator neighbors = readNeighbors->begin();
neighbors!=readNeighbors->end(); neighbors++) {
const unsigned int& neighborId = neighbors->first;
const char* neighbor_seq = readfile[neighborId];
const int neighbor_seq_len = strlen(neighbor_seq);
const int overlap = neighbors->second.get_overlap(seq_len, neighbor_seq_len);
const double log_quality = 0;
// This check is not redundant, it is used when selecting h/e
// so that the neighbor computation can be reused.
if(!neighbors->second.isNeighbor(seq_len, neighbor_seq_len, opt.K, opt.h, opt.e))
continue;
if(opt.save_stats)
my_neighbors.insert(neighborId);
const int& st1 = neighbors->second.get_st1();
const int& st2 = neighbors->second.get_st2();
const char* overlap_seq = &neighbor_seq[st2];
const bool orig = readfile.isOrig(neighborId);
int pos = orig?(st2):(neighbor_seq_len-(st2)-1);
for(int offset=0; offset<overlap; offset++) {
int calledb = baseToInt(overlap_seq[offset]);
if(!isAnyBase(calledb)) {
if(!orig) calledb = 3-calledb;
Votes[st1+offset].push_back(VoteInfo(pos, calledb, log_quality, !orig));
nVotes[st1+offset] += 1;
// Original read gets to vote twice.
if(readid==neighborId) {
int nprior = max(1, (int)floor(sqrt(opt.cov)));
if(opt.h_rate!=0) nprior = 0;
for(int z=0; z<nprior; z++)
Votes[st1+offset].push_back(VoteInfo(pos, calledb, log_quality, !orig, true));
}
}
if(orig) pos++; else pos--;
}
}
// Extract most likely sequence and output.
for(int i=0; i<seq_len; i++) {
const int orig_base = baseToInt(orig_seq.at(i));
vector<float> base_loglikelihood(16, 0.0); // Likelihood with prior votes.
vector<float> base_logquality(16, 0.0); // Likelihood without prior votes.
// perform actual ML estimation
for(vector<VoteInfo>::iterator pVote=Votes[i].begin(); pVote!=Votes[i].end(); pVote++)
for(vector<tr1::tuple<int,int,int> >::iterator H=hypothesis.begin(); H!=hypothesis.end(); H++) {
int& b1 = tr1::get<0>(*H);
int& b2 = tr1::get<1>(*H);
int& b1b2 = tr1::get<2>(*H);
int read_b1 = pVote->reverse_complement?3-b1:b1;
int read_b2 = pVote->reverse_complement?3-b2:b2;
if(b1>b2) {
base_loglikelihood[b1b2] = -numeric_limits<float>::infinity();
if(!pVote->prior)
base_logquality[b1b2] = -numeric_limits<float>::infinity();
} else if(b1==b2) {
// Homozygous case.
base_loglikelihood[b1b2] += loglikelihood[pVote->pos][pVote->base][read_b1] + pVote->log_quality;
if(!pVote->prior)
base_logquality[b1b2] += loglikelihood[pVote->pos][pVote->base][read_b1] + pVote->log_quality;
} else { // b1 < b2
// Heterozygous case.
base_loglikelihood[b1b2] += log(0.5*exp(loglikelihood[pVote->pos][pVote->base][read_b1]) + 0.5*exp(loglikelihood[pVote->pos][pVote->base][read_b2])) + pVote->log_quality;
if(!pVote->prior)
base_logquality[b1b2] += log(0.5*exp(loglikelihood[pVote->pos][pVote->base][read_b1]) + 0.5*exp(loglikelihood[pVote->pos][pVote->base][read_b2])) + pVote->log_quality;
}
}
tr1::tuple<int, int, int> mse_base;
double max_loglikelihood = -numeric_limits<double>::infinity();
for(vector<tr1::tuple<int,int,int> >::iterator H=hypothesis.begin(); H!=hypothesis.end(); H++) {
int& b1 = tr1::get<0>(*H);
int& b2 = tr1::get<1>(*H);
int& b1b2 = tr1::get<2>(*H);
if(b1==b2) {
base_loglikelihood[b1b2] += log(0.25) + log(1.0 - opt.h_rate);
base_logquality[b1b2] += log(0.25) + log(1.0 - opt.h_rate);
} else if(b1<b2) {
base_loglikelihood[b1b2] += -log(6.0) + log(opt.h_rate);
base_logquality[b1b2] += -log(6.0) + log(opt.h_rate);
}
if(base_loglikelihood[b1b2]>max_loglikelihood) {
max_loglikelihood = base_loglikelihood[b1b2];
mse_base = *H;
}
}
// Extract ML.
if(nVotes[i]!=0 && (nVotes[i]<=opt.max_cov || opt.max_cov==0) && nVotes[i]>=opt.min_cov) {
// accept changes
corr_seq[i] = intToBase(tr1::get<0>(mse_base), tr1::get<1>(mse_base));
} else {
// Reject correction if received too many/few votes.
corr_seq[i] = orig_seq.at(i);
tr1::get<0>(mse_base) = tr1::get<1>(mse_base) = orig_base;
}
// Quality score computation.
double total_prob=0, error_prob=0;
if(!isAnyBase(tr1::get<0>(mse_base)) && !isAnyBase(tr1::get<0>(mse_base)))
for(vector<tr1::tuple<int,int,int> >::iterator H=hypothesis.begin(); H!=hypothesis.end(); H++) {
int& b1 = tr1::get<0>(*H);
int& b2 = tr1::get<1>(*H);
int& b1b2 = tr1::get<2>(*H);
float prob = exp(base_logquality[b1b2]-base_logquality[tr1::get<2>(mse_base)]);
total_prob += prob;
if(b1!=tr1::get<0>(mse_base) || b2!=tr1::get<1>(mse_base))
error_prob += prob;
}
error_prob /= total_prob;
if(error_prob >= 1e-100)
qual_seq[i] = min(93, max(0, (int)floor(-10.0*log(error_prob) / log(10.0) / max(1.0, (double)nVotes[i])))) + 33;
else
qual_seq[i] = 33+93;
// update confusion matrix
if(total_prob>0 && !isAnyBase(orig_base))
for(vector<tr1::tuple<int,int,int> >::iterator H=hypothesis.begin(); H!=hypothesis.end(); H++) {
int& b1 = tr1::get<0>(*H);
int& b2 = tr1::get<1>(*H);
int& b1b2 = tr1::get<2>(*H);
float prob = exp(base_logquality[b1b2]-base_logquality[tr1::get<2>(mse_base)]);
if(readfile.isOrig(readid)) {
confMat[i][b1][orig_base]+=0.5*prob/total_prob;
confMat[i][b2][orig_base]+=0.5*prob/total_prob;
} else {
confMat[seq_len-i-1][3-b1][3-orig_base]+=0.5*prob/total_prob;
confMat[seq_len-i-1][3-b2][3-orig_base]+=0.5*prob/total_prob;
}
}
}
// Update histogram.
if(opt.save_stats){
size_t max_vote = *max_element(nVotes.begin(), nVotes.end());
if(max_vote>histogram.size())
histogram.resize(max_vote, 0);
if(hist_readset.find(readid)==hist_readset.end()){
// All neighbors will not vote in the future and only beginning of the read gets to vote to improve independence.
hist_readset.insert(my_neighbors.begin(), my_neighbors.end());
if(readfile.isOrig(readid))
histogram[nVotes[0]]+=1;
else
histogram[nVotes[seq_len-1]]+=1;
}
}
// Output corrected read.
fout << corr_seq << endl;
fqual << qual_seq << endl;
}
fout.close();
fqual.close();
// Output other stats.
// Output histogram.
if(opt.save_stats) {
fname.str(string());
fname << opt.fpre << "histogram_" << opt.fsuf << ".txt";
fout.open(fname.str().c_str());
for(size_t i=0; i<histogram.size(); i++)
fout << histogram[i] << ' ';
fout.close();
// Output confusion matrix.
fname.str(string());
fname << opt.fpre << "confmat_" << opt.fsuf << ".txt";
fout.open(fname.str().c_str());
fout << max_seq_len << endl;
for(int pos=0; pos<max_seq_len; pos++) {
for(int b1=0; b1<4; b1++) {
for(int b2=0; b2<4; b2++)
fout << confMat[pos][b1][b2] << ' ';
fout << endl;
}
fout << endl;
}
fout.close();
}
return 0;
}
|