File: Singleerrorrate.cpp

package info (click to toggle)
igor 1.4.0%2Bdfsg-2
  • links: PTS, VCS
  • area: main
  • in suites: bullseye
  • size: 4,116 kB
  • sloc: cpp: 12,453; python: 1,047; sh: 124; makefile: 33
file content (285 lines) | stat: -rw-r--r-- 9,025 bytes parent folder | download | duplicates (4)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
/*
 * Singleerrorrate.cpp
 *
 *  Created on: Jan 23, 2015
 *      Author: Quentin Marcou
 *
 *  This source code is distributed as part of the IGoR software.
 *  IGoR (Inference and Generation of Repertoires) is a versatile software to analyze and model immune receptors
 *  generation, selection, mutation and all other processes.
 *   Copyright (C) 2017  Quentin Marcou
 *
 *   This program is free software: you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
 *   the Free Software Foundation, either version 3 of the License, or
 *   (at your option) any later version.
 *
 *   This program is distributed in the hope that it will be useful,
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *   GNU General Public License for more details.

 *   You should have received a copy of the GNU General Public License
 *   along with this program.  If not, see <https://www.gnu.org/licenses/>.
 *
 */

#include "Singleerrorrate.h"

using namespace std;

Single_error_rate::Single_error_rate(): Single_error_rate(0.0) {}

Single_error_rate::Single_error_rate(double error_rate): Error_rate() , model_rate(error_rate) , normalized_counter(0) , seq_weighted_er(0)  {
	build_upper_bound_matrix(1,1);
}

Single_error_rate::~Single_error_rate() {
	// TODO Auto-generated destructor stub
}

Single_error_rate Single_error_rate::operator +(Single_error_rate err_r){
	Single_error_rate temp =  *this;
	return temp+=err_r;

}

Single_error_rate& Single_error_rate::operator +=(Single_error_rate err_r){
	this->normalized_counter+= err_r.normalized_counter;
	this->number_seq+=err_r.number_seq;
	this->model_log_likelihood+=err_r.model_log_likelihood;
	return *this;
}

shared_ptr<Error_rate> Single_error_rate::copy()const{
	shared_ptr<Single_error_rate> copy_err_r = shared_ptr<Single_error_rate>(new Single_error_rate(this->model_rate));
	copy_err_r->updated = this->updated;
	return copy_err_r;
}

Error_rate* Single_error_rate::add_checked(Error_rate* err_r){
	return &( this->operator +=( *( dynamic_cast< Single_error_rate*> (err_r) ) ) );
}

const double& Single_error_rate::get_err_rate_upper_bound(size_t n_errors , size_t n_error_free) {
	if( n_errors>this->max_err || n_error_free>this->max_noerr){
		//Need to increase the matrix size (anyway the matrix is at very most read_len^2
		this->build_upper_bound_matrix(max(this->max_err,n_errors + 10) , max(this->max_noerr , n_error_free+10));
	}

	return this->upper_bound_proba_mat(n_errors,n_error_free);

	//return this->model_rate/3;
	//TODO remove factor 3
}

void Single_error_rate::build_upper_bound_matrix(size_t m , size_t n){
	Matrix<double> new_bound_mat (m,n);
	for(size_t i=0 ; i!=new_bound_mat.get_n_rows() ; ++i){
		for(size_t j=0 ; j!=new_bound_mat.get_n_cols() ; ++j){
			if(i<this->max_err and j<this->max_noerr){
				new_bound_mat(i,j) = this->upper_bound_proba_mat(i,j);
			}
			else{
				new_bound_mat(i,j) = pow(this->model_rate/3,i)*pow(1-this->model_rate,j);
			}
			//new_bound_mat(i,j) = pow(this->model_rate/3,i)*pow(1-this->model_rate,j);
		}
	}
	this->upper_bound_proba_mat = new_bound_mat;
	this->max_err = new_bound_mat.get_n_rows()-1;
	this->max_noerr = new_bound_mat.get_n_cols()-1;
}


double Single_error_rate::compare_sequences_error_prob (double scenario_probability , const string& original_sequence ,  Seq_type_str_p_map& constructed_sequences , const Seq_offsets_map& seq_offsets , const unordered_map<tuple<Event_type,Gene_class,Seq_side>, shared_ptr<Rec_Event>>& events_map , Mismatch_vectors_map& mismatches_lists , double& seq_max_prob_scenario , double& proba_threshold_factor){
	//TODO extract sequence comparision from here, implement it in Errorrate class??
	number_errors=0;
	//cout<<constructed_sequences.at(V_gene_seq);
	genomic_nucl=0;

	Int_Str& v_gene_seq = (*constructed_sequences[V_gene_seq]);
	Int_Str& d_gene_seq = (*constructed_sequences[D_gene_seq]);
	Int_Str& j_gene_seq = (*constructed_sequences[J_gene_seq]);

	vector<int>& v_mismatch_list = *mismatches_lists[V_gene_seq];
	if(mismatches_lists.exist(D_gene_seq)){
		vector<int>& d_mismatch_list = *mismatches_lists[D_gene_seq];
		number_errors+=d_mismatch_list.size();
		genomic_nucl+=d_gene_seq.size();
	}

	vector<int>& j_mismatch_list = *mismatches_lists[J_gene_seq];

	genomic_nucl+=v_gene_seq.size();
	//genomic_nucl+=d_gene_seq.size();
	genomic_nucl+=j_gene_seq.size();

	number_errors+=v_mismatch_list.size();
	//number_errors+=d_mismatch_list.size();
	number_errors+=j_mismatch_list.size();


	 // Here a long double is required in case a lot of errors occur and/or the model rate is low, the probability will be truncated to 0 if it gets below ± 2.225,073,858,507,201,4 · 10-308 with double precision



	scenario_new_proba = scenario_probability*pow(model_rate/3,number_errors)*pow(1-model_rate,genomic_nucl-number_errors);
	if(scenario_new_proba >= seq_max_prob_scenario*proba_threshold_factor) {
		//if genomic nucl != 0 ?
		this->seq_mean_error_number +=  number_errors*scenario_new_proba;
		temp2 = (double(number_errors)/double(genomic_nucl));
		temp = scenario_new_proba*temp2;
		if(viterbi_run){
			this->seq_weighted_er = temp;
			this->seq_likelihood = scenario_new_proba;
			this->seq_probability = scenario_probability;
		}
		else{
			this->seq_weighted_er += temp;
			this->seq_likelihood += scenario_new_proba;
			this->seq_probability += scenario_probability;
		}

		++debug_number_scenarios;
		return scenario_new_proba;
	}
	else{
		return 0;
	}

}

queue<int> Single_error_rate::generate_errors(string& generated_seq , mt19937_64& generator) const{
	uniform_real_distribution<double> distribution(0.0,1.0);
	double rand_err ;// distribution(generator);
	double rand_trans;
	size_t index = 0;
	queue<int> errors_indices;
	for(string::iterator iter = generated_seq.begin() ; iter != generated_seq.end() ; ++iter){
		rand_err = distribution(generator);
		if(rand_err<=this->model_rate){
			//Introduce an error
			rand_trans = distribution(generator);
			errors_indices.push(index);

			if((*iter) == 'A'){
				if(rand_trans<= 1.0/3.0){
					(*iter) = 'C';
				}
				else if (rand_trans >= 2.0/3.0){
					(*iter) = 'G';
				}
				else{
					(*iter) = 'T';
				}
			}
			else if((*iter) == 'C'){
				if(rand_trans<= 1.0/3.0){
					(*iter) = 'A';
				}
				else if (rand_trans >= 2.0/3.0){
					(*iter) = 'G';
				}
				else{
					(*iter) = 'T';
				}
			}
			else if((*iter) == 'G'){
				if(rand_trans<= 1.0/3.0){
					(*iter) = 'A';
				}
				else if (rand_trans >= 2.0/3.0){
					(*iter) = 'C';
				}
				else{
					(*iter) = 'T';
				}

			}
			else if ((*iter == 'T')){
				if(rand_trans<= 1.0/3.0){
					(*iter) = 'A';
				}
				else if (rand_trans >= 2.0/3.0){
					(*iter) = 'C';
				}
				else{
					(*iter) = 'G';
				}

			}
			else{
				throw runtime_error("unknown nucleotide in Single_error_rate::generate_errors()");
			}

		}
		else{
			//Do nothing
		}
		++index;
	}
	return errors_indices;
}

int Single_error_rate::subseq_compare_err_num(const string& original_sequence , const string& constructed_sequence){
	int number_errors=0;

	for (size_t nucl_ind = 0 ; nucl_ind != original_sequence.size() ; ++nucl_ind){
			//Only take into account genomic nucl
			//might have to refine this if  the type of the inserted nucleotide is inferred (use constructed sequences map)
			if(original_sequence.at(nucl_ind) != constructed_sequence.at(nucl_ind)){
				number_errors+=1;
			}
		}
	return number_errors;
}

void Single_error_rate::update(){
	if(this->is_updated()){
		model_rate = normalized_counter / number_seq;
		normalized_counter = 0;
		number_seq = 0;
	}
}
/*
 * This method ensure correct normalization of the error rate
 * The error rate inferred for each scenario is weighted by the probability of the scenario.
 * The sum of these is then normalized by the likelihood of the sequence
 */
void Single_error_rate::add_to_norm_counter(){

	if(seq_likelihood != 0){ //TODO check that the first version was not more correct
			normalized_counter += seq_weighted_er/seq_likelihood;
			model_log_likelihood+=log10(seq_likelihood);
			number_seq += 1;
	}

	/*if(seq_weighted_er != 0){ //Why seq weighted error instead of seq likelihood?
		normalized_counter += seq_weighted_er/seq_likelihood;
		model_log_likelihood+=log10(seq_likelihood);
	}*/
	seq_weighted_er = 0;
	seq_likelihood = 0;
	seq_probability = 0;
	debug_number_scenarios=0;
	seq_mean_error_number=0;

}
/*
 * This method cleans the sequence specific counters
 * This method is used in case the sequence has a mean number of errors greater than the threshold
 * In this case the sequence will not contribute to the error rate
 */
void Single_error_rate::clean_seq_counters(){
	seq_weighted_er = 0;
	seq_likelihood = 0;
	seq_probability = 0;
	debug_number_scenarios=0;
	seq_mean_error_number=0;
}

void Single_error_rate::write2txt(ofstream& outfile){
	outfile<<"#SingleErrorRate"<<endl;
	outfile<<model_rate<<endl;
}