File: QualityScoreAndErrorGenerator.java

package info (click to toggle)
artfastqgenerator 0.0.20150519-1~bpo8%2B1
  • links: PTS, VCS
  • area: main
  • in suites: jessie-backports
  • size: 4,916 kB
  • sloc: java: 1,481; sh: 39; makefile: 11
file content (380 lines) | stat: -rw-r--r-- 13,815 bytes parent folder | download | duplicates (5)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
package artificialFastqGenerator;

import java.io.BufferedReader;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.logging.Level;
import java.util.logging.Logger;

/**
 * An instance of this class can be used to generate (1) ASCII encodings of Phred quality scores for bases in reads; (2) 
 * base-call errors. (1) is done by taking quality scores from a pre-existing FASTQ file, and (2) by generating an error with
 * a probability that depends on the base's phred score.
 * 
 * Copyright (C) 2012 The Institute of Cancer Research (ICR).
 *
 * This file is part of ArtificialFastqGenerator v1.0.0.
 * 
 * ArtificialFastqGenerator is free software: you can redistribute it and/or modify it under the terms of the GNU General
 * Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any
 * later version.
 * 
 * This program is distributed in the hope that it will be useful but WITHOUT ANY WARRANTY; without even the implied warranty
 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 * 
 * You should have received a copy of the GNU Public License along with this program. If not, see
 * <http://www.gnu.org/licenses/>
 * 
 * Authour's contact email: Matthew.Frampton@icr.ac.uk
 */

public class QualityScoreAndErrorGenerator {

	public static final char[] encodedSangerQualityScores = 
			"!\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~".toCharArray();
	private BufferedReader fastq1ForQualityScores;
	private BufferedReader fastq2ForQualityScores;
	private long numBaseCallErrors;
	private long numACGTBaseCalls;
	private HashMap<Character, Double> probOfErrorByEncodedQualityScore;
	private char[] highestQualityScores;
	private boolean useRealQualityScores;
	private boolean simulateErrorInRead;
	private int readLength;
	private String fastq1ForQualityScoresPath;
	private String fastq2ForQualityScoresPath;
	private Logger logger;
	
	/**
	 * Initialise a new QualityScoreAndErrorGenerator object. 
	 * 
	 * @param readLength - the length of each read.
	 * @param useRealQualityScores - true if using quality scores from an existing fastq file, else false.
	 * @param simulateErrorInRead - true if simulating error in reads, else false.
	 * @param fastq1ForQualityScoresPath - the 1st fastq file whose quality scores are used.
	 * @param fastq2ForQualityScoresPath - the 2nd fastq file whose quality scores are used.
	 */
	
	public QualityScoreAndErrorGenerator(int readLength, boolean useRealQualityScores, boolean simulateErrorInRead,
			String fastq1ForQualityScoresPath, String fastq2ForQualityScoresPath) {
		
		fastq1ForQualityScores = null;
		fastq2ForQualityScores = null;
		numBaseCallErrors = 0;
		numACGTBaseCalls = 0;
		probOfErrorByEncodedQualityScore = new HashMap<Character, Double>();
		//Fill probOfErrorByEncodedQualityScore...
		for (int i=0; i<encodedSangerQualityScores.length; i++) {
			Double probabilityOfError = getEstimatedBaseCallErrorProbability(encodedSangerQualityScores[i]);
			probOfErrorByEncodedQualityScore.put(encodedSangerQualityScores[i],probabilityOfError);
		}
		highestQualityScores = new char[readLength];
		//Fill highestQualityScores...
		for (int i=0; i<highestQualityScores.length; i++) {
			//highestQualityScores[i] = encodedSangerQualityScores[encodedSangerQualityScores.length-1];
			highestQualityScores[i] = encodedSangerQualityScores[40];
		}
		this.useRealQualityScores = useRealQualityScores;
		this.simulateErrorInRead = simulateErrorInRead;
		this.readLength = readLength;
		this.fastq1ForQualityScoresPath = fastq1ForQualityScoresPath;
		this.fastq2ForQualityScoresPath = fastq2ForQualityScoresPath;
		logger = Main.logger;
	}
	
	/**
	 * PHRED = -10 x log_10(P_e) (see Cock et al. "The Sanger FASTQ file...") so 
	 * P_e = 10^{PHRED/-10}, where P_e means estimated probability of an error. 
	 * 
	 * @param encodedQualityScore - an encoded Sanger Phred quality score.
	 * @return estimatedBaseCallErrorProbability - the estimated probability of a base call error.
	 */
	
	private double getEstimatedBaseCallErrorProbability(char encodedQualityScore) {
		
		int phredQualityScore = 0;
		for (int i=0; i <  encodedSangerQualityScores.length; i++) {
			if (encodedSangerQualityScores[i] == encodedQualityScore) {
				phredQualityScore = i;
				break;}
		}
	
		double phredQualityScoreAsDouble = (double) phredQualityScore;
		return Math.pow(10.0,phredQualityScoreAsDouble/-10.0); 
	}
	
	/**
	 * Get the encoded phred quality scores and the genotypes read. If the read is right-end, then it should align to the 
	 * -ive strand, so reverse nucleobases, and read their complements.
	 * 
	 * @param isLeft - true if the read is a left read, else false.
	 * @param nucleobases - human reference sequence nucleobases which the read is a read of.
	 * @return finalQualityScoresAndGenotypesRead - an ArrayList of char arrays containing the final quality scores and the 
	 * genotypes that get read.
	 */
	
	public ArrayList<char[]> getQualityScoresAndGenotypesRead(boolean isLeft, ArrayList<Nucleobase> nucleobases) {
		
		if (isLeft == false) {
			Collections.reverse(nucleobases);
		}
		
		char[] initialQualityScores = getInitialQualityScores(isLeft);
		char[] genotypesRead = new char[readLength];
		char[] finalQualityScores = new char[readLength];
		
		for (int i=0; i<nucleobases.size(); i++) {
			Nucleobase nucleobase = nucleobases.get(i);
			char correctGenotype = nucleobase.getGenotype();
			if (isLeft == false) {
				correctGenotype = nucleobase.getComplementGenotype();}
			if (ArtificialFastqGenerationUtils.isACGT(correctGenotype)) {numACGTBaseCalls++;}
			if (correctGenotype == 'N') {
				genotypesRead[i] = correctGenotype;
				finalQualityScores[i] = '#';
			} else {
				if (!simulateErrorInRead) {
					genotypesRead[i] = correctGenotype;
				} else {
					genotypesRead[i] = simulate1BaseCallError(correctGenotype, initialQualityScores[i]);
				}
				finalQualityScores[i] = initialQualityScores[i];
			}	
		}
		
		ArrayList<char[]> finalQualityScoresAndGenotypesRead = new ArrayList<char[]>();
		finalQualityScoresAndGenotypesRead.add(finalQualityScores);
		finalQualityScoresAndGenotypesRead.add(genotypesRead);
		return finalQualityScoresAndGenotypesRead;	
		
	}

	/**
	 * Get the initial quality scores. These are initial because there maybe be one or more quality scores
	 * which are not poor, and correspond to an 'N'. This is changed in
	 * getQualityScoresAndGenotypesRead(boolean, ArrayList<Nucleobase>, boolean, boolean).
	 * 
	 * @param isLeft - true if the read is a left read, else false.
	 * @return qualityScores - the initial Sanger format encoded phred quality scores for the read.
	 */
	
	private char[] getInitialQualityScores(boolean isLeft) {
		
		char[] qualityScores = new char[readLength];
		
		if (!useRealQualityScores) {
			//return max confidence.
			qualityScores = highestQualityScores;
		} else {
			qualityScores = getQualityScoresForNextReadFromFastq(isLeft);
		}
		
		return qualityScores;
	}

	/**
	 * Get the next read's quality scores from a pre-existing fastq file.
	 * 
	 * @param isLeft - true if the read is a left read, else false.
	 * @return qualityScoresCharArray - the Sanger format encoded phred quality scores for the read.
	 */

	private char[] getQualityScoresForNextReadFromFastq(boolean isLeft) {

		BufferedReader fastqForQualityScores = null;
		if (isLeft) {
			if (fastq1ForQualityScores == null) {
				openPreexistingFastqFile(isLeft);}
			fastqForQualityScores = fastq1ForQualityScores;
		} else {
			if (fastq2ForQualityScores == null) {
				openPreexistingFastqFile(isLeft);}
			fastqForQualityScores = fastq2ForQualityScores;
		}

		String qualityScores = null;
		while (qualityScores == null) {
			String fastqLine = null;
			try {
				boolean lineContainsConfidenceScores = false;
				while ((fastqLine = fastqForQualityScores.readLine()) != null) {				
					if (lineContainsConfidenceScores) {
						qualityScores = fastqLine;
						break;
					}
					if (fastqLine.startsWith("+")) {
						lineContainsConfidenceScores = true;
					} else {
						lineContainsConfidenceScores = false;
					}
				}
			} catch (IOException ioe) {
				ioe.printStackTrace();
				logger.log(Level.SEVERE,ArtificialFastqGenerationUtils.getStackTraceString(ioe));
			} finally {
				if (fastqLine == null) {
					closePreexistingFastqFile(isLeft);
					openPreexistingFastqFile(isLeft);
					if (isLeft) {
						fastqForQualityScores = fastq1ForQualityScores;}
					else {
						fastqForQualityScores = fastq2ForQualityScores;}
					} 
				}
			}
		char[] qualityScoresCharArray = qualityScores.replace("\\","").toCharArray();
		if (qualityScoresCharArray.length != readLength) {
			qualityScoresCharArray = addOrRemoveQualityScores(qualityScoresCharArray);
		}
		return qualityScoresCharArray;
	}

	/**
	 * Open a preexisting fastq file.
	 * 
	 * @param isLeft - true if the read is a left read, else false.
	 */
	
	private void openPreexistingFastqFile(boolean isLeft) {
	
		try {
			if (isLeft) {
				fastq1ForQualityScores = new BufferedReader(new FileReader(fastq1ForQualityScoresPath));
			} else {
				fastq2ForQualityScores = new BufferedReader(new FileReader(fastq2ForQualityScoresPath));
			}
		} catch (FileNotFoundException fnf) {
			fnf.printStackTrace();
			logger.log(Level.SEVERE,ArtificialFastqGenerationUtils.getStackTraceString(fnf));
		}
	}
	
	/**
	 * Close a preexisting fastq file.
	 * 
	 * @param isLeft - true if the read is a left read, else false.
	 */
	
	private void closePreexistingFastqFile(boolean isLeft) {
		
		try {
			if (isLeft) {
				fastq1ForQualityScores.close();
			} else {
				fastq2ForQualityScores.close();
			}
		} catch (IOException ioe) {
			ioe.printStackTrace();
			logger.log(Level.SEVERE,ArtificialFastqGenerationUtils.getStackTraceString(ioe));
		}
		
	}
	
	
	/**
	 * Increase/decrease the size of a sequence of real quality scores so that it is
	 * the same size as the readLength.
	 * 
	 * @param qualityScores - original quality scores.
	 * @return newQualityScores -length-adjusted quality scores.
	 */
	
	private char[] addOrRemoveQualityScores(char[] qualityScores) {
		
		logger.log(Level.WARNING,"Real quality scores are not length " + readLength);
		char[] newQualityScores = new char[readLength];	
		int diffInLength = qualityScores.length - readLength;
		int i=0; //index for qualityScores.
		int j=0; //index for newQualityScores.
		
		while (j < readLength) {
			if (diffInLength > 0) {//Quality scores is too big, so ignore the 1st diffInLength characters.
				i++;
				diffInLength--;
			} else if (diffInLength < 0) {//Quality scores is too small, so duplicate the 1st char \times diffInLength.
				newQualityScores[j] = qualityScores[0];
				j++;
				diffInLength++;
			} else {
				newQualityScores[j] = qualityScores[i];
				i++;
				j++;
			}
		}
		
		return newQualityScores;
	
	}
	
	/**
	 * Decide whether to generate an error based on the quality score. If yes
	 * choose 1 of the 3 alternate genotypes with equal probability.
	 * 
	 * @param correctGenotype - the genotype of a nucleobase in the human reference sequence.
	 * @param qualityScore - the quality score in the read for this genotype.
	 * @return incorrectGenotype - the genotype that is read (possible incorrect).
	 */
	
	private char simulate1BaseCallError(char correctGenotype, char qualityScore) {
	
		//if (!probOfErrorByEncodedQualityScore.containsKey(qualityScore)) {
			//logger.log(Level.WARNING, qualityScore + " not in map.");
		//}
		//1st decide if we're going to simulate an error.
		//Only simulate errors for A/C/G/T.
		if (!ArtificialFastqGenerationUtils.isACGT(correctGenotype)) {
			return correctGenotype;
		}
		double randomNumber = ArtificialFastqGenerationUtils.generateRandomDoubleBetween0And1();
		double estimatedBaseCallErrorProbability = probOfErrorByEncodedQualityScore.get(qualityScore);
		if (randomNumber >= estimatedBaseCallErrorProbability) {
			return correctGenotype;}
		
		//If we are simulating an error, then decide which alternate base to return.
		//System.out.println("Simulating an error");
		numBaseCallErrors = numBaseCallErrors + 1;
		//Get the alternate genotypes.
		char[] alternateGenotypes = ArtificialFastqGenerationUtils.getAlternateGenotypes(correctGenotype);
		char incorrectGenotype = correctGenotype;
		if (randomNumber <= estimatedBaseCallErrorProbability/3.0) {
			incorrectGenotype = alternateGenotypes[0];} 
		else if (randomNumber <= estimatedBaseCallErrorProbability/1.5) {
			incorrectGenotype = alternateGenotypes[1];}
		else {incorrectGenotype = alternateGenotypes[2];}
		
		return incorrectGenotype;
	}

	/**
	 * Get the number of base call errors that have been simulated.
	 * 
	 * @return numBaseCallErrors - the number of bases for which base calls have been simulated.
	 */
	
	public long getNumBaseCallErrors() {
		return numBaseCallErrors;
	}
	
	/**
	 * Reset the number of simulated base call errors to 0.
	 */
	
	public void resetNumBaseCallErrors() {
		numBaseCallErrors = 0;
	}
	
	/**
	 * Get the number of ACGT bases.
	 * 
	 * @return numACGTBaseCalls - the number of ACGT bases that have been called.
	 */
	
	public long getNumACGTBasesCalled() {
		return numACGTBaseCalls;
	}
	
}