1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380
|
package artificialFastqGenerator;
import java.io.BufferedReader;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.logging.Level;
import java.util.logging.Logger;
/**
* An instance of this class can be used to generate (1) ASCII encodings of Phred quality scores for bases in reads; (2)
* base-call errors. (1) is done by taking quality scores from a pre-existing FASTQ file, and (2) by generating an error with
* a probability that depends on the base's phred score.
*
* Copyright (C) 2012 The Institute of Cancer Research (ICR).
*
* This file is part of ArtificialFastqGenerator v1.0.0.
*
* ArtificialFastqGenerator is free software: you can redistribute it and/or modify it under the terms of the GNU General
* Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any
* later version.
*
* This program is distributed in the hope that it will be useful but WITHOUT ANY WARRANTY; without even the implied warranty
* of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
*
* You should have received a copy of the GNU Public License along with this program. If not, see
* <http://www.gnu.org/licenses/>
*
* Authour's contact email: Matthew.Frampton@icr.ac.uk
*/
public class QualityScoreAndErrorGenerator {
public static final char[] encodedSangerQualityScores =
"!\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~".toCharArray();
private BufferedReader fastq1ForQualityScores;
private BufferedReader fastq2ForQualityScores;
private long numBaseCallErrors;
private long numACGTBaseCalls;
private HashMap<Character, Double> probOfErrorByEncodedQualityScore;
private char[] highestQualityScores;
private boolean useRealQualityScores;
private boolean simulateErrorInRead;
private int readLength;
private String fastq1ForQualityScoresPath;
private String fastq2ForQualityScoresPath;
private Logger logger;
/**
* Initialise a new QualityScoreAndErrorGenerator object.
*
* @param readLength - the length of each read.
* @param useRealQualityScores - true if using quality scores from an existing fastq file, else false.
* @param simulateErrorInRead - true if simulating error in reads, else false.
* @param fastq1ForQualityScoresPath - the 1st fastq file whose quality scores are used.
* @param fastq2ForQualityScoresPath - the 2nd fastq file whose quality scores are used.
*/
public QualityScoreAndErrorGenerator(int readLength, boolean useRealQualityScores, boolean simulateErrorInRead,
String fastq1ForQualityScoresPath, String fastq2ForQualityScoresPath) {
fastq1ForQualityScores = null;
fastq2ForQualityScores = null;
numBaseCallErrors = 0;
numACGTBaseCalls = 0;
probOfErrorByEncodedQualityScore = new HashMap<Character, Double>();
//Fill probOfErrorByEncodedQualityScore...
for (int i=0; i<encodedSangerQualityScores.length; i++) {
Double probabilityOfError = getEstimatedBaseCallErrorProbability(encodedSangerQualityScores[i]);
probOfErrorByEncodedQualityScore.put(encodedSangerQualityScores[i],probabilityOfError);
}
highestQualityScores = new char[readLength];
//Fill highestQualityScores...
for (int i=0; i<highestQualityScores.length; i++) {
//highestQualityScores[i] = encodedSangerQualityScores[encodedSangerQualityScores.length-1];
highestQualityScores[i] = encodedSangerQualityScores[40];
}
this.useRealQualityScores = useRealQualityScores;
this.simulateErrorInRead = simulateErrorInRead;
this.readLength = readLength;
this.fastq1ForQualityScoresPath = fastq1ForQualityScoresPath;
this.fastq2ForQualityScoresPath = fastq2ForQualityScoresPath;
logger = Main.logger;
}
/**
* PHRED = -10 x log_10(P_e) (see Cock et al. "The Sanger FASTQ file...") so
* P_e = 10^{PHRED/-10}, where P_e means estimated probability of an error.
*
* @param encodedQualityScore - an encoded Sanger Phred quality score.
* @return estimatedBaseCallErrorProbability - the estimated probability of a base call error.
*/
private double getEstimatedBaseCallErrorProbability(char encodedQualityScore) {
int phredQualityScore = 0;
for (int i=0; i < encodedSangerQualityScores.length; i++) {
if (encodedSangerQualityScores[i] == encodedQualityScore) {
phredQualityScore = i;
break;}
}
double phredQualityScoreAsDouble = (double) phredQualityScore;
return Math.pow(10.0,phredQualityScoreAsDouble/-10.0);
}
/**
* Get the encoded phred quality scores and the genotypes read. If the read is right-end, then it should align to the
* -ive strand, so reverse nucleobases, and read their complements.
*
* @param isLeft - true if the read is a left read, else false.
* @param nucleobases - human reference sequence nucleobases which the read is a read of.
* @return finalQualityScoresAndGenotypesRead - an ArrayList of char arrays containing the final quality scores and the
* genotypes that get read.
*/
public ArrayList<char[]> getQualityScoresAndGenotypesRead(boolean isLeft, ArrayList<Nucleobase> nucleobases) {
if (isLeft == false) {
Collections.reverse(nucleobases);
}
char[] initialQualityScores = getInitialQualityScores(isLeft);
char[] genotypesRead = new char[readLength];
char[] finalQualityScores = new char[readLength];
for (int i=0; i<nucleobases.size(); i++) {
Nucleobase nucleobase = nucleobases.get(i);
char correctGenotype = nucleobase.getGenotype();
if (isLeft == false) {
correctGenotype = nucleobase.getComplementGenotype();}
if (ArtificialFastqGenerationUtils.isACGT(correctGenotype)) {numACGTBaseCalls++;}
if (correctGenotype == 'N') {
genotypesRead[i] = correctGenotype;
finalQualityScores[i] = '#';
} else {
if (!simulateErrorInRead) {
genotypesRead[i] = correctGenotype;
} else {
genotypesRead[i] = simulate1BaseCallError(correctGenotype, initialQualityScores[i]);
}
finalQualityScores[i] = initialQualityScores[i];
}
}
ArrayList<char[]> finalQualityScoresAndGenotypesRead = new ArrayList<char[]>();
finalQualityScoresAndGenotypesRead.add(finalQualityScores);
finalQualityScoresAndGenotypesRead.add(genotypesRead);
return finalQualityScoresAndGenotypesRead;
}
/**
* Get the initial quality scores. These are initial because there maybe be one or more quality scores
* which are not poor, and correspond to an 'N'. This is changed in
* getQualityScoresAndGenotypesRead(boolean, ArrayList<Nucleobase>, boolean, boolean).
*
* @param isLeft - true if the read is a left read, else false.
* @return qualityScores - the initial Sanger format encoded phred quality scores for the read.
*/
private char[] getInitialQualityScores(boolean isLeft) {
char[] qualityScores = new char[readLength];
if (!useRealQualityScores) {
//return max confidence.
qualityScores = highestQualityScores;
} else {
qualityScores = getQualityScoresForNextReadFromFastq(isLeft);
}
return qualityScores;
}
/**
* Get the next read's quality scores from a pre-existing fastq file.
*
* @param isLeft - true if the read is a left read, else false.
* @return qualityScoresCharArray - the Sanger format encoded phred quality scores for the read.
*/
private char[] getQualityScoresForNextReadFromFastq(boolean isLeft) {
BufferedReader fastqForQualityScores = null;
if (isLeft) {
if (fastq1ForQualityScores == null) {
openPreexistingFastqFile(isLeft);}
fastqForQualityScores = fastq1ForQualityScores;
} else {
if (fastq2ForQualityScores == null) {
openPreexistingFastqFile(isLeft);}
fastqForQualityScores = fastq2ForQualityScores;
}
String qualityScores = null;
while (qualityScores == null) {
String fastqLine = null;
try {
boolean lineContainsConfidenceScores = false;
while ((fastqLine = fastqForQualityScores.readLine()) != null) {
if (lineContainsConfidenceScores) {
qualityScores = fastqLine;
break;
}
if (fastqLine.startsWith("+")) {
lineContainsConfidenceScores = true;
} else {
lineContainsConfidenceScores = false;
}
}
} catch (IOException ioe) {
ioe.printStackTrace();
logger.log(Level.SEVERE,ArtificialFastqGenerationUtils.getStackTraceString(ioe));
} finally {
if (fastqLine == null) {
closePreexistingFastqFile(isLeft);
openPreexistingFastqFile(isLeft);
if (isLeft) {
fastqForQualityScores = fastq1ForQualityScores;}
else {
fastqForQualityScores = fastq2ForQualityScores;}
}
}
}
char[] qualityScoresCharArray = qualityScores.replace("\\","").toCharArray();
if (qualityScoresCharArray.length != readLength) {
qualityScoresCharArray = addOrRemoveQualityScores(qualityScoresCharArray);
}
return qualityScoresCharArray;
}
/**
* Open a preexisting fastq file.
*
* @param isLeft - true if the read is a left read, else false.
*/
private void openPreexistingFastqFile(boolean isLeft) {
try {
if (isLeft) {
fastq1ForQualityScores = new BufferedReader(new FileReader(fastq1ForQualityScoresPath));
} else {
fastq2ForQualityScores = new BufferedReader(new FileReader(fastq2ForQualityScoresPath));
}
} catch (FileNotFoundException fnf) {
fnf.printStackTrace();
logger.log(Level.SEVERE,ArtificialFastqGenerationUtils.getStackTraceString(fnf));
}
}
/**
* Close a preexisting fastq file.
*
* @param isLeft - true if the read is a left read, else false.
*/
private void closePreexistingFastqFile(boolean isLeft) {
try {
if (isLeft) {
fastq1ForQualityScores.close();
} else {
fastq2ForQualityScores.close();
}
} catch (IOException ioe) {
ioe.printStackTrace();
logger.log(Level.SEVERE,ArtificialFastqGenerationUtils.getStackTraceString(ioe));
}
}
/**
* Increase/decrease the size of a sequence of real quality scores so that it is
* the same size as the readLength.
*
* @param qualityScores - original quality scores.
* @return newQualityScores -length-adjusted quality scores.
*/
private char[] addOrRemoveQualityScores(char[] qualityScores) {
logger.log(Level.WARNING,"Real quality scores are not length " + readLength);
char[] newQualityScores = new char[readLength];
int diffInLength = qualityScores.length - readLength;
int i=0; //index for qualityScores.
int j=0; //index for newQualityScores.
while (j < readLength) {
if (diffInLength > 0) {//Quality scores is too big, so ignore the 1st diffInLength characters.
i++;
diffInLength--;
} else if (diffInLength < 0) {//Quality scores is too small, so duplicate the 1st char \times diffInLength.
newQualityScores[j] = qualityScores[0];
j++;
diffInLength++;
} else {
newQualityScores[j] = qualityScores[i];
i++;
j++;
}
}
return newQualityScores;
}
/**
* Decide whether to generate an error based on the quality score. If yes
* choose 1 of the 3 alternate genotypes with equal probability.
*
* @param correctGenotype - the genotype of a nucleobase in the human reference sequence.
* @param qualityScore - the quality score in the read for this genotype.
* @return incorrectGenotype - the genotype that is read (possible incorrect).
*/
private char simulate1BaseCallError(char correctGenotype, char qualityScore) {
//if (!probOfErrorByEncodedQualityScore.containsKey(qualityScore)) {
//logger.log(Level.WARNING, qualityScore + " not in map.");
//}
//1st decide if we're going to simulate an error.
//Only simulate errors for A/C/G/T.
if (!ArtificialFastqGenerationUtils.isACGT(correctGenotype)) {
return correctGenotype;
}
double randomNumber = ArtificialFastqGenerationUtils.generateRandomDoubleBetween0And1();
double estimatedBaseCallErrorProbability = probOfErrorByEncodedQualityScore.get(qualityScore);
if (randomNumber >= estimatedBaseCallErrorProbability) {
return correctGenotype;}
//If we are simulating an error, then decide which alternate base to return.
//System.out.println("Simulating an error");
numBaseCallErrors = numBaseCallErrors + 1;
//Get the alternate genotypes.
char[] alternateGenotypes = ArtificialFastqGenerationUtils.getAlternateGenotypes(correctGenotype);
char incorrectGenotype = correctGenotype;
if (randomNumber <= estimatedBaseCallErrorProbability/3.0) {
incorrectGenotype = alternateGenotypes[0];}
else if (randomNumber <= estimatedBaseCallErrorProbability/1.5) {
incorrectGenotype = alternateGenotypes[1];}
else {incorrectGenotype = alternateGenotypes[2];}
return incorrectGenotype;
}
/**
* Get the number of base call errors that have been simulated.
*
* @return numBaseCallErrors - the number of bases for which base calls have been simulated.
*/
public long getNumBaseCallErrors() {
return numBaseCallErrors;
}
/**
* Reset the number of simulated base call errors to 0.
*/
public void resetNumBaseCallErrors() {
numBaseCallErrors = 0;
}
/**
* Get the number of ACGT bases.
*
* @return numACGTBaseCalls - the number of ACGT bases that have been called.
*/
public long getNumACGTBasesCalled() {
return numACGTBaseCalls;
}
}
|