File: ArtificialFastqGenerator.java

package info (click to toggle)
artfastqgenerator 0.0.20150519-1~bpo8%2B1
  • links: PTS, VCS
  • area: main
  • in suites: jessie-backports
  • size: 4,916 kB
  • sloc: java: 1,481; sh: 39; makefile: 11
file content (483 lines) | stat: -rw-r--r-- 20,374 bytes parent folder | download | duplicates (5)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
package artificialFastqGenerator;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.util.ArrayList;
import java.util.logging.Level;
import java.util.logging.Logger;


/**
 * An instance of this class can be used to generate artificial FASTQ files from the human reference genome.
 * 
 * Copyright (C) 2012 The Institute of Cancer Research (ICR).
 *
 * This file is part of ArtificialFastqGenerator v1.0.0.
 * 
 * ArtificialFastqGenerator is free software: you can redistribute it and/or modify it under the terms of the GNU General
 * Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any
 * later version.
 * 
 * This program is distributed in the hope that it will be useful but WITHOUT ANY WARRANTY; without even the implied warranty
 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 * 
 * You should have received a copy of the GNU Public License along with this program. If not, see
 * <http://www.gnu.org/licenses/>
 * 
 * Authour's contact email: Matthew.Frampton@icr.ac.uk
 */

public class ArtificialFastqGenerator {
	
	//Field for input stream
	private BufferedReader inputStream;
	//Fields for the start and end sequence identifiers 
	private String startSeqID, endSeqID;
	//PairedEnd gap parameters
	private int pairedEndGapMean, pairedEndGapMax, pairedEndGapMin;
	private double pairedEndGapSD;
	//Fields involved in read generation and logging. generatedAllReadsIndex holds the index of the last nucleobase for which all reads have been generated.
	private ArrayList<Nucleobase> nucleobases;
	private int nucleobasesSize, generatedAllReadsIndex, allReadsGeneratedNotGuaranteed, numProcessedRegions;
	private NucleobaseAndReadFactory nucleobaseAndReadFactory;
	private CoverageStatisticsCalculator coverageStatisticsCalculator;
	//Fields output streams.
	private BufferedWriter leftReadOutputStream, rightReadOutputStream, readStartIndexesOutputStream; //debugOutputStream;
	//Logger
	private Logger logger;

	
	public ArtificialFastqGenerator() {
		
		inputStream = null;
		startSeqID = Main.startSequenceIdentifier;
		endSeqID = Main.endSequenceIdentifier;
		pairedEndGapMean = Main.templateLengthMean-(2*Main.readLength)+1;
		pairedEndGapSD = Main.templateLengthSD;
		pairedEndGapMax = (int) Math.round(pairedEndGapMean + 2*pairedEndGapSD);
		pairedEndGapMin = (int) Math.round(pairedEndGapMean - pairedEndGapSD);
		if (pairedEndGapMin < 2) {
			pairedEndGapMin = 2;}
		nucleobases = new ArrayList<Nucleobase>();
		nucleobasesSize = 0;
		generatedAllReadsIndex = -1;
		allReadsGeneratedNotGuaranteed = 2*Main.readLength + pairedEndGapMax - 2;
		numProcessedRegions = 0;
		nucleobaseAndReadFactory = new NucleobaseAndReadFactory(Main.startingX, Main.startingY, Main.readLength,
				Main.useRealQualityScores, Main.simulateErrorInRead, Main.fastq1ForQualityScores,
				Main.fastq2ForQualityScores);
		coverageStatisticsCalculator = new CoverageStatisticsCalculator();
		//debugOutputStream = null;
		leftReadOutputStream = null;
		rightReadOutputStream = null;
		readStartIndexesOutputStream = null;
		logger = Main.logger;

	}
	
	/**
	 * Main method for generating FASTQ files.
	 * 
	 * @param referenceGenomePath
	 */
	
	public void generateArtificialFastqs(String referenceGenomePath) {
		
		boolean inSeqID = false;
		String seqID = "";
		boolean generatingReads = false;
		
		try {
			//Input stream
			inputStream = new BufferedReader(new FileReader(referenceGenomePath));
			//Output streams
			leftReadOutputStream = new BufferedWriter(new FileWriter(Main.outputPath + ".1.fastq"));
			rightReadOutputStream = new BufferedWriter(new FileWriter(Main.outputPath + ".2.fastq"));
			readStartIndexesOutputStream = new BufferedWriter(new FileWriter(Main.outputPath + ".readStartIndexes.txt"));
			int currentGenotypeAsInt;
			char currentGenotype;
			double GCcount = 0.0;
			boolean foundStartSeqID = false;
			boolean foundEndSeqID = false;
			
			while ((currentGenotypeAsInt = inputStream.read()) != -1) {
				currentGenotype = (char) currentGenotypeAsInt;
				if (currentGenotype == '>') {
					inSeqID = true;}
				if (inSeqID == true) {
					if (currentGenotype == '\n') {
						if (generatingReads == false) {
							if (seqID.startsWith(startSeqID)) {
								foundStartSeqID = true;
								logger.log(Level.INFO,"Found the start sequence ID str " + startSeqID + " in " + seqID);}
							seqID = "";
							inSeqID = false;
							if (foundStartSeqID) {
								logger.log(Level.INFO,"Read generation begun.");
								coverageStatisticsCalculator.logCoverageStatsHeader();
								generatingReads = true;
								continue;}
						} else if (generatingReads == true) {
							if (endSeqID != null) {
								if (seqID.startsWith(endSeqID)) {
									foundEndSeqID = true;
									logger.log(Level.INFO,"Found the end sequence identifier " + endSeqID + " in " + seqID);}
							}
							seqID = "";
							inSeqID = false;
							if (foundEndSeqID) {
								logger.log(Level.INFO,"Further reads generated only for nucleobases already in memory.");
								processRemainingNucleobases(GCcount);
								break;}
						}
					} else {
						seqID = seqID + currentGenotype;}
				}
				else if (generatingReads == true) {
					if (Character.isWhitespace(currentGenotypeAsInt)) {continue;}
					if (ArtificialFastqGenerationUtils.isGC(currentGenotype)) {
						GCcount = GCcount + 1.0;}
					nucleobases.add(nucleobaseAndReadFactory.createNucleobase(currentGenotype));
					nucleobasesSize = nucleobasesSize + 1;
					if (nucleobaseAndReadFactory.getNumNucleobasesCreated() % Main.captureProbeLength == 0) {
						processNucleobases(false,GCcount);
						GCcount = 0.0;}
				}
			}
			if (foundEndSeqID == false) {
				if (!foundStartSeqID) {logger.log(Level.SEVERE,"Didn't find the start sequence identifier " + startSeqID +
						" so no reads generated.");}
				else {
					processRemainingNucleobases(GCcount);
					if (endSeqID != null) {logger.log(Level.WARNING,"Didn't find the end sequence identifier " + endSeqID);}
					}
				}
		} catch (FileNotFoundException fnfe) {
			fnfe.printStackTrace();
			logger.log(Level.SEVERE,ArtificialFastqGenerationUtils.getStackTraceString(fnfe));}
		catch (IOException ioe) {
			ioe.printStackTrace();
			logger.log(Level.SEVERE,ArtificialFastqGenerationUtils.getStackTraceString(ioe));
		} finally {
			try {
				if (inputStream != null) {
					inputStream.close();}
				//if (debugOutputStream != null) {
					//debugOutputStream.close();}
				if (leftReadOutputStream != null) {
					leftReadOutputStream.close();}
				if (rightReadOutputStream != null) {
					rightReadOutputStream.close();}
				if (readStartIndexesOutputStream != null) {
					readStartIndexesOutputStream.close();}
			} catch (IOException ioe) {
				ioe.printStackTrace();
				logger.log(Level.SEVERE,ArtificialFastqGenerationUtils.getStackTraceString(ioe));
			}
		}
	}
	
	/**
	 * Process remaining nucleobases at the end of the sequence.
	 * 
	 * @param GCcount
	 */
	
	private void processRemainingNucleobases(double GCcount) {
		processNucleobases(true,GCcount);
		coverageStatisticsCalculator.logOverallStats();
		nucleobaseAndReadFactory.logOverallErrorStats();
	}
	
	/**
	 * Top method for processing a region of nucleobases in order to set the target coverage and create reads.
	 * 	
	 * @param finalReadGenerationRegion - true if this is the final region of nucleobases to process, else false.
	 * @param GCcount - the GC count for the latest capture probe region.
	 */
	
	private void processNucleobases(boolean finalReadGenerationRegion, double GCcount) {
		
		long numNucleobasesCreated = nucleobaseAndReadFactory.getNumNucleobasesCreated();
		
		//Non-final region
		if (!finalReadGenerationRegion) {
			setTargetCoverage(nucleobasesSize-Main.captureProbeLength,nucleobasesSize,GCcount/Main.captureProbeLength);
			GCcount = 0.0;
			if (nucleobasesSize >= Main.nucleobaseBufferSize + allReadsGeneratedNotGuaranteed) {
				createAndWriteReadsForRegion(false);}
		}
		//Final region
		else if (finalReadGenerationRegion) {
			setTargetCoverage(nucleobasesSize-(int)(numNucleobasesCreated%Main.captureProbeLength), nucleobasesSize,
					GCcount/(numNucleobasesCreated%Main.captureProbeLength));//Set the coverage for all which haven't yet been set.
			if (Main.nucleobaseBufferSize > numNucleobasesCreated) {
				logger.log(Level.WARNING, "nucleobaseBufferSize parameter > total number of nucleobases, so resetting it to " + numNucleobasesCreated + ".");
				Main.nucleobaseBufferSize=(int)numNucleobasesCreated;}
			createAndWriteReadsForRegion(true);
		}
		
	}
	
	
	/**
	 * Top method for creating pairs of reads for a region of nucleobases: 
	 * 1. If this is the first region of nucleobases to be processed, reduce their target coverage.
	 * 2. Create the reads by calling createPairedEndReadsFrom1Nucleobase(indexInNucleobases) for each nucleobase.
	 * 3. Write out the reads for the region by calling writeReadsAndCoverageStatistics(regionStartIndex,regionEndIndex).
	 * 4. Set now redundant referents to null by calling setRedundantReferentsToNull(lastRedundantNucleobaseIndex).
	 * 
	 * @param finalReadGenerationRegion - true if this is the final region of nucleobases to process, else false.
	 */
	
	private void createAndWriteReadsForRegion(boolean finalReadGenerationRegion) {
		
		int regionStartIndex = 0;
		//If this is the 1st region, target coverage will need reducing.
		if (nucleobases.get(0).getUniqueID()==1) {
			int coverageReduction = nucleobases.get(0).getTargetCoverage()-1;
			for (int i=0; i<Main.readLength; i++) {
				Nucleobase nucleobase = nucleobases.get(i);
				int originalCoverage = nucleobase.getTargetCoverage();
				nucleobase.setTargetCoverage(originalCoverage-coverageReduction);
				coverageReduction = coverageReduction - 1;
				if (coverageReduction == 0) {break;}
				}
		} else {
			regionStartIndex = Main.readLength-1;}
		
		//Create reads starting from the 1st nucleobase after the last for which we definitely know all covering reads have been created.
		for (int i=generatedAllReadsIndex+1; i<nucleobases.size(); i++) {
			createPairedEndReadsFrom1Nucleobase(i);}
		generatedAllReadsIndex = nucleobasesSize - allReadsGeneratedNotGuaranteed;
		if (finalReadGenerationRegion) {generatedAllReadsIndex = nucleobasesSize;}
		
		//Write out the reads for the region.
		int regionEndIndex = regionStartIndex + Main.nucleobaseBufferSize;
		int lastRedundantNucleobaseIndex = regionEndIndex - (Main.readLength-1);
		while (regionEndIndex <= generatedAllReadsIndex) {
			writeReadsAndCoverageStatistics(regionStartIndex,regionEndIndex);
			setRedundantReferentsToNull(lastRedundantNucleobaseIndex);}
		
		//If this is the final region, make sure all the reads are written.
		if (finalReadGenerationRegion) {
			regionStartIndex = Main.readLength-1;
			if (regionStartIndex < generatedAllReadsIndex) {
				numProcessedRegions = Main.logRegionStats - 1;
				writeReadsAndCoverageStatistics(regionStartIndex,generatedAllReadsIndex);}
			setRedundantReferentsToNull(generatedAllReadsIndex);
		}
		
	}
		
	/**
	 * Loop through the different possible read sequences which include a particular nucleobase. For each sequence, treat it
	 * as a possible left read, and try to create a pair of reads by calling tryCreate1PairOfReads(leftReadStartIndex,
	 * leftReadEndIndex). The 2 possible stopping conditions are 1. the target coverage for the nucleobase has been reached;
	 * 2. no new pair of reads is created during 1 loop through all of the possible left reads.
	 * 
	 * @param indexInNucleobases - index of the nucleobase in the nucleobases ArrayList.
	 */
	
	private void createPairedEndReadsFrom1Nucleobase(int indexInNucleobases) {
	
		//Get the start and end indexes of the 1st sequence and the start and end indexes of the last.
		int firstLeftReadStartIndex = indexInNucleobases - Main.readLength + 1;
		if (firstLeftReadStartIndex < 0) {firstLeftReadStartIndex = 0;}
		int lastLeftReadStartIndex = indexInNucleobases;
		if (nucleobasesSize - lastLeftReadStartIndex < Main.readLength) {
			lastLeftReadStartIndex = nucleobasesSize - Main.readLength;}
		Nucleobase nucleobase = nucleobases.get(indexInNucleobases);
		
		while (!nucleobase.hasTargetCoverage()) {
			boolean createdNewPairOfReads = false;
			//Now try creating left and right paired ends. 
			for (int i=firstLeftReadStartIndex; i<=lastLeftReadStartIndex; i++) {
				int fullyCoveredIndex = tryToCreate1PairOfReads(i,i+Main.readLength);
				if (fullyCoveredIndex > -1) {i=fullyCoveredIndex;}
				else if (fullyCoveredIndex==-2) {
					createdNewPairOfReads = true;
					if (nucleobase.hasTargetCoverage()) {break;}
					}
			}
			if (!createdNewPairOfReads) {break;}
		}
		
	}
	
	/**
	 * Starting with the start and end index for the possible left read, try to create a pair of reads. Returns -2 if a
	 * left and right reads are created, -1 if a left read can be created but not a right, else return the index of the
	 * nucleobase in the left read sequence at which failure occurs (as a result of violating a target coverage or N-
	 * filter constraint).
	 * 
	 * @param leftReadStartIndex - the lowest possible starting index in nucleobases for a left read.
	 * @param leftReadEndIndex - the highest possible starting index in nucleobases for a left read.
	 * @return i - endcoded outcome of the attempt to create a pair of reads.
	 */
	
	private int tryToCreate1PairOfReads(int leftReadStartIndex, int leftReadEndIndex) {
		
		BabyRead babyLeftRead = new BabyRead(leftReadStartIndex,leftReadEndIndex);
		int failureIndex = babyLeftRead.failureIndex;
		if (failureIndex != -1) {
			ArtificialFastqGenerationUtils.setRedundantReadToNull(babyLeftRead); //Hold Java's hand for garbage collection.
			return failureIndex;}
		
		int rightReadStartIndex = leftReadEndIndex - 1 + ArtificialFastqGenerationUtils.getRandomIntWithinRange(pairedEndGapMean,pairedEndGapSD,pairedEndGapMin,pairedEndGapMax);
		int rightReadEndIndex = rightReadStartIndex + Main.readLength;
		if (!(rightReadStartIndex > nucleobasesSize-1) && !(rightReadEndIndex > nucleobasesSize)) {
			BabyRead babyRightRead = new BabyRead(rightReadStartIndex,rightReadEndIndex);
			failureIndex = babyRightRead.failureIndex;
			if (failureIndex != -1) {
				ArtificialFastqGenerationUtils.setRedundantReadToNull(babyLeftRead); //For garbage collection.
				ArtificialFastqGenerationUtils.setRedundantReadToNull(babyRightRead); //For garbage collection.
				return -1;}
			//The L and R read sequences don't violate target coverage or N-filter constraints, so create the paired-end reads.
			nucleobaseAndReadFactory.createPairedEndReads(babyLeftRead.getSequence(),babyRightRead.getSequence());
		} else {
			ArtificialFastqGenerationUtils.setRedundantReadToNull(babyLeftRead); //For garbage collection.
			return -1;}
		
		return -2;
	}
	
	/**
	 * Write the left and right reads to the left and right fastq files respectively, and coverage statistics to the log
	 * file. 
	 * 
	 * @param startIndex - the index in nucleobases of the 1st nucleobase to write reads and coverage statistics for.
	 * @param endIndex - the index in nucleobases of the last nucleobase to write reads and coverage statistics for.
	 */
	
	private void writeReadsAndCoverageStatistics(int startIndex, int endIndex) {	
		
		try {
			for (int i=startIndex; i<endIndex; i++) {
				Nucleobase nucleobase = nucleobases.get(i);
				coverageStatisticsCalculator.updateRegionStatsFrom1Nucleobase(nucleobase);
				ArrayList<Read> reads = nucleobase.getCoveringReads();
				//debugOutputStream.write(nucleobase.toString() + " " + reads + "\n");
				for (Read leftEndRead : reads) {
					if (leftEndRead.isLeftReadAndNeedsWriting()) {
						leftReadOutputStream.write(leftEndRead.toString());
						Read rightEndRead = leftEndRead.getPairedEndRead();
						rightReadOutputStream.write(rightEndRead.toString());
						readStartIndexesOutputStream.write(leftEndRead.getSequence().get(0).getUniqueID() 
								+ "," + rightEndRead.getSequence().get(0).getUniqueID() + "\n");
						leftEndRead.setLeftReadAndNeedsWriting(false);}
				}
			}
			//debugOutputStream.flush();
			leftReadOutputStream.flush();
			rightReadOutputStream.flush();
			readStartIndexesOutputStream.flush();
		} catch (IOException ioe) {
			ioe.printStackTrace();
			ArtificialFastqGenerationUtils.getStackTraceString(ioe);
		}
		
		numProcessedRegions = numProcessedRegions + 1;
		if (numProcessedRegions == Main.logRegionStats) {
			coverageStatisticsCalculator.logRegionStats();
			numProcessedRegions = 0;
		}
		
	}

	/***
	 * Set target coverage for nucleobases. If coverageBasedOnGCcontent is true, calculate the target coverage based on GC
	 * content. 
	 * 
	 * @param startIndex - index of the 1st nucleobase in nucleobases for which to set target coverage.
	 * @param endIndex - index of the last nucleobase in nucleobases for which to set target coverage.
	 * @param GCcontent - GCcontent of this capture probe region.
	 */
	
	private void setTargetCoverage(int startIndex, int endIndex, double GCcontent) {
		
		int coverage = (int) Main.coverageMeanPeak + 1; 
		if (Main.coverageBasedOnGCcontent == true) {
			//coverage = ArtificialFastqGenerationUtils.calculateTargetCoverageFromGCcontent(GCcontent);			
			coverage = ArtificialFastqGenerationUtils.getTargetCoverageViaGaussFuncOfGCcont(GCcontent, Main.coverageMeanPeak,
					Main.coverageMeanPeakGCcontent, Main.coverageMeanGCcontentSpread, Main.coverageSD);
		}
		for (int i=startIndex; i<endIndex; i++) {
			nucleobases.get(i).setTargetCoverage(coverage);
		}
		
	}

	/**
	 * Set references to redundant nucleobases and reads to null so that they will be garbage collected.
	 * 
	 * @param lastRedundantNucleobaseIndex
	 */
	
	
	private void setRedundantReferentsToNull(int lastRedundantNucleobaseIndex) {
		
		for (int i=0; i<lastRedundantNucleobaseIndex; i++) {
			Nucleobase redundantNucleobase = nucleobases.get(0);
			ArrayList<Read> reads = redundantNucleobase.getCoveringReads();
			for (Read redundantRead : reads) {
				ArtificialFastqGenerationUtils.setRedundantReadToNull(redundantRead);
			}
			nucleobases.remove(0);
			nucleobasesSize = nucleobasesSize - 1;
			generatedAllReadsIndex = generatedAllReadsIndex - 1;
			redundantNucleobase = null;
		}
		//Runtime.getRuntime().gc();
		
	}	
	
	/**
	 * A sequence of nucleobases which may be used to create a read. Has fields for (1) the sequence, (2) the index in the
	 * ArrayList nucleobases of the first nucleobase to prohibit read generation, either due to its target coverage
	 * constraint or the readsContainingN filter. A failure index of -1 means there was no failure.
	 * 
	 * @author mframpton
	 */
	
	public class BabyRead extends AbstractRead {
		
		private boolean allNs;
		private int failureIndex;
		
		public BabyRead(int startIndexInNucleobases, int endIndexInNucleobases) {

			sequence = new ArrayList<Nucleobase>();
			allNs = true;
			failureIndex = -1;
			createSequence(startIndexInNucleobases, endIndexInNucleobases);
	
		}
		
		public void createSequence(int startIndexInNucleobases, int endIndexInNucleobases) {
			
			for (int i=startIndexInNucleobases; i<endIndexInNucleobases; i++) {
				Nucleobase nucleobaseForRead = nucleobases.get(i);
				char genotype = nucleobaseForRead.getGenotype();
				if (genotype != 'N') 
					{allNs = false;}
				if (nucleobaseForRead.hasTargetCoverage()) {
					this.failureIndex = i;
					break;}
				else if (Main.readsContainingNfilter == 2 && genotype == 'N') {
					this.failureIndex = i;
					break;}
				this.sequence.add(nucleobaseForRead);
			}
			if (allNs == true && Main.readsContainingNfilter == 1) {
				failureIndex = startIndexInNucleobases;}
			
		}
		
	}
	
}