File: ArtificialFastqGenerator.java

package info (click to toggle)
artfastqgenerator 0.0.20150519-1~bpo8%2B1
links: PTS, VCS
area: main
in suites: jessie-backports
size: 4,916 kB
sloc: java: 1,481; sh: 39; makefile: 11
file content (483 lines) | stat: -rw-r--r-- 20,374 bytes
parent folder | download | duplicates (5)
package artificialFastqGenerator;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.util.ArrayList;
import java.util.logging.Level;
import java.util.logging.Logger;


/**
 * An instance of this class can be used to generate artificial FASTQ files from the human reference genome.
 * 
 * Copyright (C) 2012 The Institute of Cancer Research (ICR).
 *
 * This file is part of ArtificialFastqGenerator v1.0.0.
 * 
 * ArtificialFastqGenerator is free software: you can redistribute it and/or modify it under the terms of the GNU General
 * Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any
 * later version.
 * 
 * This program is distributed in the hope that it will be useful but WITHOUT ANY WARRANTY; without even the implied warranty
 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 * 
 * You should have received a copy of the GNU Public License along with this program. If not, see
 * <http://www.gnu.org/licenses/>
 * 
 * Authour's contact email: Matthew.Frampton@icr.ac.uk
 */

public class ArtificialFastqGenerator {
	
	//Field for input stream
	private BufferedReader inputStream;
	//Fields for the start and end sequence identifiers 
	private String startSeqID, endSeqID;
	//PairedEnd gap parameters
	private int pairedEndGapMean, pairedEndGapMax, pairedEndGapMin;
	private double pairedEndGapSD;
	//Fields involved in read generation and logging. generatedAllReadsIndex holds the index of the last nucleobase for which all reads have been generated.
	private ArrayList<Nucleobase> nucleobases;
	private int nucleobasesSize, generatedAllReadsIndex, allReadsGeneratedNotGuaranteed, numProcessedRegions;
	private NucleobaseAndReadFactory nucleobaseAndReadFactory;
	private CoverageStatisticsCalculator coverageStatisticsCalculator;
	//Fields output streams.
	private BufferedWriter leftReadOutputStream, rightReadOutputStream, readStartIndexesOutputStream; //debugOutputStream;
	//Logger
	private Logger logger;

	
	public ArtificialFastqGenerator() {
		
		inputStream = null;
		startSeqID = Main.startSequenceIdentifier;
		endSeqID = Main.endSequenceIdentifier;
		pairedEndGapMean = Main.templateLengthMean-(2*Main.readLength)+1;
		pairedEndGapSD = Main.templateLengthSD;
		pairedEndGapMax = (int) Math.round(pairedEndGapMean + 2*pairedEndGapSD);
		pairedEndGapMin = (int) Math.round(pairedEndGapMean - pairedEndGapSD);
		if (pairedEndGapMin < 2) {
			pairedEndGapMin = 2;}
		nucleobases = new ArrayList<Nucleobase>();
		nucleobasesSize = 0;
		generatedAllReadsIndex = -1;
		allReadsGeneratedNotGuaranteed = 2*Main.readLength + pairedEndGapMax - 2;
		numProcessedRegions = 0;
		nucleobaseAndReadFactory = new NucleobaseAndReadFactory(Main.startingX, Main.startingY, Main.readLength,
				Main.useRealQualityScores, Main.simulateErrorInRead, Main.fastq1ForQualityScores,
				Main.fastq2ForQualityScores);
		coverageStatisticsCalculator = new CoverageStatisticsCalculator();
		//debugOutputStream = null;
		leftReadOutputStream = null;
		rightReadOutputStream = null;
		readStartIndexesOutputStream = null;
		logger = Main.logger;

	}
	
	/**
	 * Main method for generating FASTQ files.
	 * 
	 * @param referenceGenomePath
	 */
	
	public void generateArtificialFastqs(String referenceGenomePath) {
		
		boolean inSeqID = false;
		String seqID = "";
		boolean generatingReads = false;
		
		try {
			//Input stream
			inputStream = new BufferedReader(new FileReader(referenceGenomePath));
			//Output streams
			leftReadOutputStream = new BufferedWriter(new FileWriter(Main.outputPath + ".1.fastq"));
			rightReadOutputStream = new BufferedWriter(new FileWriter(Main.outputPath + ".2.fastq"));
			readStartIndexesOutputStream = new BufferedWriter(new FileWriter(Main.outputPath + ".readStartIndexes.txt"));
			int currentGenotypeAsInt;
			char currentGenotype;
			double GCcount = 0.0;
			boolean foundStartSeqID = false;
			boolean foundEndSeqID = false;
			
			while ((currentGenotypeAsInt = inputStream.read()) != -1) {
				currentGenotype = (char) currentGenotypeAsInt;
				if (currentGenotype == '>') {
					inSeqID = true;}
				if (inSeqID == true) {
					if (currentGenotype == '\n') {
						if (generatingReads == false) {
							if (seqID.startsWith(startSeqID)) {
								foundStartSeqID = true;
								logger.log(Level.INFO,"Found the start sequence ID str " + startSeqID + " in " + seqID);}
							seqID = "";
							inSeqID = false;
							if (foundStartSeqID) {
								logger.log(Level.INFO,"Read generation begun.");
								coverageStatisticsCalculator.logCoverageStatsHeader();
								generatingReads = true;
								continue;}
						} else if (generatingReads == true) {
							if (endSeqID != null) {
								if (seqID.startsWith(endSeqID)) {
									foundEndSeqID = true;
									logger.log(Level.INFO,"Found the end sequence identifier " + endSeqID + " in " + seqID);}
							}
							seqID = "";
							inSeqID = false;
							if (foundEndSeqID) {
								logger.log(Level.INFO,"Further reads generated only for nucleobases already in memory.");
								processRemainingNucleobases(GCcount);
								break;}
						}
					} else {
						seqID = seqID + currentGenotype;}
				}
				else if (generatingReads == true) {
					if (Character.isWhitespace(currentGenotypeAsInt)) {continue;}
					if (ArtificialFastqGenerationUtils.isGC(currentGenotype)) {
						GCcount = GCcount + 1.0;}
					nucleobases.add(nucleobaseAndReadFactory.createNucleobase(currentGenotype));
					nucleobasesSize = nucleobasesSize + 1;
					if (nucleobaseAndReadFactory.getNumNucleobasesCreated() % Main.captureProbeLength == 0) {
						processNucleobases(false,GCcount);
						GCcount = 0.0;}
				}
			}
			if (foundEndSeqID == false) {
				if (!foundStartSeqID) {logger.log(Level.SEVERE,"Didn't find the start sequence identifier " + startSeqID +
						" so no reads generated.");}
				else {
					processRemainingNucleobases(GCcount);
					if (endSeqID != null) {logger.log(Level.WARNING,"Didn't find the end sequence identifier " + endSeqID);}
					}
				}
		} catch (FileNotFoundException fnfe) {
			fnfe.printStackTrace();
			logger.log(Level.SEVERE,ArtificialFastqGenerationUtils.getStackTraceString(fnfe));}
		catch (IOException ioe) {
			ioe.printStackTrace();
			logger.log(Level.SEVERE,ArtificialFastqGenerationUtils.getStackTraceString(ioe));
		} finally {
			try {
				if (inputStream != null) {
					inputStream.close();}
				//if (debugOutputStream != null) {
					//debugOutputStream.close();}
				if (leftReadOutputStream != null) {
					leftReadOutputStream.close();}
				if (rightReadOutputStream != null) {
					rightReadOutputStream.close();}
				if (readStartIndexesOutputStream != null) {
					readStartIndexesOutputStream.close();}
			} catch (IOException ioe) {
				ioe.printStackTrace();
				logger.log(Level.SEVERE,ArtificialFastqGenerationUtils.getStackTraceString(ioe));
			}
		}
	}
	
	/**
	 * Process remaining nucleobases at the end of the sequence.
	 * 
	 * @param GCcount
	 */
	
	private void processRemainingNucleobases(double GCcount) {
		processNucleobases(true,GCcount);
		coverageStatisticsCalculator.logOverallStats();
		nucleobaseAndReadFactory.logOverallErrorStats();
	}
	
	/**
	 * Top method for processing a region of nucleobases in order to set the target coverage and create reads.
	 * 	
	 * @param finalReadGenerationRegion - true if this is the final region of nucleobases to process, else false.
	 * @param GCcount - the GC count for the latest capture probe region.
	 */
	
	private void processNucleobases(boolean finalReadGenerationRegion, double GCcount) {
		
		long numNucleobasesCreated = nucleobaseAndReadFactory.getNumNucleobasesCreated();
		
		//Non-final region
		if (!finalReadGenerationRegion) {
			setTargetCoverage(nucleobasesSize-Main.captureProbeLength,nucleobasesSize,GCcount/Main.captureProbeLength);
			GCcount = 0.0;
			if (nucleobasesSize >= Main.nucleobaseBufferSize + allReadsGeneratedNotGuaranteed) {
				createAndWriteReadsForRegion(false);}
		}
		//Final region
		else if (finalReadGenerationRegion) {
			setTargetCoverage(nucleobasesSize-(int)(numNucleobasesCreated%Main.captureProbeLength), nucleobasesSize,
					GCcount/(numNucleobasesCreated%Main.captureProbeLength));//Set the coverage for all which haven't yet been set.
			if (Main.nucleobaseBufferSize > numNucleobasesCreated) {
				logger.log(Level.WARNING, "nucleobaseBufferSize parameter > total number of nucleobases, so resetting it to " + numNucleobasesCreated + ".");
				Main.nucleobaseBufferSize=(int)numNucleobasesCreated;}
			createAndWriteReadsForRegion(true);
		}
		
	}
	
	
	/**
	 * Top method for creating pairs of reads for a region of nucleobases: 
	 * 1. If this is the first region of nucleobases to be processed, reduce their target coverage.
	 * 2. Create the reads by calling createPairedEndReadsFrom1Nucleobase(indexInNucleobases) for each nucleobase.
	 * 3. Write out the reads for the region by calling writeReadsAndCoverageStatistics(regionStartIndex,regionEndIndex).
	 * 4. Set now redundant referents to null by calling setRedundantReferentsToNull(lastRedundantNucleobaseIndex).
	 * 
	 * @param finalReadGenerationRegion - true if this is the final region of nucleobases to process, else false.
	 */
	
	private void createAndWriteReadsForRegion(boolean finalReadGenerationRegion) {
		
		int regionStartIndex = 0;
		//If this is the 1st region, target coverage will need reducing.
		if (nucleobases.get(0).getUniqueID()==1) {
			int coverageReduction = nucleobases.get(0).getTargetCoverage()-1;
			for (int i=0; i<Main.readLength; i++) {
				Nucleobase nucleobase = nucleobases.get(i);
				int originalCoverage = nucleobase.getTargetCoverage();
				nucleobase.setTargetCoverage(originalCoverage-coverageReduction);
				coverageReduction = coverageReduction - 1;
				if (coverageReduction == 0) {break;}
				}
		} else {
			regionStartIndex = Main.readLength-1;}
		
		//Create reads starting from the 1st nucleobase after the last for which we definitely know all covering reads have been created.
		for (int i=generatedAllReadsIndex+1; i<nucleobases.size(); i++) {
			createPairedEndReadsFrom1Nucleobase(i);}
		generatedAllReadsIndex = nucleobasesSize - allReadsGeneratedNotGuaranteed;
		if (finalReadGenerationRegion) {generatedAllReadsIndex = nucleobasesSize;}
		
		//Write out the reads for the region.
		int regionEndIndex = regionStartIndex + Main.nucleobaseBufferSize;
		int lastRedundantNucleobaseIndex = regionEndIndex - (Main.readLength-1);
		while (regionEndIndex <= generatedAllReadsIndex) {
			writeReadsAndCoverageStatistics(regionStartIndex,regionEndIndex);
			setRedundantReferentsToNull(lastRedundantNucleobaseIndex);}
		
		//If this is the final region, make sure all the reads are written.
		if (finalReadGenerationRegion) {
			regionStartIndex = Main.readLength-1;
			if (regionStartIndex < generatedAllReadsIndex) {
				numProcessedRegions = Main.logRegionStats - 1;
				writeReadsAndCoverageStatistics(regionStartIndex,generatedAllReadsIndex);}
			setRedundantReferentsToNull(generatedAllReadsIndex);
		}
		
	}
		
	/**
	 * Loop through the different possible read sequences which include a particular nucleobase. For each sequence, treat it
	 * as a possible left read, and try to create a pair of reads by calling tryCreate1PairOfReads(leftReadStartIndex,
	 * leftReadEndIndex). The 2 possible stopping conditions are 1. the target coverage for the nucleobase has been reached;
	 * 2. no new pair of reads is created during 1 loop through all of the possible left reads.
	 * 
	 * @param indexInNucleobases - index of the nucleobase in the nucleobases ArrayList.
	 */
	
	private void createPairedEndReadsFrom1Nucleobase(int indexInNucleobases) {
	
		//Get the start and end indexes of the 1st sequence and the start and end indexes of the last.
		int firstLeftReadStartIndex = indexInNucleobases - Main.readLength + 1;
		if (firstLeftReadStartIndex < 0) {firstLeftReadStartIndex = 0;}
		int lastLeftReadStartIndex = indexInNucleobases;
		if (nucleobasesSize - lastLeftReadStartIndex < Main.readLength) {
			lastLeftReadStartIndex = nucleobasesSize - Main.readLength;}
		Nucleobase nucleobase = nucleobases.get(indexInNucleobases);
		
		while (!nucleobase.hasTargetCoverage()) {
			boolean createdNewPairOfReads = false;
			//Now try creating left and right paired ends. 
			for (int i=firstLeftReadStartIndex; i<=lastLeftReadStartIndex; i++) {
				int fullyCoveredIndex = tryToCreate1PairOfReads(i,i+Main.readLength);
				if (fullyCoveredIndex > -1) {i=fullyCoveredIndex;}
				else if (fullyCoveredIndex==-2) {
					createdNewPairOfReads = true;
					if (nucleobase.hasTargetCoverage()) {break;}
					}
			}
			if (!createdNewPairOfReads) {break;}
		}
		
	}
	
	/**
	 * Starting with the start and end index for the possible left read, try to create a pair of reads. Returns -2 if a
	 * left and right reads are created, -1 if a left read can be created but not a right, else return the index of the
	 * nucleobase in the left read sequence at which failure occurs (as a result of violating a target coverage or N-
	 * filter constraint).
	 * 
	 * @param leftReadStartIndex - the lowest possible starting index in nucleobases for a left read.
	 * @param leftReadEndIndex - the highest possible starting index in nucleobases for a left read.
	 * @return i - endcoded outcome of the attempt to create a pair of reads.
	 */
	
	private int tryToCreate1PairOfReads(int leftReadStartIndex, int leftReadEndIndex) {
		
		BabyRead babyLeftRead = new BabyRead(leftReadStartIndex,leftReadEndIndex);
		int failureIndex = babyLeftRead.failureIndex;
		if (failureIndex != -1) {
			ArtificialFastqGenerationUtils.setRedundantReadToNull(babyLeftRead); //Hold Java's hand for garbage collection.
			return failureIndex;}
		
		int rightReadStartIndex = leftReadEndIndex - 1 + ArtificialFastqGenerationUtils.getRandomIntWithinRange(pairedEndGapMean,pairedEndGapSD,pairedEndGapMin,pairedEndGapMax);
		int rightReadEndIndex = rightReadStartIndex + Main.readLength;
		if (!(rightReadStartIndex > nucleobasesSize-1) && !(rightReadEndIndex > nucleobasesSize)) {
			BabyRead babyRightRead = new BabyRead(rightReadStartIndex,rightReadEndIndex);
			failureIndex = babyRightRead.failureIndex;
			if (failureIndex != -1) {
				ArtificialFastqGenerationUtils.setRedundantReadToNull(babyLeftRead); //For garbage collection.
				ArtificialFastqGenerationUtils.setRedundantReadToNull(babyRightRead); //For garbage collection.
				return -1;}
			//The L and R read sequences don't violate target coverage or N-filter constraints, so create the paired-end reads.
			nucleobaseAndReadFactory.createPairedEndReads(babyLeftRead.getSequence(),babyRightRead.getSequence());
		} else {
			ArtificialFastqGenerationUtils.setRedundantReadToNull(babyLeftRead); //For garbage collection.
			return -1;}
		
		return -2;
	}
	
	/**
	 * Write the left and right reads to the left and right fastq files respectively, and coverage statistics to the log
	 * file. 
	 * 
	 * @param startIndex - the index in nucleobases of the 1st nucleobase to write reads and coverage statistics for.
	 * @param endIndex - the index in nucleobases of the last nucleobase to write reads and coverage statistics for.
	 */
	
	private void writeReadsAndCoverageStatistics(int startIndex, int endIndex) {	
		
		try {
			for (int i=startIndex; i<endIndex; i++) {
				Nucleobase nucleobase = nucleobases.get(i);
				coverageStatisticsCalculator.updateRegionStatsFrom1Nucleobase(nucleobase);
				ArrayList<Read> reads = nucleobase.getCoveringReads();
				//debugOutputStream.write(nucleobase.toString() + " " + reads + "\n");
				for (Read leftEndRead : reads) {
					if (leftEndRead.isLeftReadAndNeedsWriting()) {
						leftReadOutputStream.write(leftEndRead.toString());
						Read rightEndRead = leftEndRead.getPairedEndRead();
						rightReadOutputStream.write(rightEndRead.toString());
						readStartIndexesOutputStream.write(leftEndRead.getSequence().get(0).getUniqueID() 
								+ "," + rightEndRead.getSequence().get(0).getUniqueID() + "\n");
						leftEndRead.setLeftReadAndNeedsWriting(false);}
				}
			}
			//debugOutputStream.flush();
			leftReadOutputStream.flush();
			rightReadOutputStream.flush();
			readStartIndexesOutputStream.flush();
		} catch (IOException ioe) {
			ioe.printStackTrace();
			ArtificialFastqGenerationUtils.getStackTraceString(ioe);
		}
		
		numProcessedRegions = numProcessedRegions + 1;
		if (numProcessedRegions == Main.logRegionStats) {
			coverageStatisticsCalculator.logRegionStats();
			numProcessedRegions = 0;
		}
		
	}

	/***
	 * Set target coverage for nucleobases. If coverageBasedOnGCcontent is true, calculate the target coverage based on GC
	 * content. 
	 * 
	 * @param startIndex - index of the 1st nucleobase in nucleobases for which to set target coverage.
	 * @param endIndex - index of the last nucleobase in nucleobases for which to set target coverage.
	 * @param GCcontent - GCcontent of this capture probe region.
	 */
	
	private void setTargetCoverage(int startIndex, int endIndex, double GCcontent) {
		
		int coverage = (int) Main.coverageMeanPeak + 1; 
		if (Main.coverageBasedOnGCcontent == true) {
			//coverage = ArtificialFastqGenerationUtils.calculateTargetCoverageFromGCcontent(GCcontent);			
			coverage = ArtificialFastqGenerationUtils.getTargetCoverageViaGaussFuncOfGCcont(GCcontent, Main.coverageMeanPeak,
					Main.coverageMeanPeakGCcontent, Main.coverageMeanGCcontentSpread, Main.coverageSD);
		}
		for (int i=startIndex; i<endIndex; i++) {
			nucleobases.get(i).setTargetCoverage(coverage);
		}
		
	}

	/**
	 * Set references to redundant nucleobases and reads to null so that they will be garbage collected.
	 * 
	 * @param lastRedundantNucleobaseIndex
	 */
	
	
	private void setRedundantReferentsToNull(int lastRedundantNucleobaseIndex) {
		
		for (int i=0; i<lastRedundantNucleobaseIndex; i++) {
			Nucleobase redundantNucleobase = nucleobases.get(0);
			ArrayList<Read> reads = redundantNucleobase.getCoveringReads();
			for (Read redundantRead : reads) {
				ArtificialFastqGenerationUtils.setRedundantReadToNull(redundantRead);
			}
			nucleobases.remove(0);
			nucleobasesSize = nucleobasesSize - 1;
			generatedAllReadsIndex = generatedAllReadsIndex - 1;
			redundantNucleobase = null;
		}
		//Runtime.getRuntime().gc();
		
	}	
	
	/**
	 * A sequence of nucleobases which may be used to create a read. Has fields for (1) the sequence, (2) the index in the
	 * ArrayList nucleobases of the first nucleobase to prohibit read generation, either due to its target coverage
	 * constraint or the readsContainingN filter. A failure index of -1 means there was no failure.
	 * 
	 * @author mframpton
	 */
	
	public class BabyRead extends AbstractRead {
		
		private boolean allNs;
		private int failureIndex;
		
		public BabyRead(int startIndexInNucleobases, int endIndexInNucleobases) {

			sequence = new ArrayList<Nucleobase>();
			allNs = true;
			failureIndex = -1;
			createSequence(startIndexInNucleobases, endIndexInNucleobases);
	
		}
		
		public void createSequence(int startIndexInNucleobases, int endIndexInNucleobases) {
			
			for (int i=startIndexInNucleobases; i<endIndexInNucleobases; i++) {
				Nucleobase nucleobaseForRead = nucleobases.get(i);
				char genotype = nucleobaseForRead.getGenotype();
				if (genotype != 'N') 
					{allNs = false;}
				if (nucleobaseForRead.hasTargetCoverage()) {
					this.failureIndex = i;
					break;}
				else if (Main.readsContainingNfilter == 2 && genotype == 'N') {
					this.failureIndex = i;
					break;}
				this.sequence.add(nucleobaseForRead);
			}
			if (allNs == true && Main.readsContainingNfilter == 1) {
				failureIndex = startIndexInNucleobases;}
			
		}
		
	}
	
}