package artificialFastqGenerator; import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.FileNotFoundException; import java.io.FileReader; import java.io.FileWriter; import java.io.IOException; import java.util.ArrayList; import java.util.logging.Level; import java.util.logging.Logger; /** * An instance of this class can be used to generate artificial FASTQ files from the human reference genome. * * Copyright (C) 2012 The Institute of Cancer Research (ICR). * * This file is part of ArtificialFastqGenerator v1.0.0. * * ArtificialFastqGenerator is free software: you can redistribute it and/or modify it under the terms of the GNU General * Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any * later version. * * This program is distributed in the hope that it will be useful but WITHOUT ANY WARRANTY; without even the implied warranty * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. * * You should have received a copy of the GNU Public License along with this program. If not, see * * * Authour's contact email: Matthew.Frampton@icr.ac.uk */ public class ArtificialFastqGenerator { //Field for input stream private BufferedReader inputStream; //Fields for the start and end sequence identifiers private String startSeqID, endSeqID; //PairedEnd gap parameters private int pairedEndGapMean, pairedEndGapMax, pairedEndGapMin; private double pairedEndGapSD; //Fields involved in read generation and logging. generatedAllReadsIndex holds the index of the last nucleobase for which all reads have been generated. private ArrayList nucleobases; private int nucleobasesSize, generatedAllReadsIndex, allReadsGeneratedNotGuaranteed, numProcessedRegions; private NucleobaseAndReadFactory nucleobaseAndReadFactory; private CoverageStatisticsCalculator coverageStatisticsCalculator; //Fields output streams. private BufferedWriter leftReadOutputStream, rightReadOutputStream, readStartIndexesOutputStream; //debugOutputStream; //Logger private Logger logger; public ArtificialFastqGenerator() { inputStream = null; startSeqID = Main.startSequenceIdentifier; endSeqID = Main.endSequenceIdentifier; pairedEndGapMean = Main.templateLengthMean-(2*Main.readLength)+1; pairedEndGapSD = Main.templateLengthSD; pairedEndGapMax = (int) Math.round(pairedEndGapMean + 2*pairedEndGapSD); pairedEndGapMin = (int) Math.round(pairedEndGapMean - pairedEndGapSD); if (pairedEndGapMin < 2) { pairedEndGapMin = 2;} nucleobases = new ArrayList(); nucleobasesSize = 0; generatedAllReadsIndex = -1; allReadsGeneratedNotGuaranteed = 2*Main.readLength + pairedEndGapMax - 2; numProcessedRegions = 0; nucleobaseAndReadFactory = new NucleobaseAndReadFactory(Main.startingX, Main.startingY, Main.readLength, Main.useRealQualityScores, Main.simulateErrorInRead, Main.fastq1ForQualityScores, Main.fastq2ForQualityScores); coverageStatisticsCalculator = new CoverageStatisticsCalculator(); //debugOutputStream = null; leftReadOutputStream = null; rightReadOutputStream = null; readStartIndexesOutputStream = null; logger = Main.logger; } /** * Main method for generating FASTQ files. * * @param referenceGenomePath */ public void generateArtificialFastqs(String referenceGenomePath) { boolean inSeqID = false; String seqID = ""; boolean generatingReads = false; try { //Input stream inputStream = new BufferedReader(new FileReader(referenceGenomePath)); //Output streams leftReadOutputStream = new BufferedWriter(new FileWriter(Main.outputPath + ".1.fastq")); rightReadOutputStream = new BufferedWriter(new FileWriter(Main.outputPath + ".2.fastq")); readStartIndexesOutputStream = new BufferedWriter(new FileWriter(Main.outputPath + ".readStartIndexes.txt")); int currentGenotypeAsInt; char currentGenotype; double GCcount = 0.0; boolean foundStartSeqID = false; boolean foundEndSeqID = false; while ((currentGenotypeAsInt = inputStream.read()) != -1) { currentGenotype = (char) currentGenotypeAsInt; if (currentGenotype == '>') { inSeqID = true;} if (inSeqID == true) { if (currentGenotype == '\n') { if (generatingReads == false) { if (seqID.startsWith(startSeqID)) { foundStartSeqID = true; logger.log(Level.INFO,"Found the start sequence ID str " + startSeqID + " in " + seqID);} seqID = ""; inSeqID = false; if (foundStartSeqID) { logger.log(Level.INFO,"Read generation begun."); coverageStatisticsCalculator.logCoverageStatsHeader(); generatingReads = true; continue;} } else if (generatingReads == true) { if (endSeqID != null) { if (seqID.startsWith(endSeqID)) { foundEndSeqID = true; logger.log(Level.INFO,"Found the end sequence identifier " + endSeqID + " in " + seqID);} } seqID = ""; inSeqID = false; if (foundEndSeqID) { logger.log(Level.INFO,"Further reads generated only for nucleobases already in memory."); processRemainingNucleobases(GCcount); break;} } } else { seqID = seqID + currentGenotype;} } else if (generatingReads == true) { if (Character.isWhitespace(currentGenotypeAsInt)) {continue;} if (ArtificialFastqGenerationUtils.isGC(currentGenotype)) { GCcount = GCcount + 1.0;} nucleobases.add(nucleobaseAndReadFactory.createNucleobase(currentGenotype)); nucleobasesSize = nucleobasesSize + 1; if (nucleobaseAndReadFactory.getNumNucleobasesCreated() % Main.captureProbeLength == 0) { processNucleobases(false,GCcount); GCcount = 0.0;} } } if (foundEndSeqID == false) { if (!foundStartSeqID) {logger.log(Level.SEVERE,"Didn't find the start sequence identifier " + startSeqID + " so no reads generated.");} else { processRemainingNucleobases(GCcount); if (endSeqID != null) {logger.log(Level.WARNING,"Didn't find the end sequence identifier " + endSeqID);} } } } catch (FileNotFoundException fnfe) { fnfe.printStackTrace(); logger.log(Level.SEVERE,ArtificialFastqGenerationUtils.getStackTraceString(fnfe));} catch (IOException ioe) { ioe.printStackTrace(); logger.log(Level.SEVERE,ArtificialFastqGenerationUtils.getStackTraceString(ioe)); } finally { try { if (inputStream != null) { inputStream.close();} //if (debugOutputStream != null) { //debugOutputStream.close();} if (leftReadOutputStream != null) { leftReadOutputStream.close();} if (rightReadOutputStream != null) { rightReadOutputStream.close();} if (readStartIndexesOutputStream != null) { readStartIndexesOutputStream.close();} } catch (IOException ioe) { ioe.printStackTrace(); logger.log(Level.SEVERE,ArtificialFastqGenerationUtils.getStackTraceString(ioe)); } } } /** * Process remaining nucleobases at the end of the sequence. * * @param GCcount */ private void processRemainingNucleobases(double GCcount) { processNucleobases(true,GCcount); coverageStatisticsCalculator.logOverallStats(); nucleobaseAndReadFactory.logOverallErrorStats(); } /** * Top method for processing a region of nucleobases in order to set the target coverage and create reads. * * @param finalReadGenerationRegion - true if this is the final region of nucleobases to process, else false. * @param GCcount - the GC count for the latest capture probe region. */ private void processNucleobases(boolean finalReadGenerationRegion, double GCcount) { long numNucleobasesCreated = nucleobaseAndReadFactory.getNumNucleobasesCreated(); //Non-final region if (!finalReadGenerationRegion) { setTargetCoverage(nucleobasesSize-Main.captureProbeLength,nucleobasesSize,GCcount/Main.captureProbeLength); GCcount = 0.0; if (nucleobasesSize >= Main.nucleobaseBufferSize + allReadsGeneratedNotGuaranteed) { createAndWriteReadsForRegion(false);} } //Final region else if (finalReadGenerationRegion) { setTargetCoverage(nucleobasesSize-(int)(numNucleobasesCreated%Main.captureProbeLength), nucleobasesSize, GCcount/(numNucleobasesCreated%Main.captureProbeLength));//Set the coverage for all which haven't yet been set. if (Main.nucleobaseBufferSize > numNucleobasesCreated) { logger.log(Level.WARNING, "nucleobaseBufferSize parameter > total number of nucleobases, so resetting it to " + numNucleobasesCreated + "."); Main.nucleobaseBufferSize=(int)numNucleobasesCreated;} createAndWriteReadsForRegion(true); } } /** * Top method for creating pairs of reads for a region of nucleobases: * 1. If this is the first region of nucleobases to be processed, reduce their target coverage. * 2. Create the reads by calling createPairedEndReadsFrom1Nucleobase(indexInNucleobases) for each nucleobase. * 3. Write out the reads for the region by calling writeReadsAndCoverageStatistics(regionStartIndex,regionEndIndex). * 4. Set now redundant referents to null by calling setRedundantReferentsToNull(lastRedundantNucleobaseIndex). * * @param finalReadGenerationRegion - true if this is the final region of nucleobases to process, else false. */ private void createAndWriteReadsForRegion(boolean finalReadGenerationRegion) { int regionStartIndex = 0; //If this is the 1st region, target coverage will need reducing. if (nucleobases.get(0).getUniqueID()==1) { int coverageReduction = nucleobases.get(0).getTargetCoverage()-1; for (int i=0; i -1) {i=fullyCoveredIndex;} else if (fullyCoveredIndex==-2) { createdNewPairOfReads = true; if (nucleobase.hasTargetCoverage()) {break;} } } if (!createdNewPairOfReads) {break;} } } /** * Starting with the start and end index for the possible left read, try to create a pair of reads. Returns -2 if a * left and right reads are created, -1 if a left read can be created but not a right, else return the index of the * nucleobase in the left read sequence at which failure occurs (as a result of violating a target coverage or N- * filter constraint). * * @param leftReadStartIndex - the lowest possible starting index in nucleobases for a left read. * @param leftReadEndIndex - the highest possible starting index in nucleobases for a left read. * @return i - endcoded outcome of the attempt to create a pair of reads. */ private int tryToCreate1PairOfReads(int leftReadStartIndex, int leftReadEndIndex) { BabyRead babyLeftRead = new BabyRead(leftReadStartIndex,leftReadEndIndex); int failureIndex = babyLeftRead.failureIndex; if (failureIndex != -1) { ArtificialFastqGenerationUtils.setRedundantReadToNull(babyLeftRead); //Hold Java's hand for garbage collection. return failureIndex;} int rightReadStartIndex = leftReadEndIndex - 1 + ArtificialFastqGenerationUtils.getRandomIntWithinRange(pairedEndGapMean,pairedEndGapSD,pairedEndGapMin,pairedEndGapMax); int rightReadEndIndex = rightReadStartIndex + Main.readLength; if (!(rightReadStartIndex > nucleobasesSize-1) && !(rightReadEndIndex > nucleobasesSize)) { BabyRead babyRightRead = new BabyRead(rightReadStartIndex,rightReadEndIndex); failureIndex = babyRightRead.failureIndex; if (failureIndex != -1) { ArtificialFastqGenerationUtils.setRedundantReadToNull(babyLeftRead); //For garbage collection. ArtificialFastqGenerationUtils.setRedundantReadToNull(babyRightRead); //For garbage collection. return -1;} //The L and R read sequences don't violate target coverage or N-filter constraints, so create the paired-end reads. nucleobaseAndReadFactory.createPairedEndReads(babyLeftRead.getSequence(),babyRightRead.getSequence()); } else { ArtificialFastqGenerationUtils.setRedundantReadToNull(babyLeftRead); //For garbage collection. return -1;} return -2; } /** * Write the left and right reads to the left and right fastq files respectively, and coverage statistics to the log * file. * * @param startIndex - the index in nucleobases of the 1st nucleobase to write reads and coverage statistics for. * @param endIndex - the index in nucleobases of the last nucleobase to write reads and coverage statistics for. */ private void writeReadsAndCoverageStatistics(int startIndex, int endIndex) { try { for (int i=startIndex; i reads = nucleobase.getCoveringReads(); //debugOutputStream.write(nucleobase.toString() + " " + reads + "\n"); for (Read leftEndRead : reads) { if (leftEndRead.isLeftReadAndNeedsWriting()) { leftReadOutputStream.write(leftEndRead.toString()); Read rightEndRead = leftEndRead.getPairedEndRead(); rightReadOutputStream.write(rightEndRead.toString()); readStartIndexesOutputStream.write(leftEndRead.getSequence().get(0).getUniqueID() + "," + rightEndRead.getSequence().get(0).getUniqueID() + "\n"); leftEndRead.setLeftReadAndNeedsWriting(false);} } } //debugOutputStream.flush(); leftReadOutputStream.flush(); rightReadOutputStream.flush(); readStartIndexesOutputStream.flush(); } catch (IOException ioe) { ioe.printStackTrace(); ArtificialFastqGenerationUtils.getStackTraceString(ioe); } numProcessedRegions = numProcessedRegions + 1; if (numProcessedRegions == Main.logRegionStats) { coverageStatisticsCalculator.logRegionStats(); numProcessedRegions = 0; } } /*** * Set target coverage for nucleobases. If coverageBasedOnGCcontent is true, calculate the target coverage based on GC * content. * * @param startIndex - index of the 1st nucleobase in nucleobases for which to set target coverage. * @param endIndex - index of the last nucleobase in nucleobases for which to set target coverage. * @param GCcontent - GCcontent of this capture probe region. */ private void setTargetCoverage(int startIndex, int endIndex, double GCcontent) { int coverage = (int) Main.coverageMeanPeak + 1; if (Main.coverageBasedOnGCcontent == true) { //coverage = ArtificialFastqGenerationUtils.calculateTargetCoverageFromGCcontent(GCcontent); coverage = ArtificialFastqGenerationUtils.getTargetCoverageViaGaussFuncOfGCcont(GCcontent, Main.coverageMeanPeak, Main.coverageMeanPeakGCcontent, Main.coverageMeanGCcontentSpread, Main.coverageSD); } for (int i=startIndex; i reads = redundantNucleobase.getCoveringReads(); for (Read redundantRead : reads) { ArtificialFastqGenerationUtils.setRedundantReadToNull(redundantRead); } nucleobases.remove(0); nucleobasesSize = nucleobasesSize - 1; generatedAllReadsIndex = generatedAllReadsIndex - 1; redundantNucleobase = null; } //Runtime.getRuntime().gc(); } /** * A sequence of nucleobases which may be used to create a read. Has fields for (1) the sequence, (2) the index in the * ArrayList nucleobases of the first nucleobase to prohibit read generation, either due to its target coverage * constraint or the readsContainingN filter. A failure index of -1 means there was no failure. * * @author mframpton */ public class BabyRead extends AbstractRead { private boolean allNs; private int failureIndex; public BabyRead(int startIndexInNucleobases, int endIndexInNucleobases) { sequence = new ArrayList(); allNs = true; failureIndex = -1; createSequence(startIndexInNucleobases, endIndexInNucleobases); } public void createSequence(int startIndexInNucleobases, int endIndexInNucleobases) { for (int i=startIndexInNucleobases; i