package artificialFastqGenerator;

import java.io.BufferedReader;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.logging.Level;
import java.util.logging.Logger;

/**
 * An instance of this class can be used to generate (1) ASCII encodings of Phred quality scores for bases in reads; (2) 
 * base-call errors. (1) is done by taking quality scores from a pre-existing FASTQ file, and (2) by generating an error with
 * a probability that depends on the base's phred score.
 * 
 * Copyright (C) 2012 The Institute of Cancer Research (ICR).
 *
 * This file is part of ArtificialFastqGenerator v1.0.0.
 * 
 * ArtificialFastqGenerator is free software: you can redistribute it and/or modify it under the terms of the GNU General
 * Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any
 * later version.
 * 
 * This program is distributed in the hope that it will be useful but WITHOUT ANY WARRANTY; without even the implied warranty
 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 * 
 * You should have received a copy of the GNU Public License along with this program. If not, see
 * <http://www.gnu.org/licenses/>
 * 
 * Authour's contact email: Matthew.Frampton@icr.ac.uk
 */

public class QualityScoreAndErrorGenerator {

	public static final char[] encodedSangerQualityScores = 
			"!\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~".toCharArray();
	private BufferedReader fastq1ForQualityScores;
	private BufferedReader fastq2ForQualityScores;
	private long numBaseCallErrors;
	private long numACGTBaseCalls;
	private HashMap<Character, Double> probOfErrorByEncodedQualityScore;
	private char[] highestQualityScores;
	private boolean useRealQualityScores;
	private boolean simulateErrorInRead;
	private int readLength;
	private String fastq1ForQualityScoresPath;
	private String fastq2ForQualityScoresPath;
	private Logger logger;
	
	/**
	 * Initialise a new QualityScoreAndErrorGenerator object. 
	 * 
	 * @param readLength - the length of each read.
	 * @param useRealQualityScores - true if using quality scores from an existing fastq file, else false.
	 * @param simulateErrorInRead - true if simulating error in reads, else false.
	 * @param fastq1ForQualityScoresPath - the 1st fastq file whose quality scores are used.
	 * @param fastq2ForQualityScoresPath - the 2nd fastq file whose quality scores are used.
	 */
	
	public QualityScoreAndErrorGenerator(int readLength, boolean useRealQualityScores, boolean simulateErrorInRead,
			String fastq1ForQualityScoresPath, String fastq2ForQualityScoresPath) {
		
		fastq1ForQualityScores = null;
		fastq2ForQualityScores = null;
		numBaseCallErrors = 0;
		numACGTBaseCalls = 0;
		probOfErrorByEncodedQualityScore = new HashMap<Character, Double>();
		//Fill probOfErrorByEncodedQualityScore...
		for (int i=0; i<encodedSangerQualityScores.length; i++) {
			Double probabilityOfError = getEstimatedBaseCallErrorProbability(encodedSangerQualityScores[i]);
			probOfErrorByEncodedQualityScore.put(encodedSangerQualityScores[i],probabilityOfError);
		}
		highestQualityScores = new char[readLength];
		//Fill highestQualityScores...
		for (int i=0; i<highestQualityScores.length; i++) {
			//highestQualityScores[i] = encodedSangerQualityScores[encodedSangerQualityScores.length-1];
			highestQualityScores[i] = encodedSangerQualityScores[40];
		}
		this.useRealQualityScores = useRealQualityScores;
		this.simulateErrorInRead = simulateErrorInRead;
		this.readLength = readLength;
		this.fastq1ForQualityScoresPath = fastq1ForQualityScoresPath;
		this.fastq2ForQualityScoresPath = fastq2ForQualityScoresPath;
		logger = Main.logger;
	}
	
	/**
	 * PHRED = -10 x log_10(P_e) (see Cock et al. "The Sanger FASTQ file...") so 
	 * P_e = 10^{PHRED/-10}, where P_e means estimated probability of an error. 
	 * 
	 * @param encodedQualityScore - an encoded Sanger Phred quality score.
	 * @return estimatedBaseCallErrorProbability - the estimated probability of a base call error.
	 */
	
	private double getEstimatedBaseCallErrorProbability(char encodedQualityScore) {
		
		int phredQualityScore = 0;
		for (int i=0; i <  encodedSangerQualityScores.length; i++) {
			if (encodedSangerQualityScores[i] == encodedQualityScore) {
				phredQualityScore = i;
				break;}
		}
	
		double phredQualityScoreAsDouble = (double) phredQualityScore;
		return Math.pow(10.0,phredQualityScoreAsDouble/-10.0); 
	}
	
	/**
	 * Get the encoded phred quality scores and the genotypes read. If the read is right-end, then it should align to the 
	 * -ive strand, so reverse nucleobases, and read their complements.
	 * 
	 * @param isLeft - true if the read is a left read, else false.
	 * @param nucleobases - human reference sequence nucleobases which the read is a read of.
	 * @return finalQualityScoresAndGenotypesRead - an ArrayList of char arrays containing the final quality scores and the 
	 * genotypes that get read.
	 */
	
	public ArrayList<char[]> getQualityScoresAndGenotypesRead(boolean isLeft, ArrayList<Nucleobase> nucleobases) {
		
		if (isLeft == false) {
			Collections.reverse(nucleobases);
		}
		
		char[] initialQualityScores = getInitialQualityScores(isLeft);
		char[] genotypesRead = new char[readLength];
		char[] finalQualityScores = new char[readLength];
		
		for (int i=0; i<nucleobases.size(); i++) {
			Nucleobase nucleobase = nucleobases.get(i);
			char correctGenotype = nucleobase.getGenotype();
			if (isLeft == false) {
				correctGenotype = nucleobase.getComplementGenotype();}
			if (ArtificialFastqGenerationUtils.isACGT(correctGenotype)) {numACGTBaseCalls++;}
			if (correctGenotype == 'N') {
				genotypesRead[i] = correctGenotype;
				finalQualityScores[i] = '#';
			} else {
				if (!simulateErrorInRead) {
					genotypesRead[i] = correctGenotype;
				} else {
					genotypesRead[i] = simulate1BaseCallError(correctGenotype, initialQualityScores[i]);
				}
				finalQualityScores[i] = initialQualityScores[i];
			}	
		}
		
		ArrayList<char[]> finalQualityScoresAndGenotypesRead = new ArrayList<char[]>();
		finalQualityScoresAndGenotypesRead.add(finalQualityScores);
		finalQualityScoresAndGenotypesRead.add(genotypesRead);
		return finalQualityScoresAndGenotypesRead;	
		
	}

	/**
	 * Get the initial quality scores. These are initial because there maybe be one or more quality scores
	 * which are not poor, and correspond to an 'N'. This is changed in
	 * getQualityScoresAndGenotypesRead(boolean, ArrayList<Nucleobase>, boolean, boolean).
	 * 
	 * @param isLeft - true if the read is a left read, else false.
	 * @return qualityScores - the initial Sanger format encoded phred quality scores for the read.
	 */
	
	private char[] getInitialQualityScores(boolean isLeft) {
		
		char[] qualityScores = new char[readLength];
		
		if (!useRealQualityScores) {
			//return max confidence.
			qualityScores = highestQualityScores;
		} else {
			qualityScores = getQualityScoresForNextReadFromFastq(isLeft);
		}
		
		return qualityScores;
	}

	/**
	 * Get the next read's quality scores from a pre-existing fastq file.
	 * 
	 * @param isLeft - true if the read is a left read, else false.
	 * @return qualityScoresCharArray - the Sanger format encoded phred quality scores for the read.
	 */

	private char[] getQualityScoresForNextReadFromFastq(boolean isLeft) {

		BufferedReader fastqForQualityScores = null;
		if (isLeft) {
			if (fastq1ForQualityScores == null) {
				openPreexistingFastqFile(isLeft);}
			fastqForQualityScores = fastq1ForQualityScores;
		} else {
			if (fastq2ForQualityScores == null) {
				openPreexistingFastqFile(isLeft);}
			fastqForQualityScores = fastq2ForQualityScores;
		}

		String qualityScores = null;
		while (qualityScores == null) {
			String fastqLine = null;
			try {
				boolean lineContainsConfidenceScores = false;
				while ((fastqLine = fastqForQualityScores.readLine()) != null) {				
					if (lineContainsConfidenceScores) {
						qualityScores = fastqLine;
						break;
					}
					if (fastqLine.startsWith("+")) {
						lineContainsConfidenceScores = true;
					} else {
						lineContainsConfidenceScores = false;
					}
				}
			} catch (IOException ioe) {
				ioe.printStackTrace();
				logger.log(Level.SEVERE,ArtificialFastqGenerationUtils.getStackTraceString(ioe));
			} finally {
				if (fastqLine == null) {
					closePreexistingFastqFile(isLeft);
					openPreexistingFastqFile(isLeft);
					if (isLeft) {
						fastqForQualityScores = fastq1ForQualityScores;}
					else {
						fastqForQualityScores = fastq2ForQualityScores;}
					} 
				}
			}
		char[] qualityScoresCharArray = qualityScores.replace("\\","").toCharArray();
		if (qualityScoresCharArray.length != readLength) {
			qualityScoresCharArray = addOrRemoveQualityScores(qualityScoresCharArray);
		}
		return qualityScoresCharArray;
	}

	/**
	 * Open a preexisting fastq file.
	 * 
	 * @param isLeft - true if the read is a left read, else false.
	 */
	
	private void openPreexistingFastqFile(boolean isLeft) {
	
		try {
			if (isLeft) {
				fastq1ForQualityScores = new BufferedReader(new FileReader(fastq1ForQualityScoresPath));
			} else {
				fastq2ForQualityScores = new BufferedReader(new FileReader(fastq2ForQualityScoresPath));
			}
		} catch (FileNotFoundException fnf) {
			fnf.printStackTrace();
			logger.log(Level.SEVERE,ArtificialFastqGenerationUtils.getStackTraceString(fnf));
		}
	}
	
	/**
	 * Close a preexisting fastq file.
	 * 
	 * @param isLeft - true if the read is a left read, else false.
	 */
	
	private void closePreexistingFastqFile(boolean isLeft) {
		
		try {
			if (isLeft) {
				fastq1ForQualityScores.close();
			} else {
				fastq2ForQualityScores.close();
			}
		} catch (IOException ioe) {
			ioe.printStackTrace();
			logger.log(Level.SEVERE,ArtificialFastqGenerationUtils.getStackTraceString(ioe));
		}
		
	}
	
	
	/**
	 * Increase/decrease the size of a sequence of real quality scores so that it is
	 * the same size as the readLength.
	 * 
	 * @param qualityScores - original quality scores.
	 * @return newQualityScores -length-adjusted quality scores.
	 */
	
	private char[] addOrRemoveQualityScores(char[] qualityScores) {
		
		logger.log(Level.WARNING,"Real quality scores are not length " + readLength);
		char[] newQualityScores = new char[readLength];	
		int diffInLength = qualityScores.length - readLength;
		int i=0; //index for qualityScores.
		int j=0; //index for newQualityScores.
		
		while (j < readLength) {
			if (diffInLength > 0) {//Quality scores is too big, so ignore the 1st diffInLength characters.
				i++;
				diffInLength--;
			} else if (diffInLength < 0) {//Quality scores is too small, so duplicate the 1st char \times diffInLength.
				newQualityScores[j] = qualityScores[0];
				j++;
				diffInLength++;
			} else {
				newQualityScores[j] = qualityScores[i];
				i++;
				j++;
			}
		}
		
		return newQualityScores;
	
	}
	
	/**
	 * Decide whether to generate an error based on the quality score. If yes
	 * choose 1 of the 3 alternate genotypes with equal probability.
	 * 
	 * @param correctGenotype - the genotype of a nucleobase in the human reference sequence.
	 * @param qualityScore - the quality score in the read for this genotype.
	 * @return incorrectGenotype - the genotype that is read (possible incorrect).
	 */
	
	private char simulate1BaseCallError(char correctGenotype, char qualityScore) {
	
		//if (!probOfErrorByEncodedQualityScore.containsKey(qualityScore)) {
			//logger.log(Level.WARNING, qualityScore + " not in map.");
		//}
		//1st decide if we're going to simulate an error.
		//Only simulate errors for A/C/G/T.
		if (!ArtificialFastqGenerationUtils.isACGT(correctGenotype)) {
			return correctGenotype;
		}
		double randomNumber = ArtificialFastqGenerationUtils.generateRandomDoubleBetween0And1();
		double estimatedBaseCallErrorProbability = probOfErrorByEncodedQualityScore.get(qualityScore);
		if (randomNumber >= estimatedBaseCallErrorProbability) {
			return correctGenotype;}
		
		//If we are simulating an error, then decide which alternate base to return.
		//System.out.println("Simulating an error");
		numBaseCallErrors = numBaseCallErrors + 1;
		//Get the alternate genotypes.
		char[] alternateGenotypes = ArtificialFastqGenerationUtils.getAlternateGenotypes(correctGenotype);
		char incorrectGenotype = correctGenotype;
		if (randomNumber <= estimatedBaseCallErrorProbability/3.0) {
			incorrectGenotype = alternateGenotypes[0];} 
		else if (randomNumber <= estimatedBaseCallErrorProbability/1.5) {
			incorrectGenotype = alternateGenotypes[1];}
		else {incorrectGenotype = alternateGenotypes[2];}
		
		return incorrectGenotype;
	}

	/**
	 * Get the number of base call errors that have been simulated.
	 * 
	 * @return numBaseCallErrors - the number of bases for which base calls have been simulated.
	 */
	
	public long getNumBaseCallErrors() {
		return numBaseCallErrors;
	}
	
	/**
	 * Reset the number of simulated base call errors to 0.
	 */
	
	public void resetNumBaseCallErrors() {
		numBaseCallErrors = 0;
	}
	
	/**
	 * Get the number of ACGT bases.
	 * 
	 * @return numACGTBaseCalls - the number of ACGT bases that have been called.
	 */
	
	public long getNumACGTBasesCalled() {
		return numACGTBaseCalls;
	}
	
}
