File: Genomes.java

package info (click to toggle)
artfastqgenerator 0.0.20150519-5
links: PTS, VCS
area: main
in suites: forky, sid, trixie
size: 4,952 kB
sloc: java: 1,481; sh: 43; makefile: 13
file content (311 lines) | stat: -rw-r--r-- 13,257 bytes
parent folder | download | duplicates (5)
package artificialFastqGenerator;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.util.Random;
import java.util.logging.Level;
import java.util.logging.Logger;

/**
 * The Genomes class provides facilities for (1) creating a toy genome, (2) getting the start and end strings for a
 * chromosome in the human reference genome, (3) analysing and producing summary statistics for the human reference genome.
 * 
 * Copyright (C) 2012 The Institute of Cancer Research (ICR).
 *
 * This file is part of ArtificialFastqGenerator v1.0.0.
 * 
 * ArtificialFastqGenerator is free software: you can redistribute it and/or modify it under the terms of the GNU General
 * Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any
 * later version.
 * 
 * This program is distributed in the hope that it will be useful but WITHOUT ANY WARRANTY; without even the implied warranty
 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 * 
 * You should have received a copy of the GNU Public License along with this program. If not, see 
 * <http://www.gnu.org/licenses/>
 * 
 * Authour's contact email: Matthew.Frampton@icr.ac.uk
 */

public class Genomes {
	
	public static final String HRGChromosome1Str = ">1 dna:chromosome chromosome:GRCh37:1:1:249250621:1";
	public static final String HRGChromosome2Str = ">2 dna:chromosome chromosome:GRCh37:2:1:243199373:1";
	public static final String HRGChromosome3Str = ">3 dna:chromosome chromosome:GRCh37:3:1:198022430:1";
	public static final String HRGChromosome4Str = ">4 dna:chromosome chromosome:GRCh37:4:1:191154276:1";
	public static final String HRGChromosome5Str = ">5 dna:chromosome chromosome:GRCh37:5:1:180915260:1";
	public static final String HRGChromosome6Str = ">6 dna:chromosome chromosome:GRCh37:6:1:171115067:1";
	public static final String HRGChromosome7Str = ">7 dna:chromosome chromosome:GRCh37:7:1:159138663:1";
	public static final String HRGChromosome8Str = ">8 dna:chromosome chromosome:GRCh37:8:1:146364022:1";
	public static final String HRGChromosome9Str = ">9 dna:chromosome chromosome:GRCh37:9:1:141213431:1";
	public static final String HRGChromosome10Str = ">10 dna:chromosome chromosome:GRCh37:10:1:135534747:1";
	public static final String HRGChromosome11Str = ">11 dna:chromosome chromosome:GRCh37:11:1:135006516:1";
	public static final String HRGChromosome12Str = ">12 dna:chromosome chromosome:GRCh37:12:1:133851895:1";
	public static final String HRGChromosome13Str = ">13 dna:chromosome chromosome:GRCh37:13:1:115169878:1";
	public static final String HRGChromosome14Str = ">14 dna:chromosome chromosome:GRCh37:14:1:107349540:1";
	public static final String HRGChromosome15Str = ">15 dna:chromosome chromosome:GRCh37:15:1:102531392:1";
	public static final String HRGChromosome16Str = ">16 dna:chromosome chromosome:GRCh37:16:1:90354753:1";
	public static final String HRGChromosome17Str = ">17 dna:chromosome chromosome:GRCh37:17:1:81195210:1";
	public static final String HRGChromosome18Str = ">18 dna:chromosome chromosome:GRCh37:18:1:78077248:1";
	public static final String HRGChromosome19Str = ">19 dna:chromosome chromosome:GRCh37:19:1:59128983:1";
	public static final String HRGChromosome20Str = ">20 dna:chromosome chromosome:GRCh37:20:1:63025520:1";
	public static final String HRGChromosome21Str = ">21 dna:chromosome chromosome:GRCh37:21:1:48129895:1";
	public static final String HRGChromosome22Str = ">22 dna:chromosome chromosome:GRCh37:22:1:51304566:1";
	public static final String HRGChromosomeXStr = ">X dna:chromosome chromosome:GRCh37:X:1:155270560:1";
	public static final String HRGChromosomeYStr = ">Y dna:chromosome chromosome:GRCh37:Y:2649521:59034049:1";
	
	public static Logger logger = Main.logger;
	
	/**
	 * Generate a toy genome.
	 * 
	 * @param outFilePath - the path for the outputted toy genome.
	 * @param length - the number of nucleobases to go into the toy genome.
	 * @param printChromosomeLines - Whether to write the header lines for each chromosome.
	 * @param chromosomeLineIntervals - the number of nucleobases to go into each chromosome in the toy genome.
	 */
	
	public static void generateToyGenome(String outFilePath, int length, boolean printChromosomeLines, 
			int chromosomeLineIntervals) {
	
		int numNucleobases = 0;
		String[] bases = new String[4];
		bases[0] = "A";
		bases[1] = "C";
		bases[2] = "G";
		bases[3] = "T";
		//bases[4] = "C";
		//bases[5] = "G";
		int chromosomeIndex = 0;
		String[] chromosomeStrs = createChromosomeArray();
		
		try {
			BufferedWriter outputStream = new BufferedWriter(new FileWriter(outFilePath));
			Random randNumGenerator = new Random();
			String lastWrittenBaseOrWhitespace = "";
			for (int i=0; i<length; i++) {
				if (printChromosomeLines && (i % chromosomeLineIntervals) == 0 && chromosomeIndex < chromosomeStrs.length) {
					if (!lastWrittenBaseOrWhitespace.equals("\n") && !lastWrittenBaseOrWhitespace.equals("")) {
						outputStream.write("\n");}
					outputStream.write(chromosomeStrs[chromosomeIndex] + "\n");
					chromosomeIndex = chromosomeIndex + 1;
					numNucleobases = 0;}
				int randomNumber = randNumGenerator.nextInt(4);
				String nucleobaseToWrite = bases[randomNumber];
				lastWrittenBaseOrWhitespace = nucleobaseToWrite;
				outputStream.write(bases[randomNumber]);
				numNucleobases = numNucleobases + 1;
				if (numNucleobases > 0 && (numNucleobases % 100) == 0) {
					outputStream.write("\n");
					lastWrittenBaseOrWhitespace = "\n";
				}
			}
			outputStream.flush();
		} catch (IOException ioe) {
			ioe.printStackTrace();
			logger.log(Level.SEVERE,ArtificialFastqGenerationUtils.getStackTraceString(ioe));
		} 
	}
	
	/**
	 * Create an array of chromosome header strings.
	 * 
	 * @return chromosomeStrs - an array of the header strings in the human reference genome for each chromosome.
	 */
	
	public static String[] createChromosomeArray() {
		
		String[] chromosomeStrs = new String[24];
		chromosomeStrs[0] = HRGChromosome1Str ;
		chromosomeStrs[1] = HRGChromosome2Str;
		chromosomeStrs[2] = HRGChromosome3Str;
		chromosomeStrs[3] = HRGChromosome4Str;
		chromosomeStrs[4] = HRGChromosome5Str ;
		chromosomeStrs[5] = HRGChromosome6Str;
		chromosomeStrs[6] = HRGChromosome7Str;
		chromosomeStrs[7] = HRGChromosome8Str;
		chromosomeStrs[8] = HRGChromosome9Str ;
		chromosomeStrs[9] = HRGChromosome10Str;
		chromosomeStrs[10] = HRGChromosome11Str;
		chromosomeStrs[11] = HRGChromosome12Str;
		chromosomeStrs[12] = HRGChromosome13Str ;
		chromosomeStrs[13] = HRGChromosome14Str;
		chromosomeStrs[14] = HRGChromosome15Str;
		chromosomeStrs[15] = HRGChromosome16Str;
		chromosomeStrs[16] = HRGChromosome17Str;
		chromosomeStrs[17] = HRGChromosome18Str;
		chromosomeStrs[18] = HRGChromosome19Str;
		chromosomeStrs[19] = HRGChromosome20Str;
		chromosomeStrs[20] = HRGChromosome21Str;
		chromosomeStrs[21] = HRGChromosome22Str;
		chromosomeStrs[22] = HRGChromosomeXStr;
		chromosomeStrs[23] = HRGChromosomeYStr;
		return chromosomeStrs;
	}
	
	/**
	 * Get the human reference genome sequence start and end strings for a set of chromsomes.
	 * 
	 * @param chromosomes - the chromosomes for which we want the start and end strings.
	 * @return chromosomeStartEndStrs - the start and end strings.
	 */

	public static String[] getChromosomeStartEndStrs(String chromosomes) {
		
		String[] chromosomeStrs = createChromosomeArray();
		String[] chromosomesArray = chromosomes.split(",");
		String[] chromosomeStartEndStrs = new String[chromosomesArray.length];
		for (int i=0; i<chromosomesArray.length; i++) {
			String chromosome = chromosomesArray[i];
			int index = 0;
			if (chromosome.equals("X")) {
				index = 22;
			} else if (chromosome.equals("Y")) {
				index = 23;
			} else {
				index = Integer.valueOf(chromosome) - 1;
			}
			int offset = 5;
			if (chromosome.equals("12")||chromosome.equals("21")) {
				offset = 6;}
			String startEndStr = chromosome + "," + chromosomeStrs[index].substring(chromosomeStrs[index].length()-offset) +
					",>";
			chromosomeStartEndStrs[i] = startEndStr;
		}
		return chromosomeStartEndStrs;
		
	}

	/**
	 * Conduct an analysis of the human reference genome.
	 * 
	 * @param humanReferenceGenomePath - the path to the human reference genome file. 
	 * @param outputPath - the path for the output.
	 * @param lineDivisor - the region size in # of nucleobases for which to provide summary statistics.
	 */
	
	public static void analyzeHumanReferenceGenome(String humanReferenceGenomePath, String outputPath, int lineDivisor) {
		
		BufferedReader inputStream = null;
		BufferedWriter outputStream = null;
		
		try {
			inputStream = new BufferedReader(new FileReader(humanReferenceGenomePath));
			outputStream = new BufferedWriter(new FileWriter(outputPath));
			int currentCharInt;
			String currentCharStr;
			boolean previousCharNonBase = false;
			int totalBaseCount = 0;
			double ACount = 0.0;
			double TCount = 0.0;
			double CCount = 0.0;
			double GCount = 0.0;
			double NCount = 0.0;
			String previousCharStr = "\n";
			
			
			while ((currentCharInt = inputStream.read()) != -1) {
				
					currentCharStr = String.valueOf((char) currentCharInt);		
					if (currentCharStr.equals("\n")) {continue;}
					
					//The different non-bases:
					//non A,T,G & C.
					//G with preceeding :
					//C with preceeding R
					
					//Non-base characters.
					if (isNonBase(currentCharStr, previousCharStr)) {
						
						if (!previousCharNonBase) {
							outputStream.write(makeCountProportionsString(ACount, TCount, CCount, GCount, NCount, 
									totalBaseCount % lineDivisor));
							outputStream.write("Total base count: " + totalBaseCount + "\n\n");
							ACount = 0.0;
							TCount = 0.0;
							CCount = 0.0;
							GCount = 0.0;
							NCount = 0.0;
							totalBaseCount = 0;
							previousCharNonBase = true;}
						outputStream.write(currentCharStr);
					} else {//Base characters.
						if (previousCharNonBase) {
							outputStream.write("\n");
							previousCharNonBase = false;
						}
						//Update the counts
						if (currentCharStr.equals("A")) {ACount = ACount + 1;}
						else if (currentCharStr.equals("T")) {TCount = TCount + 1;}
						else if (currentCharStr.equals("C")) {CCount = CCount + 1;}
						else if (currentCharStr.equals("G")) {GCount = GCount + 1;}
						else if (currentCharStr.equals("N")) {NCount = NCount + 1;}
						totalBaseCount = totalBaseCount + 1;
						if (totalBaseCount % lineDivisor == 0) {//
							outputStream.write(makeCountProportionsString(ACount, TCount, CCount, GCount, NCount, 
									lineDivisor));
							ACount = 0.0;
							TCount = 0.0;
							CCount = 0.0;
							GCount = 0.0;
							NCount = 0.0;}
					}
					outputStream.flush();
					previousCharStr = currentCharStr;	
				}
				outputStream.write(makeCountProportionsString(ACount, TCount, CCount, GCount, NCount, totalBaseCount % 
						lineDivisor));
				outputStream.write("Total base count: " + totalBaseCount + "\n");
				outputStream.flush();
				
		} catch (IOException ioe) {
			ioe.printStackTrace();}
		
	}
	
	
	/**
	 * Check whether a human reference sequence file character is a base.
	 * 
	 * @param currentCharStr - the current human reference sequence file character as a String.
	 * @param previousCharStr - the previous human reference sequence file character as a String.
	 * @return isNonBase - true if the char is not a base, else false.
	 */
	
	public static boolean isNonBase(String currentCharStr, String previousCharStr) {
		
		if ((!currentCharStr.equals("A") && !currentCharStr.equals("T") && !currentCharStr.equals("G") && 
				!currentCharStr.equals("C") && !currentCharStr.equals("N")) || (currentCharStr.equals("G") && 
						previousCharStr.equals(":")) || (currentCharStr.equals("C") && previousCharStr.equals("R"))) {
			return true;
		}
		return false;
		
	}
	
	/**
	 * Make a String containing summary statistics for a region of nucleobases in the human reference sequence file.
	 * 
	 * @param ACount - number of A nucleobases in the region.
	 * @param TCount - number of T nucleobases in the region.
	 * @param CCount - number of C nucleobases in the region.
	 * @param GCount - number of G nucleobases in the region.
	 * @param NCount - number of N nucleobases in the region.
	 * @param divisor - size of the region in number of nucleobases.
	 * @return countProportionsString - the count proportions for the different possible nucleobases in a region in the human
	 *  reference sequence file.
	 */
	
	public static String makeCountProportionsString(double ACount, double TCount, double CCount, double GCount, 
			double NCount, double divisor) {
		
		return ACount/divisor + "," + CCount/divisor + "," + GCount/divisor + "," + NCount/divisor + "," + TCount/divisor + 
				"\n";
		
	}
	
	
}