File: ArtificialFastqGenerationUtils.java

package info (click to toggle)
artfastqgenerator 0.0.20150519-1~bpo8%2B1
  • links: PTS, VCS
  • area: main
  • in suites: jessie-backports
  • size: 4,916 kB
  • sloc: java: 1,481; sh: 39; makefile: 11
file content (214 lines) | stat: -rw-r--r-- 7,738 bytes parent folder | download | duplicates (5)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
package artificialFastqGenerator;

import java.util.Random;

/**
 * ArtificialFastqGenerationUtils consists exclusively of static utility methods that are used in
 * artificial fastq generation.
 * 
 * Copyright (C) 2012 The Institute of Cancer Research (ICR).
 * 
 * This file is part of ArtificialFastqGenerator v1.0.0.
 * 
 * ArtificialFastqGenerator is free software: you can redistribute it and/or
 * modify it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or (at your
 * option) any later version.
 * 
 * This program is distributed in the hope that it will be useful but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
 * more details.
 * 
 * You should have received a copy of the GNU Public License along with this
 * program. If not, see <http://www.gnu.org/licenses/>
 * 
 * Authour's contact email: Matthew.Frampton@icr.ac.uk
 */

public class ArtificialFastqGenerationUtils {

	public static Random randNumGenerator = new Random();
	
	/**
	 * Check if a genotype (represented as a char) is A/C/G/T.
	 * 
	 * @param genotype - the genotype whose type we are checking.
	 * @return isACGT - true if the genotype is A/C/G/T, else false.
	 */
	
	public static boolean isACGT(char genotype) {
		
		boolean isACGT = false;
		if (genotype == 'A' || genotype == 'T' || isGC(genotype)) {
			isACGT = true;
		}
		return isACGT;
	}
	
	/**
	 * Check if a genotype (represented as a char) is G/C.
	 * 
	 * @param genotype - the genotype whose type we are checking.
	 * @return isGC - true if the genotype is G/C, else false.
	 */
	
	public static boolean isGC(char genotype) {
		
		boolean isGC = false;
		if (genotype == 'C' || genotype == 'G') {
			isGC = true;
		}
		return isGC;
		
	}
	
	/**
	 * Return a char array containing the alternate genotypes.
	 * 
	 * @param genotype - genotype for which we want the alternatives.
	 * @return alternateGenotypes - the alternative genotypes.
	 */
	
	public static char[] getAlternateGenotypes(char genotype) {
		
		char[] alternateGenotypes = new char[3];
		int i = 0;
		if (genotype != 'A') {
			alternateGenotypes[i] = 'A';
			i++;}
		if (genotype != 'C') {
			alternateGenotypes[i] = 'C';
			i++;}
		if (genotype != 'G') {
			alternateGenotypes[i] = 'G';
			i++;}
		if (genotype != 'T') {
			alternateGenotypes[i] = 'T';
		}
		return alternateGenotypes;
		
	}
	
	/**
	 * Generates a pseudo-random number from a normal distribution, but with a minimum and maximum value.
	 * 
	 * @param mean - the population mean.
	 * @param standardDeviation - the population standard deviation.
	 * @param minimum - the minimum possible value.
	 * @param maximum - the maximum possible value.
	 * @return randomIntWithinRange - the integer from the bounded normal distribution.
	 */
	
	public static int getRandomIntWithinRange(int mean, double standardDeviation, int minimum, int maximum) {
		
		int randomIntWithinRange = (int) Math.round(randNumGenerator.nextGaussian()*standardDeviation+mean);
		if (randomIntWithinRange > maximum) {
			randomIntWithinRange = maximum;
		} else if (randomIntWithinRange < minimum) {
			randomIntWithinRange = minimum;
		}
		
		return randomIntWithinRange;
		//DEBUGGING
		//return mean;
		
	}
	
	/**
	 * This method calculates a region's target coverage from its GC content. It uses a normal distribution to model target
	 * coverage levels for a given GC content. Hence it first calculates the mean and standard deviation of this
	 * distribution, and then samples from it. It calculates the mean via a Gaussian function f(x) = ae^-((x-b)^2/2c^2) where
	 * a is the height of the curve's peak (coverage mean peak), b is the position of the centre of the peak (coverage mean
	 * peak GC content), and c controls the width of the bell (coverage mean GC content spread). It calculates the standard
	 * deviation as a multiple of the mean.  
	 * 
	 * @param GCcontent
	 * @param coverageMeanPeak
	 * @param coverageMeanPeakGCcontent
	 * @param coverageMeanGCcontentSpread
	 * @param coverageSDdividedByMean
	 * @return targetCoverage
	 */
	
	public static int getTargetCoverageViaGaussFuncOfGCcont(double GCcontent, double coverageMeanPeak, 
			double coverageMeanPeakGCcontent, double coverageMeanGCcontentSpread, double coverageSDdividedByMean) {
		
		double coverageMean = coverageMeanPeak * Math.exp(-1.0*Math.pow(GCcontent-coverageMeanPeakGCcontent,2.0)/(2.0*Math.pow(coverageMeanGCcontentSpread,2.0)));
		double coverageSD = coverageMean * coverageSDdividedByMean;
		int targetCoverage = (int) Math.round(randNumGenerator.nextGaussian()*coverageSD+coverageMean);
		//System.out.println("GCcontent: " + GCcontent + " coverageMean: " + coverageMean + " coverageSD: " + coverageSD + " targetCoverage: " + targetCoverage);
		return targetCoverage;
	}
	
	/**
	 * Generates a random double between 0 and 1.
	 * 
	 * @return randomNumberBtwn0And1 - a pseudo-random double between 0 and 1.
	 */
	
	public static double generateRandomDoubleBetween0And1() {
		
		//Generate a random number from 1 to 10000.
		int maxInt = 10000;
		double randomNumberBtwn0And1 = (double) randNumGenerator.nextInt(maxInt); 
		randomNumberBtwn0And1 = randomNumberBtwn0And1/((double) maxInt);
		return randomNumberBtwn0And1;
		
	}
	
	/**
	 * Generate a stack trace string for an exception object.
	 * 
	 * @param e - the exception object.
	 * @return stackTraceString - the stack trace string for e.
	 */
	
	public static String getStackTraceString(Exception e) {
		String stackTraceString = e.getClass().getName() + ": " + e.getMessage() + "\n";
		StackTraceElement[] stackTraceElements = e.getStackTrace();
		for (int i=0; i<stackTraceElements.length; i++) {
			stackTraceString = stackTraceString + "\tat " + stackTraceElements[i] + "\n";
		}
		return stackTraceString;
	}
	
	/**
	 *
	 * Set the sequence field of Read and BabyRead objects to null and then also set the object to null. This is for helping
	 * Java to garbage collect redundant Read and BabyRead objects.
	 * 
	 * @param redundantRead
	 */
	
	public static <T extends AbstractRead> void setRedundantReadToNull(T redundantRead) {
		redundantRead.setSequence(null);
		redundantRead = null;
	}
	
	@Deprecated
	/**
	 * Coverage calculated as function of GCContent based on numbers in "Enrichment of sequencing of targets from the human
	 * genome by solution hybridization" by Tewhey et al: coverage peaks at about 1.3 when GC content is 0.45, and is about
	 * 0.1 when GC content is 0. Hence we want to calculate mean coverage based on distance from 0.45. Define 2 simultaneous
	 * equations: 0A + B = 1.3 & 0.45A + B = 0.1, and solve to give A = -8/3 and B = 1.3 . Hence meanCoverage = -8/3 * 
	 * |.45-GCContent| + 1.3, then plug this into nextGaussian to get the coverage for this region. The standard deviation
	 * is calculated as a multiple of the mean.
	 * 
	 * @param GcContent - the GC content for this capture probe region.
	 * @return targetCoverage - the target coverage for nucleobases in the capture probe region.
	 */
	
	public static int calculateTargetCoverageFromGCcontent(double GCcontent) {
	
		double coverageMean = -8.0/3.0 * Math.abs(.45-GCcontent) + 1.3;
		double coverageSD = coverageMean * Main.coverageSD;
		int targetCoverage = (int) Math.round(randNumGenerator.nextGaussian()*coverageSD+coverageMean);
		
		return targetCoverage;
		//DEBUGGING
		//return Main.targetAverageCoverage;
		
	}
	
}