File: CoverageStatisticsCalculator.java

package info (click to toggle)
artfastqgenerator 0.0.20150519-1~bpo8%2B1
  • links: PTS, VCS
  • area: main
  • in suites: jessie-backports
  • size: 4,916 kB
  • sloc: java: 1,481; sh: 39; makefile: 11
file content (197 lines) | stat: -rw-r--r-- 6,942 bytes parent folder | download | duplicates (5)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
package artificialFastqGenerator;

import java.util.logging.Level;
import java.util.logging.Logger;

/**
 * Instances of this class calculate regional and overall coverage statistics for nucleobases.
 * 
 * Copyright (C) 2012 The Institute of Cancer Research (ICR).
 *
 * This file is part of ArtificialFastqGenerator v1.0.0.
 * 
 * ArtificialFastqGenerator is free software: you can redistribute it and/or modify it under the terms of the GNU General
 * Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any
 * later version.
 * 
 * This program is distributed in the hope that it will be useful but WITHOUT ANY WARRANTY; without even the implied warranty
 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 * 
 * You should have received a copy of the GNU Public License along with this program. If not, see
 * <http://www.gnu.org/licenses/>
 * 
 * Authour's contact email: Matthew.Frampton@icr.ac.uk
 */

public class CoverageStatisticsCalculator {

	//Fields for summary stats
	private double regionTotalCov, regionACGTTotalCov, overallTotalCov, overallACGTTotalCov;
	private long regionNumBasesProcessed;
	private int regionMinCov, regionMaxCov;
	private int regionNumACGTsProcessed, regionACGTMinCov, regionACGTMaxCov;
	private long overallNumBasesProcessed;
	private int overallMinCov, overallMaxCov;
	private int overallNumACGTsProcessed, overallACGTMinCov, overallACGTMaxCov;
	private static final int multipleOfMeanForInitMinCov = 1000000;
	private boolean overallStatsUpdatedFromRegionStatsAtLeastOnce;
	private Logger logger;
	
	/**
	 * Initialise a new CoverageStatisticsCalculator object.
	 */
	
	public CoverageStatisticsCalculator() {
		
		regionTotalCov = 0.0;
		regionACGTTotalCov = 0.0;
		overallTotalCov = 0.0;
		overallACGTTotalCov = 0.0;
		
		regionNumBasesProcessed = 0;
		regionMinCov = (int) Main.coverageMeanPeak*multipleOfMeanForInitMinCov;
		regionMaxCov = 0;
		
		regionNumACGTsProcessed = 0;
		regionACGTMinCov = (int) Main.coverageMeanPeak*multipleOfMeanForInitMinCov;
		regionACGTMaxCov = 0;
		
		overallNumBasesProcessed = 0;
		overallMinCov = (int) Main.coverageMeanPeak*multipleOfMeanForInitMinCov;
		overallMaxCov = 0;
		
		overallNumACGTsProcessed = 0;
		overallACGTMinCov = (int) Main.coverageMeanPeak*multipleOfMeanForInitMinCov;
		overallACGTMaxCov = 0;	
		
		overallStatsUpdatedFromRegionStatsAtLeastOnce = false;
		
		logger = Main.logger;
		
	}
	
	/**
	 * Log the coverage statistics header.
	 */
	
	public void logCoverageStatsHeader() {

		logger.log(Level.INFO,"ALL_SO_FAR_NUM,ALL_AV_COV,ALL_MIN_COV,ALL_MAX_COV,ACGT_NUM,ACGT_AV_COV,ACGT_MIN_COV," +
				"ACGT_MAX_COV");
		
	}
	
	/**
	 * Update the region coverage statistics from the coverage statistics for 1
	 * nucleobase in that region.
	 * 
	 * @param nucleobase - nucleobase from which to update region stats.
	 */
	
	public void updateRegionStatsFrom1Nucleobase(Nucleobase nucleobase) {
		
		//Stats for all bases i.e. including Ns:
		int numReads = nucleobase.getCoverage();
		if (numReads < regionMinCov) {regionMinCov = numReads;}
		else if (numReads > regionMaxCov) {regionMaxCov = numReads;}
		regionTotalCov = regionTotalCov + numReads;
		regionNumBasesProcessed = regionNumBasesProcessed + 1;
		//Stats for A/C/G/Ts:
		if (ArtificialFastqGenerationUtils.isACGT(nucleobase.getGenotype())) {
			if (numReads < regionACGTMinCov) {regionACGTMinCov = numReads;}
			if (numReads > regionACGTMaxCov) {regionACGTMaxCov = numReads;}
			//else if (numReads > regionACGTMaxCov) {regionACGTMaxCov = numReads;}
			regionACGTTotalCov = regionACGTTotalCov + numReads;
			regionNumACGTsProcessed = regionNumACGTsProcessed + 1;
		}
		
	}
	
	/**
	 * Update the overall coverage statistics from region coverage statistics.
	 */
	
	private void updateOverallStatsFromRegionStats() {
		
		overallStatsUpdatedFromRegionStatsAtLeastOnce = true;
		//Update overall stats for all bases i.e. including Ns:
		overallNumBasesProcessed = overallNumBasesProcessed + regionNumBasesProcessed;
		overallTotalCov = overallTotalCov + regionTotalCov;
		if (regionMinCov < overallMinCov) {overallMinCov = regionMinCov;}
		if (regionMaxCov > overallMaxCov) {overallMaxCov = regionMaxCov;}
		//Update overall stats for A/C/G/Ts:
		overallNumACGTsProcessed = overallNumACGTsProcessed + regionNumACGTsProcessed;
		overallACGTTotalCov = overallACGTTotalCov + regionACGTTotalCov;
		if (regionACGTMinCov < overallACGTMinCov) {overallACGTMinCov = regionACGTMinCov;}
		if (regionACGTMaxCov > overallACGTMaxCov) {overallACGTMaxCov = regionACGTMaxCov;}
		
	}

	/**
	 * Log the region coverage statistics.
	 */
	
	public void logRegionStats() {
	
		updateOverallStatsFromRegionStats();
		
		double regionAverageCoverage = regionTotalCov/regionNumBasesProcessed;
		double regionACGTAverageCoverage = regionACGTTotalCov/regionNumACGTsProcessed;
		if (regionMaxCov == 0) {
			regionMinCov = 0;
			regionAverageCoverage = 0;}
		if (regionACGTMaxCov == 0) {
			regionACGTMinCov = 0;
			regionACGTAverageCoverage = 0;}
			
		logger.log(Level.INFO, overallNumBasesProcessed + "," + regionAverageCoverage + "," + regionMinCov + "," + 
		regionMaxCov + "," + regionNumACGTsProcessed + "," + regionACGTAverageCoverage + "," + regionACGTMinCov + "," +
				regionACGTMaxCov);
		resetRegionStats();
		
	}
	
	/**
	 * Reset the region coverage statistics.
	 */
	
	private void resetRegionStats() {
		
		regionTotalCov = 0.0;
		regionNumBasesProcessed = 0;
		regionMinCov = (int) Main.coverageMeanPeak*multipleOfMeanForInitMinCov;
		regionMaxCov = 0;
		regionACGTTotalCov = 0.0;
		regionNumACGTsProcessed = 0;
		regionACGTMinCov = (int) Main.coverageMeanPeak*multipleOfMeanForInitMinCov;
		regionACGTMaxCov = 0;
		
	}
	
	/**
	 * Log the overall coverage and error statistics.
	 */
	
	
	public void logOverallStats() {
		
		if (overallStatsUpdatedFromRegionStatsAtLeastOnce == false) {
			updateOverallStatsFromRegionStats();
		}
		
		logger.log(Level.INFO,"");
		logger.log(Level.INFO,"OVERALL STATISTICS:");
		logger.log(Level.INFO,"# nucleobases processed=" + overallNumBasesProcessed);
		logger.log(Level.INFO,"Average coverage=" + overallTotalCov/overallNumBasesProcessed);
		logger.log(Level.INFO,"Minimum coverage=" + overallMinCov);
		logger.log(Level.INFO,"Maximum coverage=" + overallMaxCov);
		logger.log(Level.INFO,"# ACGTs processed=" + overallNumACGTsProcessed);
		if (overallNumACGTsProcessed != 0.0) {
			logger.log(Level.INFO,"ACGT average coverage=" + overallACGTTotalCov/overallNumACGTsProcessed);
			logger.log(Level.INFO,"ACGT minimum coverage=" + overallACGTMinCov);
			logger.log(Level.INFO,"ACGT maximum coverage=" + overallACGTMaxCov);
		}
		
	}
	
}