File: kmc.cpp

package info (click to toggle)
kmc 3.2.4%2Bdfsg-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 3,716 kB
  • sloc: cpp: 38,308; python: 664; makefile: 216; perl: 179; sh: 34
file content (459 lines) | stat: -rw-r--r-- 16,297 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
#define _CRT_SECURE_NO_WARNINGS
#include "../kmc_core/kmc_runner.h"
#include <cstring>
#include <iostream>
#include <fstream>
#include <algorithm>
#include <iomanip>
using namespace std;

struct CLIParams
{
	std::string jsonSummaryFileName;
	std::string estimatedHistogramFileName;
};

struct Params
{
	KMC::Stage1Params stage1Params;
	KMC::Stage2Params stage2Params;
	CLIParams cliParams;
};
//----------------------------------------------------------------------------------
// Show execution options of the software
void usage()
{
	cout << "K-Mer Counter (KMC) ver. " << KMC::CfgConsts::kmc_ver << " (" << KMC::CfgConsts::kmc_date << ")\n"
		<< "Usage:\n kmc [options] <input_file_name> <output_file_name> <working_directory>\n"
		<< " kmc [options] <@input_file_names> <output_file_name> <working_directory>\n"
		<< "Parameters:\n"
		<< "  input_file_name - single file in specified (-f switch) format (gziped or not)\n"
		<< "  @input_file_names - file name with list of input files in specified (-f switch) format (gziped or not)\n"
		<< "Options:\n"
		<< "  -v - verbose mode (shows all parameter settings); default: false\n"
		<< "  -k<len> - k-mer length (k from " << KMC::CfgConsts::min_k<< " to " << KMC::CfgConsts::max_k << "; default: 25)\n"
		<< "  -m<size> - max amount of RAM in GB (from 1 to 1024); default: 12\n"
		<< "  -sm - use strict memory mode (memory limit from -m<n> switch will not be exceeded)\n"
		<< "  -hc - count homopolymer compressed k-mers (approximate and experimental)\n"
		<< "  -p<par> - signature length (5, 6, 7, 8, 9, 10, 11); default: 9\n"
		<< "  -f<a/q/m/bam/kmc> - input in FASTA format (-fa), FASTQ format (-fq), multi FASTA (-fm) or BAM (-fbam) or KMC(-fkmc); default: FASTQ\n"
		<< "  -ci<value> - exclude k-mers occurring less than <value> times (default: 2)\n"
		<< "  -cs<value> - maximal value of a counter (default: 255)\n"
		<< "  -cx<value> - exclude k-mers occurring more of than <value> times (default: 1e9)\n"
		<< "  -b - turn off transformation of k-mers into canonical form\n"
		<< "  -r - turn on RAM-only mode \n"
		<< "  -n<value> - number of bins \n"
		<< "  -t<value> - total number of threads (default: no. of CPU cores)\n"
		<< "  -sf<value> - number of FASTQ reading threads\n"
		<< "  -sp<value> - number of splitting threads\n"
		<< "  -sr<value> - number of threads for 2nd stage\n"
		<< "  -j<file_name> - file name with execution summary in JSON format\n"
		<< "  -w - without output\n"
		<< "  -o<kmc/kff> - output in KMC of KFF format; default: KMC\n"
		<< "  -hp - hide percentage progress (default: false)\n"
		<< "  -e - only estimate histogram of k-mers occurrences instead of exact k-mer counting\n"
		<< "  --opt-out-size - optimize output database size (may increase running time)\n"
		<< "Example:\n"
		<< "kmc -k27 -m24 NA19238.fastq NA.res /data/kmc_tmp_dir/\n"
		<< "kmc -k27 -m24 @files.lst NA.res /data/kmc_tmp_dir/\n";
}

//----------------------------------------------------------------------------------
// Check if --help or --version was used
bool help_or_version(int argc, char** argv)
{
	const string version = "--version";
	const string help = "--help";
	for (int i = 1; i < argc; ++i)
	{
		if (argv[i] == version || argv[i] == help)
			return true;
	}
	return false;
}

bool CanCreateFile(const string& path)
{
	FILE* f = fopen(path.c_str(), "wb");
	if (!f)
		return false;
	fclose(f);
	remove(path.c_str());
	return true;
}

bool CanCreateFileInPath(const string& path)
{
	static const string name = "kmc_test.bin"; //Some random name
	if (path.back() == '\\' || path.back() == '/')
		return CanCreateFile(path + name);
	else
		return CanCreateFile(path + '/' + name);
}
//----------------------------------------------------------------------------------
// Parse the parameters
bool parse_parameters(int argc, char* argv[], Params& params)
{
	KMC::Stage1Params& stage1Params = params.stage1Params;
	KMC::Stage2Params& stage2Params = params.stage2Params;
	CLIParams& cliParams = params.cliParams;
	int i;
	int tmp;

	bool was_sm = false;
	bool was_r = false;	

	bool was_e = false;
	bool was_opt_out_size = false;
	if (argc < 4)
		return false;

	for (i = 1; i < argc; ++i)
	{
		if (argv[i][0] != '-')
			break;
		// Number of threads
		if (strncmp(argv[i], "-t", 2) == 0)
		{
			auto nThreads = atoi(&argv[i][2]);
			stage1Params.SetNThreads(nThreads); //TODO: what with stage2 in this case?
			stage2Params.SetNThreads(nThreads);
		}
		// k-mer length
		else if (strncmp(argv[i], "-k", 2) == 0)
			stage1Params.SetKmerLen(atoi(&argv[i][2]));
		// Memory limit
		else if (strncmp(argv[i], "-m", 2) == 0)
		{
			tmp = atoi(&argv[i][2]);
			stage1Params.SetMaxRamGB(tmp);
			stage2Params.SetMaxRamGB(tmp);
		}
		// Minimum counter threshold
		else if (strncmp(argv[i], "-ci", 3) == 0)
			stage2Params.SetCutoffMin(atoi(&argv[i][3]));
		// Maximum counter threshold
		else if (strncmp(argv[i], "-cx", 3) == 0)
			stage2Params.SetCutoffMax(atoll(&argv[i][3]));
		// Maximal counter value
		else if (strncmp(argv[i], "-cs", 3) == 0)
			stage2Params.SetCounterMax(atoll(&argv[i][3]));
		// Set p1
		else if (strncmp(argv[i], "-p", 2) == 0)
			stage1Params.SetSignatureLen(atoi(&argv[i][2]));
		//output type
		else if (strncmp(argv[i], "-o", 2) == 0)
		{
			if (strncmp(argv[i] + 2, "kff", 3) == 0)
				stage2Params.SetOutputFileType(KMC::OutputFileType::KFF);
			else if (strncmp(argv[i] + 2, "kmc", 3) == 0)
				stage2Params.SetOutputFileType(KMC::OutputFileType::KMC);
			else
			{
				std::cerr << "Error: unsupported output type: " << argv[i] << " (use -okff or -okmc)\n";
				exit(1);
			}
		}
		// FASTA input files
		else if (strncmp(argv[i], "-fa", 3) == 0)
			stage1Params.SetInputFileType(KMC::InputFileType::FASTA);
		// FASTQ input files
		else if (strncmp(argv[i], "-fq", 3) == 0)
			stage1Params.SetInputFileType(KMC::InputFileType::FASTQ);
		else if (strncmp(argv[i], "-fm", 3) == 0)
			stage1Params.SetInputFileType(KMC::InputFileType::MULTILINE_FASTA);
		else if (strncmp(argv[i], "-fbam", 5) == 0)
			stage1Params.SetInputFileType(KMC::InputFileType::BAM);
		else if (strncmp(argv[i], "-fkmc", 5) == 0)
			stage1Params.SetInputFileType(KMC::InputFileType::KMC);
#ifdef DEVELOP_MODE
		else if (strncmp(argv[i], "-vl", 3) == 0)
			stage1Params.SetDevelopVerbose(true);
#endif
		else if (strncmp(argv[i], "-v", 2) == 0)
		{
			static KMC::CerrVerboseLogger logger;
			stage1Params.SetVerboseLogger(&logger);			
		}
		else if (strncmp(argv[i], "-sm", 3) == 0 && strlen(argv[i]) == 3)
		{
			was_sm = true;
			stage2Params.SetStrictMemoryMode(true);
		}
		else if (strncmp(argv[i], "-hp", 3) == 0 && strlen(argv[i]) == 3)
		{
			static KMC::NullPercentProgressObserver nullPercentProgressObserver;
			static KMC::NullProgressObserver nullProgressObserver;
			stage1Params.SetPercentProgressObserver(&nullPercentProgressObserver);
			stage1Params.SetProgressObserver(&nullProgressObserver);
		}
		else if (strncmp(argv[i], "-hc", 3) == 0 && strlen(argv[i]) == 3)
			stage1Params.SetHomopolymerCompressed(true);
		else if (strncmp(argv[i], "-r", 2) == 0)
		{
			stage1Params.SetRamOnlyMode(true);
			was_r = true;
		}
		else if (strncmp(argv[i], "-b", 2) == 0)
			stage1Params.SetCanonicalKmers(false);
		// Number of reading threads
		else if (strncmp(argv[i], "-sf", 3) == 0)
			stage1Params.SetNReaders(atoi(&argv[i][3]));
		// Number of splitting threads
		else if (strncmp(argv[i], "-sp", 3) == 0)
			stage1Params.SetNSplitters(atoi(&argv[i][3]));
		// Number of internal threads per 2nd stage
		else if (strncmp(argv[i], "-sr", 3) == 0)
			stage2Params.SetNThreads(atoi(&argv[i][3]));
		else if (strncmp(argv[i], "-n", 2) == 0)
			stage1Params.SetNBins(atoi(&argv[i][2]));
		else if (strncmp(argv[i], "-j", 2) == 0)
		{
			cliParams.jsonSummaryFileName = &argv[i][2];
			if (cliParams.jsonSummaryFileName == "")
				cerr << "Warning: file name for json summary file missed (-j switch)\n";
		}
		else if (strncmp(argv[i], "-e", 2) == 0)
		{
			was_e = true;
			stage1Params.SetEstimateHistogramCfg(KMC::EstimateHistogramCfg::ONLY_ESTIMATE);
		}
		else if (strcmp(argv[i], "--opt-out-size") == 0)
		{
			was_opt_out_size = true;
			if (stage1Params.GetEstimateHistogramCfg() != KMC::EstimateHistogramCfg::ONLY_ESTIMATE) //ONLY_ESTIMATE has priority over estimate and count
				stage1Params.SetEstimateHistogramCfg(KMC::EstimateHistogramCfg::ESTIMATE_AND_COUNT_KMERS);
		}
		else if (strncmp(argv[i], "-w", 2) == 0)
			stage2Params.SetWithoutOutput(true);			

		if (strncmp(argv[i], "-smso", 5) == 0)
			stage2Params.SetStrictMemoryNSortingThreadsPerSorters(atoi(&argv[i][5]));		
		
		if (strncmp(argv[i], "-smun", 5) == 0)		
			stage2Params.SetStrictMemoryNUncompactors(atoi(&argv[i][5]));		
		if (strncmp(argv[i], "-smme", 5) == 0)		
			stage2Params.SetStrictMemoryNMergers(atoi(&argv[i][5]));		
	}

	if (argc - i < 3)
		return false;

	string input_file_name = string(argv[i++]);
	if (stage1Params.GetEstimateHistogramCfg() == KMC::EstimateHistogramCfg::ONLY_ESTIMATE)
		cliParams.estimatedHistogramFileName = argv[i++];
	else
		stage2Params.SetOutputFileName(argv[i++]);
	stage1Params.SetTmpPath(argv[i++]);

	std::vector<std::string> input_file_names;	
	if (input_file_name[0] != '@')
		input_file_names.push_back(input_file_name);
	else
	{
		ifstream in(input_file_name.c_str() + 1);
		if (!in.good())
		{
			cerr << "Error: No " << input_file_name.c_str() + 1 << " file\n";
			return false;
		}

		string s;
		while (getline(in, s))
			if (s != "")
				input_file_names.push_back(s);

		in.close();
		std::random_shuffle(input_file_names.begin(), input_file_names.end());
	}
	stage1Params.SetInputFiles(input_file_names);

	//Validate and resolve conflicts in parameters
	if (was_e && was_opt_out_size)
	{
		std::cerr << "Warning: --opt-out-size is ignored because -e was used\n";
	}

	if (was_sm && was_r)
	{
		cerr << "Error: -sm can not be used with -r\n";
		return false;
	}
	
	//Check if output files may be created and if it is possible to create file in specified tmp location
	if (!stage2Params.GetWithoutOutput())
	{		
		string pre_file_name = stage2Params.GetOutputFileName() + ".kmc_pre";
		string suff_file_name = stage2Params.GetOutputFileName() + ".kmc_suf";
		if (!CanCreateFile(pre_file_name))
		{
			cerr << "Error: Cannot create file: " << pre_file_name << "\n";
			return false;
		}
		if (!CanCreateFile(suff_file_name))
		{
			cerr << "Error: Cannot create file: " << suff_file_name << "\n";
			return false;
		}
	}
	if (!CanCreateFileInPath(stage1Params.GetTmpPath()))
	{
		cerr << "Error: Cannot create file in specified working directory: " << stage1Params.GetTmpPath() << "\n";
		return false;
	}
	return true;
}

void save_estimated_histogram(const std::string& fileName, const std::vector<uint64_t>& estimatedHistogram)
{
	if (fileName == "")
		return;

	std::ofstream out(fileName);
	if (!out)
	{
		std::cerr << "Warning: Cannot open file " << fileName << " to store estimated histogram";
		return;
	}
	for (uint32_t i = 1; i < estimatedHistogram.size(); ++i)
		out << i << "\t" << estimatedHistogram[i] << "\n";
}

void save_stats_in_json_file(const Params& params, const KMC::Stage1Results& stage1Results,	const KMC::Stage2Results& stage2Results)
{
	if (params.cliParams.jsonSummaryFileName == "")
		return;

	ofstream stats(params.cliParams.jsonSummaryFileName);

	if (!stats)
	{		
		std::cerr << "Warning: Cannot open file " << params.cliParams.jsonSummaryFileName << " to store summary of execution in JSON format";		
		return;
	}

	double time1 = stage1Results.time;
	double time2 = stage2Results.time;
	double time3 = stage2Results.timeStrictMem;

	bool display_strict_mem_stats = params.stage2Params.GetStrictMemoryMode() && stage1Results.wasSmallKOptUsed;

	stats << "{\n"
		<< "\t\"1st_stage\": \"" << time1 << "s\",\n"
		<< "\t\"2nd_stage\": \"" << time2 << "s\",\n";
	if (display_strict_mem_stats)
	{
		stats << "\t\"3rd_stage\": \"" << time3 << "s\",\n";
		stats << "\t\"Total\": \"" << (time1 + time2 + time3) << "s\",\n";
	}

	else
		stats << "\t\"Total\": \"" << (time1 + time2) << "s\",\n";


	if (display_strict_mem_stats)
	{
		stats << "\t\"Tmp_size\": \"" << stage1Results.tmpSize / 1000000 << "MB\",\n"
			<< "\t\"Tmp_size_strict_memory\": \"" << stage2Results.tmpSizeStrictMemory / 1000000 << "MB\",\n"
			<< "\t\"Tmp_total\": \"" << stage2Results.maxDiskUsage / 1000000 << "MB\",\n";
	}
	else
		stats << "\t\"Tmp_size\": \"" << stage1Results.tmpSize / 1000000 << "MB\",\n";

	stats << "\t\"Stats\": {\n";

	stats << "\t\t\"#k-mers_below_min_threshold\": " << stage2Results.nBelowCutoffMin << ",\n"
		<< "\t\t\"#k-mers_above_max_threshold\": " << stage2Results.nAboveCutoffMax << ",\n"
		<< "\t\t\"#Unique_k-mers\": " << stage2Results.nUniqueKmers << ",\n"
		<< "\t\t\"#Unique_counted_k-mers\": " << stage2Results.nUniqueKmers - stage2Results.nBelowCutoffMin - stage2Results.nAboveCutoffMax << ",\n"
		<< "\t\t\"#Total no. of k-mers\": " << stage2Results.nTotalKmers << ",\n";
	if(params.stage1Params.GetInputFileType() != KMC::InputFileType::MULTILINE_FASTA)	
		stats << "\t\t\"#Total_reads\": " << stage1Results.nSeqences << ",\n";
	else
		stats << "\t\t\"#Total_sequences\": " << stage1Results.nSeqences << ",\n";
	stats << "\t\t\"#Total_super-k-mers\": " << stage1Results.nTotalSuperKmers << "\n";

	stats << "\t}\n";
	stats << "}\n";
	stats.close();
}

void print_summary(
	const Params& params,	
	const KMC::Stage1Results& stage1Results,
	const KMC::Stage2Results& stage2Results)
{
	const KMC::Stage1Params& stage1Params = params.stage1Params;
	const KMC::Stage2Params& stage2Params = params.stage2Params;

	cout << "1st stage: " << stage1Results.time << "s\n"
		<< "2nd stage: " << stage2Results.time << "s\n";

	bool display_strict_mem_stats = stage2Params.GetStrictMemoryMode() && !stage1Results.wasSmallKOptUsed;
	if (display_strict_mem_stats)
	{
		cout << "3rd stage: " << stage2Results.timeStrictMem << "s\n";
		cout << "Total    : " << (stage1Results.time + stage2Results.time + stage2Results.timeStrictMem) << "s\n";
	}
	else
		cout << "Total    : " << (stage1Results.time + stage2Results.time) << "s\n";
	if (display_strict_mem_stats)
	{
		cout << "Tmp size : " << stage1Results.tmpSize / 1000000 << "MB\n"
			<< "Tmp size strict memory : " << stage2Results.tmpSizeStrictMemory / 1000000 << "MB\n"
			<< "Tmp total: " << stage2Results.maxDiskUsage / 1000000 << "MB\n";
	}
	else
		cout << "Tmp size : " << stage1Results.tmpSize / 1000000 << "MB\n";
	cout << "\nStats:\n"
		<< "   No. of k-mers below min. threshold : " << setw(12) << stage2Results.nBelowCutoffMin << "\n"
		<< "   No. of k-mers above max. threshold : " << setw(12) << stage2Results.nAboveCutoffMax << "\n"
		<< "   No. of unique k-mers               : " << setw(12) << stage2Results.nUniqueKmers << "\n"
		<< "   No. of unique counted k-mers       : " << setw(12) << stage2Results.nUniqueKmers - stage2Results.nBelowCutoffMin - stage2Results.nAboveCutoffMax << "\n"
		<< "   Total no. of k-mers                : " << setw(12) << stage2Results.nTotalKmers << "\n";
	if (stage1Params.GetInputFileType() != KMC::InputFileType::MULTILINE_FASTA)
		cout << "   Total no. of reads                 : " << setw(12) << stage1Results.nSeqences << "\n";
	else
		cout << "   Total no. of sequences             : " << setw(12) << stage1Results.nSeqences << "\n";
	cout << "   Total no. of super-k-mers          : " << setw(12) << stage1Results.nTotalSuperKmers << "\n";
}

//----------------------------------------------------------------------------------
// Main function
int main(int argc, char** argv)
{
	if (argc == 1 || help_or_version(argc, argv))
	{
		usage();
		return 0;
	}	
	try
	{
		Params params;		
		if (!parse_parameters(argc, argv, params))
		{
			usage();
			return 0;
		}

		KMC::Runner runner;
		auto stage1Results = runner.RunStage1(params.stage1Params);
		save_estimated_histogram(params.cliParams.estimatedHistogramFileName, stage1Results.estimatedHistogram);
		auto stage2Results = runner.RunStage2(params.stage2Params);
		print_summary(params, stage1Results, stage2Results);
		save_stats_in_json_file(params, stage1Results, stage2Results);
	}
	catch (const std::exception& err)
	{
		std::cerr << "Error: " << err.what() << "\n";
		return 1;
	}
	catch (...)
	{
		std::cerr << "Error: unknown exception\n";
		return 1;
	}
	
	return 0;

}