1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459
|
#define _CRT_SECURE_NO_WARNINGS
#include "../kmc_core/kmc_runner.h"
#include <cstring>
#include <iostream>
#include <fstream>
#include <algorithm>
#include <iomanip>
using namespace std;
struct CLIParams
{
std::string jsonSummaryFileName;
std::string estimatedHistogramFileName;
};
struct Params
{
KMC::Stage1Params stage1Params;
KMC::Stage2Params stage2Params;
CLIParams cliParams;
};
//----------------------------------------------------------------------------------
// Show execution options of the software
void usage()
{
cout << "K-Mer Counter (KMC) ver. " << KMC::CfgConsts::kmc_ver << " (" << KMC::CfgConsts::kmc_date << ")\n"
<< "Usage:\n kmc [options] <input_file_name> <output_file_name> <working_directory>\n"
<< " kmc [options] <@input_file_names> <output_file_name> <working_directory>\n"
<< "Parameters:\n"
<< " input_file_name - single file in specified (-f switch) format (gziped or not)\n"
<< " @input_file_names - file name with list of input files in specified (-f switch) format (gziped or not)\n"
<< "Options:\n"
<< " -v - verbose mode (shows all parameter settings); default: false\n"
<< " -k<len> - k-mer length (k from " << KMC::CfgConsts::min_k<< " to " << KMC::CfgConsts::max_k << "; default: 25)\n"
<< " -m<size> - max amount of RAM in GB (from 1 to 1024); default: 12\n"
<< " -sm - use strict memory mode (memory limit from -m<n> switch will not be exceeded)\n"
<< " -hc - count homopolymer compressed k-mers (approximate and experimental)\n"
<< " -p<par> - signature length (5, 6, 7, 8, 9, 10, 11); default: 9\n"
<< " -f<a/q/m/bam/kmc> - input in FASTA format (-fa), FASTQ format (-fq), multi FASTA (-fm) or BAM (-fbam) or KMC(-fkmc); default: FASTQ\n"
<< " -ci<value> - exclude k-mers occurring less than <value> times (default: 2)\n"
<< " -cs<value> - maximal value of a counter (default: 255)\n"
<< " -cx<value> - exclude k-mers occurring more of than <value> times (default: 1e9)\n"
<< " -b - turn off transformation of k-mers into canonical form\n"
<< " -r - turn on RAM-only mode \n"
<< " -n<value> - number of bins \n"
<< " -t<value> - total number of threads (default: no. of CPU cores)\n"
<< " -sf<value> - number of FASTQ reading threads\n"
<< " -sp<value> - number of splitting threads\n"
<< " -sr<value> - number of threads for 2nd stage\n"
<< " -j<file_name> - file name with execution summary in JSON format\n"
<< " -w - without output\n"
<< " -o<kmc/kff> - output in KMC of KFF format; default: KMC\n"
<< " -hp - hide percentage progress (default: false)\n"
<< " -e - only estimate histogram of k-mers occurrences instead of exact k-mer counting\n"
<< " --opt-out-size - optimize output database size (may increase running time)\n"
<< "Example:\n"
<< "kmc -k27 -m24 NA19238.fastq NA.res /data/kmc_tmp_dir/\n"
<< "kmc -k27 -m24 @files.lst NA.res /data/kmc_tmp_dir/\n";
}
//----------------------------------------------------------------------------------
// Check if --help or --version was used
bool help_or_version(int argc, char** argv)
{
const string version = "--version";
const string help = "--help";
for (int i = 1; i < argc; ++i)
{
if (argv[i] == version || argv[i] == help)
return true;
}
return false;
}
bool CanCreateFile(const string& path)
{
FILE* f = fopen(path.c_str(), "wb");
if (!f)
return false;
fclose(f);
remove(path.c_str());
return true;
}
bool CanCreateFileInPath(const string& path)
{
static const string name = "kmc_test.bin"; //Some random name
if (path.back() == '\\' || path.back() == '/')
return CanCreateFile(path + name);
else
return CanCreateFile(path + '/' + name);
}
//----------------------------------------------------------------------------------
// Parse the parameters
bool parse_parameters(int argc, char* argv[], Params& params)
{
KMC::Stage1Params& stage1Params = params.stage1Params;
KMC::Stage2Params& stage2Params = params.stage2Params;
CLIParams& cliParams = params.cliParams;
int i;
int tmp;
bool was_sm = false;
bool was_r = false;
bool was_e = false;
bool was_opt_out_size = false;
if (argc < 4)
return false;
for (i = 1; i < argc; ++i)
{
if (argv[i][0] != '-')
break;
// Number of threads
if (strncmp(argv[i], "-t", 2) == 0)
{
auto nThreads = atoi(&argv[i][2]);
stage1Params.SetNThreads(nThreads); //TODO: what with stage2 in this case?
stage2Params.SetNThreads(nThreads);
}
// k-mer length
else if (strncmp(argv[i], "-k", 2) == 0)
stage1Params.SetKmerLen(atoi(&argv[i][2]));
// Memory limit
else if (strncmp(argv[i], "-m", 2) == 0)
{
tmp = atoi(&argv[i][2]);
stage1Params.SetMaxRamGB(tmp);
stage2Params.SetMaxRamGB(tmp);
}
// Minimum counter threshold
else if (strncmp(argv[i], "-ci", 3) == 0)
stage2Params.SetCutoffMin(atoi(&argv[i][3]));
// Maximum counter threshold
else if (strncmp(argv[i], "-cx", 3) == 0)
stage2Params.SetCutoffMax(atoll(&argv[i][3]));
// Maximal counter value
else if (strncmp(argv[i], "-cs", 3) == 0)
stage2Params.SetCounterMax(atoll(&argv[i][3]));
// Set p1
else if (strncmp(argv[i], "-p", 2) == 0)
stage1Params.SetSignatureLen(atoi(&argv[i][2]));
//output type
else if (strncmp(argv[i], "-o", 2) == 0)
{
if (strncmp(argv[i] + 2, "kff", 3) == 0)
stage2Params.SetOutputFileType(KMC::OutputFileType::KFF);
else if (strncmp(argv[i] + 2, "kmc", 3) == 0)
stage2Params.SetOutputFileType(KMC::OutputFileType::KMC);
else
{
std::cerr << "Error: unsupported output type: " << argv[i] << " (use -okff or -okmc)\n";
exit(1);
}
}
// FASTA input files
else if (strncmp(argv[i], "-fa", 3) == 0)
stage1Params.SetInputFileType(KMC::InputFileType::FASTA);
// FASTQ input files
else if (strncmp(argv[i], "-fq", 3) == 0)
stage1Params.SetInputFileType(KMC::InputFileType::FASTQ);
else if (strncmp(argv[i], "-fm", 3) == 0)
stage1Params.SetInputFileType(KMC::InputFileType::MULTILINE_FASTA);
else if (strncmp(argv[i], "-fbam", 5) == 0)
stage1Params.SetInputFileType(KMC::InputFileType::BAM);
else if (strncmp(argv[i], "-fkmc", 5) == 0)
stage1Params.SetInputFileType(KMC::InputFileType::KMC);
#ifdef DEVELOP_MODE
else if (strncmp(argv[i], "-vl", 3) == 0)
stage1Params.SetDevelopVerbose(true);
#endif
else if (strncmp(argv[i], "-v", 2) == 0)
{
static KMC::CerrVerboseLogger logger;
stage1Params.SetVerboseLogger(&logger);
}
else if (strncmp(argv[i], "-sm", 3) == 0 && strlen(argv[i]) == 3)
{
was_sm = true;
stage2Params.SetStrictMemoryMode(true);
}
else if (strncmp(argv[i], "-hp", 3) == 0 && strlen(argv[i]) == 3)
{
static KMC::NullPercentProgressObserver nullPercentProgressObserver;
static KMC::NullProgressObserver nullProgressObserver;
stage1Params.SetPercentProgressObserver(&nullPercentProgressObserver);
stage1Params.SetProgressObserver(&nullProgressObserver);
}
else if (strncmp(argv[i], "-hc", 3) == 0 && strlen(argv[i]) == 3)
stage1Params.SetHomopolymerCompressed(true);
else if (strncmp(argv[i], "-r", 2) == 0)
{
stage1Params.SetRamOnlyMode(true);
was_r = true;
}
else if (strncmp(argv[i], "-b", 2) == 0)
stage1Params.SetCanonicalKmers(false);
// Number of reading threads
else if (strncmp(argv[i], "-sf", 3) == 0)
stage1Params.SetNReaders(atoi(&argv[i][3]));
// Number of splitting threads
else if (strncmp(argv[i], "-sp", 3) == 0)
stage1Params.SetNSplitters(atoi(&argv[i][3]));
// Number of internal threads per 2nd stage
else if (strncmp(argv[i], "-sr", 3) == 0)
stage2Params.SetNThreads(atoi(&argv[i][3]));
else if (strncmp(argv[i], "-n", 2) == 0)
stage1Params.SetNBins(atoi(&argv[i][2]));
else if (strncmp(argv[i], "-j", 2) == 0)
{
cliParams.jsonSummaryFileName = &argv[i][2];
if (cliParams.jsonSummaryFileName == "")
cerr << "Warning: file name for json summary file missed (-j switch)\n";
}
else if (strncmp(argv[i], "-e", 2) == 0)
{
was_e = true;
stage1Params.SetEstimateHistogramCfg(KMC::EstimateHistogramCfg::ONLY_ESTIMATE);
}
else if (strcmp(argv[i], "--opt-out-size") == 0)
{
was_opt_out_size = true;
if (stage1Params.GetEstimateHistogramCfg() != KMC::EstimateHistogramCfg::ONLY_ESTIMATE) //ONLY_ESTIMATE has priority over estimate and count
stage1Params.SetEstimateHistogramCfg(KMC::EstimateHistogramCfg::ESTIMATE_AND_COUNT_KMERS);
}
else if (strncmp(argv[i], "-w", 2) == 0)
stage2Params.SetWithoutOutput(true);
if (strncmp(argv[i], "-smso", 5) == 0)
stage2Params.SetStrictMemoryNSortingThreadsPerSorters(atoi(&argv[i][5]));
if (strncmp(argv[i], "-smun", 5) == 0)
stage2Params.SetStrictMemoryNUncompactors(atoi(&argv[i][5]));
if (strncmp(argv[i], "-smme", 5) == 0)
stage2Params.SetStrictMemoryNMergers(atoi(&argv[i][5]));
}
if (argc - i < 3)
return false;
string input_file_name = string(argv[i++]);
if (stage1Params.GetEstimateHistogramCfg() == KMC::EstimateHistogramCfg::ONLY_ESTIMATE)
cliParams.estimatedHistogramFileName = argv[i++];
else
stage2Params.SetOutputFileName(argv[i++]);
stage1Params.SetTmpPath(argv[i++]);
std::vector<std::string> input_file_names;
if (input_file_name[0] != '@')
input_file_names.push_back(input_file_name);
else
{
ifstream in(input_file_name.c_str() + 1);
if (!in.good())
{
cerr << "Error: No " << input_file_name.c_str() + 1 << " file\n";
return false;
}
string s;
while (getline(in, s))
if (s != "")
input_file_names.push_back(s);
in.close();
std::random_shuffle(input_file_names.begin(), input_file_names.end());
}
stage1Params.SetInputFiles(input_file_names);
//Validate and resolve conflicts in parameters
if (was_e && was_opt_out_size)
{
std::cerr << "Warning: --opt-out-size is ignored because -e was used\n";
}
if (was_sm && was_r)
{
cerr << "Error: -sm can not be used with -r\n";
return false;
}
//Check if output files may be created and if it is possible to create file in specified tmp location
if (!stage2Params.GetWithoutOutput())
{
string pre_file_name = stage2Params.GetOutputFileName() + ".kmc_pre";
string suff_file_name = stage2Params.GetOutputFileName() + ".kmc_suf";
if (!CanCreateFile(pre_file_name))
{
cerr << "Error: Cannot create file: " << pre_file_name << "\n";
return false;
}
if (!CanCreateFile(suff_file_name))
{
cerr << "Error: Cannot create file: " << suff_file_name << "\n";
return false;
}
}
if (!CanCreateFileInPath(stage1Params.GetTmpPath()))
{
cerr << "Error: Cannot create file in specified working directory: " << stage1Params.GetTmpPath() << "\n";
return false;
}
return true;
}
void save_estimated_histogram(const std::string& fileName, const std::vector<uint64_t>& estimatedHistogram)
{
if (fileName == "")
return;
std::ofstream out(fileName);
if (!out)
{
std::cerr << "Warning: Cannot open file " << fileName << " to store estimated histogram";
return;
}
for (uint32_t i = 1; i < estimatedHistogram.size(); ++i)
out << i << "\t" << estimatedHistogram[i] << "\n";
}
void save_stats_in_json_file(const Params& params, const KMC::Stage1Results& stage1Results, const KMC::Stage2Results& stage2Results)
{
if (params.cliParams.jsonSummaryFileName == "")
return;
ofstream stats(params.cliParams.jsonSummaryFileName);
if (!stats)
{
std::cerr << "Warning: Cannot open file " << params.cliParams.jsonSummaryFileName << " to store summary of execution in JSON format";
return;
}
double time1 = stage1Results.time;
double time2 = stage2Results.time;
double time3 = stage2Results.timeStrictMem;
bool display_strict_mem_stats = params.stage2Params.GetStrictMemoryMode() && stage1Results.wasSmallKOptUsed;
stats << "{\n"
<< "\t\"1st_stage\": \"" << time1 << "s\",\n"
<< "\t\"2nd_stage\": \"" << time2 << "s\",\n";
if (display_strict_mem_stats)
{
stats << "\t\"3rd_stage\": \"" << time3 << "s\",\n";
stats << "\t\"Total\": \"" << (time1 + time2 + time3) << "s\",\n";
}
else
stats << "\t\"Total\": \"" << (time1 + time2) << "s\",\n";
if (display_strict_mem_stats)
{
stats << "\t\"Tmp_size\": \"" << stage1Results.tmpSize / 1000000 << "MB\",\n"
<< "\t\"Tmp_size_strict_memory\": \"" << stage2Results.tmpSizeStrictMemory / 1000000 << "MB\",\n"
<< "\t\"Tmp_total\": \"" << stage2Results.maxDiskUsage / 1000000 << "MB\",\n";
}
else
stats << "\t\"Tmp_size\": \"" << stage1Results.tmpSize / 1000000 << "MB\",\n";
stats << "\t\"Stats\": {\n";
stats << "\t\t\"#k-mers_below_min_threshold\": " << stage2Results.nBelowCutoffMin << ",\n"
<< "\t\t\"#k-mers_above_max_threshold\": " << stage2Results.nAboveCutoffMax << ",\n"
<< "\t\t\"#Unique_k-mers\": " << stage2Results.nUniqueKmers << ",\n"
<< "\t\t\"#Unique_counted_k-mers\": " << stage2Results.nUniqueKmers - stage2Results.nBelowCutoffMin - stage2Results.nAboveCutoffMax << ",\n"
<< "\t\t\"#Total no. of k-mers\": " << stage2Results.nTotalKmers << ",\n";
if(params.stage1Params.GetInputFileType() != KMC::InputFileType::MULTILINE_FASTA)
stats << "\t\t\"#Total_reads\": " << stage1Results.nSeqences << ",\n";
else
stats << "\t\t\"#Total_sequences\": " << stage1Results.nSeqences << ",\n";
stats << "\t\t\"#Total_super-k-mers\": " << stage1Results.nTotalSuperKmers << "\n";
stats << "\t}\n";
stats << "}\n";
stats.close();
}
void print_summary(
const Params& params,
const KMC::Stage1Results& stage1Results,
const KMC::Stage2Results& stage2Results)
{
const KMC::Stage1Params& stage1Params = params.stage1Params;
const KMC::Stage2Params& stage2Params = params.stage2Params;
cout << "1st stage: " << stage1Results.time << "s\n"
<< "2nd stage: " << stage2Results.time << "s\n";
bool display_strict_mem_stats = stage2Params.GetStrictMemoryMode() && !stage1Results.wasSmallKOptUsed;
if (display_strict_mem_stats)
{
cout << "3rd stage: " << stage2Results.timeStrictMem << "s\n";
cout << "Total : " << (stage1Results.time + stage2Results.time + stage2Results.timeStrictMem) << "s\n";
}
else
cout << "Total : " << (stage1Results.time + stage2Results.time) << "s\n";
if (display_strict_mem_stats)
{
cout << "Tmp size : " << stage1Results.tmpSize / 1000000 << "MB\n"
<< "Tmp size strict memory : " << stage2Results.tmpSizeStrictMemory / 1000000 << "MB\n"
<< "Tmp total: " << stage2Results.maxDiskUsage / 1000000 << "MB\n";
}
else
cout << "Tmp size : " << stage1Results.tmpSize / 1000000 << "MB\n";
cout << "\nStats:\n"
<< " No. of k-mers below min. threshold : " << setw(12) << stage2Results.nBelowCutoffMin << "\n"
<< " No. of k-mers above max. threshold : " << setw(12) << stage2Results.nAboveCutoffMax << "\n"
<< " No. of unique k-mers : " << setw(12) << stage2Results.nUniqueKmers << "\n"
<< " No. of unique counted k-mers : " << setw(12) << stage2Results.nUniqueKmers - stage2Results.nBelowCutoffMin - stage2Results.nAboveCutoffMax << "\n"
<< " Total no. of k-mers : " << setw(12) << stage2Results.nTotalKmers << "\n";
if (stage1Params.GetInputFileType() != KMC::InputFileType::MULTILINE_FASTA)
cout << " Total no. of reads : " << setw(12) << stage1Results.nSeqences << "\n";
else
cout << " Total no. of sequences : " << setw(12) << stage1Results.nSeqences << "\n";
cout << " Total no. of super-k-mers : " << setw(12) << stage1Results.nTotalSuperKmers << "\n";
}
//----------------------------------------------------------------------------------
// Main function
int main(int argc, char** argv)
{
if (argc == 1 || help_or_version(argc, argv))
{
usage();
return 0;
}
try
{
Params params;
if (!parse_parameters(argc, argv, params))
{
usage();
return 0;
}
KMC::Runner runner;
auto stage1Results = runner.RunStage1(params.stage1Params);
save_estimated_histogram(params.cliParams.estimatedHistogramFileName, stage1Results.estimatedHistogram);
auto stage2Results = runner.RunStage2(params.stage2Params);
print_summary(params, stage1Results, stage2Results);
save_stats_in_json_file(params, stage1Results, stage2Results);
}
catch (const std::exception& err)
{
std::cerr << "Error: " << err.what() << "\n";
return 1;
}
catch (...)
{
std::cerr << "Error: unknown exception\n";
return 1;
}
return 0;
}
|