1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133
|
#ifndef ASSEMBLY_COVERAGEALGORITHM_H
#define ASSEMBLY_COVERAGEALGORITHM_H 1
#include "Common/Histogram.h"
#include "Common/IOUtil.h"
#include "Common/Options.h" // for opt::rank
#include <fstream>
namespace AssemblyAlgorithms {
/** Return the k-mer coverage histogram. */
static inline
Histogram coverageHistogram(const SequenceCollectionHash& c)
{
typedef SequenceCollectionHash Graph;
Histogram h;
for (Graph::const_iterator it = c.begin();
it != c.end(); ++it) {
if (it->second.deleted())
continue;
h.insert(it->second.getMultiplicity());
}
return h;
}
/** Calculate a k-mer coverage threshold from the given k-mer coverage
* histogram. */
static inline
float calculateCoverageThreshold(const Histogram& h)
{
float cov = h.firstLocalMinimum();
if (opt::rank <= 0) {
if (cov == 0)
std::cout << "Unable to determine minimum k-mer coverage\n";
else
std::cout << "Minimum k-mer coverage is " << cov << std::endl;
}
for (unsigned iteration = 0; iteration < 100; iteration++) {
Histogram trimmed = h.trimLow((unsigned)roundf(cov));
if (opt::rank <= 0)
logger(1) << "Coverage: " << cov << "\t"
"Reconstruction: " << trimmed.size() << std::endl;
unsigned median = trimmed.median();
float cov1 = sqrt(median);
if (cov1 == cov) {
// The coverage threshold has converged.
if (opt::rank <= 0)
std::cout << "Using a coverage threshold of "
<< (unsigned)roundf(cov) << "...\n"
"The median k-mer coverage is " << median << "\n"
"The reconstruction is " << trimmed.size()
<< std::endl;
if (!opt::db.empty()) {
addToDb("coverageThreshold", (unsigned)roundf(cov));
addToDb("medianKcoverage", median);
addToDb("restruction", trimmed.size());
}
return cov;
}
cov = cov1;
}
if (opt::rank <= 0)
std::cerr << "warning: coverage threshold did not converge"
<< std::endl;
return 0;
}
/** Set the coverage-related parameters e and c from the given k-mer
* coverage histogram. */
static inline
void setCoverageParameters(const Histogram& h)
{
if (!opt::coverageHistPath.empty() && opt::rank <= 0) {
std::ofstream histFile(opt::coverageHistPath.c_str());
assert_good(histFile, opt::coverageHistPath);
histFile << h;
assert(histFile.good());
}
float minCov = calculateCoverageThreshold(h);
if (opt::rank <= 0) {
if (minCov == 0)
std::cout << "Unable to determine the "
"k-mer coverage threshold" << std::endl;
else
std::cout << "The k-mer coverage threshold is " << minCov
<< std::endl;
}
if (minCov < 2)
minCov = 2;
if ((int)opt::erode < 0) {
opt::erode = (unsigned)roundf(minCov);
if (opt::rank <= 0)
std::cout << "Setting parameter e (erode) to "
<< opt::erode << std::endl;
}
if ((int)opt::erodeStrand < 0) {
opt::erodeStrand = minCov <= 2 ? 0 : 1;
if (opt::rank <= 0)
std::cout << "Setting parameter E (erodeStrand) to "
<< opt::erodeStrand << std::endl;
}
if (opt::coverage < 0) {
opt::coverage = minCov;
if (opt::rank <= 0)
std::cout << "Setting parameter c (coverage) to "
<< opt::coverage << std::endl;
}
}
/** Remove all k-mers with multiplicity lower than the given threshold */
static inline
size_t applyKmerCoverageThreshold(SequenceCollectionHash& c, unsigned kc)
{
if (kc == 0)
return 0;
for (SequenceCollectionHash::iterator it = c.begin();
it != c.end(); ++it) {
if (it->second.getMultiplicity() < kc)
it->second.setFlag(SF_DELETE);
}
return c.cleanup();
}
} // namespace AssemblyAlgorithms
#endif
|