1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192
|
#ifndef RRESOLVER_RALGORITHMS_SHORT_H
#define RRESOLVER_RALGORITHMS_SHORT_H 1
#include "BloomFilters.h"
#include "Common/Histogram.h"
#include "Contigs.h"
#include <cassert>
#include <climits>
#include <limits>
#include <cmath>
#include <fstream>
#include <string>
#include <vector>
const int MIN_MARGIN = 2;
const int R_HEURISTIC = 60;
const double R_HEURISTIC_A = 1.0;
const double R_HEURISTIC_B = 0.0;
const int MAX_SUBITERATIONS = 2;
const long HIST_SAMPLE_SIZE = LONG_MAX;
const long REPEAT_CASES_LIMIT = LONG_MAX;
const long READ_STATS_SAMPLE_SIZE = 100000;
const double READ_BATCH_FRACTION_THRESHOLD = 0.1;
const long PATH_COMBINATIONS_MULTITHREAD_THRESHOLD = 5000;
const double SUPPORTED_PATHS_MIN = 0.15;
const double COV_APPROX_FORMULA_FACTOR = 4.00;
const double SPACED_SEEDS_SNP_FRACTION = 1.00;
namespace opt {
/** Read Bloom filter size in bytes. */
extern size_t bloomSize;
/** The number of parallel threads */
extern int threads;
/** Prefix for the histogram files */
extern std::string histPrefix;
/** Name of the file to write resulting graph to */
extern std::string outputGraphPath;
/** Name of the file to write resulting graph to */
extern std::string outputContigsPath;
/** Number of kmers required to be found for a path to be supported */
extern int threshold;
/** Number of Rmers to extract per read */
extern int extract;
/** Minimum number of sliding window moves */
extern int minTests;
/** Maximum number of sliding window moves */
extern int maxTests;
/** Maximum number of branching paths */
extern int branching;
/** Explicitly specified r values. */
extern std::vector<int> rValues;
/** Explicitly set coverage approximation factors. */
extern std::vector<double> covApproxFactors;
/** Base pair quality threshold that large k-mers bases should have at minimum. */
extern int readQualityThreshold;
/** Flag indicating whether error correction is enabled */
extern int errorCorrection;
/** Upper limit on read size to consider for use with RResolver. */
extern unsigned maxReadSize;
/** factor to multiply Bloom filter memory budget with in order to stay within similar memory usage as the rest of the pipeline. */
extern double bfMemFactor;
/** Name of the file to write supported paths to */
extern std::string outputSupportedPathsPath;
/** Name of the file to write unsupported paths to */
extern std::string outputUnsupportedPathsPath;
extern unsigned k; // used by ContigProperties
extern int format; // used by ContigProperties
}
class Support
{
public:
enum class UnknownReason : uint8_t {
UNDETERMINED = 0, // Not yet processed
TOO_MANY_COMBINATIONS, // Branching out exploded beyond a threshold
OVER_MAX_TESTS, // Planned tests was above the threshold
POSSIBLE_TESTS_LT_PLANNED, // Planned tests could not be carried out due to low coverage
WINDOW_NOT_LONG_ENOUGH, // Window too small / repeat too large to all the planned tests
HEAD_SHORTER_THAN_MARGIN, // One of the branches to the left was too short for planned tests
TAIL_SHORTER_THAN_MARGIN, // One of the branches to the right was too short for planned tests
DIFFERENT_CULPRIT // The path was fine, but another path in this repeat was unknown so all the paths
// for this repeat became unknown
};
Support() = default;
Support(UnknownReason unknownReason): unknownReason(unknownReason) {};
Support(int calculatedTests, UnknownReason unknownReason)
: found(-1)
, tests(-1)
, unknownReason(unknownReason)
{
assert(calculatedTests >= 0);
if (calculatedTests > std::numeric_limits<decltype(this->calculatedTests)>::max()) { this->calculatedTests = std::numeric_limits<decltype(this->calculatedTests)>::max(); } else {
this->calculatedTests = calculatedTests;
}
}
Support(int8_t found, int8_t tests)
: found(found)
, tests(tests)
{
assert(found >= 0);
assert(tests > 0);
assert(calculatedTests == -1);
}
Support(int8_t found, int8_t tests, int calculatedTests)
: found(found)
, tests(tests)
{
assert(found >= 0);
assert(tests > 0);
assert(calculatedTests >= 0);
if (calculatedTests > std::numeric_limits<decltype(this->calculatedTests)>::max()) { this->calculatedTests = std::numeric_limits<decltype(this->calculatedTests)>::max(); } else {
this->calculatedTests = calculatedTests;
}
}
bool unknown() const { return tests == -1; }
void reset()
{
found = -1;
tests = -1;
}
bool good() const { return unknown() || found >= opt::threshold; }
bool operator>(const Support& s) { return found > s.found; }
bool operator<(const Support& s) { return found < s.found; }
int8_t found = -1;
int8_t tests = -1;
int8_t calculatedTests = -1;
UnknownReason unknownReason = UnknownReason::UNDETERMINED;
};
class ReadSize
{
public:
ReadSize(int size)
: size(size)
{}
double getFractionOfTotal() const { return double(sampleCount) / double(readsSampleSize); }
int size;
std::set<int> sizeAndMergedSizes;
std::vector<int> rValues;
//Histogram qualThresholdPositions;
long sampleCount = 0;
double covApproxFactor = COV_APPROX_FORMULA_FACTOR;
static long readsSampleSize;
static std::vector<ReadSize> readSizes;
static ReadSize current;
};
void
resolveShort(
const std::vector<std::string>& readFilepaths,
ImaginaryContigPaths& supportedPaths,
ImaginaryContigPaths& unsupportedPaths);
#endif
|