File: RAlgorithmsShort.h

package info (click to toggle)
abyss 2.3.10-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 8,284 kB
  • sloc: cpp: 78,182; ansic: 6,512; makefile: 2,252; perl: 672; sh: 509; haskell: 412; python: 4
file content (192 lines) | stat: -rw-r--r-- 5,353 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
#ifndef RRESOLVER_RALGORITHMS_SHORT_H
#define RRESOLVER_RALGORITHMS_SHORT_H 1

#include "BloomFilters.h"
#include "Common/Histogram.h"
#include "Contigs.h"

#include <cassert>
#include <climits>
#include <limits>
#include <cmath>
#include <fstream>
#include <string>
#include <vector>

const int MIN_MARGIN = 2;
const int R_HEURISTIC = 60;
const double R_HEURISTIC_A = 1.0;
const double R_HEURISTIC_B = 0.0;
const int MAX_SUBITERATIONS = 2;
const long HIST_SAMPLE_SIZE = LONG_MAX;
const long REPEAT_CASES_LIMIT = LONG_MAX;
const long READ_STATS_SAMPLE_SIZE = 100000;
const double READ_BATCH_FRACTION_THRESHOLD = 0.1;
const long PATH_COMBINATIONS_MULTITHREAD_THRESHOLD = 5000;
const double SUPPORTED_PATHS_MIN = 0.15;
const double COV_APPROX_FORMULA_FACTOR = 4.00;
const double SPACED_SEEDS_SNP_FRACTION = 1.00;

namespace opt {

/** Read Bloom filter size in bytes. */
extern size_t bloomSize;

/** The number of parallel threads */
extern int threads;

/** Prefix for the histogram files */
extern std::string histPrefix;

/** Name of the file to write resulting graph to */
extern std::string outputGraphPath;

/** Name of the file to write resulting graph to */
extern std::string outputContigsPath;

/** Number of kmers required to be found for a path to be supported */
extern int threshold;

/** Number of Rmers to extract per read */
extern int extract;

/** Minimum number of sliding window moves */
extern int minTests;

/** Maximum number of sliding window moves */
extern int maxTests;

/** Maximum number of branching paths */
extern int branching;

/** Explicitly specified r values. */
extern std::vector<int> rValues;

/** Explicitly set coverage approximation factors. */
extern std::vector<double> covApproxFactors;

/** Base pair quality threshold that large k-mers bases should have at minimum. */
extern int readQualityThreshold;

/** Flag indicating whether error correction is enabled */
extern int errorCorrection;

/** Upper limit on read size to consider for use with RResolver. */
extern unsigned maxReadSize;

/** factor to multiply Bloom filter memory budget with in order to stay within similar memory usage as the rest of the pipeline. */
extern double bfMemFactor;

/** Name of the file to write supported paths to */
extern std::string outputSupportedPathsPath;

/** Name of the file to write unsupported paths to */
extern std::string outputUnsupportedPathsPath;

extern unsigned k; // used by ContigProperties

extern int format; // used by ContigProperties

}

class Support
{
  public:

	enum class UnknownReason : uint8_t {
		UNDETERMINED = 0, // Not yet processed
		TOO_MANY_COMBINATIONS, // Branching out exploded beyond a threshold
		OVER_MAX_TESTS, // Planned tests was above the threshold
		POSSIBLE_TESTS_LT_PLANNED, // Planned tests could not be carried out due to low coverage
		WINDOW_NOT_LONG_ENOUGH, // Window too small / repeat too large to all the planned tests
		HEAD_SHORTER_THAN_MARGIN, // One of the branches to the left was too short for planned tests
		TAIL_SHORTER_THAN_MARGIN, // One of the branches to the right was too short for planned tests
		DIFFERENT_CULPRIT // The path was fine, but another path in this repeat was unknown so all the paths
		                  // for this repeat became unknown
	};

	Support() = default;

	Support(UnknownReason unknownReason): unknownReason(unknownReason) {};

	Support(int calculatedTests, UnknownReason unknownReason)
	  : found(-1)
	  , tests(-1)
		, unknownReason(unknownReason)
	{
		assert(calculatedTests >= 0);
		if (calculatedTests > std::numeric_limits<decltype(this->calculatedTests)>::max()) { this->calculatedTests = std::numeric_limits<decltype(this->calculatedTests)>::max(); } else {
			this->calculatedTests = calculatedTests;
		}
	}

	Support(int8_t found, int8_t tests)
	  : found(found)
	  , tests(tests)
	{
		assert(found >= 0);
		assert(tests > 0);
		assert(calculatedTests == -1);
	}

	Support(int8_t found, int8_t tests, int calculatedTests)
	  : found(found)
	  , tests(tests)
	{
		assert(found >= 0);
		assert(tests > 0);
		assert(calculatedTests >= 0);
		if (calculatedTests > std::numeric_limits<decltype(this->calculatedTests)>::max()) { this->calculatedTests = std::numeric_limits<decltype(this->calculatedTests)>::max(); } else {
			this->calculatedTests = calculatedTests;
		}
	}

	bool unknown() const { return tests == -1; }

	void reset()
	{
		found = -1;
		tests = -1;
	}

	bool good() const { return unknown() || found >= opt::threshold; }

	bool operator>(const Support& s) { return found > s.found; }

	bool operator<(const Support& s) { return found < s.found; }

	int8_t found = -1;
	int8_t tests = -1;
	int8_t calculatedTests = -1;
	UnknownReason unknownReason = UnknownReason::UNDETERMINED;
};

class ReadSize
{

  public:
	ReadSize(int size)
	  : size(size)
	{}

	double getFractionOfTotal() const { return double(sampleCount) / double(readsSampleSize); }

	int size;
	std::set<int> sizeAndMergedSizes;
	std::vector<int> rValues;
	//Histogram qualThresholdPositions;
	long sampleCount = 0;
	double covApproxFactor = COV_APPROX_FORMULA_FACTOR;

	static long readsSampleSize;
	static std::vector<ReadSize> readSizes;
	static ReadSize current;
};

void
resolveShort(
    const std::vector<std::string>& readFilepaths,
    ImaginaryContigPaths& supportedPaths,
    ImaginaryContigPaths& unsupportedPaths);

#endif