File: SalmonUtils.hpp

package info (click to toggle)
salmon 1.10.2%2Bds1-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 35,580 kB
  • sloc: cpp: 199,285; ansic: 171,082; sh: 859; python: 792; makefile: 235
file content (346 lines) | stat: -rw-r--r-- 10,738 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
#ifndef __SALMON_UTILS_HPP__
#define __SALMON_UTILS_HPP__

extern "C" {
#include "io_lib/os.h"
#include "io_lib/scram.h"
#undef min
#undef max
}

#include <algorithm>
#include <boost/filesystem.hpp>
#include <boost/program_options.hpp>
#include <iostream>
#include <memory>
#include <random>
#include <tuple>
#include <unordered_map>
#include <unordered_set>
#include <vector>

#include <Eigen/Dense>

#include "oneapi/tbb/task_arena.h"

#include "cereal/archives/json.hpp"
#include "spdlog/fmt/fmt.h"

#include "SalmonMath.hpp"
#include "SalmonOpts.hpp"

#include "GenomicFeature.hpp"
#include "LibraryFormat.hpp"
#include "pufferfish/Util.hpp"
#include "ReadLibrary.hpp"
#include "SalmonConfig.hpp"
#include "TranscriptGeneMap.hpp"

template <typename EqBuilderT> class ReadExperiment;
class LibraryFormat;
class FragmentLengthDistribution;
class Transcript;

namespace salmon {
namespace utils {

using std::string;
using NameVector = std::vector<string>;
using IndexVector = std::vector<size_t>;
using KmerVector = std::vector<uint64_t>;
using MateStatus = pufferfish::util::MateStatus;

// Keep track of the type of mapping that was obtained for this read
enum class MappingType : uint8_t {
  UNMAPPED = 0,
  LEFT_ORPHAN = 1,
  RIGHT_ORPHAN = 2,
  BOTH_ORPHAN = 3,
  PAIRED_MAPPED = 4,
  SINGLE_MAPPED = 5,
  DECOY = 6
};

enum class DuplicateTargetStatus : uint8_t { 
  UNKNOWN = 0, 
  RETAINED_DUPLICATES = 1, 
  REMOVED_DUPLICATES = 2 
};

std::string str(const MappingType& mt);

// To keep track of short fragments (shorter than the k-mer length)
// on which the index was built.
struct ShortFragStats {
  size_t numTooShort{0};
  size_t shortest{std::numeric_limits<size_t>::max()};
};

// An enum class for direction to avoid potential errors
// with keeping everything as a bool
enum class Direction { FORWARD = 0, REVERSE_COMPLEMENT = 1, REVERSE = 2 };

// Returns FORWARD if isFwd is true and REVERSE_COMPLEMENT otherwise
constexpr inline Direction boolToDirection(bool isFwd) {
  return isFwd ? Direction::FORWARD : Direction::REVERSE_COMPLEMENT;
}

// Returns a uint64_t where the upper 32-bits
// contain tid and the lower 32-bits contain offset
uint64_t encode(uint64_t tid, uint64_t offset);

// Given a uin64_t generated by encode(), return the
// transcript id --- upper 32-bits
uint32_t transcript(uint64_t enc);

// Given a uin64_t generated by encode(), return the
// offset --- lower 32-bits
uint32_t offset(uint64_t enc);

LibraryFormat parseLibraryFormatStringNew(std::string& fmt);

std::vector<ReadLibrary>
extractReadLibraries(boost::program_options::parsed_options& orderedOptions);

LibraryFormat parseLibraryFormatString(std::string& fmt);

bool peekBAMIsPaired(const boost::filesystem::path& fname);

size_t numberOfReadsInFastaFile(const std::string& fname);

bool readKmerOrder(const std::string& fname, std::vector<uint64_t>& kmers);

template <template <typename> class S, typename T>
bool overlap(const S<T>& a, const S<T>& b);

template <typename T>
TranscriptGeneMap
transcriptToGeneMapFromFeatures(std::vector<GenomicFeature<T>>& feats);

TranscriptGeneMap transcriptGeneMapFromGTF(const std::string& fname,
                                           std::string key = "gene_id");

TranscriptGeneMap readTranscriptToGeneMap(std::ifstream& ifile);

TranscriptGeneMap
transcriptToGeneMapFromFasta(const std::string& transcriptsFile);

bool readEquivCounts(boost::filesystem::path& eqFilePathString,
                     std::vector<string>& tnames,
                     std::vector<double>& tlens,
                     std::vector<std::vector<uint32_t>>& eqclasses,
                     std::vector<std::vector<double>>& auxs_vals,
                     std::vector<uint32_t>& eqclass_counts );

/**
 * @param sopt : The salmon options object that tells us the relevant files and contains the pointer 
 *              to the logger object
 * @param transcripts : The list of transcript objects
 * 
 * If the auxTargetFile is not empty (i.e. if the file exists)
 **/
void markAuxiliaryTargets(SalmonOpts& sopt, std::vector<Transcript>& transcripts);

/*
template <typename AbundanceVecT, typename ReadExpT>
Eigen::VectorXd updateEffectiveLengths(
        SalmonOpts& sopt,
        ReadExpT& readExp,
        Eigen::VectorXd& effLensIn,
        AbundanceVecT& alphas,
    bool finalRound = false);
*/

template <typename AbundanceVecT, typename ReadExpT>
Eigen::VectorXd
updateEffectiveLengths(oneapi::tbb::task_arena& arena, SalmonOpts& sopt, ReadExpT& readExp,
                       Eigen::VectorXd& effLensIn, AbundanceVecT& alphas,
                       std::vector<bool>& available, bool finalRound = false);

/*
 * Use atomic compare-and-swap to update val to
 * val + inc (*in log-space*).  Update occurs in a loop in case other
 * threads update in the meantime.
 */
inline void incLoopLog(std::atomic<double>& val, double inc) {
  double oldMass = val.load();
  double newMass;
  do {
    newMass = salmon::math::logAdd(oldMass, inc);
  } while (! val.compare_exchange_strong(oldMass, newMass));
}

/*
 * Same as above, but overloaded for "plain" doubles
 */
inline void incLoop(double& val, double inc) { val += inc; }

/*
 * Use atomic compare-and-swap to update val to
 * val + inc.  Update occurs in a loop in case other
 * threads update in the meantime.
 */

inline void incLoop(std::atomic<double>& val, double inc) {
  double oldMass = val.load();
  double newMass;
  do {
    newMass = oldMass + inc;
  } while (!val.compare_exchange_strong(oldMass, newMass));
}

std::string getCurrentTimeAsString();

// encodes the heuristic for guessing how threads should
// be allocated based on the available reads
// returns true if input was modified and false otherwise.
bool configure_parsing(size_t nfiles,            // input param
                       size_t& worker_threads,   // input/output param
                       uint32_t& parse_threads    // input/output param
);

bool validateOptionsAlignment_(SalmonOpts& sopt);
bool validateOptionsMapping_(SalmonOpts& sopt);

bool createAuxMapLoggers_(SalmonOpts& sopt,
                          boost::program_options::variables_map& vm);

bool processQuantOptions(SalmonOpts& sopt,
                         boost::program_options::variables_map& vm,
                         int32_t numBiasSamples);

template <typename OrderedOptionsT>
bool writeCmdInfo(SalmonOpts& sopt, OrderedOptionsT& orderedOptions) {
  namespace bfs = boost::filesystem;
  bfs::path cmdInfoPath = sopt.outputDirectory / "cmd_info.json";
  std::ofstream os(cmdInfoPath.string());
  cereal::JSONOutputArchive oa(os);
  oa(cereal::make_nvp("salmon_version", std::string(salmon::version)));
  for (auto& opt : orderedOptions.options) {
    if (opt.value.size() == 1) {
      oa(cereal::make_nvp(opt.string_key, opt.value.front()));
    } else {
      oa(cereal::make_nvp(opt.string_key, opt.value));
    }
  }
  // explicitly output the aux directory as well
  oa(cereal::make_nvp("auxDir", sopt.auxDir));
  return true;
}

void aggregateEstimatesToGeneLevel(TranscriptGeneMap& tgm,
                                   boost::filesystem::path& inputPath);

// NOTE: Throws an invalid_argument exception of the quant or
// quant_bias_corrected files do not exist!
void generateGeneLevelEstimates(boost::filesystem::path& geneMapPath,
                                boost::filesystem::path& estDir);

enum class OrphanStatus : uint8_t {
  LeftOrphan = 0,
  RightOrphan = 1,
  Paired = 2
};

bool headersAreConsistent(SAM_hdr* h1, SAM_hdr* h2);

bool headersAreConsistent(std::vector<SAM_hdr*>&& headers);

inline void reverseComplement(const char* s, int32_t l, std::string& o) {
  if (static_cast<decltype(o.size())>(l) > o.size()) {
    o.resize(l, 'A');
  }
  int32_t j = 0;
  for (int32_t i = l - 1; i >= 0; --i, ++j) {
    switch (s[i]) {
    case 'A':
    case 'a':
      o[j] = 'T';
      break;
    case 'C':
    case 'c':
      o[j] = 'G';
      break;
    case 'T':
    case 't':
      o[j] = 'A';
      break;
    case 'G':
    case 'g':
      o[j] = 'C';
      break;
    default:
      o[j] = 'N';
      break;
    }
  }
}

inline std::string reverseComplement(const char* s, int32_t l) {
  std::string o(l, 'A');
  reverseComplement(s, l, o);
  return o;
}

template <typename AlnLibT>
void writeAbundances(const SalmonOpts& sopt, AlnLibT& alnLib,
                     boost::filesystem::path& fname,
                     std::string headerComments = "");

template <typename AlnLibT>
void writeAbundancesFromCollapsed(const SalmonOpts& sopt, AlnLibT& alnLib,
                                  boost::filesystem::path& fname,
                                  std::string headerComments = "");

template <typename AlnLibT>
void normalizeAlphas(const SalmonOpts& sopt, AlnLibT& alnLib);

bool isCompatible(const LibraryFormat observed, const LibraryFormat expected,
                  int32_t start, bool isForward, MateStatus ms);

double logAlignFormatProb(const LibraryFormat observed,
                          const LibraryFormat expected, int32_t start,
                          bool isForward, MateStatus ms,
                          double incompatPrior);

bool compatibleHit(const LibraryFormat expected, int32_t start, bool isForward,
                   MateStatus ms);
bool compatibleHit(const LibraryFormat expected, const LibraryFormat observed);

std::ostream& operator<<(std::ostream& os, OrphanStatus s);
/**
 *  Given the information about the position and strand from which a paired-end
 *  read originated, return the library format with which it is compatible.
 */
LibraryFormat hitType(int32_t end1Start, bool end1Fwd, int32_t end2Start,
                      bool end2Fwd);
LibraryFormat hitType(int32_t end1Start, bool end1Fwd, uint32_t len1,
                      int32_t end2Start, bool end2Fwd, uint32_t len2,
                      bool canDovetail);
/**
 *  Given the information about the position and strand from which the
 *  single-end read originated, return the library format with which it
 *  is compatible.
 */
LibraryFormat hitType(int32_t readStart, bool isForward);

double compute_1_edit_threshold(int32_t l, const SalmonOpts& sopt);

//std::random_device get_random_device();

/**
 *  Cache the mappings provided in an efficient binary format
 *  to the provided file handle.
 *  NOTE: This function assumes the file handle is unique to
 *  the calling thread, no attempt is made to synchronize
 *  output between multiple threads.
 */
  //template <typename AlnT>
  //using AlnGroupVecRange = core::range<typename AlnGroupVec<AlnT>::iterator>;
  //void cacheMappings(AlnGroupVecRange<QuasiAlignment> hits, std::ofstream& ofile);


} // namespace utils
} // namespace salmon

#endif // __SALMON_UTILS_HPP__