1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346
|
#ifndef __SALMON_UTILS_HPP__
#define __SALMON_UTILS_HPP__
extern "C" {
#include "io_lib/os.h"
#include "io_lib/scram.h"
#undef min
#undef max
}
#include <algorithm>
#include <boost/filesystem.hpp>
#include <boost/program_options.hpp>
#include <iostream>
#include <memory>
#include <random>
#include <tuple>
#include <unordered_map>
#include <unordered_set>
#include <vector>
#include <Eigen/Dense>
#include "oneapi/tbb/task_arena.h"
#include "cereal/archives/json.hpp"
#include "spdlog/fmt/fmt.h"
#include "SalmonMath.hpp"
#include "SalmonOpts.hpp"
#include "GenomicFeature.hpp"
#include "LibraryFormat.hpp"
#include "pufferfish/Util.hpp"
#include "ReadLibrary.hpp"
#include "SalmonConfig.hpp"
#include "TranscriptGeneMap.hpp"
template <typename EqBuilderT> class ReadExperiment;
class LibraryFormat;
class FragmentLengthDistribution;
class Transcript;
namespace salmon {
namespace utils {
using std::string;
using NameVector = std::vector<string>;
using IndexVector = std::vector<size_t>;
using KmerVector = std::vector<uint64_t>;
using MateStatus = pufferfish::util::MateStatus;
// Keep track of the type of mapping that was obtained for this read
enum class MappingType : uint8_t {
UNMAPPED = 0,
LEFT_ORPHAN = 1,
RIGHT_ORPHAN = 2,
BOTH_ORPHAN = 3,
PAIRED_MAPPED = 4,
SINGLE_MAPPED = 5,
DECOY = 6
};
enum class DuplicateTargetStatus : uint8_t {
UNKNOWN = 0,
RETAINED_DUPLICATES = 1,
REMOVED_DUPLICATES = 2
};
std::string str(const MappingType& mt);
// To keep track of short fragments (shorter than the k-mer length)
// on which the index was built.
struct ShortFragStats {
size_t numTooShort{0};
size_t shortest{std::numeric_limits<size_t>::max()};
};
// An enum class for direction to avoid potential errors
// with keeping everything as a bool
enum class Direction { FORWARD = 0, REVERSE_COMPLEMENT = 1, REVERSE = 2 };
// Returns FORWARD if isFwd is true and REVERSE_COMPLEMENT otherwise
constexpr inline Direction boolToDirection(bool isFwd) {
return isFwd ? Direction::FORWARD : Direction::REVERSE_COMPLEMENT;
}
// Returns a uint64_t where the upper 32-bits
// contain tid and the lower 32-bits contain offset
uint64_t encode(uint64_t tid, uint64_t offset);
// Given a uin64_t generated by encode(), return the
// transcript id --- upper 32-bits
uint32_t transcript(uint64_t enc);
// Given a uin64_t generated by encode(), return the
// offset --- lower 32-bits
uint32_t offset(uint64_t enc);
LibraryFormat parseLibraryFormatStringNew(std::string& fmt);
std::vector<ReadLibrary>
extractReadLibraries(boost::program_options::parsed_options& orderedOptions);
LibraryFormat parseLibraryFormatString(std::string& fmt);
bool peekBAMIsPaired(const boost::filesystem::path& fname);
size_t numberOfReadsInFastaFile(const std::string& fname);
bool readKmerOrder(const std::string& fname, std::vector<uint64_t>& kmers);
template <template <typename> class S, typename T>
bool overlap(const S<T>& a, const S<T>& b);
template <typename T>
TranscriptGeneMap
transcriptToGeneMapFromFeatures(std::vector<GenomicFeature<T>>& feats);
TranscriptGeneMap transcriptGeneMapFromGTF(const std::string& fname,
std::string key = "gene_id");
TranscriptGeneMap readTranscriptToGeneMap(std::ifstream& ifile);
TranscriptGeneMap
transcriptToGeneMapFromFasta(const std::string& transcriptsFile);
bool readEquivCounts(boost::filesystem::path& eqFilePathString,
std::vector<string>& tnames,
std::vector<double>& tlens,
std::vector<std::vector<uint32_t>>& eqclasses,
std::vector<std::vector<double>>& auxs_vals,
std::vector<uint32_t>& eqclass_counts );
/**
* @param sopt : The salmon options object that tells us the relevant files and contains the pointer
* to the logger object
* @param transcripts : The list of transcript objects
*
* If the auxTargetFile is not empty (i.e. if the file exists)
**/
void markAuxiliaryTargets(SalmonOpts& sopt, std::vector<Transcript>& transcripts);
/*
template <typename AbundanceVecT, typename ReadExpT>
Eigen::VectorXd updateEffectiveLengths(
SalmonOpts& sopt,
ReadExpT& readExp,
Eigen::VectorXd& effLensIn,
AbundanceVecT& alphas,
bool finalRound = false);
*/
template <typename AbundanceVecT, typename ReadExpT>
Eigen::VectorXd
updateEffectiveLengths(oneapi::tbb::task_arena& arena, SalmonOpts& sopt, ReadExpT& readExp,
Eigen::VectorXd& effLensIn, AbundanceVecT& alphas,
std::vector<bool>& available, bool finalRound = false);
/*
* Use atomic compare-and-swap to update val to
* val + inc (*in log-space*). Update occurs in a loop in case other
* threads update in the meantime.
*/
inline void incLoopLog(std::atomic<double>& val, double inc) {
double oldMass = val.load();
double newMass;
do {
newMass = salmon::math::logAdd(oldMass, inc);
} while (! val.compare_exchange_strong(oldMass, newMass));
}
/*
* Same as above, but overloaded for "plain" doubles
*/
inline void incLoop(double& val, double inc) { val += inc; }
/*
* Use atomic compare-and-swap to update val to
* val + inc. Update occurs in a loop in case other
* threads update in the meantime.
*/
inline void incLoop(std::atomic<double>& val, double inc) {
double oldMass = val.load();
double newMass;
do {
newMass = oldMass + inc;
} while (!val.compare_exchange_strong(oldMass, newMass));
}
std::string getCurrentTimeAsString();
// encodes the heuristic for guessing how threads should
// be allocated based on the available reads
// returns true if input was modified and false otherwise.
bool configure_parsing(size_t nfiles, // input param
size_t& worker_threads, // input/output param
uint32_t& parse_threads // input/output param
);
bool validateOptionsAlignment_(SalmonOpts& sopt);
bool validateOptionsMapping_(SalmonOpts& sopt);
bool createAuxMapLoggers_(SalmonOpts& sopt,
boost::program_options::variables_map& vm);
bool processQuantOptions(SalmonOpts& sopt,
boost::program_options::variables_map& vm,
int32_t numBiasSamples);
template <typename OrderedOptionsT>
bool writeCmdInfo(SalmonOpts& sopt, OrderedOptionsT& orderedOptions) {
namespace bfs = boost::filesystem;
bfs::path cmdInfoPath = sopt.outputDirectory / "cmd_info.json";
std::ofstream os(cmdInfoPath.string());
cereal::JSONOutputArchive oa(os);
oa(cereal::make_nvp("salmon_version", std::string(salmon::version)));
for (auto& opt : orderedOptions.options) {
if (opt.value.size() == 1) {
oa(cereal::make_nvp(opt.string_key, opt.value.front()));
} else {
oa(cereal::make_nvp(opt.string_key, opt.value));
}
}
// explicitly output the aux directory as well
oa(cereal::make_nvp("auxDir", sopt.auxDir));
return true;
}
void aggregateEstimatesToGeneLevel(TranscriptGeneMap& tgm,
boost::filesystem::path& inputPath);
// NOTE: Throws an invalid_argument exception of the quant or
// quant_bias_corrected files do not exist!
void generateGeneLevelEstimates(boost::filesystem::path& geneMapPath,
boost::filesystem::path& estDir);
enum class OrphanStatus : uint8_t {
LeftOrphan = 0,
RightOrphan = 1,
Paired = 2
};
bool headersAreConsistent(SAM_hdr* h1, SAM_hdr* h2);
bool headersAreConsistent(std::vector<SAM_hdr*>&& headers);
inline void reverseComplement(const char* s, int32_t l, std::string& o) {
if (static_cast<decltype(o.size())>(l) > o.size()) {
o.resize(l, 'A');
}
int32_t j = 0;
for (int32_t i = l - 1; i >= 0; --i, ++j) {
switch (s[i]) {
case 'A':
case 'a':
o[j] = 'T';
break;
case 'C':
case 'c':
o[j] = 'G';
break;
case 'T':
case 't':
o[j] = 'A';
break;
case 'G':
case 'g':
o[j] = 'C';
break;
default:
o[j] = 'N';
break;
}
}
}
inline std::string reverseComplement(const char* s, int32_t l) {
std::string o(l, 'A');
reverseComplement(s, l, o);
return o;
}
template <typename AlnLibT>
void writeAbundances(const SalmonOpts& sopt, AlnLibT& alnLib,
boost::filesystem::path& fname,
std::string headerComments = "");
template <typename AlnLibT>
void writeAbundancesFromCollapsed(const SalmonOpts& sopt, AlnLibT& alnLib,
boost::filesystem::path& fname,
std::string headerComments = "");
template <typename AlnLibT>
void normalizeAlphas(const SalmonOpts& sopt, AlnLibT& alnLib);
bool isCompatible(const LibraryFormat observed, const LibraryFormat expected,
int32_t start, bool isForward, MateStatus ms);
double logAlignFormatProb(const LibraryFormat observed,
const LibraryFormat expected, int32_t start,
bool isForward, MateStatus ms,
double incompatPrior);
bool compatibleHit(const LibraryFormat expected, int32_t start, bool isForward,
MateStatus ms);
bool compatibleHit(const LibraryFormat expected, const LibraryFormat observed);
std::ostream& operator<<(std::ostream& os, OrphanStatus s);
/**
* Given the information about the position and strand from which a paired-end
* read originated, return the library format with which it is compatible.
*/
LibraryFormat hitType(int32_t end1Start, bool end1Fwd, int32_t end2Start,
bool end2Fwd);
LibraryFormat hitType(int32_t end1Start, bool end1Fwd, uint32_t len1,
int32_t end2Start, bool end2Fwd, uint32_t len2,
bool canDovetail);
/**
* Given the information about the position and strand from which the
* single-end read originated, return the library format with which it
* is compatible.
*/
LibraryFormat hitType(int32_t readStart, bool isForward);
double compute_1_edit_threshold(int32_t l, const SalmonOpts& sopt);
//std::random_device get_random_device();
/**
* Cache the mappings provided in an efficient binary format
* to the provided file handle.
* NOTE: This function assumes the file handle is unique to
* the calling thread, no attempt is made to synchronize
* output between multiple threads.
*/
//template <typename AlnT>
//using AlnGroupVecRange = core::range<typename AlnGroupVec<AlnT>::iterator>;
//void cacheMappings(AlnGroupVecRange<QuasiAlignment> hits, std::ofstream& ofile);
} // namespace utils
} // namespace salmon
#endif // __SALMON_UTILS_HPP__
|