1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66
|
#ifndef __DISTRIBUTION_UTILS__
#define __DISTRIBUTION_UTILS__
#include <cstddef>
#include <cstdint>
#include <vector>
// Do things involving distributions
class FragmentLengthDistribution;
class Transcript;
namespace distribution_utils {
enum class DistributionSpace : uint8_t { LOG = 0, LINEAR = 1 };
/**
* Draw samples from the provided fragment length distribution.
* \param fld A pointer to the FragmentLengthDistribution from which
* samples will be drawn.
* \param numSamples The number of samples to draw.
*/
std::vector<int32_t> samplesFromLogPMF(FragmentLengthDistribution* fld,
int32_t numSamples);
/**
* The following two functions compute conditional means of the empirical fragment length
* distribution, and apply them to the transcripts to compute the effective lengths.
* To the best of our knowledge, this particular approach to effective length correction was
* first introduced in Kallisto[1].
* [1] Bray, Nicolas L., et al. "Near-optimal probabilistic RNA-seq quantification." Nature biotechnology 34.5 (2016): 525-527.
**/
/**
* Compute the conditional mean fragment length for every length
* in the input fragment length distribution. For each length i,
* the conditional mean assumes that it is not possible to sample fragments
* of length > i, and so the probability mass is normalized by the
* probability of all lengths <= i.
*
* \param mass The input fragment length distribution. This should contain a number
* for each fragment length that is proportional to the probability of
* drawing a fragment of that length. The input need not be normalized.
* \param inputSpace A DistributionSpace parameter that determines whether mass should
* be interpreted as exisitng in log space or linear space.
* \returns The conditional means for each fragment length.
*/
std::vector<double> correctionFactorsFromMass(std::vector<double>& mass,
DistributionSpace inputSpace);
/**
* Populate the effective lengths of the input transcripts based on the conditional
* means.
*
* \sa correctionFactorsFromMass()
* \param maxLength The maximum fragment length.
* \param transcripts The transcripts whose lengths are to be corrected.
* \param correctionFactors The conditional means (computed with correctionFactorsFromMass())
*/
void computeSmoothedEffectiveLengths(size_t maxLength,
std::vector<Transcript>& transcripts,
std::vector<double>& correctionFactors,
DistributionSpace outputSpace);
}
#endif // __DISTRIBUTION_UTILS__
|