File: DistributionUtils.hpp

package info (click to toggle)
salmon 0.7.2%2Bds1-2
  • links: PTS, VCS
  • area: main
  • in suites: stretch
  • size: 4,352 kB
  • ctags: 5,243
  • sloc: cpp: 42,341; ansic: 6,252; python: 228; makefile: 207; sh: 190
file content (66 lines) | stat: -rw-r--r-- 2,817 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
#ifndef __DISTRIBUTION_UTILS__
#define __DISTRIBUTION_UTILS__

#include <cstddef>
#include <cstdint>
#include <vector>

// Do things involving distributions
class FragmentLengthDistribution;
class Transcript;

namespace distribution_utils {
enum class DistributionSpace : uint8_t { LOG = 0, LINEAR = 1 };

  /**
   *  Draw samples from the provided fragment length distribution. 
   *  \param fld A pointer to the FragmentLengthDistribution from which 
   *             samples will be drawn.
   *  \param numSamples  The number of samples to draw.
   */ 
std::vector<int32_t> samplesFromLogPMF(FragmentLengthDistribution* fld,
                                       int32_t numSamples);

  /**
   * The following two functions compute conditional means of the empirical fragment length 
   * distribution, and apply them to the transcripts to compute the effective lengths.
   * To the best of our knowledge, this particular approach to effective length correction was
   * first introduced in Kallisto[1].
   * [1] Bray, Nicolas L., et al. "Near-optimal probabilistic RNA-seq quantification." Nature biotechnology 34.5 (2016): 525-527.
   **/


  /**
   * Compute the conditional mean fragment length for every length
   * in the input fragment length distribution.  For each length i,  
   * the conditional mean assumes that it is not possible to sample fragments
   * of length > i, and so the probability mass is normalized by the 
   * probability of all lengths <= i.
   *
   * \param mass The input fragment length distribution.  This should contain a number 
   *             for each fragment length that is proportional to the probability of
   *             drawing a fragment of that length.  The input need not be normalized.
   * \param inputSpace A DistributionSpace parameter that determines whether mass should
   *                   be interpreted as exisitng in log space or linear space.
   * \returns The conditional means for each fragment length. 
   */
std::vector<double> correctionFactorsFromMass(std::vector<double>& mass,
                                              DistributionSpace inputSpace);

  /**
   * Populate the effective lengths of the input transcripts based on the conditional
   * means.
   *
   * \sa correctionFactorsFromMass()
   * \param maxLength The maximum fragment length.
   * \param transcripts The transcripts whose lengths are to be corrected.
   * \param correctionFactors The conditional means (computed with correctionFactorsFromMass())
   */
void computeSmoothedEffectiveLengths(size_t maxLength,
                                     std::vector<Transcript>& transcripts,
                                     std::vector<double>& correctionFactors,
                                     DistributionSpace outputSpace);

}

#endif // __DISTRIBUTION_UTILS__