1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162
|
/**
* lengthdistribution.h
* express
*
* Created by Adam Roberts on 1/30/13.
* Copyright 2013 Adam Roberts. All rights reserved.
*/
// Modifications by Rob Patro; 2014
#ifndef FRAGMENT_LENGTH_DISTRIBUTION
#define FRAGMENT_LENGTH_DISTRIBUTION
#include "tbb/atomic.h"
#include <atomic>
#include <vector>
#include <string>
#include <mutex>
/**
* The LengthDistribution class keeps track of the observed length distribution.
* It is initialized with a Gaussian prior with parameters specified by the
* arguments to the constructor. An argument-specified binomial kernel is then
* added for each observation. All mass values and probabilities are stored and
* returned in log space (except in to_string).
*/
class FragmentLengthDistribution {
/**
* A private vector that stores the (logged) kernel values.
**/
std::vector<double> kernel_;
/**
* A private vector that stores the observed (logged) mass for each length.
*/
std::vector<tbb::atomic<double>> hist_;
/**
* A private vector that stores the observed (logged) mass for each length.
*/
std::vector<double> cachedCMF_;
volatile bool haveCachedCMF_;
std::mutex fldMut_;
/**
* A private double that stores the total observed (logged) mass.
*/
tbb::atomic<double> totMass_;
/**
* A private double that stores the (logged) sum of the product of observed
* lengths and masses for quick mean calculations.
*/
tbb::atomic<double> sum_;
/**
* A private int that stores the minimum observed length.
*/
std::atomic<size_t> min_;
/**
* A size for internal binning of the lengths in the distribution.
*/
size_t binSize_;
public:
/**
* LengthDistribution Constructor.
* @param alpha double that sets the average pseudo-counts (logged).
* @param max_val an integer that sets the maximum allowable length.
* @param prior_mu a size_t for the mean of the prior gaussian distribution.
If 0, a uniform distribution is used instead.
* @param prior_sigma a size_t for the standard deviation of the prior
* gaussian distribution.
* @param kernel_n a size_t specifying the number of trials in the kernel
* binomial distribution. Must be odd.
* @param kernel_p a double specifying the success probability for the kernel
* binomial distribution.
* @param bin_size a size_t specifying the size of bins to use internally to
* reduce the number of parameters in the distribution.
*/
FragmentLengthDistribution(double alpha, size_t max_val, size_t prior_mu,
size_t prior_sigma, size_t kernel_n, double kernel_p,
size_t bin_size = 1);
/**
* An accessor for the maximum allowed length.
* @return Max allowed length.
*/
size_t maxVal() const;
/**
* An accessor for the minimum observed length (1 initially).
* @return Minimum observed length.
*/
size_t minVal() const;
/**
* An accessor for the mean length in the distribution.
* @return Mean observed length.
*/
double mean() const;
/**
* A member function that updates the distribution based on a new length
* observation.
* @param len an integer for the observed length.
* @param mass a double for the mass (logged) to add.
*/
void addVal(size_t len, double mass);
/**
* An accessor for the (logged) probability of a given length.
* @param len an integer for the length to return the probability of.
* @return (logged) probability of observing the given length.
*/
double pmf(size_t len) const;
/**
* A member function that returns a (logged) cumulative mass for a given
* length.
* @param len an integer for the length to return the cmf value of.
* @return (Logged) cmf value of length.
*/
double cmf(size_t len) const;
/**
* A member function that caches the cumulative mass function into
* a vector. This should be called (once), when the fld will no
* longer be updated. It will make future calls to cmf(len)
* much faster.
*/
void cacheCMF();
/**
* A member function that returns a vector containing the (logged) cumulative
* mass function *for the bins*.
* @return (Logged) cmf of bins.
*/
std::vector<double> cmf() const;
/**
* A member function that fills in a a vector containing the (logged) probability
* mass function *for the bins*, and the min and max values
* @return (Logged) pmf of bins.
*/
void dumpPMF(std::vector<double>& pmfOut, size_t& minV, size_t& maxV) const;
/**
* An accessor for the (logged) observation mass (including pseudo-counts).
* @return Total observation mass.
*/
double totMass() const;
/**
* A member function that returns a string containing the current
* distribution.
* @return Space-separated string of probabilities ordered from length 0 to
* max_val (non-logged).
*/
std::string toString() const;
//std::string to_string() const;
/**
* A member function that appends the LengthDistribution parameters to the end
* of the given file.
* @param outfile the file to append to.
* @param length_type a string specifying the type of length the distribution
* is of (ie. "Fragment" or "Target") to be included in the header.
*/
//void append_output(std::ofstream& outfile, std::string length_type) const;
};
#endif
|