1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197
|
// --------------------------------------------------------------------------
// OpenMS -- Open-Source Mass Spectrometry
// --------------------------------------------------------------------------
// Copyright The OpenMS Team -- Eberhard Karls University Tuebingen,
// ETH Zurich, and Freie Universitaet Berlin 2002-2013.
//
// This software is released under a three-clause BSD license:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of any author or any participating institution
// may be used to endorse or promote products derived from this software
// without specific prior written permission.
// For a full list of authors, refer to the file AUTHORS.
// --------------------------------------------------------------------------
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
// ARE DISCLAIMED. IN NO EVENT SHALL ANY OF THE AUTHORS OR THE CONTRIBUTING
// INSTITUTIONS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
// OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
// ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// --------------------------------------------------------------------------
// $Maintainer: David Wojnar $
// $Authors: David Wojnar $
// --------------------------------------------------------------------------
#ifndef OPENMS_MATH_STATISTICS_POSTERIORERRORPROBABILITYMODEL_H
#define OPENMS_MATH_STATISTICS_POSTERIORERRORPROBABILITYMODEL_H
#include <OpenMS/DATASTRUCTURES/DPosition.h>
#include <OpenMS/MATH/STATISTICS/GumbelDistributionFitter.h>
#include <OpenMS/MATH/STATISTICS/GaussFitter.h>
#include <OpenMS/DATASTRUCTURES/DefaultParamHandler.h>
#include <vector>
namespace OpenMS
{
class String;
class TextFile;
namespace Math
{
/**
@brief Implements a mixture model of the inverse gumbel and the gauss distribution or a gaussian mixture.
This class fits either a Gumbel distribution and a Gauss distribution to a set of data points or two Gaussian distributions using the EM algorithm.
One can output the fit as a gnuplot formula using getGumbelGnuplotFormula() and getGaussGnuplotFormula() after fitting.
@note All paremters are stored in GaussFitResult. In the case of the gumbel distribution x0 and sigma represent the local parameter alpha and the scale parameter beta, respectively.
@htmlinclude OpenMS_Math::PosteriorErrorProbabilityModel.parameters
@ingroup Math
*/
class OPENMS_DLLAPI PosteriorErrorProbabilityModel :
public DefaultParamHandler
{
public:
///default constructor
PosteriorErrorProbabilityModel();
///Destructor
virtual ~PosteriorErrorProbabilityModel();
/**
@brief fits the distributions to the data points(search_engine_scores). Estimated parameters for the distributions are saved in member variables. computeProbability can be used afterwards.
@param search_engine_scores a vector which holds the data points
@return true if algorithm has run thourgh. Else false will be returned. In that case no plot and no probabilities are calculated.
@note the vector is sorted from smallest to biggest value!
*/
bool fit(std::vector<double> & search_engine_scores);
/**
@brief fits the distributions to the data points(search_engine_scores) and writes the computed probabilites into the given vector (the second one).
@param search_engine_scores a vector which holds the data points
@param probabilities a vector which holds the probability for each data point after running this function. If it has some content it will be overwritten.
@return true if algorithm has run thourgh. Else false will be returned. In that case no plot and no probabilities are calculated.
@note the vectors are sorted from smallest to biggest value!
*/
bool fit(std::vector<double> & search_engine_scores, std::vector<double> & probabilities);
///Writes the distributions densities into the two vectors for a set of scores. Incorrect_densities represent the incorreclty assigned seqeuences.
void fillDensities(std::vector<double> & x_scores, std::vector<DoubleReal> & incorrect_density, std::vector<DoubleReal> & correct_density);
///computes the Maximum Likelihood with a log-likelihood funciotn.
DoubleReal computeMaxLikelihood(std::vector<DoubleReal> & incorrect_density, std::vector<DoubleReal> & correct_density);
///sums (1 - posterior porbabilities)
DoubleReal one_minus_sum_post(std::vector<DoubleReal> & incorrect_density, std::vector<DoubleReal> & correct_density);
///sums posterior porbabilities
DoubleReal sum_post(std::vector<DoubleReal> & incorrect_density, std::vector<DoubleReal> & correct_density);
///helper function for the EM algorithm (for fitting)
DoubleReal sum_pos_x0(std::vector<double> & x_scores, std::vector<DoubleReal> & incorrect_density, std::vector<DoubleReal> & correct_density);
///helper function for the EM algorithm (for fitting)
DoubleReal sum_neg_x0(std::vector<double> & x_scores, std::vector<DoubleReal> & incorrect_density, std::vector<DoubleReal> & correct_density);
///helper function for the EM algorithm (for fitting)
DoubleReal sum_pos_sigma(std::vector<double> & x_scores, std::vector<DoubleReal> & incorrect_density, std::vector<DoubleReal> & correct_density, DoubleReal positive_mean);
///helper function for the EM algorithm (for fitting)
DoubleReal sum_neg_sigma(std::vector<double> & x_scores, std::vector<DoubleReal> & incorrect_density, std::vector<DoubleReal> & correct_density, DoubleReal positive_mean);
///returns estimated parameters for correctly assigned sequences. Fit should be used before.
GaussFitter::GaussFitResult getCorrectlyAssignedFitResult() const
{
return correctly_assigned_fit_param_;
}
///returns estimated parameters for correctly assigned sequences. Fit should be used before.
GaussFitter::GaussFitResult getIncorrectlyAssignedFitResult() const
{
return incorrectly_assigned_fit_param_;
}
///returns the estimated negative prior probability.
DoubleReal getNegativePrior() const
{
return negative_prior_;
}
///computes the gaussian density at position x with parameters params.
DoubleReal getGauss(DoubleReal x, const GaussFitter::GaussFitResult & params)
{
return params.A * exp(-1.0 * pow(x - params.x0, 2) / (2 * pow(params.sigma, 2)));
}
///computes the gumbel density at position x with parameters params.
DoubleReal getGumbel(DoubleReal x, const GaussFitter::GaussFitResult & params)
{
DoubleReal z = exp((params.x0 - x) / params.sigma);
return (z * exp(-1 * z)) / params.sigma;
}
/**
Returns the computed posterior error probability for a given score.
@note: fit has to be used before using this function. Otherwise this function will compute nonsense.
*/
DoubleReal computeProbability(DoubleReal score);
///initializes the plots
TextFile * InitPlots(std::vector<double> & x_scores);
/// returns the gnuplot formula of the fitted gumbel distribution. Only x0 and sigma are used as local parameter alpha and scale parameter beta, respectively.
const String getGumbelGnuplotFormula(const GaussFitter::GaussFitResult & params) const;
/// returns the gnuplot formula of the fitted gauss distribution.
const String getGaussGnuplotFormula(const GaussFitter::GaussFitResult & params) const;
/// returns the gnuplot formula of the fitted mixture distribution.
const String getBothGnuplotFormula(const GaussFitter::GaussFitResult & incorrect, const GaussFitter::GaussFitResult & correct) const;
///plots the estimated distribution against target and decoy hits
void plotTargetDecoyEstimation(std::vector<double> & target, std::vector<double> & decoy);
/// returns the smallest score used in the last fit
inline DoubleReal getSmallestScore()
{
return smallest_score_;
}
private:
/// assignment operator (not implemented)
PosteriorErrorProbabilityModel & operator=(const PosteriorErrorProbabilityModel & rhs);
///Copy constructor (not implemented)
PosteriorErrorProbabilityModel(const PosteriorErrorProbabilityModel & rhs);
///stores parameters for incorrectly assigned sequences. If gumbel fit was used, A can be ignored. Furthermore, in this case, x0 and sigma are the local parameter alpha and scale parameter beta, respectively.
GaussFitter::GaussFitResult incorrectly_assigned_fit_param_;
///stores gauss parameters
GaussFitter::GaussFitResult correctly_assigned_fit_param_;
///stores final prior probability for negative peptides
DoubleReal negative_prior_;
///peak of the incorrectly assigned sequences distribution
DoubleReal max_incorrectly_;
///peak of the gauss distribution (correctly assigned sequences)
DoubleReal max_correctly_;
///smallest score which was used for fitting the model
DoubleReal smallest_score_;
///points to getGauss
DoubleReal (PosteriorErrorProbabilityModel::* calc_incorrect_)(DoubleReal x, const GaussFitter::GaussFitResult & params);
///points either to getGumbel or getGauss depending on whether on uses the gumbel or th gausian distribution for incorrectly assigned sequences.
DoubleReal (PosteriorErrorProbabilityModel::* calc_correct_)(DoubleReal x, const GaussFitter::GaussFitResult & params);
///points either to getGumbelGnuplotFormula or getGaussGnuplotFormula depending on whether on uses the gumbel or th gausian distribution for incorrectly assigned sequences.
const String (PosteriorErrorProbabilityModel::* getNegativeGnuplotFormula_)(const GaussFitter::GaussFitResult & params) const;
///points to getGumbelGnuplotFormula
const String (PosteriorErrorProbabilityModel::* getPositiveGnuplotFormula_)(const GaussFitter::GaussFitResult & params) const;
};
}
}
#endif // OPENMS_MATH_STATISTICS_POSTERIORERRORPROBABILITYMODEL_H
|