File: PosteriorErrorProbabilityModel.h

package info (click to toggle)
openms 1.11.1-5
  • links: PTS, VCS
  • area: main
  • in suites: jessie, jessie-kfreebsd
  • size: 436,688 kB
  • ctags: 150,907
  • sloc: cpp: 387,126; xml: 71,547; python: 7,764; ansic: 2,626; php: 2,499; sql: 737; ruby: 342; sh: 325; makefile: 128
file content (197 lines) | stat: -rw-r--r-- 11,049 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
// --------------------------------------------------------------------------
//                   OpenMS -- Open-Source Mass Spectrometry
// --------------------------------------------------------------------------
// Copyright The OpenMS Team -- Eberhard Karls University Tuebingen,
// ETH Zurich, and Freie Universitaet Berlin 2002-2013.
//
// This software is released under a three-clause BSD license:
//  * Redistributions of source code must retain the above copyright
//    notice, this list of conditions and the following disclaimer.
//  * Redistributions in binary form must reproduce the above copyright
//    notice, this list of conditions and the following disclaimer in the
//    documentation and/or other materials provided with the distribution.
//  * Neither the name of any author or any participating institution
//    may be used to endorse or promote products derived from this software
//    without specific prior written permission.
// For a full list of authors, refer to the file AUTHORS.
// --------------------------------------------------------------------------
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
// ARE DISCLAIMED. IN NO EVENT SHALL ANY OF THE AUTHORS OR THE CONTRIBUTING
// INSTITUTIONS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
// OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
// ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// --------------------------------------------------------------------------
// $Maintainer: David Wojnar $
// $Authors: David Wojnar $
// --------------------------------------------------------------------------

#ifndef OPENMS_MATH_STATISTICS_POSTERIORERRORPROBABILITYMODEL_H
#define OPENMS_MATH_STATISTICS_POSTERIORERRORPROBABILITYMODEL_H

#include <OpenMS/DATASTRUCTURES/DPosition.h>
#include <OpenMS/MATH/STATISTICS/GumbelDistributionFitter.h>
#include <OpenMS/MATH/STATISTICS/GaussFitter.h>
#include <OpenMS/DATASTRUCTURES/DefaultParamHandler.h>
#include <vector>

namespace OpenMS
{
  class String;
  class TextFile;
  namespace Math
  {


    /**
        @brief Implements a mixture model of the inverse gumbel and the gauss distribution or a gaussian mixture.

        This class fits either a Gumbel distribution and a Gauss distribution to a set of data points or two Gaussian distributions using the EM algorithm.
        One can output the fit as a gnuplot formula using getGumbelGnuplotFormula() and getGaussGnuplotFormula() after fitting.
        @note All paremters are stored in GaussFitResult. In the case of the gumbel distribution x0 and sigma represent the local parameter alpha and the scale parameter beta, respectively.

  @htmlinclude OpenMS_Math::PosteriorErrorProbabilityModel.parameters

        @ingroup Math
    */
    class OPENMS_DLLAPI PosteriorErrorProbabilityModel :
      public DefaultParamHandler
    {
public:

      ///default constructor
      PosteriorErrorProbabilityModel();

      ///Destructor
      virtual ~PosteriorErrorProbabilityModel();

      /**
          @brief fits the distributions to the data points(search_engine_scores). Estimated parameters for the distributions are saved in member variables. computeProbability can be used afterwards.
          @param search_engine_scores a vector which holds the data points
          @return true if algorithm has run thourgh. Else false will be returned. In that case no plot and no probabilities are calculated.
          @note the vector is sorted from smallest to biggest value!
      */
      bool fit(std::vector<double> & search_engine_scores);

      /**
          @brief fits the distributions to the data points(search_engine_scores) and writes the computed probabilites into the given vector (the second one).
          @param search_engine_scores a vector which holds the data points
          @param probabilities a vector which holds the probability for each data point after running this function. If it has some content it will be overwritten.
          @return true if algorithm has run thourgh. Else false will be returned. In that case no plot and no probabilities are calculated.
          @note the vectors are sorted from smallest to biggest value!
      */
      bool fit(std::vector<double> & search_engine_scores, std::vector<double> & probabilities);

      ///Writes the distributions densities into the two vectors for a set of scores. Incorrect_densities represent the incorreclty assigned seqeuences.
      void fillDensities(std::vector<double> & x_scores, std::vector<DoubleReal> & incorrect_density, std::vector<DoubleReal> & correct_density);
      ///computes the Maximum Likelihood with a log-likelihood funciotn.
      DoubleReal computeMaxLikelihood(std::vector<DoubleReal> & incorrect_density, std::vector<DoubleReal> & correct_density);
      ///sums (1 - posterior porbabilities)
      DoubleReal one_minus_sum_post(std::vector<DoubleReal> & incorrect_density, std::vector<DoubleReal> & correct_density);
      ///sums  posterior porbabilities
      DoubleReal sum_post(std::vector<DoubleReal> & incorrect_density, std::vector<DoubleReal> & correct_density);
      ///helper function for the EM algorithm (for fitting)
      DoubleReal sum_pos_x0(std::vector<double> & x_scores, std::vector<DoubleReal> & incorrect_density, std::vector<DoubleReal> & correct_density);
      ///helper function for the EM algorithm (for fitting)
      DoubleReal sum_neg_x0(std::vector<double> & x_scores, std::vector<DoubleReal> & incorrect_density, std::vector<DoubleReal> & correct_density);
      ///helper function for the EM algorithm (for fitting)
      DoubleReal sum_pos_sigma(std::vector<double> & x_scores, std::vector<DoubleReal> & incorrect_density, std::vector<DoubleReal> & correct_density, DoubleReal positive_mean);
      ///helper function for the EM algorithm (for fitting)
      DoubleReal sum_neg_sigma(std::vector<double> & x_scores, std::vector<DoubleReal> & incorrect_density, std::vector<DoubleReal> & correct_density, DoubleReal positive_mean);


      ///returns estimated parameters for correctly assigned sequences. Fit should be used before.
      GaussFitter::GaussFitResult getCorrectlyAssignedFitResult() const
      {
        return correctly_assigned_fit_param_;
      }

      ///returns estimated parameters for correctly assigned sequences. Fit should be used before.
      GaussFitter::GaussFitResult getIncorrectlyAssignedFitResult() const
      {
        return incorrectly_assigned_fit_param_;
      }

      ///returns the estimated negative prior probability.
      DoubleReal getNegativePrior() const
      {
        return negative_prior_;
      }

      ///computes the gaussian density at position x with parameters params.
      DoubleReal getGauss(DoubleReal x, const GaussFitter::GaussFitResult & params)
      {
        return params.A * exp(-1.0 * pow(x - params.x0, 2) / (2 * pow(params.sigma, 2)));
      }

      ///computes the gumbel density at position x with parameters params.
      DoubleReal getGumbel(DoubleReal x, const GaussFitter::GaussFitResult & params)
      {
        DoubleReal z = exp((params.x0 - x) / params.sigma);
        return (z * exp(-1 * z)) / params.sigma;
      }

      /**
          Returns the computed posterior error probability for a given score.
          @note: fit has to be used before using this function. Otherwise this function will compute nonsense.
      */
      DoubleReal computeProbability(DoubleReal score);

      ///initializes the plots
      TextFile * InitPlots(std::vector<double> & x_scores);

      /// returns the gnuplot formula of the fitted gumbel distribution. Only x0 and sigma are used as local parameter alpha and scale parameter beta, respectively.
      const String getGumbelGnuplotFormula(const GaussFitter::GaussFitResult & params) const;

      /// returns the gnuplot formula of the fitted gauss distribution.
      const String getGaussGnuplotFormula(const GaussFitter::GaussFitResult & params) const;

      /// returns the gnuplot formula of the fitted mixture distribution.
      const String getBothGnuplotFormula(const GaussFitter::GaussFitResult & incorrect, const GaussFitter::GaussFitResult & correct) const;

      ///plots the estimated distribution against target and decoy hits
      void plotTargetDecoyEstimation(std::vector<double> & target, std::vector<double> & decoy);

      /// returns the smallest score used in the last fit
      inline DoubleReal getSmallestScore()
      {
        return smallest_score_;
      }

private:
      /// assignment operator (not implemented)
      PosteriorErrorProbabilityModel & operator=(const PosteriorErrorProbabilityModel & rhs);
      ///Copy constructor (not implemented)
      PosteriorErrorProbabilityModel(const PosteriorErrorProbabilityModel & rhs);
      ///stores parameters for incorrectly assigned sequences. If gumbel fit was used, A can be ignored. Furthermore, in this case, x0 and sigma are the local parameter alpha and scale parameter beta, respectively.
      GaussFitter::GaussFitResult incorrectly_assigned_fit_param_;
      ///stores gauss parameters
      GaussFitter::GaussFitResult correctly_assigned_fit_param_;
      ///stores final prior probability for negative peptides
      DoubleReal negative_prior_;
      ///peak of the incorrectly assigned sequences distribution
      DoubleReal max_incorrectly_;
      ///peak of the gauss distribution (correctly assigned sequences)
      DoubleReal max_correctly_;
      ///smallest score which was used for fitting the model
      DoubleReal smallest_score_;
      ///points to getGauss
      DoubleReal (PosteriorErrorProbabilityModel::* calc_incorrect_)(DoubleReal x, const GaussFitter::GaussFitResult & params);
      ///points either to getGumbel or getGauss depending on whether on uses the gumbel or th gausian distribution for incorrectly assigned sequences.
      DoubleReal (PosteriorErrorProbabilityModel::* calc_correct_)(DoubleReal x, const GaussFitter::GaussFitResult & params);
      ///points either to getGumbelGnuplotFormula or getGaussGnuplotFormula depending on whether on uses the gumbel or th gausian distribution for incorrectly assigned sequences.
      const String (PosteriorErrorProbabilityModel::* getNegativeGnuplotFormula_)(const GaussFitter::GaussFitResult & params) const;
      ///points to getGumbelGnuplotFormula
      const String (PosteriorErrorProbabilityModel::* getPositiveGnuplotFormula_)(const GaussFitter::GaussFitResult & params) const;

    };
  }
}

#endif // OPENMS_MATH_STATISTICS_POSTERIORERRORPROBABILITYMODEL_H