File: IDPosteriorErrorProbability.C

package info (click to toggle)
openms 1.11.1-5
  • links: PTS, VCS
  • area: main
  • in suites: jessie, jessie-kfreebsd
  • size: 436,688 kB
  • ctags: 150,907
  • sloc: cpp: 387,126; xml: 71,547; python: 7,764; ansic: 2,626; php: 2,499; sql: 737; ruby: 342; sh: 325; makefile: 128
file content (426 lines) | stat: -rw-r--r-- 18,939 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
// --------------------------------------------------------------------------
//                   OpenMS -- Open-Source Mass Spectrometry
// --------------------------------------------------------------------------
// Copyright The OpenMS Team -- Eberhard Karls University Tuebingen,
// ETH Zurich, and Freie Universitaet Berlin 2002-2013.
//
// This software is released under a three-clause BSD license:
//  * Redistributions of source code must retain the above copyright
//    notice, this list of conditions and the following disclaimer.
//  * Redistributions in binary form must reproduce the above copyright
//    notice, this list of conditions and the following disclaimer in the
//    documentation and/or other materials provided with the distribution.
//  * Neither the name of any author or any participating institution
//    may be used to endorse or promote products derived from this software
//    without specific prior written permission.
// For a full list of authors, refer to the file AUTHORS.
// --------------------------------------------------------------------------
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
// ARE DISCLAIMED. IN NO EVENT SHALL ANY OF THE AUTHORS OR THE CONTRIBUTING
// INSTITUTIONS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
// OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
// ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// --------------------------------------------------------------------------
// $Maintainer: David Wojnar $
// $Authors: David Wojnar $
// --------------------------------------------------------------------------


#include <OpenMS/APPLICATIONS/TOPPBase.h>
#include <OpenMS/MATH/STATISTICS/PosteriorErrorProbabilityModel.h>
#include <OpenMS/FORMAT/IdXMLFile.h>
#include <OpenMS/CONCEPT/Exception.h>
#include <vector>

using namespace OpenMS;
using namespace Math; //PosteriorErrorProbabilityModel
using namespace std;

//-------------------------------------------------------------
//Doxygen docu
//-------------------------------------------------------------

/**
    @page TOPP_IDPosteriorErrorProbability IDPosteriorErrorProbability

    @brief  Tool to estimate the probability of peptide hits to be incorrectly assigned.

    <CENTER>
    <table>
        <tr>
            <td ALIGN = "center" BGCOLOR="#EBEBEB"> potential predecessor tools </td>
            <td VALIGN="middle" ROWSPAN=2> \f$ \longrightarrow \f$ IDPosteriorErrorProbability \f$ \longrightarrow \f$</td>
            <td ALIGN = "center" BGCOLOR="#EBEBEB"> potential successor tools </td>
        </tr>
        <tr>
            <td VALIGN="middle" ALIGN = "center" ROWSPAN=1> @ref TOPP_MascotAdapter (or other ID engines) </td>
            <td VALIGN="middle" ALIGN = "center" ROWSPAN=1> @ref TOPP_ConsensusID </td>
        </tr>
    </table>
    </CENTER>

    @experimental This tool has not been tested thoroughly and might behave not as expected!

    By default an estimation is performed using the (inverse) Gumbel distribution for incorrectly assigned sequences
    and a Gaussian distribution for correctly assigned sequences. The probabilities are calculated by using Bayes' law, similar to PeptideProphet.
    Alternatively, a second Gaussian distribution can be used for incorrectly assigned sequences.
    At the moment, IDPosteriorErrorProbability is able to handle X!Tandem, Mascot, MyriMatch and OMSSA scores.

  No target/decoy information needs to be provided, since the model fits are done on the mixed distribution.

    In order to validate the computed probabilities one can adjust the fit_algorithm subsection.

    There are three parameters for the plot:
    The parameter 'output_plots' is by default false. If set to true the plot will be created.
    The scores are plotted in form of bins. Each bin represents a set of scores in a range of (highest_score - smallest_score)/number_of_bins (if all scores have positive values).
    The midpoint of the bin is the mean of the scores it represents.
    Finally, the parameter output_name should be used to give the plot a unique name. Two files are created. One with the binned scores and one with all steps of the estimation.
    If top_hits_only is set, only the top hits of each PeptideIndentification are used for the estimation process.
    Additionally, if 'top_hits_only' is set, target_decoy information are available and a False Discovery Rate run was performed before, an additional plot will be plotted with target and decoy bins(output_plot must be true in fit_algorithm subsection).
    A peptide hit is assumed to be a target if its q-value is smaller than fdr_for_targets_smaller.

    Actually, the plots are saved as a gnuplot file. Therefore, to visualize the plots one has to use gnuplot, e.g. gnuplot file_name. This should output a postscript file which contains all steps of the estimation.

    <B>The command line parameters of this tool are:</B>
    @verbinclude TOPP_IDPosteriorErrorProbability.cli
    <B>INI file documentation of this tool:</B>
    @htmlinclude TOPP_IDPosteriorErrorProbability.html

    For the parameters of the algorithm section see the algorithms documentation: @n
        @ref OpenMS::Math::PosteriorErrorProbabilityModel "fit_algorithm" @n

*/

// We do not want this class to show up in the docu:
/// @cond TOPPCLASSES


class TOPPIDPosteriorErrorProbability :
  public TOPPBase
{
public:
  TOPPIDPosteriorErrorProbability() :
    TOPPBase("IDPosteriorErrorProbability", "Estimates probabilities for incorrectly assigned peptide sequences and a set of search engine scores using a mixture model.")
  {

  }

protected:
  void registerOptionsAndFlags_()
  {
    registerInputFile_("in", "<file>", "", "input file ");
    setValidFormats_("in", StringList::create("idXML"));
    registerOutputFile_("out", "<file>", "", "output file ");
    setValidFormats_("out", StringList::create("idXML"));
    registerOutputFile_("output_name", "<file>", "", "gnuplot file as txt");
    setValidFormats_("output_name", StringList::create("txt"));

    registerDoubleOption_("smallest_e_value", "<value>", 10e-20, "This value gives a lower bound to E-Values. It should not be 0, as transformation in a real number (log of E-value) is not possible for certain values then.", false, true);
    registerFlag_("split_charge", "The search engine scores are split by charge if this flag is set. Thus, for each charge state a new model will be computed.");
    registerFlag_("top_hits_only", "If set only the top hits of every PeptideIdentification will be used");
    registerDoubleOption_("fdr_for_targets_smaller", "<value>", 0.05, "Only used, when top_hits_only set. Additionally, target_decoy information should be available. The score_type must be q-value from an previous False Discovery Rate run.", false, true);
    registerFlag_("ignore_bad_data", "If set errors will be written but ignored. Useful for pipelines with many datasets where only a few are bad, but the pipeline should run through.");
    registerFlag_("prob_correct", "If set scores will be calculated as 1-ErrorProbabilities and can be interpreted as probabilities for correct identifications.");
    registerSubsection_("fit_algorithm", "Algorithm parameter subsection");
    addEmptyLine_();
  }

  //there is only one parameter at the moment
  Param getSubsectionDefaults_(const String& /*section*/) const
  {
    PosteriorErrorProbabilityModel pepm;
    return pepm.getParameters();
  }

  double get_score_(String& engine, const PeptideHit& hit)
  {
    if (engine == "OMSSA")
    {
      return (-1) * log10(max(hit.getScore(), smallest_e_value_));
    }
    else if (engine == "MyriMatch")
    {
      //double e_val = exp(-hit.getScore());
      //double score_val = ((-1)* log10(max(e_val,smallest_e_value_)));
      //printf("myri score: %e ; e_val: %e ; score_val: %e\n",hit.getScore(),e_val,score_val);
      //return score_val;
      return hit.getScore();
    }
    else if (engine.compare("XTandem") == 0)
    {
      return (-1) * log10(max((DoubleReal)hit.getMetaValue("E-Value"), smallest_e_value_));
    }
    else if (engine == "MASCOT")
    {
      if (hit.metaValueExists("EValue"))
      {
        return (-1) * log10(max((DoubleReal)hit.getMetaValue("EValue"), smallest_e_value_));
      }
      if (hit.metaValueExists("expect"))
      {
        return (-1) * log10(max((DoubleReal)hit.getMetaValue("expect"), smallest_e_value_));
      }
    }
    else if (engine == "SpectraST")
    {
      return 100 * hit.getScore(); // SpectraST f-val
    }
    else
    {
      throw Exception::UnableToFit(__FILE__, __LINE__, __PRETTY_FUNCTION__, "No parameters for chosen search engine", "The chosen search engine is currently not supported");
    }

    // avoid compiler warning (every code path must return a value, even if there is a throw() somewhere)
    return std::numeric_limits<double>::max();
  }

  ExitCodes main_(int, const char**)
  {
    //-------------------------------------------------------------
    // parsing parameters
    //-------------------------------------------------------------

    String inputfile_name = getStringOption_("in");
    String outputfile_name = getStringOption_("out");
    smallest_e_value_ = getDoubleOption_("smallest_e_value");
    Param fit_algorithm = getParam_().copy("fit_algorithm:", true);
    bool split_charge = getFlag_("split_charge");
    bool top_hits_only = getFlag_("top_hits_only");
    DoubleReal fdr_for_targets_smaller = getDoubleOption_("fdr_for_targets_smaller");
    bool target_decoy_available = false;
    bool ignore_bad_data = getFlag_("ignore_bad_data");
    bool prob_correct = getFlag_("prob_correct");
    //-------------------------------------------------------------
    // reading input
    //-------------------------------------------------------------
    IdXMLFile file;
    vector<ProteinIdentification> protein_ids;
    vector<PeptideIdentification> peptide_ids;
    file.load(inputfile_name, protein_ids, peptide_ids);
    vector<double> scores;
    vector<double> decoy;
    vector<double> target;
    vector<Int> charges;
    PosteriorErrorProbabilityModel PEP_model;
    PEP_model.setParameters(fit_algorithm);
    StringList search_engines = StringList::create("XTandem,OMSSA,MASCOT,SpectraST,MyriMatch");
    //-------------------------------------------------------------
    // calculations
    //-------------------------------------------------------------
    if (split_charge)
    {
      for (vector<PeptideIdentification>::iterator it = peptide_ids.begin(); it < peptide_ids.end(); ++it)
      {
        vector<PeptideHit> hits = it->getHits();
        for (std::vector<PeptideHit>::iterator  hit  = hits.begin(); hit < hits.end(); ++hit)
        {
          if (charges.end() == find(charges.begin(), charges.end(), hit->getCharge()))
          {
            charges.push_back(hit->getCharge());
          }
        }
      }
      if (charges.empty())
      {
        throw Exception::ElementNotFound(__FILE__, __LINE__, __PRETTY_FUNCTION__, "no charges found!");
      }
    }
    for (vector<PeptideIdentification>::iterator it = peptide_ids.begin(); it < peptide_ids.end(); ++it)
    {
      if (!it->getHits().empty())
      {
        target_decoy_available = (it->getScoreType() == "q-value" &&  it->getHits()[0].getMetaValue("target_decoy") != DataValue::EMPTY);
        break;
      }
    }

    vector<Int>::iterator charge = charges.begin(); // charges can be empty, no problem if split_charge is not set
    if (split_charge && charges.empty())
    {
      throw Exception::Precondition(__FILE__, __LINE__, __PRETTY_FUNCTION__, "split_charge is set and the list of charge states is empty but should not be!");
    }
    map<String, vector<vector<double> > > all_scores;
    char splitter = ','; //to split the engine from the charge state later on
    do
    {
      for (StringList::iterator engine = search_engines.begin(); engine < search_engines.end(); ++engine)
      {
        for (vector<ProteinIdentification>::iterator prot_iter = protein_ids.begin(); prot_iter < protein_ids.end(); ++prot_iter)
        {
          String searchengine_toUpper =  prot_iter->getSearchEngine();
          searchengine_toUpper.toUpper();
          if (*engine == prot_iter->getSearchEngine() || *engine == searchengine_toUpper)
          {
            for (vector<PeptideIdentification>::iterator it = peptide_ids.begin(); it < peptide_ids.end(); ++it)
            {
              if (prot_iter->getIdentifier().compare(it->getIdentifier()) == 0)
              {
                vector<PeptideHit> hits = it->getHits();
                if (top_hits_only)
                {
                  if (!hits.empty() && (!split_charge || hits[0].getCharge() == *charge))
                  {
                    scores.push_back(get_score_(*engine, hits[0]));
                    if (target_decoy_available)
                    {
                      if (hits[0].getScore() < fdr_for_targets_smaller)
                      {
                        target.push_back(get_score_(*engine, hits[0]));
                      }
                      else
                      {
                        decoy.push_back(get_score_(*engine, hits[0]));
                      }
                    }
                  }
                }
                else
                {
                  for (std::vector<PeptideHit>::iterator  hit  = hits.begin(); hit < hits.end(); ++hit)
                  {
                    if (!split_charge || hit->getCharge() == *charge)
                    {
                      scores.push_back(get_score_(*engine, *hit));
                    }
                  }
                }
              }
            }
          }
        }
        if (scores.size() > 2)
        {
          vector<vector<double> > tmp;
          tmp.push_back(scores);
          tmp.push_back(target);
          tmp.push_back(decoy);
          if (split_charge)
          {
            String engine_with_charge_state = *engine + String(splitter) + String(*charge);
            all_scores.insert(make_pair(engine_with_charge_state, tmp));
          }
          else
          {
            all_scores.insert(make_pair(*engine, tmp));
          }
        }

        scores.clear();
        target.clear();
        decoy.clear();
      }

      if (split_charge) ++charge;

    }
    while (charge < charges.end());

    if (all_scores.empty())
    {
      writeLog_("No data collected. Check whether search engine is supported.");
      if (!ignore_bad_data) return INPUT_FILE_EMPTY;
    }
    for (map<String, vector<vector<double> > >::iterator it = all_scores.begin(); it != all_scores.end(); ++it)
    {
      vector<String> engine_info;
      it->first.split(splitter, engine_info);
      String engine = engine_info[0];
      Int charge = -1;
      if (engine_info.size() == 2)
      {
        charge = engine_info[1].toInt();
      }
      if (split_charge)
      {
        String output_name  = fit_algorithm.getValue("output_name");
        fit_algorithm.setValue("output_name", output_name + "_charge_" + String(charge), "...", StringList::create("advanced,output file"));
        PEP_model.setParameters(fit_algorithm);
      }

      const bool return_value = PEP_model.fit(it->second[0]);
      if (!return_value) writeLog_("unable to fit data. Algorithm did not run through for the following search engine: " + engine);
      if (!return_value && !ignore_bad_data) return UNEXPECTED_RESULT;

      if (return_value)
      {
        //plot target_decoy
        if (target_decoy_available && it->second[0].size() > 0)
        {
          PEP_model.plotTargetDecoyEstimation(it->second[1], it->second[2]); //target, decoy
        }

        bool unable_to_fit_data = true;
        bool data_might_not_be_well_fit = true;
        for (vector<ProteinIdentification>::iterator prot_iter = protein_ids.begin(); prot_iter < protein_ids.end(); ++prot_iter)
        {
          String searchengine_toUpper =  prot_iter->getSearchEngine();
          searchengine_toUpper.toUpper();

          if (engine == prot_iter->getSearchEngine() || engine == searchengine_toUpper)
          {
            for (vector<PeptideIdentification>::iterator it = peptide_ids.begin(); it < peptide_ids.end(); ++it)
            {
              if (prot_iter->getIdentifier().compare(it->getIdentifier()) == 0)
              {
                String score_type = it->getScoreType() + "_score";
                vector<PeptideHit> hits = it->getHits();
                for (std::vector<PeptideHit>::iterator  hit  = hits.begin(); hit < hits.end(); ++hit)
                {
                  if (!split_charge || hit->getCharge() == charge)
                  {
                    DoubleReal score;
                    hit->setMetaValue(score_type, hit->getScore());
                    score = PEP_model.computeProbability(get_score_(engine, *hit));
                    if (score > 0 && score < 1) unable_to_fit_data = false;  //only if all it->second[0] are 0 or 1 unable_to_fit_data stays true
                    if (score > 0.2 && score < 0.8) data_might_not_be_well_fit = false;  //same as above
                    hit->setScore(score);
                    if (prob_correct)
                    {
                      hit->setScore(1 - score);
                    }
                    else
                    {
                      hit->setScore(score);
                    }
                  }
                }
                it->setHits(hits);
              }
              it->setScoreType("Posterior Error Probability");
              it->setHigherScoreBetter(false);
            }
          }
        }
        if (unable_to_fit_data) writeLog_(String("unable to fit data for search engine: ") + engine);
        if (unable_to_fit_data && !ignore_bad_data) return UNEXPECTED_RESULT;

        if (data_might_not_be_well_fit) writeLog_(String("data might not be well fitted for search engine: ") + engine);
      }
    }
    //-------------------------------------------------------------
    // writing output
    //-------------------------------------------------------------
    file.store(outputfile_name, protein_ids, peptide_ids);
    return EXECUTION_OK;
  }

  //Used in several functions
  DoubleReal smallest_e_value_;
};


int main(int argc, const char** argv)
{
  TOPPIDPosteriorErrorProbability tool;

  return tool.main(argc, argv);
}

/// @endcond