1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426
|
// --------------------------------------------------------------------------
// OpenMS -- Open-Source Mass Spectrometry
// --------------------------------------------------------------------------
// Copyright The OpenMS Team -- Eberhard Karls University Tuebingen,
// ETH Zurich, and Freie Universitaet Berlin 2002-2013.
//
// This software is released under a three-clause BSD license:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of any author or any participating institution
// may be used to endorse or promote products derived from this software
// without specific prior written permission.
// For a full list of authors, refer to the file AUTHORS.
// --------------------------------------------------------------------------
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
// ARE DISCLAIMED. IN NO EVENT SHALL ANY OF THE AUTHORS OR THE CONTRIBUTING
// INSTITUTIONS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
// OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
// ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// --------------------------------------------------------------------------
// $Maintainer: David Wojnar $
// $Authors: David Wojnar $
// --------------------------------------------------------------------------
#include <OpenMS/APPLICATIONS/TOPPBase.h>
#include <OpenMS/MATH/STATISTICS/PosteriorErrorProbabilityModel.h>
#include <OpenMS/FORMAT/IdXMLFile.h>
#include <OpenMS/CONCEPT/Exception.h>
#include <vector>
using namespace OpenMS;
using namespace Math; //PosteriorErrorProbabilityModel
using namespace std;
//-------------------------------------------------------------
//Doxygen docu
//-------------------------------------------------------------
/**
@page TOPP_IDPosteriorErrorProbability IDPosteriorErrorProbability
@brief Tool to estimate the probability of peptide hits to be incorrectly assigned.
<CENTER>
<table>
<tr>
<td ALIGN = "center" BGCOLOR="#EBEBEB"> potential predecessor tools </td>
<td VALIGN="middle" ROWSPAN=2> \f$ \longrightarrow \f$ IDPosteriorErrorProbability \f$ \longrightarrow \f$</td>
<td ALIGN = "center" BGCOLOR="#EBEBEB"> potential successor tools </td>
</tr>
<tr>
<td VALIGN="middle" ALIGN = "center" ROWSPAN=1> @ref TOPP_MascotAdapter (or other ID engines) </td>
<td VALIGN="middle" ALIGN = "center" ROWSPAN=1> @ref TOPP_ConsensusID </td>
</tr>
</table>
</CENTER>
@experimental This tool has not been tested thoroughly and might behave not as expected!
By default an estimation is performed using the (inverse) Gumbel distribution for incorrectly assigned sequences
and a Gaussian distribution for correctly assigned sequences. The probabilities are calculated by using Bayes' law, similar to PeptideProphet.
Alternatively, a second Gaussian distribution can be used for incorrectly assigned sequences.
At the moment, IDPosteriorErrorProbability is able to handle X!Tandem, Mascot, MyriMatch and OMSSA scores.
No target/decoy information needs to be provided, since the model fits are done on the mixed distribution.
In order to validate the computed probabilities one can adjust the fit_algorithm subsection.
There are three parameters for the plot:
The parameter 'output_plots' is by default false. If set to true the plot will be created.
The scores are plotted in form of bins. Each bin represents a set of scores in a range of (highest_score - smallest_score)/number_of_bins (if all scores have positive values).
The midpoint of the bin is the mean of the scores it represents.
Finally, the parameter output_name should be used to give the plot a unique name. Two files are created. One with the binned scores and one with all steps of the estimation.
If top_hits_only is set, only the top hits of each PeptideIndentification are used for the estimation process.
Additionally, if 'top_hits_only' is set, target_decoy information are available and a False Discovery Rate run was performed before, an additional plot will be plotted with target and decoy bins(output_plot must be true in fit_algorithm subsection).
A peptide hit is assumed to be a target if its q-value is smaller than fdr_for_targets_smaller.
Actually, the plots are saved as a gnuplot file. Therefore, to visualize the plots one has to use gnuplot, e.g. gnuplot file_name. This should output a postscript file which contains all steps of the estimation.
<B>The command line parameters of this tool are:</B>
@verbinclude TOPP_IDPosteriorErrorProbability.cli
<B>INI file documentation of this tool:</B>
@htmlinclude TOPP_IDPosteriorErrorProbability.html
For the parameters of the algorithm section see the algorithms documentation: @n
@ref OpenMS::Math::PosteriorErrorProbabilityModel "fit_algorithm" @n
*/
// We do not want this class to show up in the docu:
/// @cond TOPPCLASSES
class TOPPIDPosteriorErrorProbability :
public TOPPBase
{
public:
TOPPIDPosteriorErrorProbability() :
TOPPBase("IDPosteriorErrorProbability", "Estimates probabilities for incorrectly assigned peptide sequences and a set of search engine scores using a mixture model.")
{
}
protected:
void registerOptionsAndFlags_()
{
registerInputFile_("in", "<file>", "", "input file ");
setValidFormats_("in", StringList::create("idXML"));
registerOutputFile_("out", "<file>", "", "output file ");
setValidFormats_("out", StringList::create("idXML"));
registerOutputFile_("output_name", "<file>", "", "gnuplot file as txt");
setValidFormats_("output_name", StringList::create("txt"));
registerDoubleOption_("smallest_e_value", "<value>", 10e-20, "This value gives a lower bound to E-Values. It should not be 0, as transformation in a real number (log of E-value) is not possible for certain values then.", false, true);
registerFlag_("split_charge", "The search engine scores are split by charge if this flag is set. Thus, for each charge state a new model will be computed.");
registerFlag_("top_hits_only", "If set only the top hits of every PeptideIdentification will be used");
registerDoubleOption_("fdr_for_targets_smaller", "<value>", 0.05, "Only used, when top_hits_only set. Additionally, target_decoy information should be available. The score_type must be q-value from an previous False Discovery Rate run.", false, true);
registerFlag_("ignore_bad_data", "If set errors will be written but ignored. Useful for pipelines with many datasets where only a few are bad, but the pipeline should run through.");
registerFlag_("prob_correct", "If set scores will be calculated as 1-ErrorProbabilities and can be interpreted as probabilities for correct identifications.");
registerSubsection_("fit_algorithm", "Algorithm parameter subsection");
addEmptyLine_();
}
//there is only one parameter at the moment
Param getSubsectionDefaults_(const String& /*section*/) const
{
PosteriorErrorProbabilityModel pepm;
return pepm.getParameters();
}
double get_score_(String& engine, const PeptideHit& hit)
{
if (engine == "OMSSA")
{
return (-1) * log10(max(hit.getScore(), smallest_e_value_));
}
else if (engine == "MyriMatch")
{
//double e_val = exp(-hit.getScore());
//double score_val = ((-1)* log10(max(e_val,smallest_e_value_)));
//printf("myri score: %e ; e_val: %e ; score_val: %e\n",hit.getScore(),e_val,score_val);
//return score_val;
return hit.getScore();
}
else if (engine.compare("XTandem") == 0)
{
return (-1) * log10(max((DoubleReal)hit.getMetaValue("E-Value"), smallest_e_value_));
}
else if (engine == "MASCOT")
{
if (hit.metaValueExists("EValue"))
{
return (-1) * log10(max((DoubleReal)hit.getMetaValue("EValue"), smallest_e_value_));
}
if (hit.metaValueExists("expect"))
{
return (-1) * log10(max((DoubleReal)hit.getMetaValue("expect"), smallest_e_value_));
}
}
else if (engine == "SpectraST")
{
return 100 * hit.getScore(); // SpectraST f-val
}
else
{
throw Exception::UnableToFit(__FILE__, __LINE__, __PRETTY_FUNCTION__, "No parameters for chosen search engine", "The chosen search engine is currently not supported");
}
// avoid compiler warning (every code path must return a value, even if there is a throw() somewhere)
return std::numeric_limits<double>::max();
}
ExitCodes main_(int, const char**)
{
//-------------------------------------------------------------
// parsing parameters
//-------------------------------------------------------------
String inputfile_name = getStringOption_("in");
String outputfile_name = getStringOption_("out");
smallest_e_value_ = getDoubleOption_("smallest_e_value");
Param fit_algorithm = getParam_().copy("fit_algorithm:", true);
bool split_charge = getFlag_("split_charge");
bool top_hits_only = getFlag_("top_hits_only");
DoubleReal fdr_for_targets_smaller = getDoubleOption_("fdr_for_targets_smaller");
bool target_decoy_available = false;
bool ignore_bad_data = getFlag_("ignore_bad_data");
bool prob_correct = getFlag_("prob_correct");
//-------------------------------------------------------------
// reading input
//-------------------------------------------------------------
IdXMLFile file;
vector<ProteinIdentification> protein_ids;
vector<PeptideIdentification> peptide_ids;
file.load(inputfile_name, protein_ids, peptide_ids);
vector<double> scores;
vector<double> decoy;
vector<double> target;
vector<Int> charges;
PosteriorErrorProbabilityModel PEP_model;
PEP_model.setParameters(fit_algorithm);
StringList search_engines = StringList::create("XTandem,OMSSA,MASCOT,SpectraST,MyriMatch");
//-------------------------------------------------------------
// calculations
//-------------------------------------------------------------
if (split_charge)
{
for (vector<PeptideIdentification>::iterator it = peptide_ids.begin(); it < peptide_ids.end(); ++it)
{
vector<PeptideHit> hits = it->getHits();
for (std::vector<PeptideHit>::iterator hit = hits.begin(); hit < hits.end(); ++hit)
{
if (charges.end() == find(charges.begin(), charges.end(), hit->getCharge()))
{
charges.push_back(hit->getCharge());
}
}
}
if (charges.empty())
{
throw Exception::ElementNotFound(__FILE__, __LINE__, __PRETTY_FUNCTION__, "no charges found!");
}
}
for (vector<PeptideIdentification>::iterator it = peptide_ids.begin(); it < peptide_ids.end(); ++it)
{
if (!it->getHits().empty())
{
target_decoy_available = (it->getScoreType() == "q-value" && it->getHits()[0].getMetaValue("target_decoy") != DataValue::EMPTY);
break;
}
}
vector<Int>::iterator charge = charges.begin(); // charges can be empty, no problem if split_charge is not set
if (split_charge && charges.empty())
{
throw Exception::Precondition(__FILE__, __LINE__, __PRETTY_FUNCTION__, "split_charge is set and the list of charge states is empty but should not be!");
}
map<String, vector<vector<double> > > all_scores;
char splitter = ','; //to split the engine from the charge state later on
do
{
for (StringList::iterator engine = search_engines.begin(); engine < search_engines.end(); ++engine)
{
for (vector<ProteinIdentification>::iterator prot_iter = protein_ids.begin(); prot_iter < protein_ids.end(); ++prot_iter)
{
String searchengine_toUpper = prot_iter->getSearchEngine();
searchengine_toUpper.toUpper();
if (*engine == prot_iter->getSearchEngine() || *engine == searchengine_toUpper)
{
for (vector<PeptideIdentification>::iterator it = peptide_ids.begin(); it < peptide_ids.end(); ++it)
{
if (prot_iter->getIdentifier().compare(it->getIdentifier()) == 0)
{
vector<PeptideHit> hits = it->getHits();
if (top_hits_only)
{
if (!hits.empty() && (!split_charge || hits[0].getCharge() == *charge))
{
scores.push_back(get_score_(*engine, hits[0]));
if (target_decoy_available)
{
if (hits[0].getScore() < fdr_for_targets_smaller)
{
target.push_back(get_score_(*engine, hits[0]));
}
else
{
decoy.push_back(get_score_(*engine, hits[0]));
}
}
}
}
else
{
for (std::vector<PeptideHit>::iterator hit = hits.begin(); hit < hits.end(); ++hit)
{
if (!split_charge || hit->getCharge() == *charge)
{
scores.push_back(get_score_(*engine, *hit));
}
}
}
}
}
}
}
if (scores.size() > 2)
{
vector<vector<double> > tmp;
tmp.push_back(scores);
tmp.push_back(target);
tmp.push_back(decoy);
if (split_charge)
{
String engine_with_charge_state = *engine + String(splitter) + String(*charge);
all_scores.insert(make_pair(engine_with_charge_state, tmp));
}
else
{
all_scores.insert(make_pair(*engine, tmp));
}
}
scores.clear();
target.clear();
decoy.clear();
}
if (split_charge) ++charge;
}
while (charge < charges.end());
if (all_scores.empty())
{
writeLog_("No data collected. Check whether search engine is supported.");
if (!ignore_bad_data) return INPUT_FILE_EMPTY;
}
for (map<String, vector<vector<double> > >::iterator it = all_scores.begin(); it != all_scores.end(); ++it)
{
vector<String> engine_info;
it->first.split(splitter, engine_info);
String engine = engine_info[0];
Int charge = -1;
if (engine_info.size() == 2)
{
charge = engine_info[1].toInt();
}
if (split_charge)
{
String output_name = fit_algorithm.getValue("output_name");
fit_algorithm.setValue("output_name", output_name + "_charge_" + String(charge), "...", StringList::create("advanced,output file"));
PEP_model.setParameters(fit_algorithm);
}
const bool return_value = PEP_model.fit(it->second[0]);
if (!return_value) writeLog_("unable to fit data. Algorithm did not run through for the following search engine: " + engine);
if (!return_value && !ignore_bad_data) return UNEXPECTED_RESULT;
if (return_value)
{
//plot target_decoy
if (target_decoy_available && it->second[0].size() > 0)
{
PEP_model.plotTargetDecoyEstimation(it->second[1], it->second[2]); //target, decoy
}
bool unable_to_fit_data = true;
bool data_might_not_be_well_fit = true;
for (vector<ProteinIdentification>::iterator prot_iter = protein_ids.begin(); prot_iter < protein_ids.end(); ++prot_iter)
{
String searchengine_toUpper = prot_iter->getSearchEngine();
searchengine_toUpper.toUpper();
if (engine == prot_iter->getSearchEngine() || engine == searchengine_toUpper)
{
for (vector<PeptideIdentification>::iterator it = peptide_ids.begin(); it < peptide_ids.end(); ++it)
{
if (prot_iter->getIdentifier().compare(it->getIdentifier()) == 0)
{
String score_type = it->getScoreType() + "_score";
vector<PeptideHit> hits = it->getHits();
for (std::vector<PeptideHit>::iterator hit = hits.begin(); hit < hits.end(); ++hit)
{
if (!split_charge || hit->getCharge() == charge)
{
DoubleReal score;
hit->setMetaValue(score_type, hit->getScore());
score = PEP_model.computeProbability(get_score_(engine, *hit));
if (score > 0 && score < 1) unable_to_fit_data = false; //only if all it->second[0] are 0 or 1 unable_to_fit_data stays true
if (score > 0.2 && score < 0.8) data_might_not_be_well_fit = false; //same as above
hit->setScore(score);
if (prob_correct)
{
hit->setScore(1 - score);
}
else
{
hit->setScore(score);
}
}
}
it->setHits(hits);
}
it->setScoreType("Posterior Error Probability");
it->setHigherScoreBetter(false);
}
}
}
if (unable_to_fit_data) writeLog_(String("unable to fit data for search engine: ") + engine);
if (unable_to_fit_data && !ignore_bad_data) return UNEXPECTED_RESULT;
if (data_might_not_be_well_fit) writeLog_(String("data might not be well fitted for search engine: ") + engine);
}
}
//-------------------------------------------------------------
// writing output
//-------------------------------------------------------------
file.store(outputfile_name, protein_ids, peptide_ids);
return EXECUTION_OK;
}
//Used in several functions
DoubleReal smallest_e_value_;
};
int main(int argc, const char** argv)
{
TOPPIDPosteriorErrorProbability tool;
return tool.main(argc, argv);
}
/// @endcond
|