File: IDMapper.C

package info (click to toggle)
openms 1.11.1-5
  • links: PTS, VCS
  • area: main
  • in suites: jessie, jessie-kfreebsd
  • size: 436,688 kB
  • ctags: 150,907
  • sloc: cpp: 387,126; xml: 71,547; python: 7,764; ansic: 2,626; php: 2,499; sql: 737; ruby: 342; sh: 325; makefile: 128
file content (267 lines) | stat: -rw-r--r-- 14,497 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
// --------------------------------------------------------------------------
//                   OpenMS -- Open-Source Mass Spectrometry
// --------------------------------------------------------------------------
// Copyright The OpenMS Team -- Eberhard Karls University Tuebingen,
// ETH Zurich, and Freie Universitaet Berlin 2002-2013.
//
// This software is released under a three-clause BSD license:
//  * Redistributions of source code must retain the above copyright
//    notice, this list of conditions and the following disclaimer.
//  * Redistributions in binary form must reproduce the above copyright
//    notice, this list of conditions and the following disclaimer in the
//    documentation and/or other materials provided with the distribution.
//  * Neither the name of any author or any participating institution
//    may be used to endorse or promote products derived from this software
//    without specific prior written permission.
// For a full list of authors, refer to the file AUTHORS.
// --------------------------------------------------------------------------
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
// ARE DISCLAIMED. IN NO EVENT SHALL ANY OF THE AUTHORS OR THE CONTRIBUTING
// INSTITUTIONS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
// OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
// ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// --------------------------------------------------------------------------
// $Maintainer: Andreas Bertsch $
// $Authors: Marc Sturm, Hendrik Weisser $
// --------------------------------------------------------------------------

#include <OpenMS/config.h>

#include <OpenMS/FORMAT/IdXMLFile.h>
#include <OpenMS/FORMAT/FeatureXMLFile.h>
#include <OpenMS/FORMAT/ConsensusXMLFile.h>
#include <OpenMS/FORMAT/FileHandler.h>
#include <OpenMS/FORMAT/FileTypes.h>
#include <OpenMS/FORMAT/MzQuantMLFile.h>
#include <OpenMS/METADATA/MSQuantifications.h>

#include <OpenMS/ANALYSIS/ID/IDMapper.h>
#include <OpenMS/APPLICATIONS/TOPPBase.h>
#include <OpenMS/CONCEPT/LogStream.h>

using namespace OpenMS;
using namespace std;

//-------------------------------------------------------------
//Doxygen docu
//-------------------------------------------------------------

/**
    @page TOPP_IDMapper IDMapper

    @brief Assigns protein/peptide identifications to features or consensus features.

    <CENTER>
    <table>
        <tr>
            <td ALIGN = "center" BGCOLOR="#EBEBEB"> potential predecessor tools </td>
            <td VALIGN="middle" ROWSPAN=3> \f$ \longrightarrow \f$ IDMapper \f$ \longrightarrow \f$</td>
            <td ALIGN = "center" BGCOLOR="#EBEBEB"> potential successor tools </td>
        </tr>
        <tr>
            <td VALIGN="middle" ALIGN = "center" ROWSPAN=1> @ref TOPP_MascotAdapter (or other ID engines) </td>
            <td VALIGN="middle" ALIGN = "center" ROWSPAN=1> @ref TOPP_ConsensusID </td>
        </tr>
        <tr>
          <td VALIGN="middle" ALIGN="center" ROWSPAN=1> @ref TOPP_IDFilter </td>
          <td VALIGN="middle" ALIGN="center" ROWSPAN=1> @ref TOPP_MapAlignerIdentification </td>
        </tr>
    </table>
    </CENTER>

    The mapping is based on retention times and mass-to-charge values. Roughly, a peptide identification is assigned to a (consensus) feature if its position lies within the boundaries of the feature or close enough to the feature centroid.
    Peptide identifications that don't match anywhere are still recorded in the resulting map, as "unassigned peptides". Protein identifications are annotated to the whole map, i.e. not to any particular (consensus) feature.

    In all cases, tolerance in RT and m/z dimension is applied according to the parameters @p rt_tolerance and @p mz_tolerance. Tolerance is understood as "plus or minus x", so the matching range is actually increased by twice the tolerance value.

    If several features or consensus features overlap the position of a peptide identification (taking the allowed tolerances into account), the identification is annotated to all of them.

    <B>Annotation of feature maps (featureXML input):</B>\n
    If @em all features have at least one convex hull, peptide positions are matched against the bounding boxes of the convex hulls (of individual mass traces, if available) by default.
    If not, the positions of the feature centroids are used. The respective coordinates of the centroids are also used for matching (in place of the corresponding ranges from the bounding boxes)
    if @p feature:use_centroid_rt or @p feature:use_centroid_mz are true.

    <B>Annotation of consensus maps (consensusXML input):</B>\n
    Peptide positions are always matched against centroid positions. By default, the consensus centroids are used. However, if @p consensus:use_subelements is set, the centroids of sub-features are considered instead.
    In this case, a peptide identification is mapped to a consensus feature if any of its sub-features matches.

    <B>The command line parameters of this tool are:</B>
    @verbinclude TOPP_IDMapper.cli
    <B>INI file documentation of this tool:</B>
    @htmlinclude TOPP_IDMapper.html

    On the peptide side, two sources for m/z values are possible (see parameter @p mz_reference): 1. m/z of the precursor of the MS2 spectrum that gave rise to the peptide identification;
    2. theoretical masses computed from the amino acid sequences of peptide hits.
    (When using theoretical masses, make sure that peptide modifications were identified correctly. OpenMS currently "forgets" mass shifts that it can't assign to modifications - if that 
    happens, masses computed from peptide sequences will be off.)

    @deprecated The parameter handling of this tool has been reworked. For greater consistency with other tools, the parameters @p rt_delta and @p mz_delta have been renamed to @p rt_tolerance 
    and @p mz_tolerance. The possible values of the @p mz_reference parameter have also been renamed. The default value of @p mz_tolerance has been increased from 1 ppm to a more realistic 20 ppm.\n
    Most importantly, the @p use_centroids parameter from previous versions has been split into two parameters, @p feature:use_centroid_rt and @p feature:use_centroid_mz. In OpenMS 1.6, peptide
    identifications would be matched only against monoisotopic mass traces of features if @p mz_reference was @p PeptideMass; otherwise, all mass traces would be used. This implicit behaviour has
    been abandoned, you can now explicitly control it with the @p feature:use_centroid_mz parameter. @p feature:use_centroid_mz does not take into account m/z deviations in the monoisotopic mass trace, but this can be compensated by increasing @p mz_tolerance. The new implementation should work correctly even if the monoisotopic mass trace itself was not detected.

*/

// We do not want this class to show up in the docu:
/// @cond TOPPCLASSES

class TOPPIDMapper :
  public TOPPBase
{

public:

  TOPPIDMapper() :
    TOPPBase("IDMapper", "Assigns protein/peptide identifications to features or consensus features.")
  {
  }

protected:

  void registerOptionsAndFlags_()
  {
    registerInputFile_("id", "<file>", "", "Protein/peptide identifications file");
    setValidFormats_("id", StringList::create("idXML"));
    registerInputFile_("in", "<file>", "", "Feature map/consensus map file");
    setValidFormats_("in", StringList::create("featureXML,consensusXML,mzq"));
    registerOutputFile_("out", "<file>", "", "Output file (the format depends on the input file format).");
    setValidFormats_("out", StringList::create("featureXML,consensusXML,mzq"));

    addEmptyLine_();
    IDMapper mapper;
    Param p = mapper.getParameters();
    registerDoubleOption_("rt_tolerance", "<value>", p.getValue("rt_tolerance"), "RT tolerance (in seconds) for the matching of peptide identifications and (consensus) features.\nTolerance is understood as 'plus or minus x', so the matching range increases by twice the given value.", false);
    setMinFloat_("rt_tolerance", 0.0);
    registerDoubleOption_("mz_tolerance", "<value>", p.getValue("mz_tolerance"), "m/z tolerance (in ppm or Da) for the matching of peptide identifications and (consensus) features.\nTolerance is understood as 'plus or minus x', so the matching range increases by twice the given value.", false);
    setMinFloat_("mz_tolerance", 0.0);
    registerStringOption_("mz_measure", "<choice>", p.getEntry("mz_measure").valid_strings[0], "Unit of 'mz_tolerance'.", false);
    setValidStrings_("mz_measure", p.getEntry("mz_measure").valid_strings);
    registerStringOption_("mz_reference", "<choice>", p.getEntry("mz_reference").valid_strings[0], "Source of m/z values for peptide identifications. If 'precursor', the precursor-m/z from the idXML is used. If 'peptide',\nmasses are computed from the sequences of peptide hits; in this case, an identification matches if any of its hits matches.\n('peptide' should be used together with 'feature:use_centroid_mz' to avoid false-positive matches.)", false);
    setValidStrings_("mz_reference", p.getEntry("mz_reference").valid_strings);
    registerFlag_("ignore_charge", "For feature/consensus maps: Assign an ID independently of whether its charge state matches that of the (consensus) feature.");

    addEmptyLine_();
    registerTOPPSubsection_("feature", "Additional options for featureXML input");
    registerFlag_("feature:use_centroid_rt", "Use the RT coordinates of the feature centroids for matching, instead of the RT ranges of the features/mass traces.");
    registerFlag_("feature:use_centroid_mz", "Use the m/z coordinates of the feature centroids for matching, instead of the m/z ranges of the features/mass traces.\n(If you choose 'peptide' as 'mz_reference', you should usually set this flag to avoid false-positive matches.)");

    addEmptyLine_();
    registerTOPPSubsection_("consensus", "Additional options for consensusXML input");
    registerFlag_("consensus:use_subelements", "Match using RT and m/z of sub-features instead of consensus RT and m/z. A consensus feature matches if any of its sub-features matches.");
  }

  ExitCodes main_(int, const char **)
  {
    // LOG_DEBUG << "Starting..." << endl;
    String in = getStringOption_("in");
    FileTypes::Type in_type = FileHandler::getType(in);
    String out = getStringOption_("out");

    //----------------------------------------------------------------
    // load idXML
    //----------------------------------------------------------------
    // LOG_DEBUG << "Loading idXML..." << endl;
    vector<ProteinIdentification> protein_ids;
    vector<PeptideIdentification> peptide_ids;
    String document_id;
    IdXMLFile().load(getStringOption_("id"), protein_ids, peptide_ids, document_id);

    //----------------------------------------------------------------
    //create mapper
    //----------------------------------------------------------------
    // LOG_DEBUG << "Creating mapper..." << endl;
    IDMapper mapper;
    Param p = mapper.getParameters();
    p.setValue("rt_tolerance", getDoubleOption_("rt_tolerance"));
    p.setValue("mz_tolerance", getDoubleOption_("mz_tolerance"));
    p.setValue("mz_measure", getStringOption_("mz_measure"));
    p.setValue("mz_reference", getStringOption_("mz_reference"));
    p.setValue("ignore_charge", getFlag_("ignore_charge") ? "true" : "false");
    mapper.setParameters(p);

    //----------------------------------------------------------------
    // consensusXML
    //----------------------------------------------------------------
    if (in_type == FileTypes::CONSENSUSXML)
    {
      // LOG_DEBUG << "Processing consensus map..." << endl;
      ConsensusXMLFile file;
      ConsensusMap map;
      file.load(in, map);

      bool measure_from_subelements = getFlag_("consensus:use_subelements");

      mapper.annotate(map, peptide_ids, protein_ids, measure_from_subelements);

      //annotate output with data processing info
      addDataProcessing_(map, getProcessingInfo_(DataProcessing::IDENTIFICATION_MAPPING));

      file.store(out, map);
    }

    //----------------------------------------------------------------
    // featureXML
    //----------------------------------------------------------------
    if (in_type == FileTypes::FEATUREXML)
    {
      // LOG_DEBUG << "Processing feature map..." << endl;
      FeatureMap<> map;
      FeatureXMLFile file;
      file.load(in, map);

      mapper.annotate(map, peptide_ids, protein_ids,
                      getFlag_("feature:use_centroid_rt"),
                      getFlag_("feature:use_centroid_mz"));

      //annotate output with data processing info
      addDataProcessing_(map, getProcessingInfo_(DataProcessing::IDENTIFICATION_MAPPING));

      file.store(out, map);
    }

    //----------------------------------------------------------------
    // MzQuantML
    //----------------------------------------------------------------
    if (in_type == FileTypes::MZQUANTML)
    {
      // LOG_DEBUG << "Processing mzq ..." << endl;
      MSQuantifications msq;
      MzQuantMLFile file;
      file.load(in, msq);

      bool measure_from_subelements = getFlag_("consensus:use_subelements");
      for (std::vector<ConsensusMap>::iterator it = msq.getConsensusMaps().begin(); it != msq.getConsensusMaps().end(); ++it)
      {
        mapper.annotate(*it, peptide_ids, protein_ids, measure_from_subelements);
        //annotate output with data processing info
        addDataProcessing_(*it, getProcessingInfo_(DataProcessing::IDENTIFICATION_MAPPING));
      }

      //~ writeDebug_(msq.getConsensusMaps().size(),3);
      //~ writeDebug_(msq.getConsensusMaps().back().size(),3);
      //~ writeDebug_(msq.getAnalysisSummary().quant_type_,3);
      file.store(out, msq);
    }

    // LOG_DEBUG << "Done." << endl;
    return EXECUTION_OK;
  }

};


int main(int argc, const char ** argv)
{
  TOPPIDMapper tool;
  return tool.main(argc, argv);
}

/// @endcond