File: TransitionPQPFile.h

package info (click to toggle)
openms 2.6.0%2Bcleaned1-3
  • links: PTS, VCS
  • area: main
  • in suites: bookworm, bullseye
  • size: 783,124 kB
  • sloc: cpp: 526,887; xml: 142,287; ansic: 54,252; python: 11,640; php: 2,454; sh: 1,137; ruby: 529; yacc: 403; perl: 85; lex: 74; makefile: 60
file content (280 lines) | stat: -rw-r--r-- 15,627 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
// --------------------------------------------------------------------------
//                   OpenMS -- Open-Source Mass Spectrometry
// --------------------------------------------------------------------------
// Copyright The OpenMS Team -- Eberhard Karls University Tuebingen,
// ETH Zurich, and Freie Universitaet Berlin 2002-2020.
//
// This software is released under a three-clause BSD license:
//  * Redistributions of source code must retain the above copyright
//    notice, this list of conditions and the following disclaimer.
//  * Redistributions in binary form must reproduce the above copyright
//    notice, this list of conditions and the following disclaimer in the
//    documentation and/or other materials provided with the distribution.
//  * Neither the name of any author or any participating institution
//    may be used to endorse or promote products derived from this software
//    without specific prior written permission.
// For a full list of authors, refer to the file AUTHORS.
// --------------------------------------------------------------------------
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
// ARE DISCLAIMED. IN NO EVENT SHALL ANY OF THE AUTHORS OR THE CONTRIBUTING
// INSTITUTIONS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
// OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
// ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// --------------------------------------------------------------------------
// $Maintainer: George Rosenberger $
// $Authors: George Rosenberger, Hannes Roest $
// --------------------------------------------------------------------------

#pragma once

#include <OpenMS/ANALYSIS/OPENSWATH/TransitionTSVFile.h>

#include <boost/range/algorithm.hpp>
#include <boost/range/algorithm_ext/erase.hpp>
#include <iostream>

namespace OpenMS
{

  /**
      @brief This class supports reading and writing of PQP files. 

      The PQP files are SQLite databases consisting of several tables
      representing the data contained in TraML files. For another file format that stores transitions, see also
      TransitionTSVFile.

      This class can convert TraML and PQP files into each other

      <h2> The file format has the following tables: </h2>

      Genes and proteins are described by a primary key as well as a
      human-readable gene name or protein accession key.
      <table border="0"><tr><td>
        <table>
          <tr> <th BGCOLOR="#EBEBEB" colspan=3>GENE</th> </tr>
          <tr> <td BGCOLOR="#EBEBEB">ID</td> <td>INT</td> <td> Primary Key (gene id)</td> </tr>
          <tr> <td BGCOLOR="#EBEBEB">GENE_NAME</td> <td>TEXT</td> <td> Gene name </td> </tr>
          <tr> <td BGCOLOR="#EBEBEB">DECOY</td> <td>INT (0 or 1)</td> <td> Whether this is a decoy gene (1: decoy, 0: target) </td> </tr>
        </table>
      </td><td valign="top">
        <table>
          <tr> <th BGCOLOR="#EBEBEB" colspan=3>PEPTIDE_GENE_MAPPING</th> </tr>
          <tr> <td BGCOLOR="#EBEBEB">PEPTIDE_ID</td> <td>INT</td> <td> Foreign Key (PEPTIDE.ID)</td> </tr>
          <tr> <td BGCOLOR="#EBEBEB">GENE_ID</td> <td>INT</td> <td> Foreign Key (GENE.ID)</td> </tr>
        </table>
      </td></table>

      <table border="0"><tr><td>
      <table>
        <tr> <th BGCOLOR="#EBEBEB" colspan=3>PROTEIN</th> </tr>
        <tr> <td BGCOLOR="#EBEBEB">ID</td> <td>INT</td> <td> Primary Key (protein id)</td> </tr>
        <tr> <td BGCOLOR="#EBEBEB">PROTEIN_ACCESSION</td> <td>TEXT</td> <td> Protein accession </td> </tr>
        <tr> <td BGCOLOR="#EBEBEB">DECOY</td> <td>INT (0 or 1)</td> <td> Whether this is a decoy protein (1: decoy, 0: target) </td> </tr>
      </table>
      </td><td valign="top">
      <table>
        <tr> <th BGCOLOR="#EBEBEB" colspan=3>PEPTIDE_PROTEIN_MAPPING</th> </tr>
        <tr> <td BGCOLOR="#EBEBEB">PEPTIDE_ID</td> <td>INT</td> <td> Foreign Key (PEPTIDE.ID)</td> </tr>
        <tr> <td BGCOLOR="#EBEBEB">PROTEIN_ID</td> <td>INT</td> <td> Foreign Key (PROTEIN.ID)</td> </tr>
      </table>
      </td></table>

      Peptides are physical analytes that are present in the sample and can carry post-translational modifications (PTMs). They are described by their amino-acid sequence and the modifications they carry:
      <table border="0"><tr><td>
      <table>
        <tr> <th BGCOLOR="#EBEBEB" colspan=3>PEPTIDE</th> </tr>
        <tr> <td BGCOLOR="#EBEBEB">ID</td> <td>INT</td> <td> Primary Key (peptide id)</td> </tr>
        <tr> <td BGCOLOR="#EBEBEB">UNMODIFIED_SEQUENCE</td> <td>TEXT</td> <td> Peptide sequence (unmodified) </td> </tr>
        <tr> <td BGCOLOR="#EBEBEB">MODIFIED_SEQUENCE</td> <td>TEXT</td> <td> Peptide sequence (modified) <sup>1</sup></td> </tr>
        <tr> <td BGCOLOR="#EBEBEB">DECOY</td> <td>INT (0 or 1)</td> <td> Whether this is a decoy peptide (1: decoy, 0: target) </td> </tr>
      </table>
      </td><td valign="top">
      <table>
        <tr> <th BGCOLOR="#EBEBEB" colspan=3>PRECURSOR_PEPTIDE_MAPPING</th> </tr>
        <tr> <td BGCOLOR="#EBEBEB">PRECURSOR_ID</td> <td>INT</td> <td> Foreign Key (PRECURSOR.ID)</td> </tr>
        <tr> <td BGCOLOR="#EBEBEB">PEPTIDE_ID</td> <td>INT</td> <td> Foreign Key (PEPTIDE.ID)</td> </tr>
      </table>
      </td></table>

      Compounds are generic analytes that are present in the sample (but are not peptides). This is used for small molecules which are described by their molecular formula and the SMILES representation (structural representation):
      <table border="0"><tr><td>
      <table>
        <tr> <th BGCOLOR="#EBEBEB" colspan=3>COMPOUND</th> </tr>
        <tr> <td BGCOLOR="#EBEBEB">ID</td> <td>INT</td> <td> Primary Key (compound id)</td> </tr>
        <tr> <td BGCOLOR="#EBEBEB">COMPOUND_NAME</td> <td>TEXT</td> <td> Compound name (common name of the analyte)</td> </tr>
        <tr> <td BGCOLOR="#EBEBEB">SUM_FORMULA</td> <td>TEXT</td> <td> Molecular formula of the compound </td> </tr>
        <tr> <td BGCOLOR="#EBEBEB">SMILES</td> <td>TEXT</td> <td> SMILES representation</td> </tr>
        <tr> <td BGCOLOR="#EBEBEB">ADDUCTS</td> <td>TEXT</td> <td> List of adducts for the compound</td> </tr>
        <tr> <td BGCOLOR="#EBEBEB">DECOY</td> <td>INT (0 or 1)</td> <td> Whether this is a decoy compound (1: decoy, 0: target) </td> </tr>
      </table>
      </td><td valign="top">
      <table>
        <tr> <th BGCOLOR="#EBEBEB" colspan=3>PRECURSOR_COMPOUND_MAPPING</th> </tr>
        <tr> <td BGCOLOR="#EBEBEB">PRECURSOR_ID</td> <td>INT</td> <td> Foreign Key (PRECURSOR.ID)</td> </tr>
        <tr> <td BGCOLOR="#EBEBEB">COMPOUND_ID</td> <td>INT</td> <td> Foreign Key (COMPOUND.ID)</td> </tr>
      </table>
      </td></table>

      Precursors are generated upon ionization from peptides or small molecule analytes (compounds) and are described by their charge state, mass-to-charge ratio, retention time and ion mobility drift time:
      <table border="0"><tr><td>
      <table>
        <tr> <th BGCOLOR="#EBEBEB" colspan=3>PRECURSOR</th> </tr>
        <tr> <td BGCOLOR="#EBEBEB">ID</td> <td>INT</td> <td> Primary Key (precursor id)</td> </tr>
        <tr> <td BGCOLOR="#EBEBEB">TRAML_ID</td> <td>TEXT</td> <td> TraML identifiers (maps to the "id=" attribute in TraML)</td> </tr>
        <tr> <td BGCOLOR="#EBEBEB">GROUP_LABEL</td> <td>TEXT</td> <td> designates to which peptide label group (as defined in MS:1000893) the peptide belongs to<sup>2</sup></td> </tr>
        <tr> <td BGCOLOR="#EBEBEB">PRECURSOR_MZ</td> <td>REAL</td> <td> %Precursor m/z </td> </tr>
        <tr> <td BGCOLOR="#EBEBEB">CHARGE</td> <td>TEXT</td> <td> %Precursor charge state </td> </tr>
        <tr> <td BGCOLOR="#EBEBEB">LIBRARY_INTENSITY</td> <td>TEXT</td> <td> %Precursor library intensity </td> </tr>
        <tr> <td BGCOLOR="#EBEBEB">LIBRARY_RT</td> <td>TEXT</td> <td> Library retention time (RT) </td> </tr>
        <tr> <td BGCOLOR="#EBEBEB">LIBRARY_DRIFT_TIME</td> <td>TEXT</td> <td> Library drift time (ion mobility drift time or collisional cross-section) </td> </tr>
        <tr> <td BGCOLOR="#EBEBEB">DECOY</td> <td>INT (0 or 1)</td> <td> Whether this is a decoy precursor (1: decoy, 0: target) </td> </tr>
      </table>
      </td><td valign="top">
      <table>
        <tr> <th BGCOLOR="#EBEBEB" colspan=3>TRANSITION_PRECURSOR_MAPPING</th> </tr>
        <tr> <td BGCOLOR="#EBEBEB">TRANSITION_ID</td> <td>INT</td> <td> Foreign Key (TRANSITION.ID)</td> </tr>
        <tr> <td BGCOLOR="#EBEBEB">PRECURSOR_ID</td> <td>INT</td> <td> Foreign Key (PRECURSOR.ID)</td> </tr>
      </table>
      </td></table>


      Transitions are generated upon fragmentation from precursors and are described by their charge state and (fragment) mass-to-charge ratio. For peptide fragments, an ion type (e.g. y for y-ions) and a ordinal (e.g. 6 for a y6 ion) can be recorded. Note that detecting transitions are used for precursor detection and will indicate whether a given precursor is present or not in the sample. Use detecting transitions for the top N transitions of a precursor to detect a set of transitions in the sample. Identifying transitions will be used to discriminate different peptidoforms of the same precursor (see <a href="http://openswath.org/en/latest/docs/ipf.html">IPF Workflow</a> for PTM inference).
      <table border="0"><tr><td>
      <table>
        <tr> <th BGCOLOR="#EBEBEB" colspan=3>TRANSITION</th> </tr>
        <tr> <td BGCOLOR="#EBEBEB">ID</td> <td>INT</td> <td> Primary Key (transition id)</td> </tr>
        <tr> <td BGCOLOR="#EBEBEB">TRAML_ID</td> <td>TEXT</td> <td> TraML identifiers (maps to the "id=" attribute in TraML)</td> </tr>
        <tr> <td BGCOLOR="#EBEBEB">PRODUCT_MZ</td> <td>TEXT</td> <td> Fragment ion m/z </td> </tr>
        <tr> <td BGCOLOR="#EBEBEB">CHARGE</td> <td>TEXT</td> <td> Fragment ion charge </td> </tr>
        <tr> <td BGCOLOR="#EBEBEB">TYPE</td> <td>CHAR</td> <td> Fragment ion type (e.g. "b" or "y")</td> </tr>
        <tr> <td BGCOLOR="#EBEBEB">ANNOTATION</td> <td>TEXT</td> <td> Fragment ion annotation </td> </tr>
        <tr> <td BGCOLOR="#EBEBEB">ORDINAL</td> <td>INT</td> <td> Fragment ion ordinal </td> </tr>
        <tr> <td BGCOLOR="#EBEBEB">DETECTING</td> <td>INT (0 or 1)</td> <td>1: use transition to detect peak group, 0: don't use transition for detection</td> </tr>
        <tr> <td BGCOLOR="#EBEBEB">IDENTIFYING</td> <td>INT (0 or 1)</td> <td> 1: use transition for peptidoform inference in the <a href="http://openswath.org/en/latest/docs/ipf.html">IPF Workflow</a>, 0: don't use transition for identification</td> </tr>
        <tr> <td BGCOLOR="#EBEBEB">QUANTIFYING</td> <td>INT (0 or 1)</td> <td> 1: use transition to quantify peak group, 0: don't use transition for quantification</td> </tr>
        <tr> <td BGCOLOR="#EBEBEB">LIBRARY_INTENSITY</td> <td>REAL</td> <td> Fragment ion library intensity </td> </tr>
        <tr> <td BGCOLOR="#EBEBEB">DECOY</td> <td>INT (0 or 1)</td> <td> Whether this is a decoy transition (1: decoy, 0: target) </td> </tr>
      </table>
      </td>
      </table>
      </td></table>

      There is one extra table directly mapping TRANSITION to PEPTIDE which is mainly used for the 
      <a href="http://openswath.org/en/latest/docs/ipf.html">IPF Workflow</a> for PTM inference. It directly maps transitions to peptidoforms 
      (one identification transition can map to multiple peptidoforms):

      <table border="0"><tr><td>
      </td><td>
      <table>
        <tr> <th BGCOLOR="#EBEBEB" colspan=3>TRANSITION_PEPTIDE_MAPPING</th> </tr>
        <tr> <td BGCOLOR="#EBEBEB">TRANSITION_ID</td> <td>INT</td> <td> Foreign Key (TRANSITION.ID)</td> </tr>
        <tr> <td BGCOLOR="#EBEBEB">PEPTIDE_ID</td> <td>INT</td> <td> Foreign Key (PEPTIDE.ID)</td> </tr>
      </table>
      </td></table>
      </td></table>

      Another extra table describes the file format version:
      <table>
        <tr> <th BGCOLOR="#EBEBEB" colspan=3>VERSION</th> </tr>
        <tr> <td BGCOLOR="#EBEBEB">ID</td> <td>INT</td> <td> %File Format version </td> </tr>
      </table>

      <p>
      Remarks:
      </p>
      <ul>
        <li>
          1. modifications should be supplied inside the sequence using UniMod
            identifiers or freetext identifiers that are understood by %OpenMS. See also @ref OpenMS::AASequence for more information. For example:
            <ul>
            <li> PEPT(Phosphorylation)IDE(UniMod:27)A ) </li>
            </ul>
        </li>
        <li>
          2. peptide label groups designate groups of peptides that are isotopically
          modified forms of the same peptide species. For example, the heavy and
          light forms of the same peptide will both be assigned the same peptide
          group label. For example:
            <ul>
            <li> PEPTIDEAK -> gets label "PEPTIDEAK_gr1"  </li>
            <li> PEPTIDEAK[+8] -> gets label "PEPTIDEAK_gr1"  </li>
            <li> PEPT(Phosphorylation)IDEAK -> gets label "PEPTIDEAK_gr2"  </li>
            <li> PEPT(Phosphorylation)IDEAK[+8] -> gets label "PEPTIDEAK_gr2"  </li>
            </ul>
        </li>
      </ul>
      </p>


      @htmlinclude OpenMS_TransitionPQPFile.parameters

  */
  class OPENMS_DLLAPI TransitionPQPFile :
    public TransitionTSVFile
  {

private:

    /** @brief Read PQP SQLite file
     *
     * @param filename The input file
     * @param transition_list The output list of transitions
     * @param legacy_traml_id Should legacy TraML IDs be used (boolean)?
     *
    */
    void readPQPInput_(const char* filename, std::vector<TSVTransition>& transition_list, bool legacy_traml_id = false);

    /** @brief Write a TargetedExperiment to a file
     *
     * @param filename Name of the output file
     * @param targeted_exp The data structure to be written to the file
    */
    void writePQPOutput_(const char* filename, OpenMS::TargetedExperiment& targeted_exp);

public:

    //@{
    /// Constructor
    TransitionPQPFile();

    /// Destructor
    ~TransitionPQPFile() override;
    //@}

    /** @brief Write out a targeted experiment (TraML structure) into a PQP file
     *
     * @param filename The output file
     * @param targeted_exp The targeted experiment
     *
    */
    void convertTargetedExperimentToPQP(const char* filename, OpenMS::TargetedExperiment& targeted_exp);

    /** @brief Read in a PQP file and construct a targeted experiment (TraML structure)
     *
     * @param filename The input file
     * @param targeted_exp The output targeted experiment
     * @param legacy_traml_id Should legacy TraML IDs be used (boolean)?
     *
    */
    void convertPQPToTargetedExperiment(const char* filename, OpenMS::TargetedExperiment& targeted_exp, bool legacy_traml_id = false);

    /** @brief Read in a PQP file and construct a targeted experiment (Light transition structure)
     *
     * @param filename The input file
     * @param targeted_exp The output targeted experiment
     * @param legacy_traml_id Should legacy TraML IDs be used (boolean)?
     *
    */
    void convertPQPToTargetedExperiment(const char* filename, OpenSwath::LightTargetedExperiment& targeted_exp, bool legacy_traml_id = false);

  };
}