File: blast_io_context.h

package info (click to toggle)
seqan2 2.5.2-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 228,748 kB
  • sloc: cpp: 257,602; ansic: 91,967; python: 8,326; sh: 1,056; xml: 570; makefile: 229; awk: 51; javascript: 21
file content (284 lines) | stat: -rw-r--r-- 12,458 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
// ==========================================================================
//                 SeqAn - The Library for Sequence Analysis
// ==========================================================================
// Copyright (c) 2006-2026, Knut Reinert, FU Berlin
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are met:
//
//     * Redistributions of source code must retain the above copyright
//       notice, this list of conditions and the following disclaimer.
//     * Redistributions in binary form must reproduce the above copyright
//       notice, this list of conditions and the following disclaimer in the
//       documentation and/or other materials provided with the distribution.
//     * Neither the name of Knut Reinert or the FU Berlin nor the names of
//       its contributors may be used to endorse or promote products derived
//       from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
// ARE DISCLAIMED. IN NO EVENT SHALL KNUT REINERT OR THE FU BERLIN BE LIABLE
// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
// OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
// DAMAGE.
//
// ==========================================================================
// Author: Hannes Hauswedell <hannes.hauswedell@fu-berlin.de>
// ==========================================================================
// This file contains the BlastIOContext's code
// ==========================================================================

#ifndef SEQAN_BLAST_BLAST_IO_CONTEXT_H__
#define SEQAN_BLAST_BLAST_IO_CONTEXT_H__

namespace seqan2
{

// ============================================================================
// Forwards
// ============================================================================

template <typename TScore>
struct BlastScoringScheme;

// ============================================================================
// Metafunctions
// ============================================================================

// ----------------------------------------------------------------------------
// Mfn BlastIOContextStringType_
// ----------------------------------------------------------------------------

template <typename TContext>
struct BlastIOContextStringType_
{
    typedef std::string Type;
};

// ============================================================================
// Tags, Classes, Enums
// ============================================================================

// ----------------------------------------------------------------------------
// Class BlastIOContext
// ----------------------------------------------------------------------------

/*!
 * @class BlastIOContext
 * @headerfile <seqan/blast.h>
 * @signature template <typename TScore_ = Blosum62,
 * BlastProgram p = BlastProgram::UNKNOWN, BlastTabularSpec h = BlastTabularSpec::UNKNOWN>
 * struct BlastIOContext { ... };
 *
 * @brief An object that holds file global information and buffers for BlastIO
 *
 * @tparam TScore   Type of the @link Score @endlink object used.
 * @tparam p        @link BlastProgram @endlink as compile-time parameter.
 * @tparam h        @link BlastTabularSpec @endlink as compile-time parameter.
 *
 * This is a part of the Blast formatted files. Before writing, some of the context's members should be set; after
 * reading it will contain
 * all information from the file that did not belong to a @link BlastRecord @endlink, e.g. the name of the database.
 * It also contains buffers for internal use.
 *
 * You should re-use this object (i.e. only create it once for
 * every file that you read/write). And you don't need to and should not clear()
 * this, except when restarting IO on a different file.
 *
 * To speed-up file writing slightly you can set the value template parameters <tt>p</tt> and/or <tt>h</tt> to something
 * other than ::DYNAMIC at compile-time (e.g. if you know that you will be printing only BLASTX), but then you won't
 * be able to modify these values at run-time. For file reading this is also possible, but usually the
 * added flexibility of automatically detecting these values is prefferable.
 *
 * If not explicitly stated otherwise, the member variables are <i>out-parameters</i> of <tt>readHeader()</tt>,
 * <tt>readRecord()</tt> and <tt>readFooter()</tt>, i.e. they are set by these functions; and they are
 * <i>in-parameters</i> to  <tt>writeHeader()</tt>, <tt>writeRecord()</tt> and <tt>writeFooter()</tt>, i.e. they
 * influence these functions' output.
 *
 * See @link BlastTabularFileOut @endlink and @link BlastReportFileOut @endlink for more complete examples of usage.
 */

template <typename TScore_ = Blosum62,
          BlastProgram p = BlastProgram::DYNAMIC,
          BlastTabularSpec h = BlastTabularSpec::DYNAMIC>
struct BlastIOContext
{
    typedef TScore_ TScore;
    typedef typename BlastIOContextStringType_<BlastIOContext>::Type TString;

    /*!
     * @var BlastProgramSelector BlastIOContext::blastProgram;
     * @brief The @link BlastProgram @endlink.
     *
     * @section Remarks
     *
     * Behaves exactly like an enum of type @link BlastProgram @endlink, unless the second template parameter was
     * specified to make this a compile-time constant. See @link BlastProgramSelector @endlink for more information.
     */
    BlastProgramSelector<p> blastProgram;

    /*!
     * @var BlastTabularSpecSelector BlastIOContext::tabularSpec;
     * @brief The @link BlastTabularSpec @endlink.
     *
     * @section Remarks
     *
     * Behaves exactly like an enum of type @link BlastTabularSpec @endlink, unless the third template parameter was
     * specified to make this a compile-time constant. See @link BlastTabularSpecSelector @endlink for more information.
     */
    BlastTabularSpecSelector<h> tabularSpec;

    /*!
     * @var BlastScoringScheme<TScore> BlastIOContext::scoringScheme;
     * @brief The @link BlastScoringScheme @endlink.
     */
    BlastScoringScheme<TScore> scoringScheme;

    /*!
     * @var TString BlastIOContext::versionString;
     * @brief The blast version string.
     *
     * @section Remarks
     *
     * Used when writing @link BlastReportFileOut @endlink and @link BlastTabularFileOut @endlink if the context's tabularSpec
     * is set to BlastTabularSpec::COMMENTS. Defaults to a version string based on the emulated
     * blast version and the current SeqAn version.
     * When reading from @link BlastTabularFileOut @endlink the corresponding line is extracted from the comment lines
     * (if present).
     */
    TString versionString;
    void _setDefaultVersionString()
    {
        clear(versionString);
        append(versionString, _programTagToString(blastProgram));
        append(versionString, " 2.2.26");
        if (!legacyFormat)
            append(versionString, "+");
        append(versionString, " [I/O Module of SeqAn-");
        append(versionString, std::to_string(SEQAN_VERSION_MAJOR));
        append(versionString, '.');
        append(versionString, std::to_string(SEQAN_VERSION_MINOR));
        append(versionString, '.');
        append(versionString, std::to_string(SEQAN_VERSION_PATCH));
        append(versionString, ", https://www.seqan.de]");
    }

    /*!
     * @var bool BlastIOContext::legacyFormat;
     * @brief Whether to use the legacy format (only @link BlastTabular @endlink).
     *
     * @section Remarks
     *
     * Setting this flag when writing to a @link BlastTabularFileOut @endlink (that has BlastTabularSpec::COMMENTS set)
     * will result in the legacy version of the comments being written. In the legacy format the mismatches column
     * also includes all gaps in addition to mismatches.
     * Note that many other features like custom fields are not supported in this format.
     *
     * When reading @link BlastTabularFileOut @endlink this flag will automatically be set based on the comments (if a
     * they exist).
     */
    bool legacyFormat = false;

    /*!
     * @var TString BlastIOContext::dbName;
     * @brief Name of the dabase or path to the file.
     */
    TString         dbName;

    /*!
     * @var uint64_t BlastIOContext::dbTotalLength;
     * @brief Summed up sequence length of the database.
     */
    uint64_t        dbTotalLength = 0u;

    /*!
     * @var uint64_t BlastIOContext::dbNumberOfSeqs;
     * @brief Number of sequences in the database.
     */
    uint64_t        dbNumberOfSeqs = 0u;

    /*!
     * @var StringSet<TString> BlastIOContext::otherLines;
     * @brief A StringSet that will contain all comment lines that
     * could not be interpreted in another way (only @link BlastTabularFileIn @endlink).
     */
    StringSet<TString, Owner<ConcatDirect<>>> otherLines;

    /*!
     * @var std::vector<BlastMatchField::Enum> BlastIOContext::fields;
     * @brief The fields (types of columns) in @link BlastTabular @endlink-formats.
     *
     * @section Remarks
     *
     * This is an <i>out-parameter</i> for:
     * <li> @link BlastTabularFileIn#readRecord @endlink iff tabularSpec == COMMENTS (otherwise it can't be deduced)</li>
     *
     * This is an <i>in-parameter</i> for:
     * <li> @link BlastTabularFileIn#readRecord @endlink if tabularSpec != COMMENTS (specified fields will be expected)</li>
     * <li> @link BlastTabularFileOut#writeRecord @endlink (specified fields will written)
     *
     * Setting @link BlastIOContext::ignoreFieldsInComments @endlink will make this variable be an <i>in-parameter</i> for
     * the first case, as well. This variable is ignored in the legacy formats and for non-tabular formats.
     */
    std::vector<typename BlastMatchField<>::Enum> fields = { BlastMatchField<>::Enum::STD };

    /*!
     * @var StringSet<TString> BlastIOContext::fieldsAsStrings;
     * @brief The fields (types of columns) in @link BlastTabular @endlink-formats, but as uninterpreted strings.
     *
     * @section Remarks
     *
     * Useful when the comment lines do not conform to standards and you want to extract the verbatim column labels or
     * if you wish to print non-standard column labels (which you shouldn't!).
     */
    StringSet<TString, Owner<ConcatDirect<>>> fieldsAsStrings;

    /*!
     * @var bool BlastIOContext::ignoreFieldsInComments;
     * @brief Use fields as in-parameter for readRecord as well (only @link BlastTabularFileIn @endlink).
     *
     * @section Remarks
     *
     * See @link BlastTabularFileIn#readRecord @endlink. Use this when the comment lines do not
     * conform to standards (and the fields can't be read), but you know that
     * the matches are in the given, e.g. default format.
     */
    bool ignoreFieldsInComments = false;

    /*!
     * @var StringSet<TString> BlastIOContext::conformancyErrors;
     * @brief Holds non fatal error messages when reading from @link BlastTabularFileIn @endlink.
     *
     * @section Remarks
     *
     * After doing a @link BlastTabularFileIn#readRecord @endlink this will indicate whether the
     * comment lines contained non-fatal parse errors, usually the result
     * of a file written by a sloppy blast implementation or possibly a bug in SeqAn.
     * An empty StringSet indicates that all is good.
     */
    StringSet<TString, Owner<ConcatDirect<>>> conformancyErrors;

    // ------- CACHES, BUFFERS and INTERNALS --------- //

    // counted internally for TabularFooter
    uint64_t _numberOfRecords = 0;

    // cache for length adjustments in blast statistics
    std::unordered_map<uint64_t, uint64_t> _cachedLengthAdjustments;

    // io-buffers
    TString _lineBuffer; // holds the current line
    TString _stringBuffer;
    StringSet<TString, Owner<ConcatDirect<>>> _setBuffer1;
    StringSet<TString, Owner<ConcatDirect<>>> _setBuffer2;
};

}

#endif