File: blast_tabular_lowlevel.h

package info (click to toggle)
seqan2 2.5.2-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 228,748 kB
  • sloc: cpp: 257,602; ansic: 91,967; python: 8,326; sh: 1,056; xml: 570; makefile: 229; awk: 51; javascript: 21
file content (325 lines) | stat: -rw-r--r-- 12,528 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
// ==========================================================================
//                 SeqAn - The Library for Sequence Analysis
// ==========================================================================
// Copyright (c) 2006-2026, Knut Reinert, FU Berlin
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are met:
//
//     * Redistributions of source code must retain the above copyright
//       notice, this list of conditions and the following disclaimer.
//     * Redistributions in binary form must reproduce the above copyright
//       notice, this list of conditions and the following disclaimer in the
//       documentation and/or other materials provided with the distribution.
//     * Neither the name of Knut Reinert or the FU Berlin nor the names of
//       its contributors may be used to endorse or promote products derived
//       from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
// ARE DISCLAIMED. IN NO EVENT SHALL KNUT REINERT OR THE FU BERLIN BE LIABLE
// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
// OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
// DAMAGE.
//
// ==========================================================================
// Author: Hannes Hauswedell <hannes.hauswedell@fu-berlin.de>
// ==========================================================================
// This file contains routines to read BLAST tab-seperated output
// ==========================================================================

#ifndef SEQAN_BLAST_BLAST_TABULAR_LOWLEVEL_H_
#define SEQAN_BLAST_BLAST_TABULAR_LOWLEVEL_H_

namespace seqan2
{

// ============================================================================
// Forwards
// ============================================================================

// ============================================================================
// Tags, Classes, Enums
// ============================================================================

// ----------------------------------------------------------------------------
// Class BlastTabularLL
// ----------------------------------------------------------------------------

/*!
 * @class BlastTabularLL
 * @signature typedef Tag<BlastTabularLL_> BlastTabularLL;
 * @headerfile <seqan/blast.h>
 * @brief Low-Level support for Blast Tabular file formats
 *
 * There are three blast format related tags in SeqAn:
 *
 * <li> @link BlastReport @endlink with the FormattedFile output specialization @link BlastReportFileOut @endlink</li>
 * <li> @link BlastTabular @endlink with the FormattedFile output and input specializations
 * @link BlastTabularFileOut @endlink and @link BlastTabularFileIn @endlink</li>
 * <li> @link BlastTabularLL @endlink which provides light-weight, but very basic tabular IO </li>
 *
 * This is the third tag, it offers <b>low-level</b> support for reading and writing NCBI Blast compatible
 * <b>tabular</b> files, <b>without comment lines</b> -- although files with comment lines can be read if the comment
 * lines are skipped. These are the formats that are available in legacy Blast
 * (<tt>blastall</tt> executable) with the parameters <tt>-m 8</tt> and <tt>-m 9</tt> (with comment lines)
 * and in BLAST+ (<tt>blastx</tt>, <tt>blastn</tt>...) with
 * the parameters <tt>-outfmt 6</tt> and <tt>-outfmt 7</tt> respectively.
 *
 * For most situations @link BlastTabular @endlink is more adequate. Use this tag's interface only for quick parsing
 * of matches in a file, e.g counting and filtering purposes. This interface does not offer a FormattedFile
 * abstraction and no convenience data structures, it does no transformations on the data.
 *
 * The reference Blast implementation used for developing the SeqAn support is NCBI Blast+ 2.2.26 and
 * NCBI Blast 2.2.26 for the legacy support.
 *
 * @section Input example
 *
 * The following example program extracts the list of matching query-subject-pairs from a blast tabular file and prints
 * it to std::out:
 *
 * @include demos/blast/blast_in_lowlevel.cpp
 *
 * The output looks like this:
 *
 * @include demos/blast/blast_in_lowlevel.cpp.stdout_
 *
 */

struct BlastTabularLL_;
typedef Tag<BlastTabularLL_> BlastTabularLL;

// ============================================================================
// Functions
// ============================================================================

// ----------------------------------------------------------------------------
// Function onMatch()
// ----------------------------------------------------------------------------

/*!
 * @fn BlastTabularLL#onMatch
 * @brief Returns whether the iterator is on the beginning of a match line.
 * @signature bool onMatch(stream, blastTabularLL)
 * @headerfile seqan/blast.h
 *
 * @param[in] iter              An input iterator over a stream or any fwd-iterator over a string.
 * @param[in] blastTabularLL    The @link BlastTabularLL @endlink tag.
 *
 * @throw IOError On low-level I/O errors.
 *
 * @return bool true or false
 */

template <typename TFwdIterator>
inline bool
onMatch(TFwdIterator & iter,
        BlastTabularLL const &)
{
    return (value(iter) != '#');
}

// ----------------------------------------------------------------------------
// Function skipUntilMatch()
// ----------------------------------------------------------------------------

/*!
 * @fn BlastTabularLL#skipUntilMatch
 * @brief Skip arbitrary number of comment lines until the beginning of a match is reached.
 * @signature void skipUntilMatch(stream, blastTabularLL);
 * @headerfile seqan/blast.h
 *
 * @param[in,out] stream         An input iterator over a stream or any fwd-iterator over a string.
 * @param[in]     blastTabularLL The @link BlastTabularLL @endlink tag.
 *
 * @section Remarks
 *
 * This is also part of the low-level IO and not required if you use readRecord.
 * Call this function whenever you are not @link BlastTabularLL#onMatch @endlink, but want to be, e.g. to
 * @link BlastTabularLL#readMatch @endlink.
 *
 * Since it is legal for files to end with comment lines, this function does not throw if end-of-file is reached.
 * You need to check that after calling.
 *
 * @throw IOError On low-level I/O errors.
 * @throw ParseError On high-level file format errors.
 */

template <typename TFwdIterator>
inline void
skipUntilMatch(TFwdIterator & iter,
               BlastTabularLL const & /*tag*/)
{
    while ((!atEnd(iter)) && value(iter) == '#') // skip comments
        skipLine(iter);
}

// ----------------------------------------------------------------------------
// Function readMatch()
// ----------------------------------------------------------------------------

/*!
 * @fn BlastTabularLL#readMatch
 * @brief Low-level BlastTabular file reading.
 * @signature void readMatch(stream, blastTabularLL, args ...);
 * @headerfile seqan/blast.h
 *
 * @param[in,out] stream         An input iterator over a stream or any fwd-iterator over a string.
 * @param[in]     blastTabularLL The @link BlastTabularLL @endlink tag.
 * @param[out]    args           Arbitrary typed variables able to hold the fields.
 *
 * @section Remarks
 *
 * Use this signature only if you do not or cannot use @link BlastMatch
 * @endlinkes. You can specify any number of arguments that are expected
 * to be able to hold the values in the columns read, i.e. if you pass a
 * double as argument and the value in the column cannot be successfully cast
 * to double, an exception will be thrown. If you want to be on the safe side,
 * you can pass CharStrings and evaluate them in another way.
 *
 * You may specify less columns than are available in the file, all but the first
 * n will be discarded.
 *
 * No transformations are made on the data, e.g. the positions are still
 * one-indexed and flipped for reverse strand matches.
 *
 * See @link BlastTabularLL @endlink for an example of low-level IO.
 *
 * @throw IOError On low-level I/O errors.
 * @throw ParseError On high-level file format errors.
 */

// arbitrary columns
template <typename TTarget>
inline SEQAN_FUNC_ENABLE_IF(IsSequence<TTarget>)
_assignOrCast(TTarget & target, std::string const & source)
{
    assign(target, source);
}

template <typename TTarget>
inline SEQAN_FUNC_ENABLE_IF(Is<NumberConcept<TTarget>>)
_assignOrCast(TTarget & target, std::string const & source)
{
    target = lexicalCast<TTarget>(source);
}

template <typename TFwdIterator,
          typename TArg>
inline void
_readMatchImplBlastTab(TFwdIterator & iter,
                       TArg & arg)
{
    static std::string buffer;
    clear(buffer);

    readUntil(buffer, iter, OrFunctor<IsTab,IsNewline>());
    _assignOrCast(arg, buffer);

    // as this is the last requested field, go to beginning of next line
    skipLine(iter);
}

template <typename TFwdIterator,
          typename TArg,
          typename... TArgs>
inline void
_readMatchImplBlastTab(TFwdIterator & iter,
                       TArg & arg,
                       TArgs & ... args)
{
    static std::string buffer;
    clear(buffer);

    readUntil(buffer, iter, IsTab());
    skipOne(iter, IsTab());
    _assignOrCast(arg, buffer);

    // recurse to next argument
    _readMatchImplBlastTab(iter, args...);
}

// custom arguments
template <typename TFwdIterator,
          typename... TArgs>
inline void
readMatch(TFwdIterator & iter,
           BlastTabularLL const &,
           TArgs & ... args)
{
    // comment lines should have been read or skipped
    if (SEQAN_UNLIKELY(!onMatch(iter, BlastTabularLL())))
        SEQAN_THROW(ParseError("ERROR: Not on beginning of Match (you should have skipped comments)."));

    _readMatchImplBlastTab(iter, args...);
}

// ----------------------------------------------------------------------------
// Function writeMatch()
// ----------------------------------------------------------------------------

/*!
 * @fn BlastTabularLL#writeMatch
 * @headerfile seqan/blast.h
 * @brief Low-level file-writing for blast tabular formats
 * @signature void writeMatch(stream, blastTabularLL, columns...)
 *
 * @section Remarks
 *
 * This is a very leight-weight alternative to @link BlastTabularFileOut#writeRecord @endlink. It doesn't require
 * @link BlastMatch @endlinkes, @link BlastRecord @endlinks or the use of @link FormattedFile @endlink.
 * It supports an arbitrary amount of and arbitrary typed columns to be printed.
 *
 * Use this only if you do not require comment lines and you are prepared to do all transformations on the data
 * yourself, i.e. this function does none of the match adjustments mentioned in
 * @link BlastTabularFileOut#writeRecord @endlink.
 *
 * @param[in,out] stream         The file to write to (FILE, fstream, @link OutputStreamConcept @endlink ...).
 * @param[in]     blastTabularLL The @link BlastTabularLL @endlink tag.
 * @param[in]     columns...     Any number of printable parameters.
 *
 * @throw IOError On low-level I/O errors.
 */

template <typename TFwdIterator>
inline void
_writeFields(TFwdIterator & /**/,
             BlastTabularLL const & /*tag*/)
{
}

template <typename TFwdIterator, typename TField, typename... TFields>
inline void
_writeFields(TFwdIterator & stream,
             BlastTabularLL const & /*tag*/,
             TField const & field1,
             TFields const & ... fields)
{
    write(stream, '\t');
    write(stream, field1);
    _writeFields(stream, BlastTabularLL(), fields... );
}

// Function for arbitrary number and typed fields
template <typename TFwdIterator, typename TField, typename... TFields>
inline void
writeMatch(TFwdIterator & stream,
           BlastTabularLL const & /*tag*/,
           TField const & field1,
           TFields const & ... fields)
{
    write(stream, field1);

    _writeFields(stream, BlastTabularLL(), fields...);
    write(stream, '\n');
}

}

#endif // SEQAN_BLAST_BLAST_TABULAR_LOWLEVEL_H_