File: fastq_defline_parser.hpp

package info (click to toggle)
sra-sdk 3.2.1%2Bdfsg-4
  • links: PTS, VCS
  • area: main
  • in suites: sid, trixie
  • size: 296,076 kB
  • sloc: ansic: 532,876; cpp: 243,000; perl: 9,649; python: 8,978; sh: 7,888; java: 6,253; makefile: 1,148; yacc: 703; xml: 310; lex: 236
file content (213 lines) | stat: -rw-r--r-- 6,810 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
#ifndef __CFASTQDEFLINEPARSER_HPP__
#define __CFASTQDEFLINEPARSER_HPP__

/**
 * @file fastq_defline_parser.hpp
 * @brief Defline Parser
 *
 */

#include <string_view>
#include <vector>
#include "fastq_read.hpp"
#include "fastq_defline_matcher.hpp"
#include "fastq_error.hpp"
#include <spdlog/spdlog.h>
#include <set>

using namespace std;
class CDefLineParser
/// Parse defline into CFastQRead using list of registered defline matcher
{
public:

    typedef std::vector<std::shared_ptr<CDefLineMatcher>> deflinematchers_t;
    /**
     * @brief Construct a new CDefLineParser object
     *
     * Registers a list of supported matchers
     */
    CDefLineParser();

    /**
     * @brief Destroy the CDefLineParser object
     *
     */
    ~CDefLineParser();

    /**
     * @brief Reset internal state
     *
     */
    void Reset();

    /**
     * @brief Parse defline into CFastqRead object
     *
     * @param[in] defline defline string_view to parse
     * @param[in, out] read CFastqRead with populated defline related field
     */
    void Parse(const string_view& defline, CFastqRead& read);

    /**
     * @brief Check if parser can match a given defline
     *
     * @param[in] defline defline string_view to parse
     * @param[in] strict if true Matchall pattern cannot be used, otherwise it will match any defline
     * @return true if defline can be parsed
     * @return false if defline cannot be parsed
     */
    bool Match(const re2::StringPiece& defline, bool strict = false);

    /**
     * @brief Check if defline matches last matched pattern
     * 
     * @param defline 
     * @return true 
     * @return false 
     */
    bool MatchLast(const re2::StringPiece& defline);

    /**
     * @brief Enable MatchAll pattern
     *
     */
    void SetMatchAll();

    /**
     * @brief Get last matched defline type
     *
     * @return const string& defline type
     */
    const string& GetDeflineType() const;

    /**
     * @brief Get last matched platform code
     *
     * @return uint8_t platform code
     */
    uint8_t GetPlatform() const;

    /**
     * @brief Get all defline type there were matched
     *
     * @return const set<string>&
     */
    const set<string>& AllDeflineTypes() const { return mDeflineTypes;}
    const deflinematchers_t& GetDeflineMatchers() const { return mDefLineMatchers; }
private:
    deflinematchers_t mDefLineMatchers; ///< Vector of all registered Defline matchers
    size_t mIndexLastSuccessfulMatch = 0; ///< Index of the last sucessfull matcher
    size_t mAllMatchIndex = -1;           ///< Index of Match everything matcher
    std::set<string> mDeflineTypes;       ///< Set of deflines types processed by this reader
};


CDefLineParser::CDefLineParser()
{
    Reset();
    // NoMtach mathcher should be the first one so that 
    // mIndexLastSuccessfulMatch always points to a valid matcher
     
    mDefLineMatchers.emplace_back(new CDefLineMatcher_NoMatch);
    mDefLineMatchers.emplace_back(new CDefLineMatcherBgiNew);
    mDefLineMatchers.emplace_back(new CDefLineMatcherBgiOld);
    mDefLineMatchers.emplace_back(new CDefLineMatcherIlluminaNew);
    mDefLineMatchers.emplace_back(new CDefLineMatcherIlluminaNewNoPrefix);
    mDefLineMatchers.emplace_back(new CDefLineMatcherIlluminaNewWithSuffix);
    mDefLineMatchers.emplace_back(new CDefLineMatcherIlluminaNewWithPeriods);
    mDefLineMatchers.emplace_back(new CDefLineMatcherIlluminaNewWithUnderscores);
    mDefLineMatchers.emplace_back(new CDefLineMatcherIlluminaOldWithSuffix);
    mDefLineMatchers.emplace_back(new CDefLineMatcherIlluminaOldColon);
    mDefLineMatchers.emplace_back(new CDefLineMatcherIlluminaOldUnderscore);
    mDefLineMatchers.emplace_back(new CDefLineMatcherIlluminaOldWithSuffix2);
    mDefLineMatchers.emplace_back(new CDefLineMatcherIlluminaOldNoPrefix);
    mDefLineMatchers.emplace_back(new CDefLineMatcherLS454);    
    mDefLineMatchers.emplace_back(new CDefLineMatcherPacBio);
    mDefLineMatchers.emplace_back(new CDefLineMatcherPacBio2);
    mDefLineMatchers.emplace_back(new CDefLineMatcherPacBio3);
    mDefLineMatchers.emplace_back(new CDefLineMatcherPacBio4);
    mDefLineMatchers.emplace_back(new CDefLineMatcherIonTorrent2);
    mDefLineMatchers.emplace_back(new CDefLineMatcherIonTorrent);
    mDefLineMatchers.emplace_back(new CDefLineIlluminaOldBcRn);
    mDefLineMatchers.emplace_back(new CDefLineIlluminaOldBcOnly);
    mDefLineMatchers.emplace_back(new CDefLineIlluminaOldRnOnly);
    mDefLineMatchers.emplace_back(new CDefLineMatcherIlluminaNewDataGroup);
    mDefLineMatchers.emplace_back(new CDefLineMatcherNanopore1);
    mDefLineMatchers.emplace_back(new CDefLineMatcherNanopore2);
    mDefLineMatchers.emplace_back(new CDefLineMatcherNanopore3);
    mDefLineMatchers.emplace_back(new CDefLineMatcherNanopore3_1);
    mDefLineMatchers.emplace_back(new CDefLineMatcherNanopore5); // before Nanopore4, to match fastq-load.py
    mDefLineMatchers.emplace_back(new CDefLineMatcherNanopore4);
}

CDefLineParser::~CDefLineParser()
{

}

void CDefLineParser::SetMatchAll()
{
    mDefLineMatchers.emplace_back(new CDefLineMatcher_AllMatch);
    mAllMatchIndex = mDefLineMatchers.size() - 1;
}


void CDefLineParser::Reset()
{
    mIndexLastSuccessfulMatch = 0;
}

bool CDefLineParser::Match(const re2::StringPiece& defline, bool strict)
{
    if (mDefLineMatchers[mIndexLastSuccessfulMatch]->Matches(defline)) {
        return true;
    }
    for (size_t i = 0; i < mDefLineMatchers.size(); ++i) {
        if (i == mIndexLastSuccessfulMatch) {
            continue;
        }
        if (!mDefLineMatchers[i]->Matches(defline)) {
            continue;
        }
        if (strict && i == mAllMatchIndex)
            return false;
        mIndexLastSuccessfulMatch = i;
        mDeflineTypes.insert(mDefLineMatchers[mIndexLastSuccessfulMatch]->Defline());
        //spdlog::info("Current pattern: {}", mDefLineMatchers[mIndexLastSuccessfulMatch]->Defline());
        return true;
    }
    return false;
}

bool CDefLineParser::MatchLast(const re2::StringPiece& defline)
{
    return mDefLineMatchers[mIndexLastSuccessfulMatch]->Matches(defline);
}


void CDefLineParser::Parse(const string_view& defline, CFastqRead& read)
{
    if (Match(re2::StringPiece(defline.data(), defline.size()))) {
        mDefLineMatchers[mIndexLastSuccessfulMatch]->GetMatch(read);
        return;
    }
    // VDB-4970: do not include the bad defline's text since it may contain XML-breaking characters
    throw fastq_error(100, "Defline not recognized");
}

inline
uint8_t CDefLineParser::GetPlatform() const
{
    return mDefLineMatchers[mIndexLastSuccessfulMatch]->GetPlatform();
}

inline
const string& CDefLineParser::GetDeflineType() const
{
    return mDefLineMatchers[mIndexLastSuccessfulMatch]->Defline();
}



#endif