1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213
|
#ifndef __CFASTQDEFLINEPARSER_HPP__
#define __CFASTQDEFLINEPARSER_HPP__
/**
* @file fastq_defline_parser.hpp
* @brief Defline Parser
*
*/
#include <string_view>
#include <vector>
#include "fastq_read.hpp"
#include "fastq_defline_matcher.hpp"
#include "fastq_error.hpp"
#include <spdlog/spdlog.h>
#include <set>
using namespace std;
class CDefLineParser
/// Parse defline into CFastQRead using list of registered defline matcher
{
public:
typedef std::vector<std::shared_ptr<CDefLineMatcher>> deflinematchers_t;
/**
* @brief Construct a new CDefLineParser object
*
* Registers a list of supported matchers
*/
CDefLineParser();
/**
* @brief Destroy the CDefLineParser object
*
*/
~CDefLineParser();
/**
* @brief Reset internal state
*
*/
void Reset();
/**
* @brief Parse defline into CFastqRead object
*
* @param[in] defline defline string_view to parse
* @param[in, out] read CFastqRead with populated defline related field
*/
void Parse(const string_view& defline, CFastqRead& read);
/**
* @brief Check if parser can match a given defline
*
* @param[in] defline defline string_view to parse
* @param[in] strict if true Matchall pattern cannot be used, otherwise it will match any defline
* @return true if defline can be parsed
* @return false if defline cannot be parsed
*/
bool Match(const re2::StringPiece& defline, bool strict = false);
/**
* @brief Check if defline matches last matched pattern
*
* @param defline
* @return true
* @return false
*/
bool MatchLast(const re2::StringPiece& defline);
/**
* @brief Enable MatchAll pattern
*
*/
void SetMatchAll();
/**
* @brief Get last matched defline type
*
* @return const string& defline type
*/
const string& GetDeflineType() const;
/**
* @brief Get last matched platform code
*
* @return uint8_t platform code
*/
uint8_t GetPlatform() const;
/**
* @brief Get all defline type there were matched
*
* @return const set<string>&
*/
const set<string>& AllDeflineTypes() const { return mDeflineTypes;}
const deflinematchers_t& GetDeflineMatchers() const { return mDefLineMatchers; }
private:
deflinematchers_t mDefLineMatchers; ///< Vector of all registered Defline matchers
size_t mIndexLastSuccessfulMatch = 0; ///< Index of the last sucessfull matcher
size_t mAllMatchIndex = -1; ///< Index of Match everything matcher
std::set<string> mDeflineTypes; ///< Set of deflines types processed by this reader
};
CDefLineParser::CDefLineParser()
{
Reset();
// NoMtach mathcher should be the first one so that
// mIndexLastSuccessfulMatch always points to a valid matcher
mDefLineMatchers.emplace_back(new CDefLineMatcher_NoMatch);
mDefLineMatchers.emplace_back(new CDefLineMatcherBgiNew);
mDefLineMatchers.emplace_back(new CDefLineMatcherBgiOld);
mDefLineMatchers.emplace_back(new CDefLineMatcherIlluminaNew);
mDefLineMatchers.emplace_back(new CDefLineMatcherIlluminaNewNoPrefix);
mDefLineMatchers.emplace_back(new CDefLineMatcherIlluminaNewWithSuffix);
mDefLineMatchers.emplace_back(new CDefLineMatcherIlluminaNewWithPeriods);
mDefLineMatchers.emplace_back(new CDefLineMatcherIlluminaNewWithUnderscores);
mDefLineMatchers.emplace_back(new CDefLineMatcherIlluminaOldWithSuffix);
mDefLineMatchers.emplace_back(new CDefLineMatcherIlluminaOldColon);
mDefLineMatchers.emplace_back(new CDefLineMatcherIlluminaOldUnderscore);
mDefLineMatchers.emplace_back(new CDefLineMatcherIlluminaOldWithSuffix2);
mDefLineMatchers.emplace_back(new CDefLineMatcherIlluminaOldNoPrefix);
mDefLineMatchers.emplace_back(new CDefLineMatcherLS454);
mDefLineMatchers.emplace_back(new CDefLineMatcherPacBio);
mDefLineMatchers.emplace_back(new CDefLineMatcherPacBio2);
mDefLineMatchers.emplace_back(new CDefLineMatcherPacBio3);
mDefLineMatchers.emplace_back(new CDefLineMatcherPacBio4);
mDefLineMatchers.emplace_back(new CDefLineMatcherIonTorrent2);
mDefLineMatchers.emplace_back(new CDefLineMatcherIonTorrent);
mDefLineMatchers.emplace_back(new CDefLineIlluminaOldBcRn);
mDefLineMatchers.emplace_back(new CDefLineIlluminaOldBcOnly);
mDefLineMatchers.emplace_back(new CDefLineIlluminaOldRnOnly);
mDefLineMatchers.emplace_back(new CDefLineMatcherIlluminaNewDataGroup);
mDefLineMatchers.emplace_back(new CDefLineMatcherNanopore1);
mDefLineMatchers.emplace_back(new CDefLineMatcherNanopore2);
mDefLineMatchers.emplace_back(new CDefLineMatcherNanopore3);
mDefLineMatchers.emplace_back(new CDefLineMatcherNanopore3_1);
mDefLineMatchers.emplace_back(new CDefLineMatcherNanopore5); // before Nanopore4, to match fastq-load.py
mDefLineMatchers.emplace_back(new CDefLineMatcherNanopore4);
}
CDefLineParser::~CDefLineParser()
{
}
void CDefLineParser::SetMatchAll()
{
mDefLineMatchers.emplace_back(new CDefLineMatcher_AllMatch);
mAllMatchIndex = mDefLineMatchers.size() - 1;
}
void CDefLineParser::Reset()
{
mIndexLastSuccessfulMatch = 0;
}
bool CDefLineParser::Match(const re2::StringPiece& defline, bool strict)
{
if (mDefLineMatchers[mIndexLastSuccessfulMatch]->Matches(defline)) {
return true;
}
for (size_t i = 0; i < mDefLineMatchers.size(); ++i) {
if (i == mIndexLastSuccessfulMatch) {
continue;
}
if (!mDefLineMatchers[i]->Matches(defline)) {
continue;
}
if (strict && i == mAllMatchIndex)
return false;
mIndexLastSuccessfulMatch = i;
mDeflineTypes.insert(mDefLineMatchers[mIndexLastSuccessfulMatch]->Defline());
//spdlog::info("Current pattern: {}", mDefLineMatchers[mIndexLastSuccessfulMatch]->Defline());
return true;
}
return false;
}
bool CDefLineParser::MatchLast(const re2::StringPiece& defline)
{
return mDefLineMatchers[mIndexLastSuccessfulMatch]->Matches(defline);
}
void CDefLineParser::Parse(const string_view& defline, CFastqRead& read)
{
if (Match(re2::StringPiece(defline.data(), defline.size()))) {
mDefLineMatchers[mIndexLastSuccessfulMatch]->GetMatch(read);
return;
}
// VDB-4970: do not include the bad defline's text since it may contain XML-breaking characters
throw fastq_error(100, "Defline not recognized");
}
inline
uint8_t CDefLineParser::GetPlatform() const
{
return mDefLineMatchers[mIndexLastSuccessfulMatch]->GetPlatform();
}
inline
const string& CDefLineParser::GetDeflineType() const
{
return mDefLineMatchers[mIndexLastSuccessfulMatch]->Defline();
}
#endif
|