File: parser.hpp

package info (click to toggle)
libbioparser-dev 3.1.0-2
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 1,932 kB
  • sloc: cpp: 1,275; makefile: 13
file content (138 lines) | stat: -rw-r--r-- 3,428 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
// Copyright (c) 2020 Robert Vaser

#ifndef BIOPARSER_PARSER_HPP_
#define BIOPARSER_PARSER_HPP_

#include <algorithm>
#include <cctype>
#include <cstdint>
#include <cstring>
#include <limits>
#include <memory>
#include <stdexcept>
#include <string>
#include <vector>

#include "zlib.h"  // NOLINT

namespace bioparser {

template<class T>
class Parser {  // Parser factory
 public:
  Parser(const Parser&) = delete;
  Parser& operator=(const Parser&) = delete;

  Parser(Parser&&) = delete;
  Parser& operator=(Parser&&) = delete;

  virtual ~Parser() {}

  template<template<class> class P>
  static std::unique_ptr<Parser<T>> Create(const std::string& path) {
    auto file = gzopen(path.c_str(), "r");
    if (file == nullptr) {
      throw std::invalid_argument(
          "[bioparser::Parser::Create] error: unable to open file " + path);
    }
    return std::unique_ptr<Parser<T>>(new P<T>(file));
  }

  // by default, all parsers shrink sequence names to the first white space
  virtual std::vector<std::unique_ptr<T>> Parse(
      std::uint64_t bytes, bool shorten_names = true) = 0;

  void Reset() {
    gzseek(file_.get(), 0, SEEK_SET);
    buffer_ptr_ = 0;
    buffer_bytes_ = 0;
  }

 protected:
  Parser(gzFile file, std::uint32_t storage_size)
      : file_(file, gzclose),
        buffer_(65536, 0),  // 64 kB
        buffer_ptr_(0),
        buffer_bytes_(0),
        storage_(storage_size, 0),
        storage_ptr_(0) {}

  const std::vector<char>& buffer() const {
    return buffer_;
  }

  std::uint32_t buffer_ptr() const {
    return buffer_ptr_;
  }

  std::uint32_t buffer_bytes() const {
    return buffer_bytes_;
  }

  const std::vector<char>& storage() const {
    return storage_;
  }

  std::uint32_t storage_ptr() const {
    return storage_ptr_;
  }

  bool Read() {
    buffer_ptr_ = 0;
    buffer_bytes_ = gzread(file_.get(), buffer_.data(), buffer_.size());
    return buffer_bytes_ < buffer_.size();
  }

  void Store(std::size_t count, bool strip = false) {
    if (buffer_ptr_ + count > buffer_.size()) {
      throw std::invalid_argument(
          "[bioparser::Parser::Store] error: buffer overflow");
    }
    if (storage_ptr_ + count > std::numeric_limits<std::uint32_t>::max()) {
      throw std::invalid_argument(
          "[bioparser::Parser::Store] error: storage overflow");
    }
    if (storage_ptr_ + count > storage_.size()) {
      storage_.resize(2 * storage_.size());
    }
    std::memcpy(&storage_[storage_ptr_], &buffer_[buffer_ptr_], count);
    storage_ptr_ += strip ? RightStrip(&storage_[storage_ptr_], count) : count;
    buffer_ptr_ += count + 1;  // ignore sought character
  }

  void Terminate(std::uint32_t i) {
    storage_[i] = '\0';
  }

  void Clear() {
    storage_ptr_ = 0;
  }

  static std::uint32_t RightStrip(const char* str, std::uint32_t str_len) {
    while (str_len > 0 && std::isspace(str[str_len - 1])) {
      --str_len;
    }
    return str_len;
  }

  static std::uint32_t Shorten(const char* str, std::uint32_t str_len) {
    for (std::uint32_t i = 0; i < str_len; ++i) {
      if (std::isspace(str[i])) {
        return i;
      }
    }
    return str_len;
  }

 private:
  std::unique_ptr<gzFile_s, int(*)(gzFile)> file_;
  std::vector<char> buffer_;
  std::uint32_t buffer_ptr_;
  std::uint32_t buffer_bytes_;
  std::vector<char> storage_;
  std::uint32_t storage_ptr_;
};

}  // namespace bioparser

#endif  // BIOPARSER_PARSER_HPP_