File: Stream1dStringDataset.hpp

package info (click to toggle)
r-bioc-alabaster.base 1.6.1%2Bds-2
  • links: PTS, VCS
  • area: main
  • in suites: sid, trixie
  • size: 1,652 kB
  • sloc: cpp: 11,377; sh: 29; makefile: 2
file content (171 lines) | stat: -rw-r--r-- 5,144 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
#ifndef RITSUKO_HDF5_STREAM_1D_STRING_DATASET_HPP
#define RITSUKO_HDF5_STREAM_1D_STRING_DATASET_HPP

#include "H5Cpp.h"

#include <vector>
#include <string>
#include <stdexcept>

#include "pick_1d_block_size.hpp"
#include "get_1d_length.hpp"
#include "get_name.hpp"
#include "as_numeric_datatype.hpp"
#include "_strings.hpp"

/**
 * @file Stream1dStringDataset.hpp
 * @brief Stream a numeric 1-dimensional HDF5 dataset into memory.
 */

namespace ritsuko {

namespace hdf5 {

/**
 * @brief Stream a 1-dimensional HDF5 string dataset into memory.
 *
 * This streams in a 1-dimensional HDF5 string dataset in contiguous blocks, using block sizes defined by `pick_1d_block_size()`.
 * Callers can then iterate over the individual strings.
 */
class Stream1dStringDataset {
public:
    /**
     * @param ptr Pointer to a HDF5 dataset handle.
     * @param length Length of the dataset as a 1-dimensional vector.
     * @param buffer_size Size of the buffer for holding streamed blocks of values.
     * Larger buffers improve speed at the cost of some memory efficiency.
     */
    Stream1dStringDataset(const H5::DataSet* ptr, hsize_t length, hsize_t buffer_size) : 
        ptr(ptr), 
        full_length(length), 
        block_size(pick_1d_block_size(ptr->getCreatePlist(), full_length, buffer_size)),
        mspace(1, &block_size),
        dspace(1, &full_length),
        dtype(ptr->getDataType()),
        is_variable(dtype.isVariableStr())
    {
        if (is_variable) {
            var_buffer.resize(block_size);
        } else {
            fixed_length = dtype.getSize();
            fix_buffer.resize(fixed_length * block_size);
        }
        final_buffer.resize(block_size);
    }

    /**
     * Overloaded constructor where the length is automatically determined.
     *
     * @param ptr Pointer to a HDF5 dataset handle.
     * @param buffer_size Size of the buffer for holding streamed blocks of values.
     */
    Stream1dStringDataset(const H5::DataSet* ptr, hsize_t buffer_size) : 
        Stream1dStringDataset(ptr, get_1d_length(ptr->getSpace(), false), buffer_size) 
    {}

public:
    /**
     * @return String at the current position of the stream.
     */
    std::string get() {
        while (consumed >= available) {
            consumed -= available;
            load(); 
        }
        return final_buffer[consumed];
    }

    /**
     * @return String at the current position of the stream.
     * Unlike `get()`, this avoids a copy by directly acquiring the string,
     * but it invalidates all subsequent `get()` and `steal()` requests until `next()` is called.
     */
    std::string steal() {
        while (consumed >= available) {
            consumed -= available;
            load(); 
        }
        return std::move(final_buffer[consumed]);
    }

    /**
     * Advance to the next position of the stream.
     *
     * @param jump Number of positions by which to advance the stream.
     */
    void next(size_t jump = 1) {
        consumed += jump;
    }

    /**
     * @return Length of the dataset.
     */
    hsize_t length() const {
        return full_length;
    }

    /**
     * @return Current position on the stream.
     */
    hsize_t position() const {
        return consumed + last_loaded;
    }

private:
    const H5::DataSet* ptr;
    hsize_t full_length, block_size;
    H5::DataSpace mspace;
    H5::DataSpace dspace;

    H5::DataType dtype;
    bool is_variable;
    std::vector<char*> var_buffer;
    size_t fixed_length = 0;
    std::vector<char> fix_buffer;
    std::vector<std::string> final_buffer;

    hsize_t last_loaded = 0;
    hsize_t consumed = 0;
    hsize_t available = 0;

    void load() {
        if (last_loaded >= full_length) {
            throw std::runtime_error("requesting data beyond the end of the dataset at '" + get_name(*ptr) + "'");
        }
        available = std::min(full_length - last_loaded, block_size);
        constexpr hsize_t zero = 0;
        mspace.selectHyperslab(H5S_SELECT_SET, &available, &zero);
        dspace.selectHyperslab(H5S_SELECT_SET, &available, &last_loaded);

        if (is_variable) {
            ptr->read(var_buffer.data(), dtype, mspace, dspace);
            [[maybe_unused]] VariableStringCleaner deletor(dtype.getId(), mspace.getId(), var_buffer.data());
            for (hsize_t i = 0; i < available; ++i) {
                if (var_buffer[i] == NULL) {
                    throw std::runtime_error("detected a NULL pointer for a variable length string in '" + get_name(*ptr) + "'");
                }
                auto& curstr = final_buffer[i];
                curstr.clear();
                curstr.insert(0, var_buffer[i]);
            }

        } else {
            auto bptr = fix_buffer.data();
            ptr->read(bptr, dtype, mspace, dspace);
            for (size_t i = 0; i < available; ++i, bptr += fixed_length) {
                auto& curstr = final_buffer[i];
                curstr.clear();
                curstr.insert(curstr.end(), bptr, bptr + find_string_length(bptr, fixed_length));
            }
        }

        last_loaded += available;
    }
};

}

}

#endif