File: IterateNdDataset.hpp

package info (click to toggle)
r-bioc-alabaster.base 1.6.1%2Bds-2
  • links: PTS, VCS
  • area: main
  • in suites: sid, trixie
  • size: 1,652 kB
  • sloc: cpp: 11,377; sh: 29; makefile: 2
file content (164 lines) | stat: -rw-r--r-- 5,130 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
#ifndef RITSUKO_ITERATE_ND_DATASET_HPP
#define RITSUKO_ITERATE_ND_DATASET_HPP

#include "H5Cpp.h"

#include <vector>
#include <algorithm>
#include <cmath>

/**
 * @file IterateNdDataset.hpp
 * @brief Iterate through an N-dimensional dataset by block.
 */

namespace ritsuko {

namespace hdf5 {

/**
 * @brief Iterate through an N-dimensional dataset by block.
 *
 * This iterates through an N-dimensional dataset in a blockwise fashion, constructing `H5::DataSpace` objects to enable callers to easily read the dataset contents at each block.
 * Block sizes are typically determined from dataset chunking via `pick_nd_block_dimensions()`, which ensures efficient access of entire chunks at each step.
 */
struct IterateNdDataset {
    /**
     * @param d Dataset dimensions.
     * @param b Block dimensions, typically obtained from `pick_nd_block_dimensions()`.
     * This should be of the same length as `d`, where each value of `b` is no greater than its counterpart in `d`.
     */
    IterateNdDataset(std::vector<hsize_t> d, std::vector<hsize_t> b) : 
        data_extent(std::move(d)), 
        block_extent(std::move(b)), 
        ndims(data_extent.size()), 
        starts_internal(ndims), 
        counts_internal(block_extent), 
        dspace(ndims, data_extent.data()) 
    {
        for (auto b : block_extent) {
            total_size *= b;
        }

        if (total_size) {
            dspace.selectHyperslab(H5S_SELECT_SET, counts_internal.data(), starts_internal.data());
            mspace.setExtentSimple(ndims, counts_internal.data());
        } else {
            finished_internal = true;
        }
    }

    /**
     * Move to the next step in the iteration.
     * This will modify the state of all references returned by the getters.
     */
    void next() {
        // Attempting a shift from the last dimension as this is the fastest-changing.
        for (size_t i = ndims; i > 0; --i) {
            auto d = i - 1;
            starts_internal[d] += block_extent[d];

            // Shift was possible, breaking out.
            if (starts_internal[d] < data_extent[d]) {
                total_size /= counts_internal[d];
                counts_internal[d] = std::min(data_extent[d] - starts_internal[d], block_extent[d]);
                total_size *= counts_internal[d];
                break;
            }

            // Next step isn't possible as we've reached the end of the dataset.
            if (d == 0) {
                finished_internal = true;
                return;
            }

            // Reached the end of the current dimension extent; set it to zero, 
            // move to the next dimension and increment it.
            starts_internal[d] = 0;
            total_size /= counts_internal[d];
            counts_internal[d] = std::min(data_extent[d], block_extent[d]);
            total_size *= counts_internal[d];
        }

        dspace.selectHyperslab(H5S_SELECT_SET, counts_internal.data(), starts_internal.data());
        mspace.setExtentSimple(ndims, counts_internal.data());
    }

public:
    /**
     * @return Whether the iteration is finished.
     * All other getters should only be accessed if this is `true`.
     */
    bool finished() const {
        return finished_internal;
    }

    /**
     * @return Size of the current block, in terms of the number of elements.
     * This is usually equal to the product of the block dimensions used in the constructor,
     * except at the edges of the dataset where the current block may be truncated.
     */
    size_t current_block_size() const {
        return total_size;
    }

    /**
     * @return Starting coordinates of the current block. 
     */
    const std::vector<hsize_t>& starts () const {
        return starts_internal;
    }

    /**
     * @return Dimensions of the current block. 
     * This is usually equal to the block dimensions used in the constructor,
     * except at the edges of the dataset where the current block may be truncated.
     */
    const std::vector<hsize_t>& counts () const {
        return counts_internal;
    }

    /**
     * @return Dataspace for extracting block contents from file.
     */
    const H5::DataSpace& file_space() const {
        return dspace;
    }

    /**
     * @return Dataspace for storing the block contents in memory.
     * This assumes a contiguous memory allocation that has space for at least `total_size()` elements.
     */
    const H5::DataSpace& memory_space() const {
        return mspace;
    }

    /**
     * @return Dimensions of the dataset, as provided in the constructor.
     */
    const std::vector<hsize_t>& dimensions() const {
        return data_extent;
    }

    /**
     * @return Dimensions of the blocks, as provided in the constructor.
     */
    const std::vector<hsize_t>& block_dimensions() const {
        return block_extent;
    }

private:
    std::vector<hsize_t> data_extent, block_extent;
    size_t ndims;

    std::vector<hsize_t> starts_internal, counts_internal;
    H5::DataSpace mspace, dspace;
    bool finished_internal = false;
    size_t total_size = 1;
};

}

}

#endif