File: utils_bumpy_array.hpp

package info (click to toggle)
r-bioc-alabaster.base 1.6.1%2Bds-2
  • links: PTS, VCS
  • area: main
  • in suites: sid, trixie
  • size: 1,652 kB
  • sloc: cpp: 11,377; sh: 29; makefile: 2
file content (205 lines) | stat: -rw-r--r-- 8,186 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
#ifndef TAKANE_UTILS_BUMPY_ARRAY_HPP
#define TAKANE_UTILS_BUMPY_ARRAY_HPP

#include "H5Cpp.h"
#include "ritsuko/ritsuko.hpp"
#include "ritsuko/hdf5/hdf5.hpp"

#include <cstdint>
#include <string>
#include <stdexcept>
#include <vector>
#include <array>
#include <filesystem>

#include "utils_public.hpp"
#include "utils_array.hpp"
#include "utils_other.hpp"
#include "utils_json.hpp"

namespace takane {

void validate(const std::filesystem::path&, const ObjectMetadata&, Options&);
size_t height(const std::filesystem::path&, const ObjectMetadata&, Options&);
bool satisfies_interface(const std::string&, const std::string&, const Options&);
bool derived_from(const std::string&, const std::string&, const Options&);

namespace internal_bumpy_array {

inline std::vector<uint64_t> validate_dimensions(const H5::Group& handle) {
    auto dhandle = ritsuko::hdf5::open_dataset(handle, "dimensions");
    if (ritsuko::hdf5::exceeds_integer_limit(dhandle, 64, false)) {
        throw std::runtime_error("expected 'dimensions' to have a datatype that fits in a 64-bit unsigned integer");
    }
    auto len = ritsuko::hdf5::get_1d_length(dhandle, false);
    std::vector<uint64_t> output(len);
    dhandle.read(output.data(), H5::PredType::NATIVE_UINT64);
    return output;
}

inline hsize_t validate_lengths(const H5::Group& handle, size_t concatenated_length, hsize_t buffer_size) {
    auto lhandle = ritsuko::hdf5::open_dataset(handle, "lengths");
    if (ritsuko::hdf5::exceeds_integer_limit(lhandle, 64, false)) {
        throw std::runtime_error("expected 'lengths' to have a datatype that fits in a 64-bit unsigned integer");
    }

    auto len = ritsuko::hdf5::get_1d_length(lhandle.getSpace(), false);

    ritsuko::hdf5::Stream1dNumericDataset<uint64_t> stream(&lhandle, len, buffer_size);
    size_t total = 0;
    for (size_t i = 0; i < len; ++i, stream.next()) {
        total += stream.get();
    }
    if (total != concatenated_length) {
        throw std::runtime_error("sum of 'lengths' does not equal the height of the concatenated object (got " + std::to_string(total) + ", expected " + std::to_string(concatenated_length) + ")");
    }

    return len;
}

inline void validate_sparse_indices(const H5::Group& handle, const std::vector<uint64_t>& dimensions, hsize_t num_lengths, hsize_t buffer_size) {
    size_t ndims = dimensions.size();
    std::vector<H5::DataSet> handles;
    handles.reserve(ndims);

    for (size_t d = 0; d < ndims; ++d) {
        auto dname = std::to_string(d);
        auto lhandle = ritsuko::hdf5::open_dataset(handle, dname.c_str());
        if (ritsuko::hdf5::exceeds_integer_limit(lhandle, 64, false)) {
            throw std::runtime_error("expected '" + dname + "' to have a datatype that fits in a 64-bit unsigned integer");
        }
        auto len = ritsuko::hdf5::get_1d_length(lhandle.getSpace(), false);
        if (len != num_lengths) {
            throw std::runtime_error("expected '" + dname + "' to have the same length as 'lengths'");
        }
        handles.push_back(lhandle);
    }

    if (!num_lengths) {
        return;
    }

    std::vector<ritsuko::hdf5::Stream1dNumericDataset<uint64_t> > streams;
    streams.reserve(ndims);
    std::vector<uint64_t> position;
    position.reserve(ndims); 

    for (size_t d = 0; d < ndims; ++d) {
        streams.emplace_back(&(handles[d]), num_lengths, buffer_size);
        position.push_back(streams.back().get());
        if (position.back() >= dimensions[d]) {
            throw std::runtime_error("values in 'indices/" + std::to_string(d) + "' should be less than the corresponding dimension extent");
        }
        streams.back().next();
    }

    for (size_t l = 1; l < num_lengths; ++l) {
        bool greater = false;

        // Going from back to front and checking for increasingness, based on the
        // expectation that the first dimension is the fastest-changing.
        for (size_t rd = ndims; rd > 0; --rd) {
            auto d = rd - 1;
            auto& dstream = streams[d];

            auto nextv = dstream.get();
            if (nextv >= dimensions[d]) {
                throw std::runtime_error("values in 'indices/" + std::to_string(d) + "' should be less than the corresponding dimension extent");
            }
            dstream.next();

            auto& oldv = position[d];
            if (!greater) {
                if (nextv > oldv) {
                    greater = true;
                } else if (nextv < oldv) {
                    throw std::runtime_error("coordinates in 'indices' should be strictly increasing");
                }
            }
            oldv = nextv;
        }

        if (!greater) {
            throw std::runtime_error("duplicate coordinates in 'indices'");
        }
    }
}

template<bool satisfies_interface_>
void validate_directory(const std::filesystem::path& path, const std::string& object_type, const std::string& concatenated_type, const ObjectMetadata& metadata, Options& options) try {
    auto vstring = internal_json::extract_version_for_type(metadata.other, object_type);
    auto version = ritsuko::parse_version_string(vstring.c_str(), vstring.size(), /* skip_patch = */ true);
    if (version.major != 1) {
        throw std::runtime_error("unsupported version string '" + vstring + "'");
    }

    auto catdir = path / "concatenated";
    auto catmeta = read_object_metadata(catdir);
    if constexpr(satisfies_interface_) {
        if (!satisfies_interface(catmeta.type, concatenated_type, options)) {
            throw std::runtime_error("'concatenated' should satisfy the '" + concatenated_type + "' interface");
        }
    } else {
        if (!derived_from(catmeta.type, concatenated_type, options)) {
            throw std::runtime_error("'concatenated' should contain an '" + concatenated_type + "' object");
        }
    }

    try {
        ::takane::validate(catdir, catmeta, options);
    } catch (std::exception& e) {
        throw std::runtime_error("failed to validate the 'concatenated' object; " + std::string(e.what()));
    }
    size_t catheight = ::takane::height(catdir, catmeta, options);

    auto handle = ritsuko::hdf5::open_file(path / "partitions.h5");
    auto ghandle = ritsuko::hdf5::open_group(handle, object_type.c_str());

    auto dims = validate_dimensions(ghandle);
    auto len = validate_lengths(ghandle, catheight, options.hdf5_buffer_size);

    // Support both dense and sparse cases.
    if (ghandle.exists("indices")) {
        auto ihandle = ritsuko::hdf5::open_group(ghandle, "indices");
        validate_sparse_indices(ihandle, dims, len, options.hdf5_buffer_size);
    } else {
        size_t prod = !dims.empty(); // prod = 0 if dims is empty, otherwise it starts at 1.
        for (auto d : dims) {
            prod *= d;
        }
        if (prod != len) {
            throw std::runtime_error("length of 'lengths' should equal the product of 'dimensions'");
        }
    }

    if (ghandle.exists("names")) {
        internal_array::check_dimnames(ghandle, "names", dims, options);
    }

} catch (std::exception& e) {
    throw std::runtime_error("failed to validate a '" + object_type + "' object at '" + path.string() + "'; " + std::string(e.what()));
}

inline size_t height(const std::filesystem::path& path, const std::string& name, [[maybe_unused]] const ObjectMetadata& metadata, [[maybe_unused]] Options& options) {
    auto handle = ritsuko::hdf5::open_file(path / "partitions.h5");
    auto ghandle = handle.openGroup(name);
    auto dhandle = ghandle.openDataSet("dimensions");
    std::array<hsize_t, 2> dims;
    dhandle.read(dims.data(), H5::PredType::NATIVE_HSIZE);
    return dims[0];
}

inline std::vector<size_t> dimensions(const std::filesystem::path& path, const std::string& name, [[maybe_unused]] const ObjectMetadata& metadata, [[maybe_unused]] Options& options) {
    auto handle = ritsuko::hdf5::open_file(path / "partitions.h5");
    auto ghandle = handle.openGroup(name);
    auto dhandle = ghandle.openDataSet("dimensions");
    std::vector<hsize_t> dims(2);
    dhandle.read(dims.data(), H5::PredType::NATIVE_HSIZE);
    return std::vector<size_t>(dims.begin(), dims.end());
}

}

}

#endif