File: compressed_sparse_matrix.hpp

package info (click to toggle)
r-bioc-alabaster.base 1.6.1%2Bds-2
  • links: PTS, VCS
  • area: main
  • in suites: sid, trixie
  • size: 1,652 kB
  • sloc: cpp: 11,377; sh: 29; makefile: 2
file content (230 lines) | stat: -rw-r--r-- 9,035 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
#ifndef TAKANE_HDF5_SPARSE_MATRIX_HPP
#define TAKANE_HDF5_SPARSE_MATRIX_HPP

#include "ritsuko/ritsuko.hpp"
#include "ritsuko/hdf5/hdf5.hpp"

#include "utils_public.hpp"
#include "utils_array.hpp"
#include "utils_json.hpp"

#include <filesystem>
#include <stdexcept>
#include <string>
#include <cstdint>
#include <array>
#include <vector>

/**
 * @file compressed_sparse_matrix.hpp
 * @brief Validation for compressed sparse matrices.
 */

namespace takane {

/**
 * @namespace takane::compressed_sparse_matrix
 * @brief Definitions for compressed sparse matrices.
 */
namespace compressed_sparse_matrix {

/**
 * @cond
 */
namespace internal {

inline std::array<uint64_t, 2> validate_shape(const H5::Group& handle, const Options&) try {
    auto shandle = ritsuko::hdf5::open_dataset(handle, "shape");
    if (ritsuko::hdf5::exceeds_integer_limit(shandle, 64, false)) {
        throw std::runtime_error("expected the datatype to be a subset of a 64-bit unsigned integer");
    }

    size_t len = ritsuko::hdf5::get_1d_length(shandle, false);
    if (len != 2) {
        throw std::runtime_error("expected the dataset to be of length 2");
    }

    std::array<uint64_t, 2> output;
    shandle.read(output.data(), H5::PredType::NATIVE_UINT64);
    return output;

} catch (std::exception& e) {
    throw std::runtime_error("failed to validate sparse matrix shape at '" + ritsuko::hdf5::get_name(handle) + "/shape'; " + std::string(e.what()));
}

inline size_t validate_data(const H5::Group& handle, const Options&) try {
    auto dhandle = ritsuko::hdf5::open_dataset(handle, "data");

    auto type = ritsuko::hdf5::open_and_load_scalar_string_attribute(handle, "type");
    if (type == "integer") {
        if (ritsuko::hdf5::exceeds_integer_limit(dhandle, 32, true)) {
            throw std::runtime_error("expected an integer 'data' to fit inside a 32-bit signed integer");
        }
    } else if (type == "boolean") {
        if (ritsuko::hdf5::exceeds_integer_limit(dhandle, 32, true)) {
            throw std::runtime_error("expected a boolean 'data' to fit inside a 32-bit signed integer");
        }
    } else if (type == "number") {
        if (ritsuko::hdf5::exceeds_float_limit(dhandle, 64)) {
            throw std::runtime_error("expected a number 'data' to fit inside a 64-bit float");
        }
    } else {
        throw std::runtime_error("unknown matrix type '" + type + "'");
    }

    if (dhandle.attrExists("missing-value-placeholder")) {
        auto attr = dhandle.openAttribute("missing-value-placeholder");
        ritsuko::hdf5::check_missing_placeholder_attribute(dhandle, attr);
    }

    return ritsuko::hdf5::get_1d_length(dhandle, false);
} catch (std::exception& e) {
    throw std::runtime_error("failed to validate sparse matrix data at '" + ritsuko::hdf5::get_name(handle) + "/data'; " + std::string(e.what()));
}

inline std::vector<uint64_t> validate_indptrs(const H5::Group& handle, size_t primary_dim, size_t num_nonzero) try {
    auto dhandle = ritsuko::hdf5::open_dataset(handle, "indptr");
    if (ritsuko::hdf5::exceeds_integer_limit(dhandle, 64, false)) {
        throw std::runtime_error("expected datatype to be a subset of a 64-bit unsigned integer");
    }

    size_t len = ritsuko::hdf5::get_1d_length(dhandle, false);
    if (len != primary_dim + 1) {
        throw std::runtime_error("dataset should have length equal to the primary dimension extent plus 1");
    }

    std::vector<uint64_t> indptrs(len);
    dhandle.read(indptrs.data(), H5::PredType::NATIVE_UINT64);

    if (indptrs[0] != 0) {
        throw std::runtime_error("first entry should be zero");
    }
    if (indptrs.back() != num_nonzero) {
        throw std::runtime_error("last entry should equal the number of non-zero elements");
    }

    for (size_t i = 1; i < len; ++i) {
        if (indptrs[i] < indptrs[i-1]) {
            throw std::runtime_error("pointers should be sorted in increasing order");
        }
    }

    return indptrs;
} catch (std::exception& e) {
    throw std::runtime_error("failed to validate sparse matrix pointers at '" + ritsuko::hdf5::get_name(handle) + "/indptr'; " + std::string(e.what()));
}

inline void validate_indices(const H5::Group& handle, const std::vector<uint64_t>& indptrs, uint64_t secondary_dim, const Options& options) try {
    auto dhandle = ritsuko::hdf5::open_dataset(handle, "indices");
    if (ritsuko::hdf5::exceeds_integer_limit(dhandle, 64, false)) {
        throw std::runtime_error("expected datatype to be a subset of a 64-bit unsigned integer");
    }

    auto len = ritsuko::hdf5::get_1d_length(dhandle.getSpace(), false);
    if (indptrs.back() != len) {
        throw std::runtime_error("dataset length should be equal to the number of non-zero elements (expected " + std::to_string(indptrs.back()) + ", got " + std::to_string(len) + ")");
    }

    size_t which_ptr = 0;
    uint64_t last_index = 0;
    hsize_t limit = indptrs[0];
    ritsuko::hdf5::Stream1dNumericDataset<uint64_t> stream(&dhandle, len, options.hdf5_buffer_size);

    for (hsize_t i = 0; i < len; ++i, stream.next()) {
        auto x = stream.get();
        if (x >= secondary_dim) {
            throw std::runtime_error("out-of-range index (" + std::to_string(x) + ")");
        }

        if (i == limit) {
            // No need to check if there are more or fewer elements
            // than expected, as we already know that indptr.back()
            // is equal to the number of non-zero elements.
            do {
                ++which_ptr;
                limit = indptrs[which_ptr];
            } while (i == limit);
        } else if (last_index >= x) {
            throw std::runtime_error("indices should be strictly increasing");
        }

        last_index = x;
    }

} catch (std::exception& e) {
    throw std::runtime_error("failed to validate sparse matrix indices at '" + ritsuko::hdf5::get_name(handle) + "/indices'; " + std::string(e.what()));
}

}
/**
 * @endcond
 */

/**
 * @param path Path to a directory containing a compressed sparse matrix.
 * @param metadata Metadata for the object, typically read from its `OBJECT` file.
 * @param options Validation options.
 */
inline void validate(const std::filesystem::path& path, const ObjectMetadata& metadata, Options& options) {
    const auto& vstring = internal_json::extract_version_for_type(metadata.other, "compressed_sparse_matrix");
    auto version = ritsuko::parse_version_string(vstring.c_str(), vstring.size(), /* skip_patch = */ true);
    if (version.major != 1) {
        throw std::runtime_error("unsupported version '" + vstring + "'");
    }

    auto handle = ritsuko::hdf5::open_file(path / "matrix.h5");
    auto ghandle = ritsuko::hdf5::open_group(handle, "compressed_sparse_matrix");
    auto layout = ritsuko::hdf5::open_and_load_scalar_string_attribute(ghandle, "layout");
    size_t primary = 0;
    if (layout == "CSC") {
        primary = 1;
    } else if (layout != "CSR") {
        throw std::runtime_error("'layout' attribute must be one of 'CSC' or 'CSR'");
    }

    auto shape = internal::validate_shape(ghandle, options);
    size_t num_nonzero = internal::validate_data(ghandle, options);
    auto indptrs = internal::validate_indptrs(ghandle, shape[primary], num_nonzero);
    internal::validate_indices(ghandle, indptrs, shape[1 - primary], options);

    if (ghandle.exists("names")) {
        std::vector<hsize_t> dims(shape.begin(), shape.end());
        internal_array::check_dimnames(ghandle, "names", dims, options);
    }
}

/**
 * @param path Path to the directory containing a compressed sparse matrix.
 * @param metadata Metadata for the object, typically read from its `OBJECT` file.
 * @param options Validation options.
 * @return Number of rows in the matrix.
 */
inline size_t height(const std::filesystem::path& path, [[maybe_unused]] const ObjectMetadata& metadata, [[maybe_unused]] Options& options) {
    auto handle = ritsuko::hdf5::open_file(path / "matrix.h5");
    auto ghandle = ritsuko::hdf5::open_group(handle, "compressed_sparse_matrix");
    auto shandle = ritsuko::hdf5::open_dataset(ghandle, "shape");
    std::array<uint64_t, 2> output;
    shandle.read(output.data(), H5::PredType::NATIVE_UINT64);
    return output.front();
}

/**
 * @param path Path to the directory containing a compressed sparse matrix.
 * @param metadata Metadata for the object, typically read from its `OBJECT` file.
 * @param options Validation options.
 * @return Dimensions of the matrix.
 */
inline std::vector<size_t> dimensions(const std::filesystem::path& path, [[maybe_unused]] const ObjectMetadata& metadata, [[maybe_unused]] Options& options) {
    auto handle = ritsuko::hdf5::open_file(path / "matrix.h5");
    auto ghandle = ritsuko::hdf5::open_group(handle, "compressed_sparse_matrix");
    auto shandle = ritsuko::hdf5::open_dataset(ghandle, "shape");
    std::array<uint64_t, 2> output;
    shandle.read(output.data(), H5::PredType::NATIVE_UINT64);
    return std::vector<size_t>(output.begin(), output.end());
}

}

}

#endif