File: utils_factor.hpp

package info (click to toggle)
r-bioc-alabaster.base 1.6.1%2Bds-2
  • links: PTS, VCS
  • area: main
  • in suites: sid, trixie
  • size: 1,652 kB
  • sloc: cpp: 11,377; sh: 29; makefile: 2
file content (96 lines) | stat: -rw-r--r-- 3,533 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
#ifndef TAKANE_UTILS_FACTOR_HPP
#define TAKANE_UTILS_FACTOR_HPP

#include <unordered_set>
#include <string>
#include <cstdint>
#include <vector>
#include <stdexcept>

#include "ritsuko/ritsuko.hpp"
#include "ritsuko/hdf5/hdf5.hpp"

namespace takane {

namespace internal_factor {

template<class H5Object_>
void check_ordered_attribute(const H5Object_& handle) {
    if (!handle.attrExists("ordered")) {
        return;
    }

    auto attr = handle.openAttribute("ordered");
    if (!ritsuko::hdf5::is_scalar(attr)) {
        throw std::runtime_error("expected 'ordered' attribute to be a scalar");
    }
    if (ritsuko::hdf5::exceeds_integer_limit(attr, 32, true)) {
        throw std::runtime_error("expected 'ordered' attribute to have a datatype that fits in a 32-bit signed integer");
    }
}

struct DefaultFactorMessenger {
    static std::string level() { return "factor level"; }
    static std::string levels() { return "levels"; }
    static std::string codes() { return "factor codes"; }
};

// These factor level/code checks are useful elsewhere but with different error messages;
// in such cases, we just do some compile-time switches that only affect the error message.
template<class ErrorMessenger_ = DefaultFactorMessenger>
hsize_t validate_factor_levels(const H5::Group& handle, const std::string& name, hsize_t buffer_size) {
    auto lhandle = ritsuko::hdf5::open_dataset(handle, name.c_str());
    if (!ritsuko::hdf5::is_utf8_string(lhandle)) {
        throw std::runtime_error("expected '" + name + "' to have a datatype that can be represented by a UTF-8 encoded string");
    }

    auto len = ritsuko::hdf5::get_1d_length(lhandle.getSpace(), false);
    std::unordered_set<std::string> present;

    ritsuko::hdf5::Stream1dStringDataset stream(&lhandle, len, buffer_size);
    for (hsize_t i = 0; i < len; ++i, stream.next()) {
        auto x = stream.steal();
        if (present.find(x) != present.end()) {
            throw std::runtime_error("'" + name + "' contains duplicated " + ErrorMessenger_::level() + " '" + x + "'");
        }
        present.insert(std::move(x));
    }

    return len;
}

template<class ErrorMessenger_ = DefaultFactorMessenger>
hsize_t validate_factor_codes(const H5::Group& handle, const std::string& name, hsize_t num_levels, hsize_t buffer_size, bool allow_missing = true) {
    auto chandle = ritsuko::hdf5::open_dataset(handle, name.c_str());
    if (ritsuko::hdf5::exceeds_integer_limit(chandle, 64, false)) {
        throw std::runtime_error("expected a datatype for '" + name + "' that fits in a 64-bit unsigned integer");
    }

    bool has_missing = false;
    uint64_t missing_placeholder = 0;
    if (allow_missing) {
        auto missingness = ritsuko::hdf5::open_and_load_optional_numeric_missing_placeholder<uint64_t>(chandle, "missing-value-placeholder");
        has_missing = missingness.first;
        missing_placeholder = missingness.second;
    }

    auto len = ritsuko::hdf5::get_1d_length(chandle.getSpace(), false);
    ritsuko::hdf5::Stream1dNumericDataset<uint64_t> stream(&chandle, len, buffer_size);
    for (hsize_t i = 0; i < len; ++i, stream.next()) {
        auto x = stream.get();
        if (has_missing && x == missing_placeholder) {
            continue;
        }
        if (static_cast<hsize_t>(x) >= num_levels) {
            throw std::runtime_error("expected " + ErrorMessenger_::codes() + " to be less than the number of " + ErrorMessenger_::levels() + " in '" + name + "'");
        }
    }

    return len;
}

}

}

#endif