File: sequence_information.hpp

package info (click to toggle)
r-bioc-alabaster.base 1.6.1%2Bds-2
  • links: PTS, VCS
  • area: main
  • in suites: sid, trixie
  • size: 1,652 kB
  • sloc: cpp: 11,377; sh: 29; makefile: 2
file content (111 lines) | stat: -rw-r--r-- 4,249 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
#ifndef TAKANE_SEQUENCE_INFORMATION_HPP
#define TAKANE_SEQUENCE_INFORMATION_HPP

#include "ritsuko/ritsuko.hpp"
#include "ritsuko/hdf5/hdf5.hpp"

#include <filesystem>
#include <stdexcept>
#include <unordered_set>
#include <string>

#include "utils_public.hpp"
#include "utils_json.hpp"

/**
 * @file sequence_information.hpp
 * @brief Validation for sequence information.
 */

namespace takane {

/**
 * @namespace takane::sequence_information
 * @brief Definitions for sequence information objects.
 */
namespace sequence_information {

/**
 * @param path Path to the directory containing the data frame.
 * @param metadata Metadata for the object, typically read from its `OBJECT` file.
 * @param options Validation options.
 */
inline void validate(const std::filesystem::path& path, const ObjectMetadata& metadata, Options& options) {
    auto vstring = internal_json::extract_version_for_type(metadata.other, "sequence_information");
    auto version = ritsuko::parse_version_string(vstring.c_str(), vstring.size(), /* skip_patch = */ true);
    if (version.major != 1) {
        throw std::runtime_error("unsupported version string '" + vstring + "'");
    }

    auto handle = ritsuko::hdf5::open_file(path / "info.h5");
    auto ghandle = ritsuko::hdf5::open_group(handle, "sequence_information");

    size_t nseq = 0;
    {
        auto nhandle = ritsuko::hdf5::open_dataset(ghandle, "name");
        if (!ritsuko::hdf5::is_utf8_string(nhandle)) {
            throw std::runtime_error("expected 'name' to have a datatype that can be represented by a UTF-8 encoded string");
        }

        nseq = ritsuko::hdf5::get_1d_length(nhandle.getSpace(), false);
        std::unordered_set<std::string> collected;
        ritsuko::hdf5::Stream1dStringDataset stream(&nhandle, nseq, options.hdf5_buffer_size);
        for (size_t s = 0; s < nseq; ++s, stream.next()) {
            auto x = stream.steal();
            if (collected.find(x) != collected.end()) {
                throw std::runtime_error("detected duplicated sequence name '" + x + "'");
            }
            collected.insert(std::move(x));
        }
    }

    const char* missing_attr_name = "missing-value-placeholder";

    {
        auto lhandle = ritsuko::hdf5::open_dataset(ghandle, "length");
        if (ritsuko::hdf5::exceeds_integer_limit(lhandle, 64, false)) {
            throw std::runtime_error("expected a datatype for 'length' that fits in a 64-bit unsigned integer");
        }
        if (ritsuko::hdf5::get_1d_length(lhandle.getSpace(), false) != nseq) {
            throw std::runtime_error("expected lengths of 'length' and 'name' to be equal");
        }
        if (lhandle.attrExists(missing_attr_name)) {
            auto ahandle = lhandle.openAttribute(missing_attr_name);
            ritsuko::hdf5::check_missing_placeholder_attribute(lhandle, ahandle);
        }
    }

    {
        auto chandle = ritsuko::hdf5::open_dataset(ghandle, "circular");
        if (ritsuko::hdf5::exceeds_integer_limit(chandle, 32, true)) {
            throw std::runtime_error("expected a datatype for 'circular' that fits in a 32-bit signed integer");
        }
        if (ritsuko::hdf5::get_1d_length(chandle.getSpace(), false) != nseq) {
            throw std::runtime_error("expected lengths of 'length' and 'circular' to be equal");
        }
        if (chandle.attrExists(missing_attr_name)) {
            auto ahandle = chandle.openAttribute(missing_attr_name);
            ritsuko::hdf5::check_missing_placeholder_attribute(chandle, ahandle);
        }
    }

    {
        auto gnhandle = ritsuko::hdf5::open_dataset(ghandle, "genome");
        if (!ritsuko::hdf5::is_utf8_string(gnhandle)) {
            throw std::runtime_error("expected 'genome' to have a datatype that can be represented by a UTF-8 encoded string");
        }
        if (ritsuko::hdf5::get_1d_length(gnhandle.getSpace(), false) != nseq) {
            throw std::runtime_error("expected lengths of 'length' and 'genome' to be equal");
        }
        if (gnhandle.attrExists(missing_attr_name)) {
            auto ahandle = gnhandle.openAttribute(missing_attr_name);
            ritsuko::hdf5::check_missing_placeholder_attribute(gnhandle, ahandle);
        }
    }
}

}

}

#endif