File: RunMetadataParser.cpp

package info (click to toggle)
pbbam 2.4.0%2Bdfsg-1
  • links: PTS, VCS
  • area: main
  • in suites: trixie
  • size: 14,144 kB
  • sloc: cpp: 60,214; xml: 2,908; ansic: 660; sh: 275; python: 203; makefile: 187
file content (196 lines) | stat: -rw-r--r-- 6,273 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
#include "PbbamInternalConfig.h"

#include "RunMetadataParser.h"

#include <pbcopper/utility/StringUtils.h>

#include <boost/algorithm/string.hpp>

#include <fstream>
#include <istream>
#include <stdexcept>

namespace PacBio {
namespace BAM {
namespace {

std::shared_ptr<internal::DataSetElement> MakeRunMetadataElement(const pugi::xml_node& xmlNode)
{
    std::string name = xmlNode.name();
    const auto foundColon = name.find(':');
    if (foundColon != std::string::npos) {
        name = name.substr(foundColon + 1);
    }

    const internal::FromInputXml fromInputXml;
    if (name == Element::AUTOMATION) {
        return std::make_shared<Automation>(fromInputXml);
    }
    if (name == Element::AUTOMATION_PARAMETER) {
        return std::make_shared<AutomationParameter>(fromInputXml);
    }
    if (name == Element::AUTOMATION_PARAMETERS) {
        return std::make_shared<AutomationParameters>(fromInputXml);
    }
    if (name == Element::BINDING_KIT) {
        return std::make_shared<BindingKit>(fromInputXml);
    }
    if (name == Element::COLLECTIONS) {
        return std::make_shared<Collections>(fromInputXml);
    }
    if (name == Element::CONTROL_KIT) {
        return std::make_shared<ControlKit>(fromInputXml);
    }
    if (name == Element::SEQUENCING_KIT_PLATE) {
        return std::make_shared<SequencingKitPlate>(fromInputXml);
    }
    if (name == Element::TEMPLATE_PREP_KIT) {
        return std::make_shared<TemplatePrepKit>(fromInputXml);
    }

    return std::make_shared<internal::DataSetElement>(name, internal::FromInputXml{});
}

void FromRunMetadataXml(const pugi::xml_node& xmlNode, internal::DataSetElement& parent)
{
    const std::string label = xmlNode.name();
    if (label.empty()) {
        return;
    }

    auto e = MakeRunMetadataElement(xmlNode);
    e->Label(xmlNode.name());
    e->Text(xmlNode.text().get());

    // iterate attributes
    auto attrIter = xmlNode.attributes_begin();
    auto attrEnd = xmlNode.attributes_end();
    for (; attrIter != attrEnd; ++attrIter) {
        e->Attribute(attrIter->name(), attrIter->value());
    }

    // iterate children, recursively building up subtree
    auto childIter = xmlNode.begin();
    auto childEnd = xmlNode.end();
    for (; childIter != childEnd; ++childIter) {
        pugi::xml_node childNode = *childIter;
        FromRunMetadataXml(childNode, *e.get());
    }

    parent.AddChild(e);
}

CollectionMetadata SubreadSetCollection(const std::string& subreadSetName,
                                        const pugi::xml_node& subreadSetNode)
{
    // find & initialize CollectionMetadata from node
    const auto cmNode = subreadSetNode.child(Element::DATASET_METADATA)
                            .child(Element::COLLECTIONS)
                            .child(Element::COLLECTION_METADATA);
    if (!cmNode) {
        throw std::runtime_error{"[pbbam] run metadata ERROR: XML is missing expected elements"};
    }

    CollectionMetadata cm{subreadSetName};
    cm.Label(cmNode.name());

    // load element attributes
    auto attrIter = cmNode.attributes_begin();
    auto attrEnd = cmNode.attributes_end();
    for (; attrIter != attrEnd; ++attrIter) {
        const std::string name = attrIter->name();
        const std::string value = attrIter->value();
        cm.Attribute(name, value);
    }

    // load children, recursively
    auto childIter = cmNode.begin();
    auto childEnd = cmNode.end();
    for (; childIter != childEnd; ++childIter) {
        pugi::xml_node childNode = *childIter;
        FromRunMetadataXml(childNode, cm);
    }

    return cm;
}

pugi::xml_node FetchSubreadSetsNode(const pugi::xml_document& doc)
{
    const auto rootNode = doc.document_element();
    if (!rootNode) {
        throw std::runtime_error{"[pbbam] run metadata ERROR: could not fetch XML root node"};
    }
    if (std::string{rootNode.name()} != Element::PACBIO_DATA_MODEL) {
        throw std::runtime_error{
            "[pbbam] run metadata ERROR: expected 'PacBioDataModel' as root node, instead "
            "found: " +
            std::string{rootNode.name()}};
    }

    const auto result = rootNode.child(Element::EXPERIMENT_CONTAINER)
                            .child(Element::RUNS)
                            .child(Element::RUN)
                            .child(Element::OUTPUTS)
                            .child(Element::SUBREADSETS);
    if (!result) {
        throw std::runtime_error{"[pbbam] run metadata ERROR: XML is missing expected elements"};
    }
    return result;
}

std::map<std::string, CollectionMetadata> CollectionsFromXml(std::istream& in)
{
    std::map<std::string, CollectionMetadata> collections;

    pugi::xml_document doc;
    const pugi::xml_parse_result loadResult = doc.load(in);
    if (!loadResult) {
        throw std::runtime_error{
            "[pbbam] run metadata ERROR: could not read XML document\n"
            "  reason: " +
            std::string{loadResult.description()}};
    }

    const auto subreadSetsNode = FetchSubreadSetsNode(doc);
    for (const auto& subreadSetNode : subreadSetsNode) {
        const auto& subreadSetName = subreadSetNode.attribute("Name").value();
        collections.emplace(subreadSetName, SubreadSetCollection(subreadSetName, subreadSetNode));
    }

    return collections;
}

}  // namespace

CollectionMetadata RunMetadataParser::LoadCollection(const std::string& metadataXmlFn)
{
    std::ifstream in{metadataXmlFn};
    return LoadCollection(in);
}

CollectionMetadata RunMetadataParser::LoadCollection(std::istream& in)
{
    const auto& collections = CollectionsFromXml(in);
    // enforce only a single collection
    if (collections.size() != 1) {
        throw std::runtime_error{
            "[pbbam] run metadata ERROR: expected 1 SubreadSet, instead found: " +
            std::to_string(collections.size())};
    }
    return collections.begin()->second;
}

std::map<std::string, CollectionMetadata> RunMetadataParser::LoadCollections(
    const std::string& runMetadataXmlFn)
{
    std::ifstream in{runMetadataXmlFn};
    return LoadCollections(in);
}

std::map<std::string, CollectionMetadata> RunMetadataParser::LoadCollections(std::istream& in)
{
    return CollectionsFromXml(in);
}

}  // namespace BAM
}  // namespace PacBio