File: region_expander.cpp

package info (click to toggle)
vg 1.30.0%2Bds-1
  • links: PTS, VCS
  • area: main
  • in suites: bullseye
  • size: 267,848 kB
  • sloc: cpp: 446,974; ansic: 116,148; python: 22,805; cs: 17,888; javascript: 11,031; sh: 5,866; makefile: 4,039; java: 1,415; perl: 1,303; xml: 442; lisp: 242
file content (162 lines) | stat: -rw-r--r-- 7,481 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
#include "region_expander.hpp"

namespace vg {

    RegionExpander::RegionExpander(const PathPositionHandleGraph* graph, const SnarlManager* snarl_manager) :
        graph(graph), snarl_manager(snarl_manager)
    {
        // Nothing to do
    }

    map<pair<id_t, bool>, pair<uint64_t,uint64_t >> RegionExpander::expanded_subgraph(const GFFRecord& gff_record) {
        
        map<pair<id_t, bool>, pair<uint64_t, uint64_t>> return_val;

        vector<pair<id_t, bool>> interval_subpath;
        
        assert(gff_record.start != -1 && gff_record.end != -1 && gff_record.start <= gff_record.end);
        
        if (!graph->has_path(gff_record.sequence_id)) {
            cerr << "error [RegionExpander] cannot expand genomic interval, graph does not contain path with name: " << gff_record.sequence_id << endl;
            exit(1);
        }

        path_handle_t path_handle = graph->get_path_handle(gff_record.sequence_id);
        
        // walk along the path for the interval and add the corresponding nodes to the subgraph
        
        step_handle_t step = graph->get_step_at_position(path_handle, gff_record.start);
        handle_t handle = graph->get_handle_of_step(step);
        id_t node_id = graph->get_id(handle);
        bool is_rev = graph->get_is_reverse(handle);
        size_t node_length = graph->get_length(handle);
        
        size_t at_pos = graph->get_position_of_step(step);
        
        interval_subpath.emplace_back(node_id, is_rev);
        return_val[make_pair(node_id, is_rev)] = pair<uint64_t, uint64_t>(gff_record.start - at_pos,
                                                                          node_length);
        at_pos += node_length;

        while (at_pos <= gff_record.end) {
            step = graph->get_next_step(step);
            handle = graph->get_handle_of_step(step);
            node_id = graph->get_id(handle);
            is_rev = graph->get_is_reverse(handle);
            node_length = graph->get_length(handle);
            
            interval_subpath.emplace_back(node_id, is_rev);
            return_val[make_pair(node_id, is_rev)] = pair<uint64_t, uint64_t>(0, node_length);
            at_pos += node_length;
        }
        
        return_val[make_pair(node_id, is_rev)].second = gff_record.end - (at_pos - node_length) + 1;
        
        // walk along the path and identify snarls that have both ends on the path
        
        unordered_set<const Snarl*> entered_snarls;
        unordered_set<const Snarl*> completed_snarls;
        
        for (size_t i = 0; i + 1 < interval_subpath.size(); i++) {
            const Snarl* snarl_out = snarl_manager->into_which_snarl(interval_subpath[i].first,
                                                                     !interval_subpath[i].second);
            
            if (snarl_out) {
                if (entered_snarls.count(snarl_out)) {
                    completed_snarls.insert(snarl_out);
                }
            }
            
            const Snarl* snarl_in = snarl_manager->into_which_snarl(interval_subpath[i].first,
                                                                    interval_subpath[i].second);
            
            if (snarl_in) {
                entered_snarls.insert(snarl_in);
            }
        }
        
        // traverse the subgraph in each snarl and add it to the annotation
        for (const Snarl* snarl : completed_snarls) {
            // orient the snarl to match the orientation of the annotation
            handle_t oriented_start, oriented_end;
            if (return_val.count(pair<id_t, bool>(snarl->start().node_id(),
                                                  snarl->start().backward()))) {
                
                oriented_start = graph->get_handle(snarl->start().node_id(),
                                                   snarl->start().backward());
                
                oriented_end = graph->get_handle(snarl->end().node_id(),
                                                 snarl->end().backward());
                
            }
            else {
                
                oriented_start = graph->get_handle(snarl->end().node_id(),
                                                   !snarl->end().backward());
                
                oriented_end = graph->get_handle(snarl->start().node_id(),
                                                 !snarl->start().backward());
            }
            
            // mark all the exits and the entry point as untraversable
            unordered_set<handle_t> stacked{oriented_start, oriented_end, graph->flip(oriented_start)};
            vector<handle_t> stack{oriented_start};
            
            // make an index for jumping over the inside of child snarls
            unordered_map<handle_t, handle_t> child_snarl_skips;
            for (const Snarl* child : snarl_manager->children_of(snarl)) {
                handle_t start = graph->get_handle(child->start().node_id(),
                                                   child->start().backward());
                handle_t end = graph->get_handle(child->end().node_id(),
                                                 child->end().backward());
                
                child_snarl_skips[start] = end;
                child_snarl_skips[graph->flip(end)] = graph->flip(start);
            }
            
            // traverse the subgraph and add it to the return value
            while (!stack.empty()) {
                handle_t handle = stack.back();
                stack.pop_back();
                
                pair<id_t, bool> trav = make_pair(graph->get_id(handle),
                                                  graph->get_is_reverse(handle));
                
                if (!return_val.count(trav)) {
                    return_val[trav] = pair<uint64_t, uint64_t>(0, graph->get_length(handle));
                }
                
                if (child_snarl_skips.count(handle)) {
                    // skip over the internals of the child snarl we're pointing into
                    handle_t next = child_snarl_skips[handle];
                    if (!stacked.count(next)) {
                        stack.push_back(next);
                        stacked.insert(next);
                    }
                }
                else {
                    // traverse edges
                    graph->follow_edges(handle, false, [&](const handle_t& next) {
                        if (!stacked.count(next)) {
                            stack.push_back(next);
                            stacked.insert(next);
                        }
                    });
                }
            }
        }
        
        if (gff_record.strand_is_rev) {
            // we did all our queries with respect to the forward strand, flip it back to the reverse
            map<pair<id_t, bool>, pair<uint64_t, uint64_t>> reversed_map;
            for (const auto& record : return_val) {
                uint64_t node_length = graph->get_length(graph->get_handle(record.first.first));
                reversed_map[make_pair(record.first.first, !record.first.second)] = make_pair(node_length - record.second.second,
                                                                                              node_length - record.second.first);
            }
            return_val = move(reversed_map);
        }
        
        return return_val;
    }
}