File: dozeu_interface.hpp

package info (click to toggle)
vg 1.30.0%2Bds-1
  • links: PTS, VCS
  • area: main
  • in suites: bullseye
  • size: 267,848 kB
  • sloc: cpp: 446,974; ansic: 116,148; python: 22,805; cs: 17,888; javascript: 11,031; sh: 5,866; makefile: 4,039; java: 1,415; perl: 1,303; xml: 442; lisp: 242
file content (342 lines) | stat: -rw-r--r-- 15,494 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
/**
 * @file dozeu_interface.hpp
 * @author Hajime Suzuki
 * @date 2018/03/23
 */
#ifndef VG_DOZEU_INTERFACE_HPP_INCLUDED
#define VG_DOZEU_INTERFACE_HPP_INCLUDED

#include <algorithm>
#include <cstdint>
#include <functional>
#include <unordered_map>
#include <vector>

#include <vg/vg.pb.h>
#include "types.hpp"
#include "handle.hpp"
#include "mem.hpp"

// #define BENCH
// #include "bench.h"

// forward declarations of dozeu structs
struct dz_s;
struct dz_forefront_s;
struct dz_query_s;
struct dz_alignment_s;

namespace vg {

static constexpr uint16_t default_xdrop_max_gap_length = 40;

/**
 * Align to a graph using the xdrop algorithm, as implemented in dozeu.
 *
 * The underlying Dozeu library is fundamentally based around semi-global
 * alignment: extending an alignment from a known matching position (what
 * in other parts of vg we call "pinned" alignment).
 *
 * To simulate non-pinned alignment, we align in two passes in different
 * directions. One from a guess of a pinning position, to get a more
 * accurate "head" pinning position for the other end, and once back from
 * where the previous pass ended up, to get an overall hopefully-optimal
 * alignment.
 *
 * If the input graph is not reverse-complemented, direction = false
 * (reverse, right to left) on the first pass, and direction = true
 * (forward, left to right) on the second. If it is reverse complemented,
 * we flip them.
 *
 * This won't actually work in theory to get the optimal local alignment in
 * all cases, but it works well in practice.
 *
 * This class maintains an internal dz_s, which is *NOT THREADSAFE*,
 * and non-const during alignments. However, it may be reused for
 * subsequent alignments.
 */
class DozeuInterface {
    
public:
    
    virtual ~DozeuInterface() = default;
    /**
     * align query: forward-backward banded alignment
     *
     * Compute an alignment of the given Alignment's sequence against the
     * given DAG, using (one of) the given MEMs to seed the alignment.
     *
     * reverse_complemented is true if the topologically sorted graph we
     * have was reverse-complemented when extracted from a larger
     * containing graph, and false if it is in the same orientation as it
     * exists in the larger containing graph. The MEMs and the Alignment
     * are interpreted as being against the forward strand of the passed
     * subgraph no matter the value of this setting.
     *
     * reverse_complemented true means we will compute the alignment
     * forward in the topologically-sorted order of the given graph
     * (anchoring to the first node if no MEMs are provided) and false if
     * we want to compute the alignment backward in the topological order
     * (anchoring to the last node).
     *
     * First the head (the most upstream) seed in MEMs is selected and
     * extended downward to detect the downstream breakpoint. Next the
     * alignment path is generated by second upward extension from the
     * downstream breakpoint.
     *
     * The MEM list may be empty. If MEMs are provided, uses only the
     * begin, end, and nodes fields of the MaximalExactMatch objects. It
     * uses the first occurrence of the last MEM if reverse_complemented is
     * true, and the last occurrence of the first MEM otherwise.
     */
    void align(Alignment& alignment, const HandleGraph& graph, const vector<MaximalExactMatch>& mems,
               bool reverse_complemented, int8_t full_length_bonus,
               uint16_t max_gap_length = default_xdrop_max_gap_length);
    
    /**
     * Same as above except using a precomputed topological order, which
     * need not include all handles in the graph, and which may contain both
     * orientations of a handle.
     */
    void align(Alignment& alignment, const HandleGraph& graph, const vector<handle_t>& order,
               const vector<MaximalExactMatch>& mems, bool reverse_complemented,
               int8_t full_length_bonus, uint16_t max_gap_length = default_xdrop_max_gap_length);
    
    /**
     * Compute a pinned alignment, where the start (pin_left=true) or end
     * (pin_left=false) end of the Alignment sequence is pinned to the
     * start of the first (pin_left=true) or end of the last
     * (pin_left=false) node in the graph's topological order.
     *
     * Does not account for multiple sources/sinks in the topological
     * order; whichever comes first/last ends up being used for the pin.
     */
    void align_pinned(Alignment& alignment, const HandleGraph& g, bool pin_left,
                      int8_t full_length_bonus, uint16_t max_gap_length = default_xdrop_max_gap_length);
    
protected:
    /**
     * Represents a correspondance between a position in the subgraph we are
     * mapping to and a position in the read we are mapping.
     */
    struct graph_pos_s {
        /// What index in the node list of our extracted subgraph is our node at?
        size_t node_index;
        /// What is the offset in the node? Note that we only think about the forward strand.
        uint32_t ref_offset;
        /// What is the correspondign offset in the query sequence?
        uint32_t query_offset;
    };
    
    /**
     * Represents a HandleGraph with a defined (topological) order calculated for it.
     */
    struct OrderedGraph {
        OrderedGraph(const HandleGraph& graph, const vector<handle_t>& order);
        void for_each_neighbor(const size_t i, bool go_left, const function<void(size_t)>& lambda) const;
        size_t size() const;
        
        const HandleGraph& graph;
        const vector<handle_t>& order;
        unordered_map<handle_t, size_t> index_of;
    };
    
    // wrappers for dozeu functions that can be used to toggle between between quality
    // adjusted and standard alignments
    virtual dz_query_s* pack_query_forward(const char* seq, const uint8_t* qual,
                                           int8_t full_length_bonus, size_t len) = 0;
    virtual dz_query_s* pack_query_reverse(const char* seq, const uint8_t* qual,
                                           int8_t full_length_bonus, size_t len) = 0;
    virtual const dz_forefront_s* scan(const dz_query_s* query, const dz_forefront_s** forefronts,
                                       size_t n_forefronts, const char* ref, int32_t rlen, uint32_t rid,
                                       uint16_t xt) = 0;
    virtual const dz_forefront_s* extend(const dz_query_s* query, const dz_forefront_s** forefronts,
                                         size_t n_forefronts, const char* ref, int32_t rlen,
                                         uint32_t rid, uint16_t xt) = 0;
    virtual dz_alignment_s* trace(const dz_forefront_s* forefront) = 0;
    virtual void flush() = 0;
    
    /// Given the subgraph we are aligning to, the MEM hist against it, the
    /// length of the query, and the direction we are aligning the query in
    /// (true = forward), select a single anchoring match between the graph
    /// and the query to align out from.
    ///
    /// This replaces scan_seed_position for the case where we have MEMs.
    graph_pos_s calculate_seed_position(const OrderedGraph& graph, const vector<MaximalExactMatch>& mems,
                                        size_t query_length, bool direction) const;
    /// Given the index of the node at which the winning score occurs, find
    /// the position in the node and read sequence at which the winning
    /// match is found.
    graph_pos_s calculate_max_position(const OrderedGraph& graph, const graph_pos_s& seed_pos,
                                       size_t max_node_index, bool direction,
                                       const vector<const dz_forefront_s*>& forefronts);
    
    /// If no seeds are provided as alignment input, we need to compute our own starting anchor position. This function does that.
    /// Takes the topologically-sorted graph, the query sequence, and the direction.
    /// If direction is false, finds a seed hit on the first node of the graph. If it is true, finds a hit on the last node.
    ///
    /// This replaces calculate_seed_position for the case where we have no MEMs.
    ///
    /// The bool return with the position indicates whether the scan succeeded or failed.
    /// If the scan failed, then the alignment should not be attempted.
    pair<graph_pos_s, bool> scan_seed_position(const OrderedGraph& graph, const Alignment& alignment,
                                               bool direction, vector<const dz_forefront_s*>& forefronts,
                                               int8_t full_length_bonus, uint16_t max_gap_length);
    
    /// Append an edit at the end of the current mapping array.
    /// Returns the length passed in.
    size_t push_edit(Mapping *mapping, uint8_t op, const char* alt, size_t len) const;
    
    /// Do alignment. Takes the graph, the sorted packed edges in
    /// ascending order for a forward pass or descending order for a
    /// reverse pass, the packed query sequence, the index of the seed node
    /// in the graph, the offset (TODO: in the read?) of the seed position,
    /// and the direction to traverse the graph topological order.
    ///
    /// Note that we take our direction as right_to_left, whole many other
    /// functions take it as left_to_right.
    ///
    /// If a MEM seed is provided, this is run in two passes. The first is
    /// left to right (right_to_left = false) if align did not have
    /// reverse_complement set and the second is right to left (right_to_left =
    /// true).
    ///
    /// If we have no MEM seed, we only run one pass (the second one).
    ///
    /// Returns the index in the topological order of the node with the
    /// highest scoring alignment.
    ///
    /// Note that if no non-empty local alignment is found, it may not be
    /// safe to call dz_calc_max_qpos on the associated forefront!
    size_t do_poa(const OrderedGraph& graph, const dz_query_s* packed_query,
                  const vector<graph_pos_s>& seed_positions, bool right_to_left,
                  vector<const dz_forefront_s*>& forefronts, uint16_t);
    
    /**
     * After all the alignment work has been done, do the traceback and
     * save into the given Alignment object.
     *
     * If left_to_right is true, the nodes were filled left to right, and
     * the internal traceback will come out in left to right order, so we
     * can emit it as is. If it is false, the nodes were filled right to
     * left, and the internal traceback comes out in right to left order,
     * so we need to flip it.
     */
    void calculate_and_save_alignment(Alignment& alignment, const OrderedGraph& graph,
                                      const vector<graph_pos_s>& head_positions,
                                      size_t tail_node_index, bool left_to_right,
                                      const vector<const dz_forefront_s*>& forefronts);
    
    // void debug_print(Alignment const &alignment, OrderedGraph const &graph, MaximalExactMatch const &seed, bool reverse_complemented);
    // bench_t bench;
    
    /// After doing the upward pass and finding head_pos to anchor from, do
    /// the downward alignment pass and traceback. If left_to_right is
    /// set, goes left to right and traces back the other way. If it is
    /// unset, goes right to left and traces back the other way.
    void align_downward(Alignment &alignment, const OrderedGraph& graph,
                        const vector<graph_pos_s>& head_positions,
                        bool left_to_right, vector<const dz_forefront_s*>& forefronts,
                        int8_t full_length_bonus, uint16_t max_gap_length);
    
    
    /// The core dozeu class, which does the alignments
    dz_s* dz = nullptr;
};

/*
 * A dozeu-backed X-drop aligner that does not use base qualities
 * to adjust alignment scores.
 */
class XdropAligner : public DozeuInterface {
public:
    
    /// Main constructor. Expects a 4 x 4 score matrix.
    XdropAligner(const int8_t* _score_matrix,
                 int8_t _gap_open,
                 int8_t _gap_extension);
    
    // see DozeuInterface::align and DozeuInterface::align_pinned below for alignment
    // interface
    
private:
    
    // implementations of virtual functions from DozeuInterface
    dz_query_s* pack_query_forward(const char* seq, const uint8_t* qual, int8_t full_length_bonus, size_t len);
    dz_query_s* pack_query_reverse(const char* seq, const uint8_t* qual, int8_t full_length_bonus, size_t len);
    const dz_forefront_s* scan(const dz_query_s* query, const dz_forefront_s** forefronts,
                               size_t n_forefronts, const char* ref, int32_t rlen, uint32_t rid,
                               uint16_t xt);
    const dz_forefront_s* extend(const dz_query_s* query, const dz_forefront_s** forefronts,
                                 size_t n_forefronts, const char* ref, int32_t rlen,
                                 uint32_t rid, uint16_t xt);
    dz_alignment_s* trace(const dz_forefront_s* forefront);
    void flush();
    
public:
    
    XdropAligner() = default;
    ~XdropAligner(void);
    
    /// Copy constructor
    XdropAligner(const XdropAligner& other);
    /// Copy assignment
    XdropAligner& operator=(const XdropAligner& other);
    /// Move constructor
    XdropAligner(XdropAligner&& other);
    /// Move assignment
    XdropAligner& operator=(XdropAligner&& other);
};

/*
 * A dozeu-backed X-drop aligner that uses base qualities to adjust
 * alignment scores.
 */
class QualAdjXdropAligner : public DozeuInterface {
public:
    
    /// Main constructor. Expects a 4 x 4 score matrix and a 4 x 4 x 64 quality adjusted matrix
    QualAdjXdropAligner(const int8_t* _score_matrix,
                        const int8_t* _qual_adj_score_matrix,
                        int8_t _gap_open,
                        int8_t _gap_extension);
    
    
    // see DozeuInterface::align and DozeuInterface::align_pinned below for alignment
    // interface
    
private:
    
    // implementations of virtual functions from DozeuInterface
    dz_query_s* pack_query_forward(const char* seq, const uint8_t* qual, int8_t full_length_bonus, size_t len);
    dz_query_s* pack_query_reverse(const char* seq, const uint8_t* qual, int8_t full_length_bonus, size_t len);
    const dz_forefront_s* scan(const dz_query_s* query, const dz_forefront_s** forefronts,
                                       size_t n_forefronts, const char* ref, int32_t rlen, uint32_t rid,
                                       uint16_t xt);
    const dz_forefront_s* extend(const dz_query_s* query, const dz_forefront_s** forefronts,
                                         size_t n_forefronts, const char* ref, int32_t rlen,
                                         uint32_t rid, uint16_t xt);
    dz_alignment_s* trace(const dz_forefront_s* forefront);
    void flush();
    
public:
    
    QualAdjXdropAligner() = default;
    ~QualAdjXdropAligner(void);
    
    /// Copy constructor
    QualAdjXdropAligner(const QualAdjXdropAligner& other);
    /// Copy assignment
    QualAdjXdropAligner& operator=(const QualAdjXdropAligner& other);
    /// Move constructor
    QualAdjXdropAligner(QualAdjXdropAligner&& other);
    /// Move assignment
    QualAdjXdropAligner& operator=(QualAdjXdropAligner&& other);
};

} // end of namespace vg

#endif // VG_DOZEU_INTERFACE_HPP_INCLUDED
/**
 * end of dozeu_interface.hpp
 */