File: variant_adder.hpp

package info (click to toggle)
vg 1.30.0%2Bds-1
  • links: PTS, VCS
  • area: main
  • in suites: bullseye
  • size: 267,848 kB
  • sloc: cpp: 446,974; ansic: 116,148; python: 22,805; cs: 17,888; javascript: 11,031; sh: 5,866; makefile: 4,039; java: 1,415; perl: 1,303; xml: 442; lisp: 242
file content (213 lines) | stat: -rw-r--r-- 8,057 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
#ifndef VG_VARIANT_ADDER_HPP_INCLUDED
#define VG_VARIANT_ADDER_HPP_INCLUDED

/** \file
 * variant_adder.hpp: defines a tool class used for adding variants from VCF
 * files into existing graphs.
 */

#include "vcf_buffer.hpp"
#include "path_index.hpp"
#include "vg.hpp"
#include "xg.hpp"
#include "progressive.hpp"
#include "name_mapper.hpp"
#include "graph_synchronizer.hpp"
#include "build_index.hpp"

namespace vg {

using namespace std;

/**
 * A tool class for adding variants to a VG graph. Integrated NameMapper
 * provides name translation for the VCF contigs.
 */
class VariantAdder : public NameMapper, public Progressive {

public:
    
    /**
     * Make a new VariantAdder to add variants to the given graph. Modifies the
     * graph in place.
     */
    VariantAdder(VG& graph);
    
    /**
     * Add in the variants from the given non-null VCF file. The file must be
     * freshly opened. The variants in the file must be sorted.
     *
     * May be called from multiple threads. Synchronizes internally on the
     * graph.
     */
    void add_variants(vcflib::VariantCallFile* vcf);
    
    /**
     * Align the given string to the given graph, wetween the given endpoints,
     * using the most appropriate alignment method, depending on the relative
     * sizes involved and whether a good alignment exists. max_span gives the
     * maximum length in the graph that we expect our string to possibly align
     * over (for cases of large deletions, where we might want to follow a long
     * path in the graph).
     *
     * The endpoints have to be heads/tails of the graph.
     *
     * Treats N/N substitutions as matches.
     *
     * TODO: now that we have a smart aligner that can synthesize deletions
     * without finding them with the banded global aligner, do we need max_span
     * anymore?
     *
     * Mostly exposed for testability.
     */
    Alignment smart_align(vg::VG& graph, pair<NodeSide, NodeSide> endpoints, const string& to_align, size_t max_span);
    
    /**
     * Turn any N/N substitutions in the given alignment against the given graph into matches.
     * Modifies the alignment in place.
     */
    void align_ns(vg::VG& graph, Alignment& aln);
    
    /// What's the maximum node size we should produce, and the size we should
    /// chop the input graph to? Since alt sequences are forced out to node
    /// boundaries, it makes sense for this to be small relative to
    /// whole_alignment_cutoff.
    size_t max_node_size = 32;
    
    /// How wide of a range in bases should we look for nearby variants in?
    size_t variant_range = 50;
    
    /// How much additional context should we try and add outside the radius of
    /// our group of variants we actually find?
    size_t flank_range = 100;
    
    /// Should we accept and ignore VCF contigs that we can't find in the graph?
    bool ignore_missing_contigs = false;
    
    /// What's the max radius on a variant we can have in order to use that
    /// variant as context for another main variant?
    size_t max_context_radius = 50;
    
    /// What's the cut-off for the graph's size or the alt's size in bp under
    /// which we can just use permissive banding and large band padding? If
    /// either is larger than this, we use the pinned-alignment-based do-each-
    /// end-and-splice mode.
    size_t whole_alignment_cutoff = 4096;
    
    /// When we're above that cutoff, what amount of band padding can we use
    /// looking for an existing version of our sequence?
    size_t large_alignment_band_padding = 30;
    
    /// When we're doing a restricted band padding alignment, how good does it
    /// have to be, as a fraction of the perfect match score for the whole
    /// context, in order to use it?
    double min_score_factor = 0.95;
    
    /// If the restricted band alignment doesn't find anything, we resort to
    /// pinned alignments from the ends and cutting and pasting together. How
    /// big should each pinned tail be?
    size_t pinned_tail_size = 200;
    
    /// We use this Aligner to hold the scoring parameters. It may be accessed
    /// by multiple threads at once.
    Aligner aligner;
    
    /// Sometimes, we have to make Mappers, for graphs too big to safely use our
    /// global banded aligner on. If we do that, what max edge crossing limit
    /// should we use for simplification?
    size_t edge_max = 0;
    /// What base kmer size should we use?
    size_t kmer_size = 16;
    /// What number of doubling steps should we use?
    size_t doubling_steps = 3;
    /// If nonzero, prune short subgraphs smaller than this before GCSA2-indexing
    size_t subgraph_prune = 0;
    
    // What's the cut-off above which we don't try thin banded aligners and do a
    // Mapper instead?
    size_t thin_alignment_cutoff = 10000;
    
    // What's the cutoff at which we decide to not even try the mapper-based
    // aligner (in query bases), and just resort to inserting duplicate
    // insertions?
    // TODO: This is disabled for now
    size_t mapper_alignment_cutoff = 0;
    
    /// We have code to skip large structural duplications, because aligners
    /// won't be able to distinguish the copies. TODO: we want to actually make
    /// them into cycles.
    bool skip_structural_duplications = true;
    
    /// Should we print out periodic updates about the variants we have
    /// processed?
    bool print_updates = false;
    
protected:
    /// The graph we are modifying
    VG& graph;
    
    /// We keep a GraphSynchronizer so we can have multiple threads working on
    /// different parts of the same graph.
    GraphSynchronizer sync;
    
    /// We cache the set of valid path names, so we can detect/skip missing ones
    /// without locking the graph.
    set<string> path_names;
    
    /**
     * Get all the unique combinations of variant alts represented by actual
     * haplotypes. Arbitrarily phases unphased variants.
     *
     * Can (and should) take a WindowedVcfBuffer that owns the variants, and
     * from which cached pre-parsed genotypes can be extracted.
     *
     * Returns a set of vectors or one number per variant, giving the alt number
     * (starting with 0 for reference) that appears on the haplotype.
     *
     * TODO: ought to just take a collection of pre-barsed genotypes, but in an
     * efficient way (a vector of pointers to vectors of sample genotypes?)
     */
    set<vector<int>> get_unique_haplotypes(const vector<vcflib::Variant*>& variants, WindowedVcfBuffer* cache = nullptr) const;
    
    /**
     * Convert a haplotype on a list of variants into a string. The string will
     * run from the start of the first variant through the end of the last
     * variant.
     *
     * Can't be const because it relies on non-const operations on the
     * synchronizer.
     */
    string haplotype_to_string(const vector<int>& haplotype, const vector<vcflib::Variant*>& variants);
    
    /**
     * Get the radius of the variant around its center: the amount of sequence
     * that needs to be pulled out to make sure you have the ref and all the
     * alts, if they exist. This is just going to be twice the longest of the
     * ref and the alts.
     */
    static size_t get_radius(const vcflib::Variant& variant);
    
    /**
     * Get the center position of the given variant.
     */
    static size_t get_center(const vcflib::Variant& variant);
    
    /**
     * Get the center and radius around that center needed to extract everything
     * that might be involved in a group of variants.
     */
    static pair<size_t, size_t> get_center_and_radius(const vector<vcflib::Variant*>& variants);
    
    /**
     * Glom all the given variants into one vector, throwing out variants from
     * the before and after vectors that are too big to be in a context.
     */
    vector<vcflib::Variant*> filter_local_variants(const vector<vcflib::Variant*>& before,
        vcflib::Variant* variant, const vector<vcflib::Variant*>& after) const;
     
};

}

#endif