File: vcfflatten.cpp

package info (click to toggle)
libvcflib 1.0.2%2Bdfsg-2
  • links: PTS, VCS
  • area: main
  • in suites: bullseye
  • size: 49,732 kB
  • sloc: cpp: 39,194; perl: 474; python: 321; ruby: 285; sh: 247; ansic: 198; makefile: 125; javascript: 94; lisp: 55
file content (187 lines) | stat: -rw-r--r-- 7,941 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
/*
    vcflib C++ library for parsing and manipulating VCF files

    Copyright © 2010-2020 Erik Garrison
    Copyright © 2020      Pjotr Prins

    This software is published under the MIT License. See the LICENSE file.
*/

#include "Variant.h"
#include "convert.h"

using namespace std;
using namespace vcflib;


double convertStrDbl(const string& s) {
    double r;
    convert(s, r);
    return r;
}

int main(int argc, char** argv) {

    int maxAlleles = 2;

    VariantCallFile variantFile;

    if (argc > 1) {
        string filename = argv[1];
        if (filename == "--help" || filename == "-h") {
            cerr << "usage: vcfflatten [file]" << endl
                 << endl
                 << "Removes multi-allelic sites by picking the most common alternate.  Requires" << endl
                 << "allele frequency specification 'AF' and use of 'G' and 'A' to specify the" << endl
                 << "fields which vary according to the Allele or Genotype. VCF file may be" << endl
                 << "specified on the command line or piped as stdin." << endl;
            cerr << endl << "Type: transformation" << endl << endl;
            exit(1);
        }
        variantFile.open(filename);
    } else {
        variantFile.open(std::cin);
    }

    if (!variantFile.is_open()) {
        return 1;
    }

    cout << variantFile.header << endl;

    Variant var(variantFile);
    while (variantFile.getNextVariant(var)) {
        // count the number of alternates
        // if we have more than N, strip the lowest-frequency ones
        if (var.alleles.size() > maxAlleles) {

            multimap<double, string> alleleFrequencies;

            vector<string>& freqsstr = var.info["AF"];
            vector<double> freqs;
            freqs.resize(freqsstr.size());
            transform(freqsstr.begin(), freqsstr.end(), freqs.begin(), convertStrDbl);

            vector<double>::iterator f = freqs.begin();
            for (vector<string>::iterator a = var.alt.begin(); a != var.alt.end(); ++a, ++f) {
                alleleFrequencies.insert(pair<double, string>(*f, *a));
            }

            // pick the highest frequency alternate
            string bestalt = alleleFrequencies.rbegin()->second;
            // and get its index
            int bestaltIndex = var.getAltAlleleIndex(bestalt);
            int bestaltGenotypeIndex = bestaltIndex + 1; // per VCF spec

            // keep the RR, RA, and AA alleles for this alternate
            // generate the genotype index table for this variant
            map<pair<int, int>, int> genotypeIndexes = var.getGenotypeIndexesDiploid();

            // now get the genotype indexes we want to keep
            vector<int> alleleIndexes;
            alleleIndexes.push_back(0);
            alleleIndexes.push_back(bestaltGenotypeIndex);

            // add the reference allele index for generating genotype indexes
            int ploidy = 2;
            vector<vector<int> > genotypesToKeep = multichoose(ploidy, alleleIndexes);
            map<int, bool> genotypeIndexesToKeep;
            for (vector<vector<int> >::iterator k = genotypesToKeep.begin(); k != genotypesToKeep.end(); ++k) {
                pair<int, int> genotype = make_pair(k->front(), k->back()); // vectors are guaranteed to be diploid per multichoose
                genotypeIndexesToKeep[genotypeIndexes[genotype]] = true;
            }
            // we are diploid, so there should be exactly 3 genotypes
            assert(genotypeIndexesToKeep.size() == 3);

            // get the fields which have genotype order "G"
            // for all the infocounts
            // find the ones which are == GENOTYPE_NUMBER or ALLELE_NUMBER
            //     and fix em up
            for (map<string, int>::iterator c = variantFile.infoCounts.begin(); c != variantFile.infoCounts.end(); ++c) {
                int count = c->second;
                if (count == GENOTYPE_NUMBER) {
                    string key = c->first;
                    map<string, vector<string> >::iterator v = var.info.find(key);
                    if (v != var.info.end()) {
                        vector<string>& vals = v->second;
                        vector<string> tokeep;
                        int i = 0;
                        for (vector<string>::iterator g = vals.begin(); g != vals.end(); ++g, ++i) {
                            if (genotypeIndexesToKeep.find(i) != genotypeIndexesToKeep.end()) {
                                tokeep.push_back(*g);
                            }
                        }
                        vals = tokeep;
                    }
                } else if (count == ALLELE_NUMBER) {
                    string key = c->first;
                    map<string, vector<string> >::iterator v = var.info.find(key);
                    if (v != var.info.end()) {
                        vector<string>& vals = v->second;
                        vector<string> tokeep;
                        int i = 0;
                        for (vector<string>::iterator a = vals.begin(); a != vals.end(); ++a, ++i) {
                            if (i == bestaltIndex) {
                                tokeep.push_back(*a);
                            }
                        }
                        vals = tokeep;
                    }
                }
            }
            //
            // for all the formatcounts
            // find the ones which are == GENOTYPE_NUMBER or ALLELE_NUMBER
            //     for each sample, remove the new irrelevant values

            // for each sample
            //   remove info fields which now refer to nothing
            for (map<string, int>::iterator c = variantFile.formatCounts.begin(); c != variantFile.formatCounts.end(); ++c) {
                int count = c->second;
                if (count == GENOTYPE_NUMBER) {
                    string key = c->first;
                    for (map<string, map<string, vector<string> > >::iterator s = var.samples.begin(); s != var.samples.end(); ++s) {
                        map<string, vector<string> >& sample = s->second;
                        map<string, vector<string> >::iterator v = sample.find(key);
                        if (v != sample.end()) {
                            vector<string>& vals = v->second;
                            vector<string> tokeep;
                            int i = 0;
                            for (vector<string>::iterator g = vals.begin(); g != vals.end(); ++g, ++i) {
                                if (genotypeIndexesToKeep.find(i) != genotypeIndexesToKeep.end()) {
                                    tokeep.push_back(*g);
                                }
                            }
                            vals = tokeep;
                        }
                    }
                } else if (count == ALLELE_NUMBER) {
                    string key = c->first;
                    for (map<string, map<string, vector<string> > >::iterator s = var.samples.begin(); s != var.samples.end(); ++s) {
                        map<string, vector<string> >& sample = s->second;
                        map<string, vector<string> >::iterator v = sample.find(key);
                        if (v != sample.end()) {
                            vector<string>& vals = v->second;
                            vector<string> tokeep;
                            int i = 0;
                            for (vector<string>::iterator a = vals.begin(); a != vals.end(); ++a, ++i) {
                                if (i == bestaltIndex) {
                                    tokeep.push_back(*a);
                                }
                            }
                            vals = tokeep;
                        }
                    }
                }
            }

            var.alt.clear();
            var.alt.push_back(bestalt);

        }
        cout << var << endl;
    }

    return 0;

}