File: kmer_multiplicity_counter.cpp

package info (click to toggle)
spades 3.13.1%2Bdfsg-2
  • links: PTS, VCS
  • area: main
  • in suites: bullseye
  • size: 22,172 kB
  • sloc: cpp: 136,213; ansic: 48,218; python: 16,809; perl: 4,252; sh: 2,115; java: 890; makefile: 507; pascal: 348; xml: 303
file content (231 lines) | stat: -rw-r--r-- 8,300 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
#include <string>
#include <vector>
#include <set>
#include <fstream>
#include <sstream>
#include <iostream>
#include <memory>
#include <algorithm>
#include <libcxx/sort.hpp>
#include <boost/optional/optional.hpp>
#include "getopt_pp/getopt_pp.h"
#include "kmc_api/kmc_file.h"
//#include "omp.h"
#include "io/kmers/mmapped_reader.hpp"
#include "utils/filesystem/path_helper.hpp"
#include "utils/stl_utils.hpp"
#include "utils/ph_map/perfect_hash_map_builder.hpp"
#include "utils/kmer_mph/kmer_splitters.hpp"
#include "logger.hpp"

using std::string;
using std::vector;

const string KMER_PARSED_EXTENSION = ".bin";
const string KMER_SORTED_EXTENSION = ".sorted";

class KmerMultiplicityCounter {

    size_t k_, sample_cnt_;
    std::string file_prefix_;

    //TODO: get rid of intermediate .bin file
    string ParseKmc(const string& filename) {
        CKMCFile kmcFile;
        kmcFile.OpenForListing(filename);
        CKmerAPI kmer((unsigned int) k_);
        uint32 count;
        std::string parsed_filename = filename + KMER_PARSED_EXTENSION;
        std::ofstream output(parsed_filename, std::ios::binary);
        while (kmcFile.ReadNextKmer(kmer, count)) {
            RtSeq seq(k_, kmer.to_string());
            seq.BinWrite(output);
            seq_element_type tmp = count;
            output.write((char*) &(tmp), sizeof(seq_element_type));
        }
        output.close();
        return parsed_filename;
    }

    string SortKmersCountFile(const string& filename) {
        MMappedRecordArrayReader<seq_element_type> ins(filename, RtSeq::GetDataSize(k_) + 1, false);
        libcxx::sort(ins.begin(), ins.end(), adt::array_less<seq_element_type>());
        std::string sorted_filename = filename + KMER_SORTED_EXTENSION;
        std::ofstream out(sorted_filename);
        out.write((char*) ins.data(), ins.data_size());
        out.close();
        remove(filename.c_str());
        return sorted_filename;
    }

    bool ReadKmerWithCount(std::ifstream& infile, std::pair<RtSeq, uint32>& res) {
        RtSeq seq(res.first.size());
        if (!seq.BinRead(infile)) {
            return false;
        }
        seq_element_type tmp;
        infile.read((char*) &tmp, sizeof(seq_element_type));
        res = {seq, (uint32) tmp};
        return true;
    }

    fs::TmpFile FilterCombinedKmers(fs::TmpDir workdir, const std::vector<string>& files, size_t all_min) {
        size_t n = files.size();
        vector<std::unique_ptr<ifstream>> infiles;
        infiles.reserve(n);
        for (auto fn : files) {
            INFO("Processing " << fn);
            auto parsed = ParseKmc(fn);
            auto sorted = SortKmersCountFile(parsed);
            infiles.emplace_back(new std::ifstream(sorted));
        }
        vector<std::pair<RtSeq, uint32>> top_kmer(n, {RtSeq(k_), 0});
        vector<bool> alive(n, false);

        for (size_t i = 0; i < n; i++) {
            alive[i] = ReadKmerWithCount(*infiles[i], top_kmer[i]);
        }

        auto kmer_file = fs::tmp::make_temp_file("kmer", workdir);

        typedef uint16_t Mpl;
        std::ofstream output_kmer(*kmer_file, std::ios::binary);
        std::ofstream mpl_file(file_prefix_ + ".bpr", std::ios_base::binary);

        RtSeq::less3 kmer_less;
        while (true) {
            boost::optional<RtSeq> min_kmer;
            size_t cnt_min = 0;
            for (size_t i = 0; i < n; ++i) {
                if (alive[i]) {
                    RtSeq& cur_kmer = top_kmer[i].first;
                    if (!min_kmer || kmer_less(cur_kmer, *min_kmer)) {
                        min_kmer = cur_kmer;
                        cnt_min = 0;
                    }
                    if (cur_kmer == *min_kmer) {
                        cnt_min++;
                    }
                }
            }
            if (!min_kmer) {
                break;
            }
            if (cnt_min >= all_min) {
                std::vector<uint32> cnt_vector(n, 0);
                min_kmer.get().BinWrite(output_kmer);
                for (size_t i = 0; i < n; ++i) {
                    if (alive[i] && top_kmer[i].first == *min_kmer) {
                        cnt_vector[i] += top_kmer[i].second;
                    }
                }

                for (size_t mpl : cnt_vector) {
                    mpl_file.write(reinterpret_cast<char *>(&mpl), sizeof(Mpl));
                }
            }
            for (size_t i = 0; i < n; ++i) {
                if (alive[i] && top_kmer[i].first == *min_kmer) {
                    alive[i] = ReadKmerWithCount(*infiles[i], top_kmer[i]);
                }
            }
        }
        return kmer_file;
    }

    void BuildKmerIndex(fs::TmpDir workdir, fs::TmpFile kmer_file, size_t sample_cnt, size_t nthreads) {
        INFO("Initializing kmer profile index");

        typedef size_t Offset;
        using namespace utils;

        static const size_t read_buffer_size = 0; //FIXME some buffer size
        DeBruijnKMerKMerSplitter<StoringTypeFilter<InvertableStoring>>
            splitter(workdir, k_, k_, true, read_buffer_size);
        splitter.AddKMers(*kmer_file);

        KMerDiskCounter<RtSeq> counter(workdir, splitter);
        KeyStoringMap<RtSeq, Offset, kmer_index_traits<RtSeq>, InvertableStoring> kmer_mpl(k_);
        BuildIndex(kmer_mpl, counter, 16, nthreads);
        INFO("Built index with " << kmer_mpl.size() << " kmers");

        //Building kmer->profile offset index
        std::ifstream kmers_in(*kmer_file, std::ios::binary);
        InvertableStoring::trivial_inverter<Offset> inverter;
        RtSeq kmer(k_);
        for (Offset offset = 0; ; offset += sample_cnt) {
            kmer.BinRead(kmers_in);
            if (kmers_in.fail()) {
                break;
            }

//            VERIFY(kmer_str.length() == k_);
//            conj_graph_pack::seq_t kmer(k_, kmer_str.c_str());
//            kmer = gp_.kmer_mapper.Substitute(kmer);

            auto kwh = kmer_mpl.ConstructKWH(kmer);
            VERIFY(kmer_mpl.valid(kwh));
            kmer_mpl.put_value(kwh, offset, inverter);
        }

        std::ofstream map_file(file_prefix_ + ".kmm", std::ios_base::binary | std::ios_base::out);
        kmer_mpl.BinWrite(map_file);
        INFO("Saved kmer profile map");
    }

public:
    KmerMultiplicityCounter(size_t k, std::string file_prefix):
        k_(k), file_prefix_(std::move(file_prefix)) {
    }

    void CombineMultiplicities(const vector<string>& input_files, size_t min_samples, const string& tmpdir, size_t nthreads = 1) {
        auto workdir = fs::tmp::make_temp_dir(tmpdir, "kmidx");
        auto kmer_file = FilterCombinedKmers(workdir, input_files, min_samples);
        BuildKmerIndex(workdir, kmer_file, input_files.size(), nthreads);
    }
private:
    DECL_LOGGER("KmerMultiplicityCounter");
};

void PrintUsageInfo() {
    std::cout << "Usage: kmer_multiplicity_counter [options] -f files_dir" << std::endl;
    std::cout << "Options:" << std::endl;
    std::cout << "-k - kmer length" << std::endl;
    std::cout << "-n - sample count" << std::endl;
    std::cout << "-o - output file prefix" << std::endl;
    std::cout << "-t - number of threads (default: 1)" << std::endl;
    std::cout << "-s - minimal number of samples to contain kmer" << std::endl;
    std::cout << "files_dir must contain two files (.kmc_pre and .kmc_suf) with kmer multiplicities for each sample from 1 to n" << std::endl;
}

int main(int argc, char *argv[]) {
    using namespace GetOpt;
    create_console_logger();

    size_t k, sample_cnt, min_samples, nthreads;
    string output, work_dir;

    try {
        GetOpt_pp ops(argc, argv);
        ops.exceptions_all();
        ops >> Option('k', k)
            >> Option('n', sample_cnt)
            >> Option('s', min_samples)
            >> Option('o', output)
            >> Option('t', "threads", nthreads, size_t(1))
            >> Option('f', work_dir)
        ;
    } catch(GetOptEx &ex) {
        PrintUsageInfo();
        exit(1);
    }

    std::vector<string> input_files;
    for (size_t i = 1; i <= sample_cnt; ++i) {
        input_files.push_back(work_dir + "/sample" + std::to_string(i));
    }

    KmerMultiplicityCounter kmcounter(k, output);
    kmcounter.CombineMultiplicities(input_files, min_samples, work_dir, nthreads);
    return 0;
}