1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231
|
#include <string>
#include <vector>
#include <set>
#include <fstream>
#include <sstream>
#include <iostream>
#include <memory>
#include <algorithm>
#include <libcxx/sort.hpp>
#include <boost/optional/optional.hpp>
#include "getopt_pp/getopt_pp.h"
#include "kmc_api/kmc_file.h"
//#include "omp.h"
#include "io/kmers/mmapped_reader.hpp"
#include "utils/filesystem/path_helper.hpp"
#include "utils/stl_utils.hpp"
#include "utils/ph_map/perfect_hash_map_builder.hpp"
#include "utils/kmer_mph/kmer_splitters.hpp"
#include "logger.hpp"
using std::string;
using std::vector;
const string KMER_PARSED_EXTENSION = ".bin";
const string KMER_SORTED_EXTENSION = ".sorted";
class KmerMultiplicityCounter {
size_t k_, sample_cnt_;
std::string file_prefix_;
//TODO: get rid of intermediate .bin file
string ParseKmc(const string& filename) {
CKMCFile kmcFile;
kmcFile.OpenForListing(filename);
CKmerAPI kmer((unsigned int) k_);
uint32 count;
std::string parsed_filename = filename + KMER_PARSED_EXTENSION;
std::ofstream output(parsed_filename, std::ios::binary);
while (kmcFile.ReadNextKmer(kmer, count)) {
RtSeq seq(k_, kmer.to_string());
seq.BinWrite(output);
seq_element_type tmp = count;
output.write((char*) &(tmp), sizeof(seq_element_type));
}
output.close();
return parsed_filename;
}
string SortKmersCountFile(const string& filename) {
MMappedRecordArrayReader<seq_element_type> ins(filename, RtSeq::GetDataSize(k_) + 1, false);
libcxx::sort(ins.begin(), ins.end(), adt::array_less<seq_element_type>());
std::string sorted_filename = filename + KMER_SORTED_EXTENSION;
std::ofstream out(sorted_filename);
out.write((char*) ins.data(), ins.data_size());
out.close();
remove(filename.c_str());
return sorted_filename;
}
bool ReadKmerWithCount(std::ifstream& infile, std::pair<RtSeq, uint32>& res) {
RtSeq seq(res.first.size());
if (!seq.BinRead(infile)) {
return false;
}
seq_element_type tmp;
infile.read((char*) &tmp, sizeof(seq_element_type));
res = {seq, (uint32) tmp};
return true;
}
fs::TmpFile FilterCombinedKmers(fs::TmpDir workdir, const std::vector<string>& files, size_t all_min) {
size_t n = files.size();
vector<std::unique_ptr<ifstream>> infiles;
infiles.reserve(n);
for (auto fn : files) {
INFO("Processing " << fn);
auto parsed = ParseKmc(fn);
auto sorted = SortKmersCountFile(parsed);
infiles.emplace_back(new std::ifstream(sorted));
}
vector<std::pair<RtSeq, uint32>> top_kmer(n, {RtSeq(k_), 0});
vector<bool> alive(n, false);
for (size_t i = 0; i < n; i++) {
alive[i] = ReadKmerWithCount(*infiles[i], top_kmer[i]);
}
auto kmer_file = fs::tmp::make_temp_file("kmer", workdir);
typedef uint16_t Mpl;
std::ofstream output_kmer(*kmer_file, std::ios::binary);
std::ofstream mpl_file(file_prefix_ + ".bpr", std::ios_base::binary);
RtSeq::less3 kmer_less;
while (true) {
boost::optional<RtSeq> min_kmer;
size_t cnt_min = 0;
for (size_t i = 0; i < n; ++i) {
if (alive[i]) {
RtSeq& cur_kmer = top_kmer[i].first;
if (!min_kmer || kmer_less(cur_kmer, *min_kmer)) {
min_kmer = cur_kmer;
cnt_min = 0;
}
if (cur_kmer == *min_kmer) {
cnt_min++;
}
}
}
if (!min_kmer) {
break;
}
if (cnt_min >= all_min) {
std::vector<uint32> cnt_vector(n, 0);
min_kmer.get().BinWrite(output_kmer);
for (size_t i = 0; i < n; ++i) {
if (alive[i] && top_kmer[i].first == *min_kmer) {
cnt_vector[i] += top_kmer[i].second;
}
}
for (size_t mpl : cnt_vector) {
mpl_file.write(reinterpret_cast<char *>(&mpl), sizeof(Mpl));
}
}
for (size_t i = 0; i < n; ++i) {
if (alive[i] && top_kmer[i].first == *min_kmer) {
alive[i] = ReadKmerWithCount(*infiles[i], top_kmer[i]);
}
}
}
return kmer_file;
}
void BuildKmerIndex(fs::TmpDir workdir, fs::TmpFile kmer_file, size_t sample_cnt, size_t nthreads) {
INFO("Initializing kmer profile index");
typedef size_t Offset;
using namespace utils;
static const size_t read_buffer_size = 0; //FIXME some buffer size
DeBruijnKMerKMerSplitter<StoringTypeFilter<InvertableStoring>>
splitter(workdir, k_, k_, true, read_buffer_size);
splitter.AddKMers(*kmer_file);
KMerDiskCounter<RtSeq> counter(workdir, splitter);
KeyStoringMap<RtSeq, Offset, kmer_index_traits<RtSeq>, InvertableStoring> kmer_mpl(k_);
BuildIndex(kmer_mpl, counter, 16, nthreads);
INFO("Built index with " << kmer_mpl.size() << " kmers");
//Building kmer->profile offset index
std::ifstream kmers_in(*kmer_file, std::ios::binary);
InvertableStoring::trivial_inverter<Offset> inverter;
RtSeq kmer(k_);
for (Offset offset = 0; ; offset += sample_cnt) {
kmer.BinRead(kmers_in);
if (kmers_in.fail()) {
break;
}
// VERIFY(kmer_str.length() == k_);
// conj_graph_pack::seq_t kmer(k_, kmer_str.c_str());
// kmer = gp_.kmer_mapper.Substitute(kmer);
auto kwh = kmer_mpl.ConstructKWH(kmer);
VERIFY(kmer_mpl.valid(kwh));
kmer_mpl.put_value(kwh, offset, inverter);
}
std::ofstream map_file(file_prefix_ + ".kmm", std::ios_base::binary | std::ios_base::out);
kmer_mpl.BinWrite(map_file);
INFO("Saved kmer profile map");
}
public:
KmerMultiplicityCounter(size_t k, std::string file_prefix):
k_(k), file_prefix_(std::move(file_prefix)) {
}
void CombineMultiplicities(const vector<string>& input_files, size_t min_samples, const string& tmpdir, size_t nthreads = 1) {
auto workdir = fs::tmp::make_temp_dir(tmpdir, "kmidx");
auto kmer_file = FilterCombinedKmers(workdir, input_files, min_samples);
BuildKmerIndex(workdir, kmer_file, input_files.size(), nthreads);
}
private:
DECL_LOGGER("KmerMultiplicityCounter");
};
void PrintUsageInfo() {
std::cout << "Usage: kmer_multiplicity_counter [options] -f files_dir" << std::endl;
std::cout << "Options:" << std::endl;
std::cout << "-k - kmer length" << std::endl;
std::cout << "-n - sample count" << std::endl;
std::cout << "-o - output file prefix" << std::endl;
std::cout << "-t - number of threads (default: 1)" << std::endl;
std::cout << "-s - minimal number of samples to contain kmer" << std::endl;
std::cout << "files_dir must contain two files (.kmc_pre and .kmc_suf) with kmer multiplicities for each sample from 1 to n" << std::endl;
}
int main(int argc, char *argv[]) {
using namespace GetOpt;
create_console_logger();
size_t k, sample_cnt, min_samples, nthreads;
string output, work_dir;
try {
GetOpt_pp ops(argc, argv);
ops.exceptions_all();
ops >> Option('k', k)
>> Option('n', sample_cnt)
>> Option('s', min_samples)
>> Option('o', output)
>> Option('t', "threads", nthreads, size_t(1))
>> Option('f', work_dir)
;
} catch(GetOptEx &ex) {
PrintUsageInfo();
exit(1);
}
std::vector<string> input_files;
for (size_t i = 1; i <= sample_cnt; ++i) {
input_files.push_back(work_dir + "/sample" + std::to_string(i));
}
KmerMultiplicityCounter kmcounter(k, output);
kmcounter.CombineMultiplicities(input_files, min_samples, work_dir, nthreads);
return 0;
}
|