File: BloomIO.h

package info (click to toggle)
abyss 2.3.10-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 8,284 kB
  • sloc: cpp: 78,182; ansic: 6,512; makefile: 2,252; perl: 672; sh: 509; haskell: 412; python: 4
file content (122 lines) | stat: -rw-r--r-- 3,071 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
#ifndef BLOOM_IO_H
#define BLOOM_IO_H 1

#include "BloomDBG/RollingHash.h"
#include "BloomDBG/RollingHashIterator.h"
#include "DataLayer/FastaReader.h"
#include "vendor/btl_bloomfilter/BloomFilter.hpp"

namespace BloomDBG {

/**
 * Round up `num` to the nearest multiple of `base`.
 */
template<typename T>
inline static T
roundUpToMultiple(T num, T base)
{
	if (base == 0)
		return num;
	T remainder = num % base;
	if (remainder == 0)
		return num;
	return num + base - remainder;
}

/**
 * Load DNA sequence into Bloom filter using rolling hash.
 *
 * @param bloom target Bloom filter
 * @param seq DNA sequence
 */
template<typename BF>
inline static void
loadSeq(BF& bloom, const std::string& seq)
{
	const unsigned k = bloom.getKmerSize();
	const unsigned numHashes = bloom.getHashNum();
	for (RollingHashIterator it(seq, numHashes, k); it != RollingHashIterator::end(); ++it) {
		bloom.insert(*it);
	}
}

/**
 * Load sequences contents of FASTA file into Bloom filter using
 * rolling hash.
 * @param bloom target Bloom filter
 * @param path path to FASTA file
 * @param verbose if true, print progress messages to STDERR
 */
template<typename BF>
inline static void
loadFile(BF& bloom, const std::string& path, bool verbose = false)
{
	const size_t BUFFER_SIZE = 1000000;
	const size_t LOAD_PROGRESS_STEP = 10000;

	assert(!path.empty());
	if (verbose)
		std::cerr << "Reading `" << path << "'..." << std::endl;

	FastaReader in(path.c_str(), FastaReader::FOLD_CASE);
	uint64_t readCount = 0;
#pragma omp parallel
	for (std::vector<std::string> buffer(BUFFER_SIZE);;) {
		buffer.clear();
		size_t bufferSize = 0;
		bool good = true;
#pragma omp critical(in)
		for (; good && bufferSize < BUFFER_SIZE;) {
			std::string seq;
			good = in >> seq;
			if (good) {
				buffer.push_back(seq);
				bufferSize += seq.length();
			}
		}
		if (buffer.size() == 0)
			break;
		for (size_t j = 0; j < buffer.size(); j++) {
			loadSeq(bloom, buffer.at(j));
			if (verbose)
#pragma omp critical(cerr)
			{
				readCount++;
				if (readCount % LOAD_PROGRESS_STEP == 0)
					std::cerr << "Loaded " << readCount << " reads into Bloom filter\n";
			}
		}
	}
	assert(in.eof());
	if (verbose) {
		std::cerr << "Loaded " << readCount << " reads from `" << path << "` into Bloom filter\n";
	}
}

/** Load FASTQ/FASTA/SAM/BAM files from command line into a Bloom filter */
template<typename BloomFilterT>
static inline void
loadBloomFilter(int argc, char** argv, BloomFilterT& bloom, bool verbose = false)
{
	/* load reads into Bloom filter */
	for (int i = optind; i < argc; ++i) {
		/*
		 * Debugging feature: If there is a ':'
		 * separating the list of input read files into
		 * two parts, use the first set of files
		 * to load the Bloom filter and the second
		 * set of files for the assembly (read extension).
		 */
		if (strcmp(argv[i], ":") == 0) {
			optind = i + 1;
			break;
		}
		BloomDBG::loadFile(bloom, argv[i], verbose);
	}
	if (verbose)
		cerr << "Bloom filter FPR: " << setprecision(3) << bloom.FPR() * 100 << "%" << endl;
}

} // end namespace 'BloomDBG'

#endif