1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195
|
/* Using Boost.Bloom to check occurrence of DNA sequences in a genome.
*
* Copyright 2025 Joaquin M Lopez Munoz.
* Distributed under the Boost Software License, Version 1.0.
* (See accompanying file LICENSE_1_0.txt or copy at
* http://www.boost.org/LICENSE_1_0.txt)
*
* See https://www.boost.org/libs/bloom for library home page.
*/
#include <array>
#include <boost/bloom/filter.hpp>
#include <boost/bloom/fast_multiblock32.hpp>
#include <cassert>
#include <cstdint>
#include <cstdlib>
#include <fstream>
#include <iostream>
#include <stdexcept>
#include <string_view>
/* A k-mer is a sliding segment of size k over a sequence of DNA nucleotides
* (A, C, G, T). k_mer encodes the segment in a 64-bit word using 2 bits per
* nucleotide.
*/
template<std::size_t K>
struct k_mer
{
static_assert(
K >= 0 &&
2 * K <= sizeof(std::uint64_t) * CHAR_BIT);
static constexpr std::size_t size()
{
return K;
}
void reset()
{
data = 0;
}
/* shift the k-mer and append a new nucleotide */
k_mer& operator+=(char n)
{
static constexpr std::uint64_t mask=
(((std::uint64_t)1) << (2 * size())) - 1;
data <<= 2;
data &= mask;
data |= table[(unsigned char)n];
return *this;
}
std::uint64_t data = 0;
using table_type=std::array<unsigned char, UCHAR_MAX>;
static constexpr table_type table = [] {
table_type table{};
table['A'] = table['a'] = 0;
table['C'] = table['c'] = 1;
table['G'] = table['g'] = 2;
table['T'] = table['t'] = 3;
return table;
}();
};
template<std::size_t N>
std::size_t hash_value(const k_mer<N>& km)
{
/* k:mer::data is 8 bytes wide. We use it directly as the associated
* hash value in 64-bit mode, as std::size_t is the same size; in 32-bit
* mode, we XOR the high and low portions of data to make it fit into
* a std::size_t.
*/
if constexpr (sizeof(std::size_t) >= sizeof(std::uint64_t)) {
return (std::size_t)km.data;
}
else{ /* 32-bit mode */
return (std::size_t)(km.data ^ (km.data >> 32));
}
}
/* Insert all the k-mers of a given genome in a boost::bloom::filter.
* Assumed format is FASTA with A, C, G, T.
* https://en.wikipedia.org/wiki/FASTA_format
*/
using genome_filter = boost::bloom::filter<
k_mer<20>, /* using k-mers of length 20 */
1, boost::bloom::fast_multiblock32<8> >;
genome_filter make_genome_filter(const char* filename)
{
using k_mer = genome_filter::value_type;
std::ifstream in(filename, std::ios::ate); /* open at end to tell size */
if(!in) throw std::runtime_error("can't open file");
/* As a rough estimation, we assume that the number of k-mers
* is approximately equal to the length of the genome --this is
* overpessimistic due to the likely presence of duplicate k-mers.
* We set FPR = 1%.
*/
genome_filter f((std::size_t)in.tellg(), 0.01);
in.seekg(0);
std::string line;
std::size_t width = 0;
k_mer km;
while(std::getline(in, line)) {
if(line.empty()) continue;
if(line[0] == '>') { /* annotation lines in the FASTA format */
width = 0;
km.reset();
continue;
}
std::size_t i = 0;
/* don't insert km till it has km.size() nucleotides */
for(; width< km.size() - 1 && i < line.size(); ++i) {
km += line[i];
++width;
}
for(; i < line.size(); ++i) {
km += line[i];
f.insert(km);
}
}
return f;
}
/* We estimate a DNA sequence seq to be contained in a genome if all the k-mers
* of seq are contained. The calculation of the resulting false positive rate
* is left as an exercise for the reader.
*/
bool may_contain(const genome_filter& f, std::string_view seq)
{
using k_mer = genome_filter::value_type;
assert(seq.size() >= k_mer::size());
k_mer km;
auto first = seq.begin(), last = seq.end();
/* feed first km.size() -1 nucleotides */
for(std::size_t i = 0; i < km.size() - 1; ++i) km += *first++;
do{
km += *first++;
if(!f.may_contain(km)) return false;
}while(first != last);
return true;
}
int main()
{
try{
/* Fruit fly genome (Drosophila melanogaster), available at
* https://www.ncbi.nlm.nih.gov/datasets/genome/GCF_000001215.4/
*/
auto f=make_genome_filter(
"GCF_000001215.4_Release_6_plus_ISO1_MT_genomic.fna");
/* Some DNA sequences */
const char* seqs[] = {
"ataaataagattgCGACTCAAAATTAAgcaataacac", /* chr. X */
"attatagggagaaatatgatcgcgtatgcgagagtagtgccaacatattgtgctc", /* chr. 3L */
"agaATTTACTAAGTACTTCTATGAATGGAATTATTATTGGAAACTCTACAA", /* chr. 4 */
"ATTTACTAAGTACTTCTATCTGCAAATTAACAATTTATCAAACAACTG", /* not present */
"ataaataagattgCGACTCAAAAGTAAgcaat" /* mutation in chr. X, not present */
};
int i = 0;
for(auto seq: seqs){
std::cout << "check sequence " << i++ << ": "
<< may_contain(f, seq) << "\n";
}
}
catch(const std::exception& e) {
std::cerr << e.what() << "\n";
return EXIT_FAILURE;
}
}
|