1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339
|
/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
/* vim: set ts=8 sts=2 et sw=2 tw=80: */
/* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
/*
* A counting Bloom filter implementation. This allows consumers to
* do fast probabilistic "is item X in set Y?" testing which will
* never answer "no" when the correct answer is "yes" (but might
* incorrectly answer "yes" when the correct answer is "no").
*/
#ifndef mozilla_BloomFilter_h
#define mozilla_BloomFilter_h
#include "mozilla/Attributes.h"
#include "mozilla/Likely.h"
#include <climits>
#include <cstdint>
#include <cstring>
namespace mozilla {
/*
* This class implements a classic Bloom filter as described at
* <http://en.wikipedia.org/wiki/Bloom_filter>. This allows quick
* probabilistic answers to the question "is object X in set Y?" where the
* contents of Y might not be time-invariant. The probabilistic nature of the
* test means that sometimes the answer will be "yes" when it should be "no".
* If the answer is "no", then X is guaranteed not to be in Y.
*
* The filter is parametrized on KeySize, which is the size of the key
* generated by each of hash functions used by the filter, in bits,
* and the type of object T being added and removed. T must implement
* a |uint32_t hash() const| method which returns a uint32_t hash key
* that will be used to generate the two separate hash functions for
* the Bloom filter. This hash key MUST be well-distributed for good
* results! KeySize is not allowed to be larger than 16.
*
* The filter uses exactly 2**KeySize bit (2**(KeySize-3) bytes) of memory.
* From now on we will refer to the memory used by the filter as M.
*
* The expected rate of incorrect "yes" answers depends on M and on
* the number N of objects in set Y. As long as N is small compared
* to M, the rate of such answers is expected to be approximately
* 4*(N/M)**2 for this filter. In practice, if Y has a few hundred
* elements then using a KeySize of 12 gives a reasonably low
* incorrect answer rate. A KeySize of 12 has the additional benefit
* of using exactly one page for the filter in typical hardware
* configurations.
*/
template <unsigned KeySize, class T>
class BitBloomFilter {
/*
* A counting Bloom filter with 8-bit counters. For now we assume
* that having two hash functions is enough, but we may revisit that
* decision later.
*
* The filter uses an array with 2**KeySize entries.
*
* Assuming a well-distributed hash function, a Bloom filter with
* array size M containing N elements and
* using k hash function has expected false positive rate exactly
*
* $ (1 - (1 - 1/M)^{kN})^k $
*
* because each array slot has a
*
* $ (1 - 1/M)^{kN} $
*
* chance of being 0, and the expected false positive rate is the
* probability that all of the k hash functions will hit a nonzero
* slot.
*
* For reasonable assumptions (M large, kN large, which should both
* hold if we're worried about false positives) about M and kN this
* becomes approximately
*
* $$ (1 - \exp(-kN/M))^k $$
*
* For our special case of k == 2, that's $(1 - \exp(-2N/M))^2$,
* or in other words
*
* $$ N/M = -0.5 * \ln(1 - \sqrt(r)) $$
*
* where r is the false positive rate. This can be used to compute
* the desired KeySize for a given load N and false positive rate r.
*
* If N/M is assumed small, then the false positive rate can
* further be approximated as 4*N^2/M^2. So increasing KeySize by
* 1, which doubles M, reduces the false positive rate by about a
* factor of 4, and a false positive rate of 1% corresponds to
* about M/N == 20.
*
* What this means in practice is that for a few hundred keys using a
* KeySize of 12 gives false positive rates on the order of 0.25-4%.
*
* Similarly, using a KeySize of 10 would lead to a 4% false
* positive rate for N == 100 and to quite bad false positive
* rates for larger N.
*/
public:
BitBloomFilter() {
static_assert(KeySize >= 3, "KeySize too small");
static_assert(KeySize <= kKeyShift, "KeySize too big");
// XXX: Should we have a custom operator new using calloc instead and
// require that we're allocated via the operator?
clear();
}
/*
* Clear the filter. This should be done before reusing it.
*/
void clear();
/*
* Add an item to the filter.
*/
void add(const T* aValue);
/*
* Check whether the filter might contain an item. This can
* sometimes return true even if the item is not in the filter,
* but will never return false for items that are actually in the
* filter.
*/
bool mightContain(const T* aValue) const;
/*
* Methods for add/contain when we already have a hash computed
*/
void add(uint32_t aHash);
bool mightContain(uint32_t aHash) const;
private:
static const size_t kArraySize = (1 << (KeySize - 3));
static const uint32_t kKeyMask = (1 << KeySize) - 1;
static const uint32_t kKeyShift = 16;
static uint32_t hash1(uint32_t aHash) { return aHash & kKeyMask; }
static uint32_t hash2(uint32_t aHash) {
return (aHash >> kKeyShift) & kKeyMask;
}
bool getSlot(uint32_t aHash) const {
uint32_t index = aHash / 8;
uint8_t shift = aHash % 8;
uint8_t mask = 1 << shift;
return !!(mBits[index] & mask);
}
void setSlot(uint32_t aHash) {
uint32_t index = aHash / 8;
uint8_t shift = aHash % 8;
uint8_t bit = 1 << shift;
mBits[index] |= bit;
}
bool getFirstSlot(uint32_t aHash) const { return getSlot(hash1(aHash)); }
bool getSecondSlot(uint32_t aHash) const { return getSlot(hash2(aHash)); }
void setFirstSlot(uint32_t aHash) { setSlot(hash1(aHash)); }
void setSecondSlot(uint32_t aHash) { setSlot(hash2(aHash)); }
uint8_t mBits[kArraySize];
};
template <unsigned KeySize, class T>
inline void BitBloomFilter<KeySize, T>::clear() {
memset(mBits, 0, kArraySize);
}
template <unsigned KeySize, class T>
inline void BitBloomFilter<KeySize, T>::add(uint32_t aHash) {
setFirstSlot(aHash);
setSecondSlot(aHash);
}
template <unsigned KeySize, class T>
MOZ_ALWAYS_INLINE void BitBloomFilter<KeySize, T>::add(const T* aValue) {
uint32_t hash = aValue->hash();
return add(hash);
}
template <unsigned KeySize, class T>
MOZ_ALWAYS_INLINE bool BitBloomFilter<KeySize, T>::mightContain(
uint32_t aHash) const {
// Check that all the slots for this hash contain something
return getFirstSlot(aHash) && getSecondSlot(aHash);
}
template <unsigned KeySize, class T>
MOZ_ALWAYS_INLINE bool BitBloomFilter<KeySize, T>::mightContain(
const T* aValue) const {
uint32_t hash = aValue->hash();
return mightContain(hash);
}
/*
* This class implements a counting Bloom filter as described at
* <http://en.wikipedia.org/wiki/Bloom_filter#Counting_filters>, with
* 8-bit counters.
*
* Compared to `BitBloomFilter`, this class supports 'remove' operation.
*
* The filter uses exactly 2**KeySize bytes of memory.
*
* Other characteristics are the same as BitBloomFilter.
*/
template <unsigned KeySize, class T>
class CountingBloomFilter {
public:
CountingBloomFilter() {
static_assert(KeySize <= kKeyShift, "KeySize too big");
clear();
}
/*
* Clear the filter. This should be done before reusing it, because
* just removing all items doesn't clear counters that hit the upper
* bound.
*/
void clear();
/*
* Add an item to the filter.
*/
void add(const T* aValue);
/*
* Remove an item from the filter.
*/
void remove(const T* aValue);
/*
* Check whether the filter might contain an item. This can
* sometimes return true even if the item is not in the filter,
* but will never return false for items that are actually in the
* filter.
*/
bool mightContain(const T* aValue) const;
/*
* Methods for add/remove/contain when we already have a hash computed
*/
void add(uint32_t aHash);
void remove(uint32_t aHash);
bool mightContain(uint32_t aHash) const;
private:
static const size_t kArraySize = (1 << KeySize);
static const uint32_t kKeyMask = (1 << KeySize) - 1;
static const uint32_t kKeyShift = 16;
static uint32_t hash1(uint32_t aHash) { return aHash & kKeyMask; }
static uint32_t hash2(uint32_t aHash) {
return (aHash >> kKeyShift) & kKeyMask;
}
uint8_t& firstSlot(uint32_t aHash) { return mCounters[hash1(aHash)]; }
uint8_t& secondSlot(uint32_t aHash) { return mCounters[hash2(aHash)]; }
const uint8_t& firstSlot(uint32_t aHash) const {
return mCounters[hash1(aHash)];
}
const uint8_t& secondSlot(uint32_t aHash) const {
return mCounters[hash2(aHash)];
}
static bool full(const uint8_t& aSlot) { return aSlot == UINT8_MAX; }
uint8_t mCounters[kArraySize];
};
template <unsigned KeySize, class T>
inline void CountingBloomFilter<KeySize, T>::clear() {
memset(mCounters, 0, kArraySize);
}
template <unsigned KeySize, class T>
inline void CountingBloomFilter<KeySize, T>::add(uint32_t aHash) {
uint8_t& slot1 = firstSlot(aHash);
if (MOZ_LIKELY(!full(slot1))) {
++slot1;
}
uint8_t& slot2 = secondSlot(aHash);
if (MOZ_LIKELY(!full(slot2))) {
++slot2;
}
}
template <unsigned KeySize, class T>
MOZ_ALWAYS_INLINE void CountingBloomFilter<KeySize, T>::add(const T* aValue) {
uint32_t hash = aValue->hash();
return add(hash);
}
template <unsigned KeySize, class T>
inline void CountingBloomFilter<KeySize, T>::remove(uint32_t aHash) {
// If the slots are full, we don't know whether we bumped them to be
// there when we added or not, so just leave them full.
uint8_t& slot1 = firstSlot(aHash);
if (MOZ_LIKELY(!full(slot1))) {
--slot1;
}
uint8_t& slot2 = secondSlot(aHash);
if (MOZ_LIKELY(!full(slot2))) {
--slot2;
}
}
template <unsigned KeySize, class T>
MOZ_ALWAYS_INLINE void CountingBloomFilter<KeySize, T>::remove(
const T* aValue) {
uint32_t hash = aValue->hash();
remove(hash);
}
template <unsigned KeySize, class T>
MOZ_ALWAYS_INLINE bool CountingBloomFilter<KeySize, T>::mightContain(
uint32_t aHash) const {
// Check that all the slots for this hash contain something
return firstSlot(aHash) && secondSlot(aHash);
}
template <unsigned KeySize, class T>
MOZ_ALWAYS_INLINE bool CountingBloomFilter<KeySize, T>::mightContain(
const T* aValue) const {
uint32_t hash = aValue->hash();
return mightContain(hash);
}
} // namespace mozilla
#endif /* mozilla_BloomFilter_h */
|