1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201
|
// Copyright 2020 the Blobloom authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package blobloom
import "math"
// A Config holds parameters for Optimize or NewOptimized.
type Config struct {
// Capacity is the expected number of distinct keys to be added.
// More keys can always be added, but the false positive rate can be
// expected to drop below FPRate if their number exceeds the Capacity.
Capacity uint64
// Desired lower bound on the false positive rate when the Bloom filter
// has been filled to its capacity. FPRate must be between zero
// (exclusive) and one (inclusive).
FPRate float64
// Maximum size of the Bloom filter in bits. Zero means the global
// MaxBits constant. A value less than BlockBits means BlockBits.
MaxBits uint64
// Trigger the "contains filtered or unexported fields" message for
// forward compatibility and force the caller to use named fields.
_ struct{}
}
// NewOptimized is shorthand for New(Optimize(config)).
func NewOptimized(config Config) *Filter {
return New(Optimize(config))
}
// NewSyncOptimized is shorthand for New(Optimize(config)).
func NewSyncOptimized(config Config) *SyncFilter {
return NewSync(Optimize(config))
}
// Optimize returns numbers of keys and hash functions that achieve the
// desired false positive described by config.
//
// Optimize panics when config.FPRate is invalid.
//
// The estimated number of bits is imprecise for false positives rates below
// ca. 1e-15.
func Optimize(config Config) (nbits uint64, nhashes int) {
n := float64(config.Capacity)
p := config.FPRate
if p <= 0 || p > 1 {
panic("false positive rate for a Bloom filter must be > 0, <= 1")
}
if n == 0 {
// Assume the client wants to add at least one key; log2(0) = -inf.
n = 1
}
// The optimal nbits/n is c = -log2(p) / ln(2) for a vanilla Bloom filter.
c := math.Ceil(-math.Log2(p) / math.Ln2)
if c < float64(len(correctC)) {
c = float64(correctC[int(c)])
} else {
// We can't achieve the desired FPR. Just triple the number of bits.
c *= 3
}
nbits = uint64(c * n)
// Round up to a multiple of BlockBits.
if nbits%BlockBits != 0 {
nbits += BlockBits - nbits%BlockBits
}
var maxbits uint64 = MaxBits
if config.MaxBits != 0 && config.MaxBits < maxbits {
maxbits = config.MaxBits
if maxbits < BlockBits {
maxbits = BlockBits
}
}
if nbits > maxbits {
nbits = maxbits
// Round down to a multiple of BlockBits.
nbits -= nbits % BlockBits
}
// The corresponding optimal number of hash functions is k = c * log(2).
// Try rounding up and down to see which rounding is better.
c = float64(nbits) / n
k := c * math.Ln2
if k < 1 {
nhashes = 1
return nbits, nhashes
}
ceilK, floorK := math.Floor(k), math.Ceil(k)
if ceilK == floorK {
return nbits, int(ceilK)
}
fprCeil, _ := fpRate(c, math.Ceil(k))
fprFloor, _ := fpRate(c, math.Floor(k))
if fprFloor < fprCeil {
k = floorK
} else {
k = ceilK
}
return nbits, int(k)
}
// correctC maps c = m/n for a vanilla Bloom filter to the c' for a
// blocked Bloom filter.
//
// This is Putze et al.'s Table I, extended down to zero.
// For c > 34, the values become huge and are hard to compute.
var correctC = []byte{
1, 1, 2, 4, 5,
6, 7, 8, 9, 10, 11, 12, 13, 14, 16, 17, 18, 20, 21, 23,
25, 26, 28, 30, 32, 35, 38, 40, 44, 48, 51, 58, 64, 74, 90,
}
// FPRate computes an estimate of the false positive rate of a Bloom filter
// after nkeys distinct keys have been added.
func FPRate(nkeys, nbits uint64, nhashes int) float64 {
if nkeys == 0 {
return 0
}
p, _ := fpRate(float64(nbits)/float64(nkeys), float64(nhashes))
return p
}
func fpRate(c, k float64) (p float64, iter int) {
switch {
case c == 0:
panic("0 bits per key is too few")
case k == 0:
panic("0 hashes is too few")
}
// Putze et al.'s Equation (3).
//
// The Poisson distribution has a single spike around its mean
// BlockBits/c that gets slimmer and further away from zero as c tends
// to zero (the Bloom filter gets more filled). We start at the mean,
// then add terms left and right of it until their relative contribution
// drops below ε.
const ε = 1e-9
mean := BlockBits / c
// Ceil to make sure we start at one, not zero.
i := math.Ceil(mean)
p = math.Exp(logPoisson(mean, i) + logFprBlock(BlockBits/i, k))
for j := i - 1; j > 0; j-- {
add := math.Exp(logPoisson(mean, j) + logFprBlock(BlockBits/j, k))
p += add
iter++
if add/p < ε {
break
}
}
for j := i + 1; ; j++ {
add := math.Exp(logPoisson(mean, j) + logFprBlock(BlockBits/j, k))
p += add
iter++
if add/p < ε {
break
}
}
return p, iter
}
// FPRate computes an estimate of f's false positive rate after nkeys distinct
// keys have been added.
func (f *Filter) FPRate(nkeys uint64) float64 {
return FPRate(nkeys, f.NumBits(), f.k)
}
// Log of the FPR of a single block, FPR = (1 - exp(-k/c))^k.
func logFprBlock(c, k float64) float64 {
return k * math.Log1p(-math.Exp(-k/c))
}
// Log of the Poisson distribution's pmf.
func logPoisson(λ, k float64) float64 {
lg, _ := math.Lgamma(k + 1)
return k*math.Log(λ) - λ - lg
}
|