1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189
|
{-# LANGUAGE BangPatterns, CPP, Rank2Types,
TypeOperators,FlexibleContexts #-}
-- |
-- Module: Data.BloomFilter.Mutable
-- Copyright: Bryan O'Sullivan
-- License: BSD3
--
-- Maintainer: Bryan O'Sullivan <bos@serpentine.com>
-- Stability: unstable
-- Portability: portable
--
-- A fast, space efficient Bloom filter implementation. A Bloom
-- filter is a set-like data structure that provides a probabilistic
-- membership test.
--
-- * Queries do not give false negatives. When an element is added to
-- a filter, a subsequent membership test will definitely return
-- 'True'.
--
-- * False positives /are/ possible. If an element has not been added
-- to a filter, a membership test /may/ nevertheless indicate that
-- the element is present.
--
-- This module provides low-level control. For an easier to use
-- interface, see the "Data.BloomFilter.Easy" module.
module Data.BloomFilter.Mutable
(
-- * Overview
-- $overview
-- ** Ease of use
-- $ease
-- ** Performance
-- $performance
-- * Types
Hash
, MBloom
-- * Mutable Bloom filters
-- ** Creation
, new
-- ** Accessors
, length
, elem
-- ** Mutation
, insert
-- * The underlying representation
-- | If you serialize the raw bit arrays below to disk, do not
-- expect them to be portable to systems with different
-- conventions for endianness or word size.
-- | The raw bit array used by the mutable 'MBloom' type.
, bitArray
) where
#include "MachDeps.h"
import Control.Monad (liftM, forM_)
import Control.Monad.ST (ST)
import Data.Array.Base (unsafeRead, unsafeWrite)
import Data.Bits ((.&.), (.|.), unsafeShiftL, unsafeShiftR)
import Data.BloomFilter.Array (newArray)
import Data.BloomFilter.Util ((:*)(..), nextPowerOfTwo)
import Data.Word (Word32)
import Data.BloomFilter.Mutable.Internal
import Prelude hiding (elem, length, notElem,
(/), (*), div, divMod, mod, rem)
-- | Create a new mutable Bloom filter. For efficiency, the number of
-- bits used may be larger than the number requested. It is always
-- rounded up to the nearest higher power of two, but will be clamped
-- at a maximum of 4 gigabits, since hashes are 32 bits in size.
new :: (a -> [Hash]) -- ^ family of hash functions to use
-> Int -- ^ number of bits in filter
-> ST s (MBloom s a)
new hash numBits = MB hash shft msk `liftM` newArray numElems numBytes
where twoBits | numBits < 1 = 1
| numBits > maxHash = maxHash
| isPowerOfTwo numBits = numBits
| otherwise = nextPowerOfTwo numBits
numElems = max 2 (twoBits `unsafeShiftR` logBitsInHash)
numBytes = numElems `unsafeShiftL` logBytesInHash
trueBits = numElems `unsafeShiftL` logBitsInHash
shft = logPower2 trueBits
msk = trueBits - 1
isPowerOfTwo n = n .&. (n - 1) == 0
maxHash :: Int
#if WORD_SIZE_IN_BITS == 64
maxHash = 4294967296
#else
maxHash = 1073741824
#endif
logBitsInHash :: Int
logBitsInHash = 5 -- logPower2 bitsInHash
logBytesInHash :: Int
logBytesInHash = 2 -- logPower2 (sizeOf (undefined :: Hash))
-- | Given a filter's mask and a hash value, compute an offset into
-- a word array and a bit offset within that word.
hashIdx :: Int -> Word32 -> (Int :* Int)
hashIdx msk x = (y `unsafeShiftR` logBitsInHash) :* (y .&. hashMask)
where hashMask = 31 -- bitsInHash - 1
y = fromIntegral x .&. msk
-- | Hash the given value, returning a list of (word offset, bit
-- offset) pairs, one per hash value.
hashesM :: MBloom s a -> a -> [Int :* Int]
hashesM mb elt = hashIdx (mask mb) `map` hashes mb elt
-- | Insert a value into a mutable Bloom filter. Afterwards, a
-- membership query for the same value is guaranteed to return @True@.
insert :: MBloom s a -> a -> ST s ()
insert mb elt = do
let mu = bitArray mb
forM_ (hashesM mb elt) $ \(word :* bit) -> do
old <- unsafeRead mu word
unsafeWrite mu word (old .|. (1 `unsafeShiftL` bit))
-- | Query a mutable Bloom filter for membership. If the value is
-- present, return @True@. If the value is not present, there is
-- /still/ some possibility that @True@ will be returned.
elem :: a -> MBloom s a -> ST s Bool
elem elt mb = loop (hashesM mb elt)
where mu = bitArray mb
loop ((word :* bit):wbs) = do
i <- unsafeRead mu word
if i .&. (1 `unsafeShiftL` bit) == 0
then return False
else loop wbs
loop _ = return True
-- bitsInHash :: Int
-- bitsInHash = sizeOf (undefined :: Hash) `shiftL` 3
-- | Return the size of a mutable Bloom filter, in bits.
length :: MBloom s a -> Int
length = unsafeShiftL 1 . shift
-- | Slow, crummy way of computing the integer log of an integer known
-- to be a power of two.
logPower2 :: Int -> Int
logPower2 k = go 0 k
where go j 1 = j
go j n = go (j+1) (n `unsafeShiftR` 1)
-- $overview
--
-- Each of the functions for creating Bloom filters accepts two parameters:
--
-- * The number of bits that should be used for the filter. Note that
-- a filter is fixed in size; it cannot be resized after creation.
--
-- * A function that accepts a value, and should return a fixed-size
-- list of hashes of that value. To keep the false positive rate
-- low, the hashes computes should, as far as possible, be
-- independent.
--
-- By choosing these parameters with care, it is possible to tune for
-- a particular false positive rate. The @suggestSizing@ function in
-- the "Data.BloomFilter.Easy" module calculates useful estimates for
-- these parameters.
-- $ease
--
-- This module provides both mutable interfaces for creating and
-- querying a Bloom filter. It is most useful as a low-level way to
-- manage a Bloom filter with a custom set of characteristics.
-- $performance
--
-- The implementation has been carefully tuned for high performance
-- and low space consumption.
--
-- For efficiency, the number of bits requested when creating a Bloom
-- filter is rounded up to the nearest power of two. This lets the
-- implementation use bitwise operations internally, instead of much
-- more expensive multiplication, division, and modulus operations.
|