1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345
|
{-# LANGUAGE BangPatterns, Rank2Types, ScopedTypeVariables, TypeOperators #-}
-- |
-- Module: Data.BloomFilter
-- Copyright: Bryan O'Sullivan
-- License: BSD3
--
-- Maintainer: Bryan O'Sullivan <bos@serpentine.com>
-- Stability: unstable
-- Portability: portable
--
-- A fast, space efficient Bloom filter implementation. A Bloom
-- filter is a set-like data structure that provides a probabilistic
-- membership test.
--
-- * Queries do not give false negatives. When an element is added to
-- a filter, a subsequent membership test will definitely return
-- 'True'.
--
-- * False positives /are/ possible. If an element has not been added
-- to a filter, a membership test /may/ nevertheless indicate that
-- the element is present.
--
-- This module provides low-level control. For an easier to use
-- interface, see the "Data.BloomFilter.Easy" module.
module Data.BloomFilter
(
-- * Overview
-- $overview
-- ** Ease of use
-- $ease
-- ** Performance
-- $performance
-- * Types
Hash
, Bloom
, MBloom
-- * Immutable Bloom filters
-- ** Conversion
, freeze
, thaw
, unsafeFreeze
-- ** Creation
, unfold
, fromList
, empty
, singleton
-- ** Accessors
, length
, elem
, notElem
-- ** Modification
, insert
, insertList
-- * The underlying representation
-- | If you serialize the raw bit arrays below to disk, do not
-- expect them to be portable to systems with different
-- conventions for endianness or word size.
-- | The raw bit array used by the immutable 'Bloom' type.
, bitArray
) where
import Control.Monad (liftM, forM_)
import Control.Monad.ST (ST, runST)
import Control.DeepSeq (NFData(..))
import Data.Array.Base (unsafeAt)
import qualified Data.Array.Base as ST
import Data.Array.Unboxed (UArray)
import Data.Bits ((.&.), unsafeShiftL, unsafeShiftR)
import Data.BloomFilter.Util ((:*)(..))
import qualified Data.BloomFilter.Mutable as MB
import qualified Data.BloomFilter.Mutable.Internal as MB
import Data.BloomFilter.Mutable.Internal (Hash, MBloom)
import Data.Word (Word32)
import Prelude hiding (elem, length, notElem,
(/), (*), div, divMod, mod, rem)
-- | An immutable Bloom filter, suitable for querying from pure code.
data Bloom a = B {
hashes :: !(a -> [Hash])
, shift :: {-# UNPACK #-} !Int
, mask :: {-# UNPACK #-} !Int
, bitArray :: {-# UNPACK #-} !(UArray Int Hash)
}
instance Show (Bloom a) where
show ub = "Bloom { " ++ show ((1::Int) `unsafeShiftL` shift ub) ++ " bits } "
instance NFData (Bloom a) where
rnf !_ = ()
logBitsInHash :: Int
logBitsInHash = 5 -- Data.BloomFilter.Mutable.logPower2 bitsInHash
-- | Create an immutable Bloom filter, using the given setup function
-- which executes in the 'ST' monad.
--
-- Example:
--
-- @
--import "Data.BloomFilter.Hash" (cheapHashes)
--
--filter = create (cheapHashes 3) 1024 $ \mf -> do
-- insertMB mf \"foo\"
-- insertMB mf \"bar\"
-- @
--
-- Note that the result of the setup function is not used.
create :: (a -> [Hash]) -- ^ family of hash functions to use
-> Int -- ^ number of bits in filter
-> (forall s. (MBloom s a -> ST s ())) -- ^ setup function
-> Bloom a
{-# INLINE create #-}
create hash numBits body = runST $ do
mb <- MB.new hash numBits
body mb
unsafeFreeze mb
-- | Create an immutable Bloom filter from a mutable one. The mutable
-- filter may be modified afterwards.
freeze :: MBloom s a -> ST s (Bloom a)
freeze mb = B (MB.hashes mb) (MB.shift mb) (MB.mask mb) `liftM`
ST.freeze (MB.bitArray mb)
-- | Create an immutable Bloom filter from a mutable one. The mutable
-- filter /must not/ be modified afterwards, or a runtime crash may
-- occur. For a safer creation interface, use 'freeze' or 'create'.
unsafeFreeze :: MBloom s a -> ST s (Bloom a)
unsafeFreeze mb = B (MB.hashes mb) (MB.shift mb) (MB.mask mb) `liftM`
ST.unsafeFreeze (MB.bitArray mb)
-- | Copy an immutable Bloom filter to create a mutable one. There is
-- no non-copying equivalent.
thaw :: Bloom a -> ST s (MBloom s a)
thaw ub = MB.MB (hashes ub) (shift ub) (mask ub) `liftM` ST.thaw (bitArray ub)
-- | Create an empty Bloom filter.
--
-- This function is subject to fusion with 'insert'
-- and 'insertList'.
empty :: (a -> [Hash]) -- ^ family of hash functions to use
-> Int -- ^ number of bits in filter
-> Bloom a
{-# INLINE [1] empty #-}
empty hash numBits = create hash numBits (\_ -> return ())
-- | Create a Bloom filter with a single element.
--
-- This function is subject to fusion with 'insert'
-- and 'insertList'.
singleton :: (a -> [Hash]) -- ^ family of hash functions to use
-> Int -- ^ number of bits in filter
-> a -- ^ element to insert
-> Bloom a
{-# INLINE [1] singleton #-}
singleton hash numBits elt = create hash numBits (\mb -> MB.insert mb elt)
-- | Given a filter's mask and a hash value, compute an offset into
-- a word array and a bit offset within that word.
hashIdx :: Int -> Word32 -> (Int :* Int)
hashIdx msk x = (y `unsafeShiftR` logBitsInHash) :* (y .&. hashMask)
where hashMask = 31 -- bitsInHash - 1
y = fromIntegral x .&. msk
-- | Hash the given value, returning a list of (word offset, bit
-- offset) pairs, one per hash value.
hashesU :: Bloom a -> a -> [Int :* Int]
hashesU ub elt = hashIdx (mask ub) `map` hashes ub elt
-- | Query an immutable Bloom filter for membership. If the value is
-- present, return @True@. If the value is not present, there is
-- /still/ some possibility that @True@ will be returned.
elem :: a -> Bloom a -> Bool
elem elt ub = all test (hashesU ub elt)
where test (off :* bit) = (bitArray ub `unsafeAt` off) .&. (1 `unsafeShiftL` bit) /= 0
modify :: (forall s. (MBloom s a -> ST s z)) -- ^ mutation function (result is discarded)
-> Bloom a
-> Bloom a
{-# INLINE modify #-}
modify body ub = runST $ do
mb <- thaw ub
_ <- body mb
unsafeFreeze mb
-- | Create a new Bloom filter from an existing one, with the given
-- member added.
--
-- This function may be expensive, as it is likely to cause the
-- underlying bit array to be copied.
--
-- Repeated applications of this function with itself are subject to
-- fusion.
insert :: a -> Bloom a -> Bloom a
{-# NOINLINE insert #-}
insert elt = modify (flip MB.insert elt)
-- | Create a new Bloom filter from an existing one, with the given
-- members added.
--
-- This function may be expensive, as it is likely to cause the
-- underlying bit array to be copied.
--
-- Repeated applications of this function with itself are subject to
-- fusion.
insertList :: [a] -> Bloom a -> Bloom a
{-# NOINLINE insertList #-}
insertList elts = modify $ \mb -> mapM_ (MB.insert mb) elts
{-# RULES "Bloom insert . insert" forall a b u.
insert b (insert a u) = insertList [a,b] u
#-}
{-# RULES "Bloom insertList . insert" forall x xs u.
insertList xs (insert x u) = insertList (x:xs) u
#-}
{-# RULES "Bloom insert . insertList" forall x xs u.
insert x (insertList xs u) = insertList (x:xs) u
#-}
{-# RULES "Bloom insertList . insertList" forall xs ys u.
insertList xs (insertList ys u) = insertList (xs++ys) u
#-}
{-# RULES "Bloom insertList . empty" forall h n xs.
insertList xs (empty h n) = fromList h n xs
#-}
{-# RULES "Bloom insertList . singleton" forall h n x xs.
insertList xs (singleton h n x) = fromList h n (x:xs)
#-}
-- | Query an immutable Bloom filter for non-membership. If the value
-- /is/ present, return @False@. If the value is not present, there
-- is /still/ some possibility that @False@ will be returned.
notElem :: a -> Bloom a -> Bool
notElem elt ub = any test (hashesU ub elt)
where test (off :* bit) = (bitArray ub `unsafeAt` off) .&. (1 `unsafeShiftL` bit) == 0
-- | Return the size of an immutable Bloom filter, in bits.
length :: Bloom a -> Int
length = unsafeShiftL 1 . shift
-- | Build an immutable Bloom filter from a seed value. The seeding
-- function populates the filter as follows.
--
-- * If it returns 'Nothing', it is finished producing values to
-- insert into the filter.
--
-- * If it returns @'Just' (a,b)@, @a@ is added to the filter and
-- @b@ is used as a new seed.
unfold :: forall a b. (a -> [Hash]) -- ^ family of hash functions to use
-> Int -- ^ number of bits in filter
-> (b -> Maybe (a, b)) -- ^ seeding function
-> b -- ^ initial seed
-> Bloom a
{-# INLINE unfold #-}
unfold hs numBits f k = create hs numBits (loop k)
where loop :: forall s. b -> MBloom s a -> ST s ()
loop j mb = case f j of
Just (a, j') -> MB.insert mb a >> loop j' mb
_ -> return ()
-- | Create an immutable Bloom filter, populating it from a list of
-- values.
--
-- Here is an example that uses the @cheapHashes@ function from the
-- "Data.BloomFilter.Hash" module to create a hash function that
-- returns three hashes.
--
-- @
--import "Data.BloomFilter.Hash" (cheapHashes)
--
--filt = fromList (cheapHashes 3) 1024 [\"foo\", \"bar\", \"quux\"]
-- @
fromList :: (a -> [Hash]) -- ^ family of hash functions to use
-> Int -- ^ number of bits in filter
-> [a] -- ^ values to populate with
-> Bloom a
{-# INLINE [1] fromList #-}
fromList hs numBits list = create hs numBits $ forM_ list . MB.insert
{-# RULES "Bloom insertList . fromList" forall h n xs ys.
insertList xs (fromList h n ys) = fromList h n (xs ++ ys)
#-}
{-
-- This is a simpler definition, but GHC doesn't inline the unfold
-- sensibly.
fromList hashes numBits = unfold hashes numBits convert
where convert (x:xs) = Just (x, xs)
convert _ = Nothing
-}
-- $overview
--
-- Each of the functions for creating Bloom filters accepts two parameters:
--
-- * The number of bits that should be used for the filter. Note that
-- a filter is fixed in size; it cannot be resized after creation.
--
-- * A function that accepts a value, and should return a fixed-size
-- list of hashes of that value. To keep the false positive rate
-- low, the hashes computes should, as far as possible, be
-- independent.
--
-- By choosing these parameters with care, it is possible to tune for
-- a particular false positive rate. The @suggestSizing@ function in
-- the "Data.BloomFilter.Easy" module calculates useful estimates for
-- these parameters.
-- $ease
--
-- This module provides immutable interfaces for working with a
-- query-only Bloom filter, and for converting to and from mutable
-- Bloom filters.
--
-- For a higher-level interface that is easy to use, see the
-- 'Data.BloomFilter.Easy' module.
-- $performance
--
-- The implementation has been carefully tuned for high performance
-- and low space consumption.
--
-- For efficiency, the number of bits requested when creating a Bloom
-- filter is rounded up to the nearest power of two. This lets the
-- implementation use bitwise operations internally, instead of much
-- more expensive multiplication, division, and modulus operations.
|