File: Mutable.hs

package info (click to toggle)
haskell-bloomfilter 2.0.1.3-1
links: PTS, VCS
area: main
in suites: sid
size: 196 kB
sloc: ansic: 852; haskell: 709; makefile: 13
file content (189 lines) | stat: -rw-r--r-- 6,261 bytes
parent folder | download | duplicates (2)
{-# LANGUAGE BangPatterns, CPP, Rank2Types,
    TypeOperators,FlexibleContexts #-}

-- |
-- Module: Data.BloomFilter.Mutable
-- Copyright: Bryan O'Sullivan
-- License: BSD3
--
-- Maintainer: Bryan O'Sullivan <bos@serpentine.com>
-- Stability: unstable
-- Portability: portable
--
-- A fast, space efficient Bloom filter implementation.  A Bloom
-- filter is a set-like data structure that provides a probabilistic
-- membership test.
--
-- * Queries do not give false negatives.  When an element is added to
--   a filter, a subsequent membership test will definitely return
--   'True'.
--
-- * False positives /are/ possible.  If an element has not been added
--   to a filter, a membership test /may/ nevertheless indicate that
--   the element is present.
--
-- This module provides low-level control.  For an easier to use
-- interface, see the "Data.BloomFilter.Easy" module.

module Data.BloomFilter.Mutable
    (
    -- * Overview
    -- $overview

    -- ** Ease of use
    -- $ease

    -- ** Performance
    -- $performance

    -- * Types
      Hash
    , MBloom
    -- * Mutable Bloom filters

    -- ** Creation
    , new

    -- ** Accessors
    , length
    , elem

    -- ** Mutation
    , insert

    -- * The underlying representation
    -- | If you serialize the raw bit arrays below to disk, do not
    -- expect them to be portable to systems with different
    -- conventions for endianness or word size.

    -- | The raw bit array used by the mutable 'MBloom' type.
    , bitArray
    ) where

#include "MachDeps.h"

import Control.Monad (liftM, forM_)
import Control.Monad.ST (ST)
import Data.Array.Base (unsafeRead, unsafeWrite)
import Data.Bits ((.&.), (.|.), unsafeShiftL, unsafeShiftR)
import Data.BloomFilter.Array (newArray)
import Data.BloomFilter.Util ((:*)(..), nextPowerOfTwo)
import Data.Word (Word32)
import Data.BloomFilter.Mutable.Internal

import Prelude hiding (elem, length, notElem,
                       (/), (*), div, divMod, mod, rem)

-- | Create a new mutable Bloom filter.  For efficiency, the number of
-- bits used may be larger than the number requested.  It is always
-- rounded up to the nearest higher power of two, but will be clamped
-- at a maximum of 4 gigabits, since hashes are 32 bits in size.
new :: (a -> [Hash])          -- ^ family of hash functions to use
    -> Int                    -- ^ number of bits in filter
    -> ST s (MBloom s a)
new hash numBits = MB hash shft msk `liftM` newArray numElems numBytes
  where twoBits | numBits < 1 = 1
                | numBits > maxHash = maxHash
                | isPowerOfTwo numBits = numBits
                | otherwise = nextPowerOfTwo numBits
        numElems = max 2 (twoBits `unsafeShiftR` logBitsInHash)
        numBytes = numElems `unsafeShiftL` logBytesInHash
        trueBits = numElems `unsafeShiftL` logBitsInHash
        shft     = logPower2 trueBits
        msk      = trueBits - 1
        isPowerOfTwo n = n .&. (n - 1) == 0

maxHash :: Int
#if WORD_SIZE_IN_BITS == 64
maxHash = 4294967296
#else
maxHash = 1073741824
#endif

logBitsInHash :: Int
logBitsInHash = 5 -- logPower2 bitsInHash

logBytesInHash :: Int
logBytesInHash = 2 -- logPower2 (sizeOf (undefined :: Hash))

-- | Given a filter's mask and a hash value, compute an offset into
-- a word array and a bit offset within that word.
hashIdx :: Int -> Word32 -> (Int :* Int)
hashIdx msk x = (y `unsafeShiftR` logBitsInHash) :* (y .&. hashMask)
  where hashMask = 31 -- bitsInHash - 1
        y = fromIntegral x .&. msk

-- | Hash the given value, returning a list of (word offset, bit
-- offset) pairs, one per hash value.
hashesM :: MBloom s a -> a -> [Int :* Int]
hashesM mb elt = hashIdx (mask mb) `map` hashes mb elt

-- | Insert a value into a mutable Bloom filter.  Afterwards, a
-- membership query for the same value is guaranteed to return @True@.
insert :: MBloom s a -> a -> ST s ()
insert mb elt = do
  let mu = bitArray mb
  forM_ (hashesM mb elt) $ \(word :* bit) -> do
      old <- unsafeRead mu word
      unsafeWrite mu word (old .|. (1 `unsafeShiftL` bit))

-- | Query a mutable Bloom filter for membership.  If the value is
-- present, return @True@.  If the value is not present, there is
-- /still/ some possibility that @True@ will be returned.
elem :: a -> MBloom s a -> ST s Bool
elem elt mb = loop (hashesM mb elt)
  where mu = bitArray mb
        loop ((word :* bit):wbs) = do
          i <- unsafeRead mu word
          if i .&. (1 `unsafeShiftL` bit) == 0
            then return False
            else loop wbs
        loop _ = return True

-- bitsInHash :: Int
-- bitsInHash = sizeOf (undefined :: Hash) `shiftL` 3

-- | Return the size of a mutable Bloom filter, in bits.
length :: MBloom s a -> Int
length = unsafeShiftL 1 . shift


-- | Slow, crummy way of computing the integer log of an integer known
-- to be a power of two.
logPower2 :: Int -> Int
logPower2 k = go 0 k
    where go j 1 = j
          go j n = go (j+1) (n `unsafeShiftR` 1)

-- $overview
--
-- Each of the functions for creating Bloom filters accepts two parameters:
--
-- * The number of bits that should be used for the filter.  Note that
--   a filter is fixed in size; it cannot be resized after creation.
--
-- * A function that accepts a value, and should return a fixed-size
--   list of hashes of that value.  To keep the false positive rate
--   low, the hashes computes should, as far as possible, be
--   independent.
--
-- By choosing these parameters with care, it is possible to tune for
-- a particular false positive rate.  The @suggestSizing@ function in
-- the "Data.BloomFilter.Easy" module calculates useful estimates for
-- these parameters.

-- $ease
--
-- This module provides both mutable interfaces for creating and
-- querying a Bloom filter.  It is most useful as a low-level way to
-- manage a Bloom filter with a custom set of characteristics.

-- $performance
--
-- The implementation has been carefully tuned for high performance
-- and low space consumption.
--
-- For efficiency, the number of bits requested when creating a Bloom
-- filter is rounded up to the nearest power of two.  This lets the
-- implementation use bitwise operations internally, instead of much
-- more expensive multiplication, division, and modulus operations.