File: Easy.hs

package info (click to toggle)
haskell-bloomfilter 2.0.1.0-6
  • links: PTS, VCS
  • area: main
  • in suites: bullseye, buster, sid
  • size: 192 kB
  • sloc: ansic: 852; haskell: 740; makefile: 13
file content (102 lines) | stat: -rw-r--r-- 3,772 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
{-# LANGUAGE ScopedTypeVariables #-}

-- |
-- Module: Data.BloomFilter.Easy
-- Copyright: Bryan O'Sullivan
-- License: BSD3
--
-- Maintainer: Bryan O'Sullivan <bos@serpentine.com>
-- Stability: unstable
-- Portability: portable
--
-- An easy-to-use Bloom filter interface.

module Data.BloomFilter.Easy
    (
    -- * Easy creation and querying
      Bloom
    , easyList
    , B.elem
    , B.notElem
    , B.length

    -- ** Example: a spell checker
    -- $example

    -- * Useful defaults for creation
    , safeSuggestSizing
    , suggestSizing
    ) where

import Data.BloomFilter (Bloom)
import Data.BloomFilter.Hash (Hashable, cheapHashes)
import Data.BloomFilter.Util (nextPowerOfTwo)
import qualified Data.ByteString as SB
import qualified Data.ByteString.Lazy as LB
import qualified Data.BloomFilter as B

-- | Create a Bloom filter with the given false positive rate and
-- members.  The hash functions used are computed by the @cheapHashes@
-- function from the 'Data.BloomFilter.Hash' module.
easyList :: (Hashable a)
         => Double              -- ^ desired false positive rate (0 < /e/ < 1)
         -> [a]                 -- ^ values to populate with
         -> Bloom a
{-# SPECIALIZE easyList :: Double -> [String] -> Bloom String #-}
{-# SPECIALIZE easyList :: Double -> [LB.ByteString] -> Bloom LB.ByteString #-}
{-# SPECIALIZE easyList :: Double -> [SB.ByteString] -> Bloom SB.ByteString #-}
easyList errRate xs = B.fromList (cheapHashes numHashes) numBits xs
    where capacity = length xs
          (numBits, numHashes)
              | capacity > 0 = suggestSizing capacity errRate
              | otherwise    = (1, 1)

-- | Suggest a good combination of filter size and number of hash
-- functions for a Bloom filter, based on its expected maximum
-- capacity and a desired false positive rate.
--
-- The false positive rate is the rate at which queries against the
-- filter should return @True@ when an element is not actually
-- present.  It should be a fraction between 0 and 1, so a 1% false
-- positive rate is represented by 0.01.
safeSuggestSizing
    :: Int              -- ^ expected maximum capacity
    -> Double           -- ^ desired false positive rate (0 < /e/ < 1)
    -> Either String (Int, Int)
safeSuggestSizing capacity errRate
    | capacity <= 0                = Left "invalid capacity"
    | errRate <= 0 || errRate >= 1 = Left "invalid error rate"
    | otherwise =
    let cap = fromIntegral capacity
        (bits :: Double, hashes :: Double) =
            minimum [((-k) * cap / log (1 - (errRate ** (1 / k))), k)
                     | k <- [1..100]]
        roundedBits = nextPowerOfTwo (ceiling bits)
    in if roundedBits <= 0
       then Left  "capacity too large to represent"
       else Right (roundedBits, truncate hashes)

-- | Behaves as 'safeSuggestSizing', but calls 'error' if given
-- invalid or out-of-range inputs.
suggestSizing :: Int            -- ^ expected maximum capacity
              -> Double         -- ^ desired false positive rate (0 < /e/ < 1)
              -> (Int, Int)
suggestSizing cap errs = either fatal id (safeSuggestSizing cap errs)
  where fatal = error . ("Data.BloomFilter.Util.suggestSizing: " ++)

-- $example
--
-- This example reads a dictionary file containing one word per line,
-- constructs a Bloom filter with a 1% false positive rate, and
-- spellchecks its standard input.  Like the Unix @spell@ command, it
-- prints each word that it does not recognize.
--
-- @
--import Data.BloomFilter.Easy (easyList, elemB)
--
--main = do
--  filt <- ('easyList' 0.01 . words) \`fmap\` readFile \"/usr/share/dict/words\"
--  let check word | 'elemB' word filt = \"\"
--                 | otherwise         = word ++ \"\\n\"
--  interact (concat . map check . lines)
-- @