1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341
|
{-# LANGUAGE CPP #-}
-----------------------------------------------------------------------------
-- |
-- Module : Data.HashTable
-- Copyright : (c) The University of Glasgow 2003
-- License : BSD-style (see the file libraries/base/LICENSE)
--
-- Maintainer : libraries@haskell.org
-- Stability : provisional
-- Portability : portable
--
-- An implementation of extensible hash tables, as described in
-- Per-Ake Larson, /Dynamic Hash Tables/, CACM 31(4), April 1988,
-- pp. 446--457. The implementation is also derived from the one
-- in GHC's runtime system (@ghc\/rts\/Hash.{c,h}@).
--
-----------------------------------------------------------------------------
module Data.HashTab (
-- * Basic hash table operations
HashTable, new, insert, delete, lookup, update,
-- * Converting to and from lists
fromList, toList,
-- * Hash functions
-- $hash_functions
hashInt, hashString,
prime,
-- * Diagnostics
longestChain
) where
-- This module is imported by Data.Typeable, which is pretty low down in the
-- module hierarchy, so don't import "high-level" modules
-- Right now we import high-level modules with gay abandon.
import Prelude hiding ( lookup )
import Data.Tuple ( fst )
import Data.Bits
import Data.Maybe
import Data.List ( maximumBy, partition, concat, foldl )
import Data.Int ( Int32 )
import Data.Array.Base
import Data.Array hiding (bounds)
import Data.Array.IO
import Data.Char ( ord )
import Data.IORef ( IORef, newIORef, readIORef, writeIORef )
import Control.Monad ( mapM, sequence_ )
-----------------------------------------------------------------------
readHTArray :: HTArray a -> Int32 -> IO a
readMutArray :: MutArray a -> Int32 -> IO a
writeMutArray :: MutArray a -> Int32 -> a -> IO ()
freezeArray :: MutArray a -> IO (HTArray a)
thawArray :: HTArray a -> IO (MutArray a)
newMutArray :: (Int32, Int32) -> a -> IO (MutArray a)
#if defined(DEBUG) || defined(__NHC__)
type MutArray a = IOArray Int32 a
type HTArray a = MutArray a
newMutArray = newArray
readHTArray = readArray
readMutArray = readArray
writeMutArray = writeArray
freezeArray = return
thawArray = return
#else
type MutArray a = IOArray Int32 a
type HTArray a = Array Int32 a
newMutArray = newArray
readHTArray arr i = return $! (unsafeAt arr (fromIntegral i))
readMutArray arr i = unsafeRead arr (fromIntegral i)
writeMutArray arr i x = unsafeWrite arr (fromIntegral i) x
freezeArray = unsafeFreeze
thawArray = unsafeThaw
#endif
newtype HashTable key val = HashTable (IORef (HT key val))
-- TODO: the IORef should really be an MVar.
data HT key val
= HT {
kcount :: !Int32, -- Total number of keys.
buckets :: !(HTArray [(key,val)]),
bmask :: !Int32,
hash_fn :: key -> Int32,
cmp :: key -> key -> Bool
}
-- -----------------------------------------------------------------------------
-- Sample hash functions
-- $hash_functions
--
-- This implementation of hash tables uses the low-order /n/ bits of the hash
-- value for a key, where /n/ varies as the hash table grows. A good hash
-- function therefore will give an even distribution regardless of /n/.
--
-- If your keyspace is integrals such that the low-order bits between
-- keys are highly variable, then you could get away with using 'id'
-- as the hash function.
--
-- We provide some sample hash functions for 'Int' and 'String' below.
-- | A sample hash function for 'Int', implemented as simply @(x `mod` P)@
-- where P is a suitable prime (currently 1500007). Should give
-- reasonable results for most distributions of 'Int' values, except
-- when the keys are all multiples of the prime!
--
hashInt :: Int -> Int32
hashInt = (`rem` prime) . fromIntegral
-- | A sample hash function for 'String's. The implementation is:
--
-- > hashString = fromIntegral . foldr f 0
-- > where f c m = ord c + (m * 128) `rem` 1500007
--
-- which seems to give reasonable results.
--
hashString :: String -> Int32
hashString = fromIntegral . foldl f 0
where f m c = ord c + (m * 128) `rem` fromIntegral prime
-- | A prime larger than the maximum hash table size
prime :: Int32
prime = 1500007
-- -----------------------------------------------------------------------------
-- Parameters
tABLE_MAX = 1024 * 1024 :: Int32 -- Maximum size of hash table
#if tABLE_MIN
#else
tABLE_MIN = 16 :: Int32
hLOAD = 4 :: Int32 -- Maximum average load of a single hash bucket
hYSTERESIS = 0 :: Int32 -- entries to ignore in load computation
#endif
{- Hysteresis favors long association-list-like behavior for small tables. -}
-- -----------------------------------------------------------------------------
-- Creating a new hash table
-- | Creates a new hash table. The following property should hold for the @eq@
-- and @hash@ functions passed to 'new':
--
-- > eq A B => hash A == hash B
--
new
:: (key -> key -> Bool) -- ^ @eq@: An equality comparison on keys
-> (key -> Int32) -- ^ @hash@: A hash function on keys
-> IO (HashTable key val) -- ^ Returns: an empty hash table
new cmpr hash = do
-- make a new hash table with a single, empty, segment
let mask = tABLE_MIN-1
bkts' <- newMutArray (0,mask) []
bkts <- freezeArray bkts'
let
kcnt = 0
ht = HT { buckets=bkts, kcount=kcnt, bmask=mask,
hash_fn=hash, cmp=cmpr }
table <- newIORef ht
return (HashTable table)
-- -----------------------------------------------------------------------------
-- Inserting a key\/value pair into the hash table
-- | Inserts a key\/value mapping into the hash table.
--
-- Note that 'insert' doesn't remove the old entry from the table -
-- the behaviour is like an association list, where 'lookup' returns
-- the most-recently-inserted mapping for a key in the table. The
-- reason for this is to keep 'insert' as efficient as possible. If
-- you need to update a mapping, then we provide 'update'.
--
insert :: HashTable key val -> key -> val -> IO ()
insert (HashTable ref) key val = do
table@HT{ kcount=k, buckets=bkts, bmask=b } <- readIORef ref
let table1 = table{ kcount = k+1 }
indx = bucketIndex table key
bucket <- readHTArray bkts indx
bkts' <- thawArray bkts
writeMutArray bkts' indx ((key,val):bucket)
freezeArray bkts'
table2 <-
if tooBig k b
then expandHashTable table1
else return table1
writeIORef ref table2
tooBig :: Int32 -> Int32 -> Bool
tooBig k b = k-hYSTERESIS > hLOAD * b
bucketIndex :: HT key val -> key -> Int32
bucketIndex HT{ hash_fn=hash, bmask=mask } key =
let h = hash key
in (h .&. mask)
expandHashTable :: HT key val -> IO (HT key val)
expandHashTable
table@HT{ buckets=bkts, bmask=mask } = do
let
oldsize = mask + 1
newmask = mask + mask + 1
newsize = newmask + 1
--
if newsize > tABLE_MAX
then return table
else do
--
newbkts' <- newMutArray (0,newmask) []
let
table'=table{ bmask=newmask }
splitBucket oldindex = do
bucket <- readHTArray bkts oldindex
let (oldb,newb) = partition ((oldindex==).bucketIndex table' . fst) bucket
writeMutArray newbkts' oldindex oldb
writeMutArray newbkts' (oldindex + oldsize) newb
mapM_ splitBucket [0..mask]
newbkts <- freezeArray newbkts'
return ( table'{ buckets=newbkts } )
-- -----------------------------------------------------------------------------
-- Deleting a mapping from the hash table
-- Remove a key from a bucket
deleteBucket :: (key -> Bool) -> [(key,val)] -> (Int32, [(key, val)])
deleteBucket _ [] = (0,[])
deleteBucket del (pair@(k,_):bucket) =
case deleteBucket del bucket of
(dels, bucket') | del k -> dels' `seq` (dels', bucket')
| otherwise -> (dels, pair:bucket')
where dels' = dels + 1
-- | Remove an entry from the hash table.
delete :: HashTable key val -> key -> IO ()
delete (HashTable ref) key = do
table@HT{ buckets=bkts, kcount=kcnt, cmp=cmpr } <- readIORef ref
let indx = bucketIndex table key
bkts' <- thawArray bkts
bucket <- readMutArray bkts' indx
let (removed,bucket') = deleteBucket (cmpr key) bucket
writeMutArray bkts' indx bucket'
freezeArray bkts'
writeIORef ref ( table{kcount = kcnt - removed} )
-- -----------------------------------------------------------------------------
-- Updating a mapping in the hash table
-- | Updates an entry in the hash table, returning 'True' if there was
-- already an entry for this key, or 'False' otherwise. After 'update'
-- there will always be exactly one entry for the given key in the table.
--
-- 'insert' is more efficient than 'update' if you don't care about
-- multiple entries, or you know for sure that multiple entries can't
-- occur. However, 'update' is more efficient than 'delete' followed
-- by 'insert'.
update :: HashTable key val -> key -> val -> IO Bool
update (HashTable ref) key val = do
table@HT{ kcount=k, buckets=bkts, cmp=cmpr, bmask=b } <- readIORef ref
let indx = bucketIndex table key
bkts' <- thawArray bkts
bucket <- readMutArray bkts' indx
let (deleted,bucket') = deleteBucket (cmpr key) bucket
k' = k + 1 - deleted
table1 = table{ kcount=k' }
writeMutArray bkts' indx ((key,val):bucket')
freezeArray bkts'
table2 <-
if tooBig k' b -- off by one from insert's resize heuristic.
then expandHashTable table1
else return table1
writeIORef ref table2
return (deleted>0)
-- -----------------------------------------------------------------------------
-- Looking up an entry in the hash table
-- | Looks up the value of a key in the hash table.
lookup :: HashTable key val -> key -> IO (Maybe val)
lookup (HashTable ref) key = do
table@HT{ buckets=bkts, cmp=cmpr } <- readIORef ref
let indx = bucketIndex table key
bucket <- readHTArray bkts indx
case [ val | (key',val) <- bucket, cmpr key key' ] of
[] -> return Nothing
(v:_) -> return (Just v)
-- -----------------------------------------------------------------------------
-- Converting to/from lists
-- | Convert a list of key\/value pairs into a hash table. Equality on keys
-- is taken from the Eq instance for the key type.
--
fromList :: (Eq key) => (key -> Int32) -> [(key,val)] -> IO (HashTable key val)
fromList hash list = do
table <- new (==) hash
sequence_ [ insert table k v | (k,v) <- list ]
return table
-- | Converts a hash table to a list of key\/value pairs.
--
toList :: (Ord key, Ord val) => HashTable key val -> IO [(key,val)]
toList (HashTable ref) = do
HT{ buckets=bkts, bmask=b } <- readIORef ref
fmap concat (mapM (readHTArray bkts) [0..b])
-- -----------------------------------------------------------------------------
-- Diagnostics
-- | This function is useful for determining whether your hash function
-- is working well for your data set. It returns the longest chain
-- of key\/value pairs in the hash table for which all the keys hash to
-- the same bucket. If this chain is particularly long (say, longer
-- than 10 elements), then it might be a good idea to try a different
-- hash function.
--
longestChain :: HashTable key val -> IO [(key,val)]
longestChain (HashTable ref) = do
HT{ buckets=bkts, bmask=b } <- readIORef ref
let lengthCmp (_:x)(_:y) = lengthCmp x y
lengthCmp [] [] = EQ
lengthCmp [] _ = LT
lengthCmp _ [] = GT
fmap (maximumBy lengthCmp) (mapM (readHTArray bkts) [0..b])
|