1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376
|
{-# OPTIONS -fno-implicit-prelude #-}
-----------------------------------------------------------------------------
-- |
-- Module : Data.HashTable
-- Copyright : (c) The University of Glasgow 2003
-- License : BSD-style (see the file libraries/base/LICENSE)
--
-- Maintainer : libraries@haskell.org
-- Stability : provisional
-- Portability : portable
--
-- An implementation of extensible hash tables, as described in
-- Per-Ake Larson, /Dynamic Hash Tables/, CACM 31(4), April 1988,
-- pp. 446--457. The implementation is also derived from the one
-- in GHC's runtime system (@ghc\/rts\/Hash.{c,h}@).
--
-----------------------------------------------------------------------------
module Data.HashTable (
-- * Basic hash table operations
HashTable, new, insert, delete, lookup,
-- * Converting to and from lists
fromList, toList,
-- * Hash functions
-- $hash_functions
hashInt, hashString,
prime,
-- * Diagnostics
longestChain
) where
-- This module is imported by Data.Dynamic, which is pretty low down in the
-- module hierarchy, so don't import "high-level" modules
#ifdef __GLASGOW_HASKELL__
import GHC.Base
#else
import Prelude hiding ( lookup )
#endif
import Data.Tuple ( fst )
import Data.Bits
import Data.Maybe
import Data.List ( maximumBy, filter, length, concat )
import Data.Int ( Int32 )
#if defined(__GLASGOW_HASKELL__)
import GHC.Num
import GHC.Real ( Integral(..), fromIntegral )
import GHC.IOBase ( IO, IOArray, newIOArray, readIOArray, writeIOArray,
unsafeReadIOArray, unsafeWriteIOArray,
IORef, newIORef, readIORef, writeIORef )
import GHC.Err ( undefined )
#else
import Data.Char ( ord )
import Data.IORef ( IORef, newIORef, readIORef, writeIORef )
# if defined(__HUGS__)
import Hugs.IOArray ( IOArray, newIOArray, readIOArray, writeIOArray,
unsafeReadIOArray, unsafeWriteIOArray )
# elif defined(__NHC__)
import NHC.IOExtras ( IOArray, newIOArray, readIOArray, writeIOArray)
# endif
#endif
import Control.Monad ( when, mapM, sequence_ )
-----------------------------------------------------------------------
myReadArray :: IOArray Int32 a -> Int32 -> IO a
myWriteArray :: IOArray Int32 a -> Int32 -> a -> IO ()
#if defined(DEBUG) || defined(__NHC__)
myReadArray = readIOArray
myWriteArray = writeIOArray
#else
myReadArray arr i = unsafeReadIOArray arr (fromIntegral i)
myWriteArray arr i x = unsafeWriteIOArray arr (fromIntegral i) x
#endif
-- | A hash table mapping keys of type @key@ to values of type @val@.
--
-- The implementation will grow the hash table as necessary, trying to
-- maintain a reasonable average load per bucket in the table.
--
newtype HashTable key val = HashTable (IORef (HT key val))
-- TODO: the IORef should really be an MVar.
data HT key val
= HT {
split :: !Int32, -- Next bucket to split when expanding
max_bucket :: !Int32, -- Max bucket of smaller table
mask1 :: !Int32, -- Mask for doing the mod of h_1 (smaller table)
mask2 :: !Int32, -- Mask for doing the mod of h_2 (larger table)
kcount :: !Int32, -- Number of keys
bcount :: !Int32, -- Number of buckets
dir :: !(IOArray Int32 (IOArray Int32 [(key,val)])),
hash_fn :: key -> Int32,
cmp :: key -> key -> Bool
}
{-
ALTERNATIVE IMPLEMENTATION:
This works out slightly slower, because there's a tradeoff between
allocating a complete new HT structure each time a modification is
made (in the version above), and allocating new Int32s each time one
of them is modified, as below. Using FastMutInt instead of IORef
Int32 helps, but yields an implementation which has about the same
performance as the version above (and is more complex).
data HashTable key val
= HashTable {
split :: !(IORef Int32), -- Next bucket to split when expanding
max_bucket :: !(IORef Int32), -- Max bucket of smaller table
mask1 :: !(IORef Int32), -- Mask for doing the mod of h_1 (smaller table)
mask2 :: !(IORef Int32), -- Mask for doing the mod of h_2 (larger table)
kcount :: !(IORef Int32), -- Number of keys
bcount :: !(IORef Int32), -- Number of buckets
dir :: !(IOArray Int32 (IOArray Int32 [(key,val)])),
hash_fn :: key -> Int32,
cmp :: key -> key -> Bool
}
-}
-- -----------------------------------------------------------------------------
-- Sample hash functions
-- $hash_functions
--
-- This implementation of hash tables uses the low-order /n/ bits of the hash
-- value for a key, where /n/ varies as the hash table grows. A good hash
-- function therefore will give an even distribution regardless of /n/.
--
-- If your keyspace is integrals such that the low-order bits between
-- keys are highly variable, then you could get away with using 'id'
-- as the hash function.
--
-- We provide some sample hash functions for 'Int' and 'String' below.
-- | A sample hash function for 'Int', implemented as simply @(x `mod` P)@
-- where P is a suitable prime (currently 1500007). Should give
-- reasonable results for most distributions of 'Int' values, except
-- when the keys are all multiples of the prime!
--
hashInt :: Int -> Int32
hashInt = (`rem` prime) . fromIntegral
-- | A sample hash function for 'String's. The implementation is:
--
-- > hashString = fromIntegral . foldr f 0
-- > where f c m = ord c + (m * 128) `rem` 1500007
--
-- which seems to give reasonable results.
--
hashString :: String -> Int32
hashString = fromIntegral . foldr f 0
where f c m = ord c + (m * 128) `rem` fromIntegral prime
-- | A prime larger than the maximum hash table size
prime :: Int32
prime = 1500007
-- -----------------------------------------------------------------------------
-- Parameters
sEGMENT_SIZE = 1024 :: Int32 -- Size of a single hash table segment
sEGMENT_SHIFT = 10 :: Int -- derived
sEGMENT_MASK = 0x3ff :: Int32 -- derived
dIR_SIZE = 1024 :: Int32 -- Size of the segment directory
-- Maximum hash table size is sEGMENT_SIZE * dIR_SIZE
hLOAD = 4 :: Int32 -- Maximum average load of a single hash bucket
-- -----------------------------------------------------------------------------
-- Creating a new hash table
-- | Creates a new hash table
new
:: (key -> key -> Bool) -- ^ An equality comparison on keys
-> (key -> Int32) -- ^ A hash function on keys
-> IO (HashTable key val) -- ^ Returns: an empty hash table
new cmp hash_fn = do
-- make a new hash table with a single, empty, segment
dir <- newIOArray (0,dIR_SIZE) undefined
segment <- newIOArray (0,sEGMENT_SIZE-1) []
myWriteArray dir 0 segment
let
split = 0
max = sEGMENT_SIZE
mask1 = (sEGMENT_SIZE - 1)
mask2 = (2 * sEGMENT_SIZE - 1)
kcount = 0
bcount = sEGMENT_SIZE
ht = HT { dir=dir, split=split, max_bucket=max, mask1=mask1, mask2=mask2,
kcount=kcount, bcount=bcount, hash_fn=hash_fn, cmp=cmp
}
table <- newIORef ht
return (HashTable table)
-- -----------------------------------------------------------------------------
-- Inserting a key\/value pair into the hash table
-- | Inserts an key\/value mapping into the hash table.
insert :: HashTable key val -> key -> val -> IO ()
insert (HashTable ref) key val = do
table@HT{ kcount=k, bcount=b, dir=dir } <- readIORef ref
let table1 = table{ kcount = k+1 }
table2 <-
if (k > hLOAD * b)
then expandHashTable table1
else return table1
writeIORef ref table2
(segment_index,segment_offset) <- tableLocation table key
segment <- myReadArray dir segment_index
bucket <- myReadArray segment segment_offset
myWriteArray segment segment_offset ((key,val):bucket)
return ()
bucketIndex :: HT key val -> key -> IO Int32
bucketIndex HT{ hash_fn=hash_fn,
split=split,
mask1=mask1,
mask2=mask2 } key = do
let
h = fromIntegral (hash_fn key)
small_bucket = h .&. mask1
large_bucket = h .&. mask2
--
if small_bucket < split
then return large_bucket
else return small_bucket
tableLocation :: HT key val -> key -> IO (Int32,Int32)
tableLocation table key = do
bucket_index <- bucketIndex table key
let
segment_index = bucket_index `shiftR` sEGMENT_SHIFT
segment_offset = bucket_index .&. sEGMENT_MASK
--
return (segment_index,segment_offset)
expandHashTable :: HT key val -> IO (HT key val)
expandHashTable
table@HT{ dir=dir,
split=split,
max_bucket=max,
mask2=mask2 } = do
let
oldsegment = split `shiftR` sEGMENT_SHIFT
oldindex = split .&. sEGMENT_MASK
newbucket = max + split
newsegment = newbucket `shiftR` sEGMENT_SHIFT
newindex = newbucket .&. sEGMENT_MASK
--
when (newindex == 0) $
do segment <- newIOArray (0,sEGMENT_SIZE-1) []
myWriteArray dir newsegment segment
--
let table' =
if (split+1) < max
then table{ split = split+1 }
-- we've expanded all the buckets in this table, so start from
-- the beginning again.
else table{ split = 0,
max_bucket = max * 2,
mask1 = mask2,
mask2 = mask2 `shiftL` 1 .|. 1 }
let
split_bucket old new [] = do
segment <- myReadArray dir oldsegment
myWriteArray segment oldindex old
segment <- myReadArray dir newsegment
myWriteArray segment newindex new
split_bucket old new ((k,v):xs) = do
h <- bucketIndex table' k
if h == newbucket
then split_bucket old ((k,v):new) xs
else split_bucket ((k,v):old) new xs
--
segment <- myReadArray dir oldsegment
bucket <- myReadArray segment oldindex
split_bucket [] [] bucket
return table'
-- -----------------------------------------------------------------------------
-- Deleting a mapping from the hash table
-- | Remove an entry from the hash table.
delete :: HashTable key val -> key -> IO ()
delete (HashTable ref) key = do
table@HT{ dir=dir, cmp=cmp } <- readIORef ref
(segment_index,segment_offset) <- tableLocation table key
segment <- myReadArray dir segment_index
bucket <- myReadArray segment segment_offset
myWriteArray segment segment_offset (filter (not.(key `cmp`).fst) bucket)
return ()
-- -----------------------------------------------------------------------------
-- Looking up an entry in the hash table
-- | Looks up the value of a key in the hash table.
lookup :: HashTable key val -> key -> IO (Maybe val)
lookup (HashTable ref) key = do
table@HT{ dir=dir, cmp=cmp } <- readIORef ref
(segment_index,segment_offset) <- tableLocation table key
segment <- myReadArray dir segment_index
bucket <- myReadArray segment segment_offset
case [ val | (key',val) <- bucket, cmp key key' ] of
[] -> return Nothing
(v:_) -> return (Just v)
-- -----------------------------------------------------------------------------
-- Converting to/from lists
-- | Convert a list of key\/value pairs into a hash table. Equality on keys
-- is taken from the Eq instance for the key type.
--
fromList :: Eq key => (key -> Int32) -> [(key,val)] -> IO (HashTable key val)
fromList hash_fn list = do
table <- new (==) hash_fn
sequence_ [ insert table k v | (k,v) <- list ]
return table
-- | Converts a hash table to a list of key\/value pairs.
--
toList :: HashTable key val -> IO [(key,val)]
toList (HashTable ref) = do
HT{ dir=dir, max_bucket=max, split=split } <- readIORef ref
--
let
max_segment = (max + split - 1) `quot` sEGMENT_SIZE
--
segments <- mapM (segmentContents dir) [0 .. max_segment]
return (concat segments)
where
segmentContents dir seg_index = do
segment <- myReadArray dir seg_index
bs <- mapM (myReadArray segment) [0 .. sEGMENT_SIZE-1]
return (concat bs)
-- -----------------------------------------------------------------------------
-- Diagnostics
-- | This function is useful for determining whether your hash function
-- is working well for your data set. It returns the longest chain
-- of key\/value pairs in the hash table for which all the keys hash to
-- the same bucket. If this chain is particularly long (say, longer
-- than 10 elements), then it might be a good idea to try a different
-- hash function.
--
longestChain :: HashTable key val -> IO [(key,val)]
longestChain (HashTable ref) = do
HT{ dir=dir, max_bucket=max, split=split } <- readIORef ref
--
let
max_segment = (max + split - 1) `quot` sEGMENT_SIZE
--
--trace ("maxChainLength: max = " ++ show max ++ ", split = " ++ show split ++ ", max_segment = " ++ show max_segment) $ do
segments <- mapM (segmentMaxChainLength dir) [0 .. max_segment]
return (maximumBy lengthCmp segments)
where
segmentMaxChainLength dir seg_index = do
segment <- myReadArray dir seg_index
bs <- mapM (myReadArray segment) [0 .. sEGMENT_SIZE-1]
return (maximumBy lengthCmp bs)
lengthCmp x y = length x `compare` length y
|