File: HashTable.hs

package info (click to toggle)
hugs98 98.200311-4
  • links: PTS
  • area: main
  • in suites: sarge
  • size: 12,964 kB
  • ctags: 8,084
  • sloc: ansic: 67,521; haskell: 61,497; xml: 4,566; sh: 3,264; cpp: 1,936; yacc: 1,094; makefile: 915; cs: 883; sed: 10
file content (376 lines) | stat: -rw-r--r-- 12,640 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
{-# OPTIONS -fno-implicit-prelude #-}

-----------------------------------------------------------------------------
-- |
-- Module      :  Data.HashTable
-- Copyright   :  (c) The University of Glasgow 2003
-- License     :  BSD-style (see the file libraries/base/LICENSE)
--
-- Maintainer  :  libraries@haskell.org
-- Stability   :  provisional
-- Portability :  portable
--
-- An implementation of extensible hash tables, as described in
-- Per-Ake Larson, /Dynamic Hash Tables/, CACM 31(4), April 1988,
-- pp. 446--457.  The implementation is also derived from the one
-- in GHC's runtime system (@ghc\/rts\/Hash.{c,h}@).
--
-----------------------------------------------------------------------------

module Data.HashTable (
	-- * Basic hash table operations
	HashTable, new, insert, delete, lookup,
	-- * Converting to and from lists
	fromList, toList,
	-- * Hash functions
	-- $hash_functions
	hashInt, hashString,
	prime,
	-- * Diagnostics
	longestChain
 ) where

-- This module is imported by Data.Dynamic, which is pretty low down in the
-- module hierarchy, so don't import "high-level" modules

#ifdef __GLASGOW_HASKELL__
import GHC.Base
#else
import Prelude	hiding	( lookup )
#endif
import Data.Tuple	( fst )
import Data.Bits
import Data.Maybe
import Data.List	( maximumBy, filter, length, concat )
import Data.Int		( Int32 )

#if defined(__GLASGOW_HASKELL__)
import GHC.Num
import GHC.Real		( Integral(..), fromIntegral )

import GHC.IOBase	( IO, IOArray, newIOArray, readIOArray, writeIOArray,
			  unsafeReadIOArray, unsafeWriteIOArray,
			  IORef, newIORef, readIORef, writeIORef )
import GHC.Err		( undefined )
#else
import Data.Char	( ord )
import Data.IORef	( IORef, newIORef, readIORef, writeIORef )
#  if defined(__HUGS__)
import Hugs.IOArray	( IOArray, newIOArray, readIOArray, writeIOArray,
			  unsafeReadIOArray, unsafeWriteIOArray )
#  elif defined(__NHC__)
import NHC.IOExtras	( IOArray, newIOArray, readIOArray, writeIOArray)
#  endif
#endif
import Control.Monad	( when, mapM, sequence_ )


-----------------------------------------------------------------------
myReadArray  :: IOArray Int32 a -> Int32 -> IO a
myWriteArray :: IOArray Int32 a -> Int32 -> a -> IO ()
#if defined(DEBUG) || defined(__NHC__)
myReadArray  = readIOArray
myWriteArray = writeIOArray
#else
myReadArray arr i = unsafeReadIOArray arr (fromIntegral i)
myWriteArray arr i x = unsafeWriteIOArray arr (fromIntegral i) x
#endif

-- | A hash table mapping keys of type @key@ to values of type @val@.
--
-- The implementation will grow the hash table as necessary, trying to
-- maintain a reasonable average load per bucket in the table.
--
newtype HashTable key val = HashTable (IORef (HT key val))
-- TODO: the IORef should really be an MVar.

data HT key val
  = HT {
	split  :: !Int32, -- Next bucket to split when expanding
	max_bucket :: !Int32, -- Max bucket of smaller table
	mask1  :: !Int32, -- Mask for doing the mod of h_1 (smaller table)
	mask2  :: !Int32, -- Mask for doing the mod of h_2 (larger table)
	kcount :: !Int32, -- Number of keys
	bcount :: !Int32, -- Number of buckets
	dir    :: !(IOArray Int32 (IOArray Int32 [(key,val)])),
	hash_fn :: key -> Int32,
	cmp    :: key -> key -> Bool
   }

{-
ALTERNATIVE IMPLEMENTATION:

This works out slightly slower, because there's a tradeoff between
allocating a complete new HT structure each time a modification is
made (in the version above), and allocating new Int32s each time one
of them is modified, as below.  Using FastMutInt instead of IORef
Int32 helps, but yields an implementation which has about the same
performance as the version above (and is more complex).

data HashTable key val
  = HashTable {
	split  :: !(IORef Int32), -- Next bucket to split when expanding
	max_bucket :: !(IORef Int32), -- Max bucket of smaller table
	mask1  :: !(IORef Int32), -- Mask for doing the mod of h_1 (smaller table)
	mask2  :: !(IORef Int32), -- Mask for doing the mod of h_2 (larger table)
	kcount :: !(IORef Int32), -- Number of keys
	bcount :: !(IORef Int32), -- Number of buckets
	dir    :: !(IOArray Int32 (IOArray Int32 [(key,val)])),
	hash_fn :: key -> Int32,
	cmp    :: key -> key -> Bool
   }
-}


-- -----------------------------------------------------------------------------
-- Sample hash functions

-- $hash_functions
--
-- This implementation of hash tables uses the low-order /n/ bits of the hash
-- value for a key, where /n/ varies as the hash table grows.  A good hash
-- function therefore will give an even distribution regardless of /n/.
--
-- If your keyspace is integrals such that the low-order bits between
-- keys are highly variable, then you could get away with using 'id'
-- as the hash function.
--
-- We provide some sample hash functions for 'Int' and 'String' below.

-- | A sample hash function for 'Int', implemented as simply @(x `mod` P)@
-- where P is a suitable prime (currently 1500007).  Should give
-- reasonable results for most distributions of 'Int' values, except
-- when the keys are all multiples of the prime!
--
hashInt :: Int -> Int32
hashInt = (`rem` prime) . fromIntegral

-- | A sample hash function for 'String's.  The implementation is:
--
-- >    hashString = fromIntegral . foldr f 0
-- >      where f c m = ord c + (m * 128) `rem` 1500007
--
-- which seems to give reasonable results.
--
hashString :: String -> Int32
hashString = fromIntegral . foldr f 0
  where f c m = ord c + (m * 128) `rem` fromIntegral prime

-- | A prime larger than the maximum hash table size
prime :: Int32
prime = 1500007

-- -----------------------------------------------------------------------------
-- Parameters

sEGMENT_SIZE  = 1024  :: Int32  -- Size of a single hash table segment
sEGMENT_SHIFT = 10    :: Int  -- derived
sEGMENT_MASK  = 0x3ff :: Int32  -- derived

dIR_SIZE = 1024  :: Int32  -- Size of the segment directory
	-- Maximum hash table size is sEGMENT_SIZE * dIR_SIZE

hLOAD = 4 :: Int32 -- Maximum average load of a single hash bucket

-- -----------------------------------------------------------------------------
-- Creating a new hash table

-- | Creates a new hash table
new
  :: (key -> key -> Bool)    -- ^ An equality comparison on keys
  -> (key -> Int32)	     -- ^ A hash function on keys
  -> IO (HashTable key val)  -- ^ Returns: an empty hash table

new cmp hash_fn = do
  -- make a new hash table with a single, empty, segment
  dir     <- newIOArray (0,dIR_SIZE) undefined
  segment <- newIOArray (0,sEGMENT_SIZE-1) []
  myWriteArray dir 0 segment

  let
    split  = 0
    max    = sEGMENT_SIZE
    mask1  = (sEGMENT_SIZE - 1)
    mask2  = (2 * sEGMENT_SIZE - 1)
    kcount = 0
    bcount = sEGMENT_SIZE

    ht = HT {  dir=dir, split=split, max_bucket=max, mask1=mask1, mask2=mask2,
	       kcount=kcount, bcount=bcount, hash_fn=hash_fn, cmp=cmp
	  }
  
  table <- newIORef ht
  return (HashTable table)

-- -----------------------------------------------------------------------------
-- Inserting a key\/value pair into the hash table

-- | Inserts an key\/value mapping into the hash table.
insert :: HashTable key val -> key -> val -> IO ()

insert (HashTable ref) key val = do
  table@HT{ kcount=k, bcount=b, dir=dir } <- readIORef ref
  let table1 = table{ kcount = k+1 }
  table2 <-
	if (k > hLOAD * b)
	   then expandHashTable table1
	   else return table1
  writeIORef ref table2
  (segment_index,segment_offset) <- tableLocation table key
  segment <- myReadArray dir segment_index
  bucket <- myReadArray segment segment_offset
  myWriteArray segment segment_offset ((key,val):bucket)
  return ()

bucketIndex :: HT key val -> key -> IO Int32
bucketIndex HT{ hash_fn=hash_fn,
		split=split,
		mask1=mask1,
		mask2=mask2 } key = do
  let
    h = fromIntegral (hash_fn key)
    small_bucket = h .&. mask1
    large_bucket = h .&. mask2
  --
  if small_bucket < split
	then return large_bucket
  	else return small_bucket

tableLocation :: HT key val -> key -> IO (Int32,Int32)
tableLocation table key = do
  bucket_index <- bucketIndex table key
  let
    segment_index  = bucket_index `shiftR` sEGMENT_SHIFT
    segment_offset = bucket_index .&. sEGMENT_MASK
  --
  return (segment_index,segment_offset)

expandHashTable :: HT key val -> IO (HT key val)
expandHashTable
      table@HT{ dir=dir,
		split=split,
		max_bucket=max,
		mask2=mask2 } = do
  let
      oldsegment = split `shiftR` sEGMENT_SHIFT
      oldindex   = split .&. sEGMENT_MASK

      newbucket  = max + split
      newsegment = newbucket `shiftR` sEGMENT_SHIFT
      newindex   = newbucket .&. sEGMENT_MASK
  --
  when (newindex == 0) $
	do segment <- newIOArray (0,sEGMENT_SIZE-1) []
	   myWriteArray dir newsegment segment
  --
  let table' =
  	if (split+1) < max
     	    then table{ split = split+1 }
     		-- we've expanded all the buckets in this table, so start from
		-- the beginning again.
     	    else table{ split = 0,
			max_bucket = max * 2,
			mask1 = mask2,
			mask2 = mask2 `shiftL` 1 .|. 1 }
  let
    split_bucket old new [] = do
	segment <- myReadArray dir oldsegment
	myWriteArray segment oldindex old
	segment <- myReadArray dir newsegment
	myWriteArray segment newindex new
    split_bucket old new ((k,v):xs) = do
	h <- bucketIndex table' k
	if h == newbucket
		then split_bucket old ((k,v):new) xs
		else split_bucket ((k,v):old) new xs
  --
  segment <- myReadArray dir oldsegment
  bucket <- myReadArray segment oldindex
  split_bucket [] [] bucket
  return table'

-- -----------------------------------------------------------------------------
-- Deleting a mapping from the hash table

-- | Remove an entry from the hash table.
delete :: HashTable key val -> key -> IO ()

delete (HashTable ref) key = do
  table@HT{ dir=dir, cmp=cmp } <- readIORef ref
  (segment_index,segment_offset) <- tableLocation table key
  segment <- myReadArray dir segment_index
  bucket <- myReadArray segment segment_offset
  myWriteArray segment segment_offset (filter (not.(key `cmp`).fst) bucket)
  return ()

-- -----------------------------------------------------------------------------
-- Looking up an entry in the hash table

-- | Looks up the value of a key in the hash table.
lookup :: HashTable key val -> key -> IO (Maybe val)

lookup (HashTable ref) key = do
  table@HT{ dir=dir, cmp=cmp } <- readIORef ref
  (segment_index,segment_offset) <- tableLocation table key
  segment <- myReadArray dir segment_index
  bucket <- myReadArray segment segment_offset
  case [ val | (key',val) <- bucket, cmp key key' ] of
	[] -> return Nothing
	(v:_) -> return (Just v)

-- -----------------------------------------------------------------------------
-- Converting to/from lists

-- | Convert a list of key\/value pairs into a hash table.  Equality on keys
-- is taken from the Eq instance for the key type.
--
fromList :: Eq key => (key -> Int32) -> [(key,val)] -> IO (HashTable key val)
fromList hash_fn list = do
  table <- new (==) hash_fn
  sequence_ [ insert table k v | (k,v) <- list ]
  return table

-- | Converts a hash table to a list of key\/value pairs.
--
toList :: HashTable key val -> IO [(key,val)]
toList (HashTable ref) = do
  HT{ dir=dir, max_bucket=max, split=split } <- readIORef ref
  --
  let
    max_segment = (max + split - 1) `quot` sEGMENT_SIZE
  --
  segments <- mapM (segmentContents dir) [0 .. max_segment]
  return (concat segments)
 where
   segmentContents dir seg_index = do
     segment <- myReadArray dir seg_index
     bs <- mapM (myReadArray segment) [0 .. sEGMENT_SIZE-1]
     return (concat bs)

-- -----------------------------------------------------------------------------
-- Diagnostics

-- | This function is useful for determining whether your hash function
-- is working well for your data set.  It returns the longest chain
-- of key\/value pairs in the hash table for which all the keys hash to
-- the same bucket.  If this chain is particularly long (say, longer
-- than 10 elements), then it might be a good idea to try a different
-- hash function.
--
longestChain :: HashTable key val -> IO [(key,val)]
longestChain (HashTable ref) = do
  HT{ dir=dir, max_bucket=max, split=split } <- readIORef ref
  --
  let
    max_segment = (max + split - 1) `quot` sEGMENT_SIZE
  --
  --trace ("maxChainLength: max = " ++ show max ++ ", split = " ++ show split ++ ", max_segment = " ++ show max_segment) $ do
  segments <- mapM (segmentMaxChainLength dir) [0 .. max_segment]
  return (maximumBy lengthCmp segments)
 where
   segmentMaxChainLength dir seg_index = do
     segment <- myReadArray dir seg_index
     bs <- mapM (myReadArray segment) [0 .. sEGMENT_SIZE-1]
     return (maximumBy lengthCmp bs)

   lengthCmp x y = length x `compare` length y