File: HashTable.hs

package info (click to toggle)
hugs98 98.200311-4
links: PTS
area: main
in suites: sarge
size: 12,964 kB
ctags: 8,084
sloc: ansic: 67,521; haskell: 61,497; xml: 4,566; sh: 3,264; cpp: 1,936; yacc: 1,094; makefile: 915; cs: 883; sed: 10
file content (376 lines) | stat: -rw-r--r-- 12,640 bytes
{-# OPTIONS -fno-implicit-prelude #-}

-----------------------------------------------------------------------------
-- |
-- Module      :  Data.HashTable
-- Copyright   :  (c) The University of Glasgow 2003
-- License     :  BSD-style (see the file libraries/base/LICENSE)
--
-- Maintainer  :  libraries@haskell.org
-- Stability   :  provisional
-- Portability :  portable
--
-- An implementation of extensible hash tables, as described in
-- Per-Ake Larson, /Dynamic Hash Tables/, CACM 31(4), April 1988,
-- pp. 446--457.  The implementation is also derived from the one
-- in GHC's runtime system (@ghc\/rts\/Hash.{c,h}@).
--
-----------------------------------------------------------------------------

module Data.HashTable (
	-- * Basic hash table operations
	HashTable, new, insert, delete, lookup,
	-- * Converting to and from lists
	fromList, toList,
	-- * Hash functions
	-- $hash_functions
	hashInt, hashString,
	prime,
	-- * Diagnostics
	longestChain
 ) where

-- This module is imported by Data.Dynamic, which is pretty low down in the
-- module hierarchy, so don't import "high-level" modules

#ifdef __GLASGOW_HASKELL__
import GHC.Base
#else
import Prelude	hiding	( lookup )
#endif
import Data.Tuple	( fst )
import Data.Bits
import Data.Maybe
import Data.List	( maximumBy, filter, length, concat )
import Data.Int		( Int32 )

#if defined(__GLASGOW_HASKELL__)
import GHC.Num
import GHC.Real		( Integral(..), fromIntegral )

import GHC.IOBase	( IO, IOArray, newIOArray, readIOArray, writeIOArray,
			  unsafeReadIOArray, unsafeWriteIOArray,
			  IORef, newIORef, readIORef, writeIORef )
import GHC.Err		( undefined )
#else
import Data.Char	( ord )
import Data.IORef	( IORef, newIORef, readIORef, writeIORef )
#  if defined(__HUGS__)
import Hugs.IOArray	( IOArray, newIOArray, readIOArray, writeIOArray,
			  unsafeReadIOArray, unsafeWriteIOArray )
#  elif defined(__NHC__)
import NHC.IOExtras	( IOArray, newIOArray, readIOArray, writeIOArray)
#  endif
#endif
import Control.Monad	( when, mapM, sequence_ )


-----------------------------------------------------------------------
myReadArray  :: IOArray Int32 a -> Int32 -> IO a
myWriteArray :: IOArray Int32 a -> Int32 -> a -> IO ()
#if defined(DEBUG) || defined(__NHC__)
myReadArray  = readIOArray
myWriteArray = writeIOArray
#else
myReadArray arr i = unsafeReadIOArray arr (fromIntegral i)
myWriteArray arr i x = unsafeWriteIOArray arr (fromIntegral i) x
#endif

-- | A hash table mapping keys of type @key@ to values of type @val@.
--
-- The implementation will grow the hash table as necessary, trying to
-- maintain a reasonable average load per bucket in the table.
--
newtype HashTable key val = HashTable (IORef (HT key val))
-- TODO: the IORef should really be an MVar.

data HT key val
  = HT {
	split  :: !Int32, -- Next bucket to split when expanding
	max_bucket :: !Int32, -- Max bucket of smaller table
	mask1  :: !Int32, -- Mask for doing the mod of h_1 (smaller table)
	mask2  :: !Int32, -- Mask for doing the mod of h_2 (larger table)
	kcount :: !Int32, -- Number of keys
	bcount :: !Int32, -- Number of buckets
	dir    :: !(IOArray Int32 (IOArray Int32 [(key,val)])),
	hash_fn :: key -> Int32,
	cmp    :: key -> key -> Bool
   }

{-
ALTERNATIVE IMPLEMENTATION:

This works out slightly slower, because there's a tradeoff between
allocating a complete new HT structure each time a modification is
made (in the version above), and allocating new Int32s each time one
of them is modified, as below.  Using FastMutInt instead of IORef
Int32 helps, but yields an implementation which has about the same
performance as the version above (and is more complex).

data HashTable key val
  = HashTable {
	split  :: !(IORef Int32), -- Next bucket to split when expanding
	max_bucket :: !(IORef Int32), -- Max bucket of smaller table
	mask1  :: !(IORef Int32), -- Mask for doing the mod of h_1 (smaller table)
	mask2  :: !(IORef Int32), -- Mask for doing the mod of h_2 (larger table)
	kcount :: !(IORef Int32), -- Number of keys
	bcount :: !(IORef Int32), -- Number of buckets
	dir    :: !(IOArray Int32 (IOArray Int32 [(key,val)])),
	hash_fn :: key -> Int32,
	cmp    :: key -> key -> Bool
   }
-}


-- -----------------------------------------------------------------------------
-- Sample hash functions

-- $hash_functions
--
-- This implementation of hash tables uses the low-order /n/ bits of the hash
-- value for a key, where /n/ varies as the hash table grows.  A good hash
-- function therefore will give an even distribution regardless of /n/.
--
-- If your keyspace is integrals such that the low-order bits between
-- keys are highly variable, then you could get away with using 'id'
-- as the hash function.
--
-- We provide some sample hash functions for 'Int' and 'String' below.

-- | A sample hash function for 'Int', implemented as simply @(x `mod` P)@
-- where P is a suitable prime (currently 1500007).  Should give
-- reasonable results for most distributions of 'Int' values, except
-- when the keys are all multiples of the prime!
--
hashInt :: Int -> Int32
hashInt = (`rem` prime) . fromIntegral

-- | A sample hash function for 'String's.  The implementation is:
--
-- >    hashString = fromIntegral . foldr f 0
-- >      where f c m = ord c + (m * 128) `rem` 1500007
--
-- which seems to give reasonable results.
--
hashString :: String -> Int32
hashString = fromIntegral . foldr f 0
  where f c m = ord c + (m * 128) `rem` fromIntegral prime

-- | A prime larger than the maximum hash table size
prime :: Int32
prime = 1500007

-- -----------------------------------------------------------------------------
-- Parameters

sEGMENT_SIZE  = 1024  :: Int32  -- Size of a single hash table segment
sEGMENT_SHIFT = 10    :: Int  -- derived
sEGMENT_MASK  = 0x3ff :: Int32  -- derived

dIR_SIZE = 1024  :: Int32  -- Size of the segment directory
	-- Maximum hash table size is sEGMENT_SIZE * dIR_SIZE

hLOAD = 4 :: Int32 -- Maximum average load of a single hash bucket

-- -----------------------------------------------------------------------------
-- Creating a new hash table

-- | Creates a new hash table
new
  :: (key -> key -> Bool)    -- ^ An equality comparison on keys
  -> (key -> Int32)	     -- ^ A hash function on keys
  -> IO (HashTable key val)  -- ^ Returns: an empty hash table

new cmp hash_fn = do
  -- make a new hash table with a single, empty, segment
  dir     <- newIOArray (0,dIR_SIZE) undefined
  segment <- newIOArray (0,sEGMENT_SIZE-1) []
  myWriteArray dir 0 segment

  let
    split  = 0
    max    = sEGMENT_SIZE
    mask1  = (sEGMENT_SIZE - 1)
    mask2  = (2 * sEGMENT_SIZE - 1)
    kcount = 0
    bcount = sEGMENT_SIZE

    ht = HT {  dir=dir, split=split, max_bucket=max, mask1=mask1, mask2=mask2,
	       kcount=kcount, bcount=bcount, hash_fn=hash_fn, cmp=cmp
	  }
  
  table <- newIORef ht
  return (HashTable table)

-- -----------------------------------------------------------------------------
-- Inserting a key\/value pair into the hash table

-- | Inserts an key\/value mapping into the hash table.
insert :: HashTable key val -> key -> val -> IO ()

insert (HashTable ref) key val = do
  table@HT{ kcount=k, bcount=b, dir=dir } <- readIORef ref
  let table1 = table{ kcount = k+1 }
  table2 <-
	if (k > hLOAD * b)
	   then expandHashTable table1
	   else return table1
  writeIORef ref table2
  (segment_index,segment_offset) <- tableLocation table key
  segment <- myReadArray dir segment_index
  bucket <- myReadArray segment segment_offset
  myWriteArray segment segment_offset ((key,val):bucket)
  return ()

bucketIndex :: HT key val -> key -> IO Int32
bucketIndex HT{ hash_fn=hash_fn,
		split=split,
		mask1=mask1,
		mask2=mask2 } key = do
  let
    h = fromIntegral (hash_fn key)
    small_bucket = h .&. mask1
    large_bucket = h .&. mask2
  --
  if small_bucket < split
	then return large_bucket
  	else return small_bucket

tableLocation :: HT key val -> key -> IO (Int32,Int32)
tableLocation table key = do
  bucket_index <- bucketIndex table key
  let
    segment_index  = bucket_index `shiftR` sEGMENT_SHIFT
    segment_offset = bucket_index .&. sEGMENT_MASK
  --
  return (segment_index,segment_offset)

expandHashTable :: HT key val -> IO (HT key val)
expandHashTable
      table@HT{ dir=dir,
		split=split,
		max_bucket=max,
		mask2=mask2 } = do
  let
      oldsegment = split `shiftR` sEGMENT_SHIFT
      oldindex   = split .&. sEGMENT_MASK

      newbucket  = max + split
      newsegment = newbucket `shiftR` sEGMENT_SHIFT
      newindex   = newbucket .&. sEGMENT_MASK
  --
  when (newindex == 0) $
	do segment <- newIOArray (0,sEGMENT_SIZE-1) []
	   myWriteArray dir newsegment segment
  --
  let table' =
  	if (split+1) < max
     	    then table{ split = split+1 }
     		-- we've expanded all the buckets in this table, so start from
		-- the beginning again.
     	    else table{ split = 0,
			max_bucket = max * 2,
			mask1 = mask2,
			mask2 = mask2 `shiftL` 1 .|. 1 }
  let
    split_bucket old new [] = do
	segment <- myReadArray dir oldsegment
	myWriteArray segment oldindex old
	segment <- myReadArray dir newsegment
	myWriteArray segment newindex new
    split_bucket old new ((k,v):xs) = do
	h <- bucketIndex table' k
	if h == newbucket
		then split_bucket old ((k,v):new) xs
		else split_bucket ((k,v):old) new xs
  --
  segment <- myReadArray dir oldsegment
  bucket <- myReadArray segment oldindex
  split_bucket [] [] bucket
  return table'

-- -----------------------------------------------------------------------------
-- Deleting a mapping from the hash table

-- | Remove an entry from the hash table.
delete :: HashTable key val -> key -> IO ()

delete (HashTable ref) key = do
  table@HT{ dir=dir, cmp=cmp } <- readIORef ref
  (segment_index,segment_offset) <- tableLocation table key
  segment <- myReadArray dir segment_index
  bucket <- myReadArray segment segment_offset
  myWriteArray segment segment_offset (filter (not.(key `cmp`).fst) bucket)
  return ()

-- -----------------------------------------------------------------------------
-- Looking up an entry in the hash table

-- | Looks up the value of a key in the hash table.
lookup :: HashTable key val -> key -> IO (Maybe val)

lookup (HashTable ref) key = do
  table@HT{ dir=dir, cmp=cmp } <- readIORef ref
  (segment_index,segment_offset) <- tableLocation table key
  segment <- myReadArray dir segment_index
  bucket <- myReadArray segment segment_offset
  case [ val | (key',val) <- bucket, cmp key key' ] of
	[] -> return Nothing
	(v:_) -> return (Just v)

-- -----------------------------------------------------------------------------
-- Converting to/from lists

-- | Convert a list of key\/value pairs into a hash table.  Equality on keys
-- is taken from the Eq instance for the key type.
--
fromList :: Eq key => (key -> Int32) -> [(key,val)] -> IO (HashTable key val)
fromList hash_fn list = do
  table <- new (==) hash_fn
  sequence_ [ insert table k v | (k,v) <- list ]
  return table

-- | Converts a hash table to a list of key\/value pairs.
--
toList :: HashTable key val -> IO [(key,val)]
toList (HashTable ref) = do
  HT{ dir=dir, max_bucket=max, split=split } <- readIORef ref
  --
  let
    max_segment = (max + split - 1) `quot` sEGMENT_SIZE
  --
  segments <- mapM (segmentContents dir) [0 .. max_segment]
  return (concat segments)
 where
   segmentContents dir seg_index = do
     segment <- myReadArray dir seg_index
     bs <- mapM (myReadArray segment) [0 .. sEGMENT_SIZE-1]
     return (concat bs)

-- -----------------------------------------------------------------------------
-- Diagnostics

-- | This function is useful for determining whether your hash function
-- is working well for your data set.  It returns the longest chain
-- of key\/value pairs in the hash table for which all the keys hash to
-- the same bucket.  If this chain is particularly long (say, longer
-- than 10 elements), then it might be a good idea to try a different
-- hash function.
--
longestChain :: HashTable key val -> IO [(key,val)]
longestChain (HashTable ref) = do
  HT{ dir=dir, max_bucket=max, split=split } <- readIORef ref
  --
  let
    max_segment = (max + split - 1) `quot` sEGMENT_SIZE
  --
  --trace ("maxChainLength: max = " ++ show max ++ ", split = " ++ show split ++ ", max_segment = " ++ show max_segment) $ do
  segments <- mapM (segmentMaxChainLength dir) [0 .. max_segment]
  return (maximumBy lengthCmp segments)
 where
   segmentMaxChainLength dir seg_index = do
     segment <- myReadArray dir seg_index
     bs <- mapM (myReadArray segment) [0 .. sEGMENT_SIZE-1]
     return (maximumBy lengthCmp bs)

   lengthCmp x y = length x `compare` length y