File: libchash.h

package info (click to toggle)
sparsehash 2.0.3-2
  • links: PTS
  • area: main
  • in suites: bookworm, bullseye, forky, sid, trixie
  • size: 1,756 kB
  • sloc: cpp: 6,600; sh: 3,960; ansic: 1,317; makefile: 240
file content (253 lines) | stat: -rw-r--r-- 12,423 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
/* Copyright (c) 1998 - 2005, Google Inc.
 * All rights reserved.
 * 
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are
 * met:
 * 
 *     * Redistributions of source code must retain the above copyright
 * notice, this list of conditions and the following disclaimer.
 *     * Redistributions in binary form must reproduce the above
 * copyright notice, this list of conditions and the following disclaimer
 * in the documentation and/or other materials provided with the
 * distribution.
 *     * Neither the name of Google Inc. nor the names of its
 * contributors may be used to endorse or promote products derived from
 * this software without specific prior written permission.
 * 
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 * ---
 * Author: Craig Silverstein
 *
 *  This library is intended to be used for in-memory hash tables,
 *  though it provides rudimentary permanent-storage capabilities.
 *  It attempts to be fast, portable, and small.  The best algorithm
 *  to fulfill these goals is an internal probing hashing algorithm,
 *  as in Knuth, _Art of Computer Programming_, vol III.  Unlike
 *  chained (open) hashing, it doesn't require a pointer for every
 *  item, yet it is still constant time lookup in practice.
 *
 *  Also to save space, we let the contents (both data and key) that
 *  you insert be a union: if the key/data is small, we store it
 *  directly in the hashtable, otherwise we store a pointer to it.
 *  To keep you from having to figure out which, use KEY_PTR and
 *  PTR_KEY to convert between the arguments to these functions and
 *  a pointer to the real data.  For instance:
 *     char key[] = "ab", *key2;
 *     HTItem *bck; HashTable *ht;
 *     HashInsert(ht, PTR_KEY(ht, key), 0);
 *     bck = HashFind(ht, PTR_KEY(ht, "ab"));
 *     key2 = KEY_PTR(ht, bck->key);
 *
 *  There are a rich set of operations supported:
 *     AllocateHashTable() -- Allocates a hashtable structure and
 *                            returns it.
 *        cchKey: if it's a positive number, then each key is a
 *                fixed-length record of that length.  If it's 0,
 *                the key is assumed to be a \0-terminated string.
 *        fSaveKey: normally, you are responsible for allocating
 *                  space for the key.  If this is 1, we make a
 *                  copy of the key for you.
 *     ClearHashTable() -- Removes everything from a hashtable
 *     FreeHashTable() -- Frees memory used by a hashtable
 *
 *     HashFind() -- takes a key (use PTR_KEY) and returns the
 *                   HTItem containing that key, or NULL if the
 *                   key is not in the hashtable.
 *     HashFindLast() -- returns the item found by last HashFind()
 *     HashFindOrInsert() -- inserts the key/data pair if the key
 *                           is not already in the hashtable, or
 *                           returns the appropraite HTItem if it is.
 *     HashFindOrInsertItem() -- takes key/data as an HTItem.
 *     HashInsert() -- adds a key/data pair to the hashtable.  What
 *                     it does if the key is already in the table
 *                     depends on the value of SAMEKEY_OVERWRITE.
 *     HashInsertItem() -- takes key/data as an HTItem.
 *     HashDelete() -- removes a key/data pair from the hashtable,
 *                     if it's there.  RETURNS 1 if it was there,
 *                     0 else.
 *        If you use sparse tables and never delete, the full data
 *        space is available.  Otherwise we steal -2 (maybe -3),
 *        so you can't have data fields with those values.
 *     HashDeleteLast() -- deletes the item returned by the last Find().
 *
 *     HashFirstBucket() -- used to iterate over the buckets in a 
 *                          hashtable.  DON'T INSERT OR DELETE WHILE
 *                          ITERATING!  You can't nest iterations.
 *     HashNextBucket() -- RETURNS NULL at the end of iterating.
 *
 *     HashSetDeltaGoalSize() -- if you're going to insert 1000 items
 *                               at once, call this fn with arg 1000.
 *                               It grows the table more intelligently.
 *
 *     HashSave() -- saves the hashtable to a file.  It saves keys ok,
 *                   but it doesn't know how to interpret the data field,
 *                   so if the data field is a pointer to some complex
 *                   structure, you must send a function that takes a
 *                   file pointer and a pointer to the structure, and
 *                   write whatever you want to write.  It should return
 *                   the number of bytes written.  If the file is NULL,
 *                   it should just return the number of bytes it would
 *                   write, without writing anything.
 *                      If your data field is just an integer, not a
 *                   pointer, just send NULL for the function.
 *     HashLoad() -- loads a hashtable.  It needs a function that takes
 *                   a file and the size of the structure, and expects
 *                   you to read in the structure and return a pointer
 *                   to it.  You must do memory allocation, etc.  If
 *                   the data is just a number, send NULL.
 *     HashLoadKeys() -- unlike HashLoad(), doesn't load the data off disk
 *                       until needed.  This saves memory, but if you look
 *                       up the same key a lot, it does a disk access each
 *                       time.
 *        You can't do Insert() or Delete() on hashtables that were loaded
 *        from disk.
 */

#include <sparsehash/internal/sparseconfig.h>
#include <sys/types.h>         /* includes definition of "ulong", we hope */
#define ulong u_long

#define MAGIC_KEY             "CHsh"   /* when we save the file */

#ifndef LOG_WORD_SIZE                  /* 5 for 32 bit words, 6 for 64 */
#if defined (__LP64__) || defined (_LP64)
#define LOG_WORD_SIZE          6       /* log_2(sizeof(ulong)) [in bits] */
#else
#define LOG_WORD_SIZE          5       /* log_2(sizeof(ulong)) [in bits] */
#endif
#endif

   /* The following gives a speed/time tradeoff: how many buckets are  *
    * in each bin.  0 gives 32 buckets/bin, which is a good number.    */
#ifndef LOG_BM_WORDS
#define LOG_BM_WORDS        0      /* each group has 2^L_B_W * 32 buckets */
#endif

   /* The following are all parameters that affect performance. */
#ifndef JUMP
#define JUMP(key, offset)   ( ++(offset) )  /* ( 1 ) for linear hashing */
#endif
#ifndef Table
#define Table(x)            Sparse##x       /* Dense##x for dense tables */
#endif
#ifndef FAST_DELETE
#define FAST_DELETE         0      /* if it's 1, we never shrink the ht */
#endif
#ifndef SAMEKEY_OVERWRITE
#define SAMEKEY_OVERWRITE   1      /* overwrite item with our key on insert? */
#endif
#ifndef OCCUPANCY_PCT
#define OCCUPANCY_PCT       0.5    /* large PCT means smaller and slower */
#endif
#ifndef MIN_HASH_SIZE
#define MIN_HASH_SIZE       512    /* ht size when first created */
#endif
   /* When deleting a bucket, we can't just empty it (future hashes  *
    * may fail); instead we set the data field to DELETED.  Thus you *
    * should set DELETED to a data value you never use.  Better yet, *
    * if you don't need to delete, define INSERT_ONLY.               */
#ifndef INSERT_ONLY
#define DELETED                   -2UL
#define IS_BCK_DELETED(bck)       ( (bck) && (bck)->data == DELETED )
#define SET_BCK_DELETED(ht, bck)  do { (bck)->data = DELETED;                \
                                       FREE_KEY(ht, (bck)->key); } while ( 0 )
#else
#define IS_BCK_DELETED(bck)       0
#define SET_BCK_DELETED(ht, bck)  \
   do { fprintf(stderr, "Deletion not supported for insert-only hashtable\n");\
        exit(2); } while ( 0 )
#endif

   /* We need the following only for dense buckets (Dense##x above).  *
    * If you need to, set this to a value you'll never use for data.  */
#define EMPTY -3UL                /* steal more of the bck->data space */


   /* This is what an item is.  Either can be cast to a pointer. */
typedef struct {
   ulong data;        /* 4 bytes for data: either a pointer or an integer */
   ulong key;         /* 4 bytes for the key: either a pointer or an int */
} HTItem;

struct Table(Bin);                            /* defined in chash.c, I hope */
struct Table(Iterator);
typedef struct Table(Bin)       Table;        /* Expands to SparseBin, etc */
typedef struct Table(Iterator)  TableIterator;

   /* for STORES_PTR to work ok, cchKey MUST BE DEFINED 1st, cItems 2nd! */
typedef struct HashTable {
   ulong cchKey;        /* the length of the key, or if it's \0 terminated */
   ulong cItems;        /* number of items currently in the hashtable */
   ulong cDeletedItems; /* # of buckets holding DELETE in the hashtable */
   ulong cBuckets;      /* size of the table */
   Table *table;        /* The actual contents of the hashtable */
   int fSaveKeys;       /* 1 if we copy keys locally; 2 if keys in one block */
   int cDeltaGoalSize;  /* # of coming inserts (or deletes, if <0) we expect */
   HTItem *posLastFind; /* position of last Find() command */
   TableIterator *iter; /* used in First/NextBucket */

   FILE *fpData;        /* if non-NULL, what item->data points into */
   char * (*dataRead)(FILE *, int);   /* how to load data from disk */
   HTItem bckData;      /* holds data after being loaded from disk */
} HashTable;

   /* Small keys are stored and passed directly, but large keys are
    * stored and passed as pointers.  To make it easier to remember
    * what to pass, we provide two functions:
    *   PTR_KEY: give it a pointer to your data, and it returns
    *            something appropriate to send to Hash() functions or
    *            be stored in a data field.
    *   KEY_PTR: give it something returned by a Hash() routine, and
    *            it returns a (char *) pointer to the actual data.
    */
#define HashKeySize(ht)   ( ((ulong *)(ht))[0] )  /* this is how we inline */
#define HashSize(ht)      ( ((ulong *)(ht))[1] )  /* ...a la C++ :-) */

#define STORES_PTR(ht)    ( HashKeySize(ht) == 0 || \
			    HashKeySize(ht) > sizeof(ulong) )
#define KEY_PTR(ht, key)  ( STORES_PTR(ht) ? (char *)(key) : (char *)&(key) )
#ifdef DONT_HAVE_TO_WORRY_ABOUT_BUS_ERRORS
#define PTR_KEY(ht, ptr)  ( STORES_PTR(ht) ? (ulong)(ptr) : *(ulong *)(ptr) )
#else
#define PTR_KEY(ht, ptr)  ( STORES_PTR(ht) ? (ulong)(ptr) : HTcopy((char *)ptr))
#endif


   /* Function prototypes */
unsigned long HTcopy(char *pul);         /* for PTR_KEY, not for users */

struct HashTable *AllocateHashTable(int cchKey, int fSaveKeys);
void ClearHashTable(struct HashTable *ht);
void FreeHashTable(struct HashTable *ht);

HTItem *HashFind(struct HashTable *ht, ulong key);
HTItem *HashFindLast(struct HashTable *ht);
HTItem *HashFindOrInsert(struct HashTable *ht, ulong key, ulong dataInsert);
HTItem *HashFindOrInsertItem(struct HashTable *ht, HTItem *pItem);

HTItem *HashInsert(struct HashTable *ht, ulong key, ulong data);
HTItem *HashInsertItem(struct HashTable *ht, HTItem *pItem);

int HashDelete(struct HashTable *ht, ulong key);
int HashDeleteLast(struct HashTable *ht);

HTItem *HashFirstBucket(struct HashTable *ht);
HTItem *HashNextBucket(struct HashTable *ht);

int HashSetDeltaGoalSize(struct HashTable *ht, int delta);

void HashSave(FILE *fp, struct HashTable *ht, int (*write)(FILE *, char *));
struct HashTable *HashLoad(FILE *fp, char * (*read)(FILE *, int));
struct HashTable *HashLoadKeys(FILE *fp, char * (*read)(FILE *, int));