File: HashTable.h

package info (click to toggle)
snap-aligner 1.0.0%2Bdfsg-2
  • links: PTS, VCS
  • area: main
  • in suites: bullseye
  • size: 4,988 kB
  • sloc: cpp: 36,500; ansic: 5,239; python: 227; makefile: 85; sh: 28
file content (219 lines) | stat: -rw-r--r-- 7,362 bytes parent folder | download | duplicates (5)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
/*++

Module Name:

	HashTable.h

Abstract:

    Headers for large closed hash table that used to be general, but now just handles
    seeds for SNAP.

Authors:

    Bill Bolosky, March, 2011

Environment:


--*/

#pragma once

#include "Compat.h"
#include "GenericFile_Blob.h"
#include "Genome.h"


class SNAPHashTable {
    public:

        typedef _uint64 ValueType;  // Values can be smaller than this, but they're expanded in the interface
        typedef _uint64 KeyType;    // Likewise for keys.

        SNAPHashTable(
            _int64		i_tableSize,
            unsigned    i_keySizeInBytes,
            unsigned    i_valueSizeInBytes,
            unsigned    i_valueCount,
            _uint64		i_invalidValueValue);

        //
        // Load from file.
        //
        static SNAPHashTable *loadFromBlob(GenericFile_Blob *loadFile);
		static SNAPHashTable *loadFromGenericFile(GenericFile *loadFile);

        ~SNAPHashTable();

        bool saveToFile(const char *saveFileName, size_t *bytesWritten);

        bool saveToFile(FILE *saveFile, size_t *bytesWritten);

        //
        // Fails if either the table is full or key already exists.  Inserts ALL
        // values for a key.
        //
        bool Insert(KeyType key, ValueType *data);

        size_t GetUsedElementCount() const {return usedElementCount;}
        size_t GetTableSize() const {return tableSize;}

        unsigned GetKeySizeInBytes() const {return keySizeInBytes;}
        unsigned GetValueSizeInBytes() const {return valueSizeInBytes;}
        unsigned GetValueCount() const {return valueCount;}

		void *getEntryValues(_uint64 whichEntry) 
		{
			_ASSERT(whichEntry < GetTableSize());
			return getEntry(whichEntry);
		}

        static inline _uint64 hash(_uint64 key) {
            //
            // Hash the key.  Use the hash finalizer from the 64 bit MurmurHash3, http://code.google.com/p/smhasher/wiki/MurmurHash3,
            // which is public domain code.
            //
           
            key ^= key >> 33;
            key *= 0xff51afd7ed558ccd;
            key ^= key >> 33;
            key *= 0xc4ceb9fe1a85ec53;
            key ^= key >> 33;
            
            return key;
        }

        inline ValueType *GetFirstValueForKey(KeyType key) const {
            _ASSERT(keySizeInBytes == 8 || (key & ~((((_uint64)1) << (keySizeInBytes * 8)) - 1)) == 0);    // High bits of the key aren't set.
            _uint64 tableIndex = hash(key) % tableSize;
            void *entry = getEntry(tableIndex);
            if (isKeyEqual(entry, key) && !doesEntryHaveInvalidValue(entry)) {
                return (ValueType *)entry;
            } else {
                unsigned nProbes = 0;
                void* entry;
                do {
                    nProbes++;
                    if (nProbes > tableSize + QUADRATIC_CHAINING_DEPTH) {
                        return NULL;
                    }
                    if (nProbes < QUADRATIC_CHAINING_DEPTH) {
                        tableIndex = (tableIndex + nProbes * nProbes) % tableSize;
                    } else {
                        tableIndex = (tableIndex + 1) % tableSize;
                    }
                    entry = getEntry(tableIndex);
                } while (!isKeyEqual(entry, key) && !doesEntryHaveInvalidValue(entry));

                extern _int64 nProbesInGetEntryForKey;
                nProbesInGetEntryForKey += nProbes;

                if (doesEntryHaveInvalidValue(entry)) {
                    return NULL;
                } else {
                    return (ValueType *)entry;
                }
            }
        }


        inline bool Lookup(KeyType key, unsigned nValuesToFill, ValueType *values) const {
            _ASSERT(nValuesToFill <= valueCount);
            char *entry = (char *)GetFirstValueForKey(key);
            if (NULL == entry) {
                return false;
            }

            for (unsigned i = 0; i < nValuesToFill; i++) {
                values[i] = 0;
                memcpy(values + i, entry + i * valueSizeInBytes, valueSizeInBytes);
            }

            return true;
        }
        //
        // A version of Lookup that works properly when the table is (nearly) full and the key being looked up isn't
        // there.  It's, as you might imagine, slower than Lookup.
        //
        ValueType *SlowLookup(KeyType key);

private:

        SNAPHashTable() {}

        static const unsigned QUADRATIC_CHAINING_DEPTH = 5; // Chain quadratically for this long, then linerarly  Set to 0 for linear chaining
		static SNAPHashTable *loadCommon(GenericFile *loadFile);

        //
        // A hash table entry consists of a set of valueCount values, each of valueSizeInBytes bytes, followed by
        // a key of keySizeInBytes bytes.  The key size must be between 4 and 8 bytes, inclusive.
        //
        // Because the size of the fields and the count of values is variable, we can't use a struct and are stuck
        // with ugly memory manipulation to access the hash table entries.
        //
        // The format is 1 or 2 (valueCount) values of size valueSize, followed by keySize bytes of key.
        //
#if 0
        struct Entry {
            unsigned        value1;
            unsigned        value2;
            unsigned char   key[1]; // Actual size of key determined by keySizeInBytes
        };
#endif // 0

        // Free Entries have the first valueSizeInByes bytes of value == invalidValueValue.  The following methods
        // understand the format and try to make it less opaque to use them.
        
        inline void *getEntry(_uint64 whichEntry) const {
            return ((char *)Table + elementSize * whichEntry);
        }

        inline bool doesEntryHaveInvalidValue(void *entry) const
        {
            return !memcmp(entry, &invalidValueValue, valueSizeInBytes);
        }

        inline ValueType getValueFromEntry(void *entry, unsigned whichValue) const
        {
            _ASSERT(whichValue < valueCount);
            ValueType value = 0;    // Need =0 because valueSizeInBytes might be < sizeeof(ValueType)
            memcpy(&value, (char *)entry + whichValue * valueSizeInBytes, valueSizeInBytes);    // Assumes little-endian
            return value;
        }

        inline bool isKeyEqual(const void *entry, KeyType key) const 
        {
            return !memcmp((const char *)entry + valueSizeInBytes * valueCount, &key, keySizeInBytes);
        }

        inline void clearKey(void *entry)
        {
            memset((char *)entry + valueSizeInBytes * valueCount, 0 , keySizeInBytes);
        }

        inline void setKey(void *entry, KeyType key)
        {
            memcpy((char *)entry + valueSizeInBytes * valueCount, &key, keySizeInBytes);
        }
 
        void *Table;
        size_t tableSize;
        unsigned keySizeInBytes;
        unsigned elementSize;
        size_t usedElementCount;
        bool ownsMemoryForTable;
        unsigned valueSizeInBytes;
        unsigned valueCount;
        ValueType invalidValueValue;
 
        //
        // Returns either the entry for this key, or else the entry where the key would be
        // inserted if it's not in the table.
        //
        void* getEntryForKey(__in KeyType key) const;

        friend class SeedCountIterator;

        static const unsigned magic;
};