File: example_custom_hash.cpp

package info (click to toggle)
bbhash 1.0.0-5
  • links: PTS, VCS
  • area: main
  • in suites: bookworm
  • size: 184 kB
  • sloc: cpp: 1,813; makefile: 56; sh: 10
file content (120 lines) | stat: -rw-r--r-- 3,285 bytes parent folder | download | duplicates (10)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
#include "BooPHF.h"

#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <string.h>
#include <sys/types.h>
#include <random>
#include <algorithm>

using namespace std;



//example with user provided custom hasher for uint64_t type :

class Custom_uint64_Hasher
{
public:
	// the class should have operator () with this signature :
	// BBhash will use the 'seed' paramater to generate two different hash values form this key.
     //then it will generate internally a sequence of hash values using xorshifts, using these two first hash values as starting point.
	uint64_t operator ()   (uint64_t key, uint64_t seed=0) const
	{
		
		key ^= key >> 33;
		key *= 0xff51afd7ed558ccd;
		key ^= key >> 33;
		key *= 0xc4ceb9fe1a85ec53;
		key ^= key >> 33;
		
		key ^= seed;
		
		return key;
	}
};


//then tell BBhash to use this custom hash : (also appears below, line 104)
typedef boomphf::mphf<  u_int64_t, Custom_uint64_Hasher  > boophf_t;


int main (int argc, char* argv[]){
	
	//PARAMETERS
	u_int64_t nelem = 1000000;
	uint nthreads = 1;

	if(argc !=3 ){
		printf("Usage :\n");
		printf("%s <nelem> <nthreads> \n",argv[0]);
		return EXIT_FAILURE;
	}
	
	if(argc ==3 ){
		nelem = strtoul(argv[1], NULL,0);
		nthreads = atoi(argv[2]);
	}
	
	uint64_t ii, jj;
	u_int64_t *data;

	/////  generation of random keys
	uint64_t rab = 100;
	static std::mt19937_64 rng;
	rng.seed(std::mt19937_64::default_seed); //default seed
	
	//rng.seed(seed2); //random seed from timer
	data = (u_int64_t * ) calloc(nelem+rab,sizeof(u_int64_t));
	
	for (u_int64_t i = 1; i < nelem+rab; i++){
		data[i] = rng();
	}
	printf("de-duplicating items \n");
	
	std::sort(data,data+nelem+rab);
	
	for (ii = 1, jj = 0; ii < nelem+rab; ii++) {
		if (data[ii] != data[jj])
			data[++jj] = data[ii];
	}
	printf("found %lli duplicated items  \n",nelem+rab-(jj + 1) );
	
	//////////////////
	// at this point, array data contains a set of nelem random unique keys
	
	
	boophf_t * bphf = NULL;
	double t_begin,t_end; struct timeval timet;
	
	
	printf("Construct a BooPHF with  %lli elements  \n",nelem);
	
	gettimeofday(&timet, NULL); t_begin = timet.tv_sec +(timet.tv_usec/1000000.0);
	
	// mphf takes as input a c++ range. A simple array of keys can be wrapped with boomphf::range
	// but could be from a user defined iterator (enabling keys to be read from a file or from some complex non-contiguous structure)
	auto data_iterator = boomphf::range(static_cast<const u_int64_t*>(data), static_cast<const u_int64_t*>(data+nelem));
	
	double gammaFactor = 2.0; // lowest bit/elem is achieved with gamma=1, higher values lead to larger mphf but faster construction/query
	// gamma = 2 is a good tradeoff (leads to approx 3.7 bits/key )

	//build the mphf
	bphf = new boomphf::mphf<u_int64_t,Custom_uint64_Hasher>(nelem,data_iterator,nthreads,gammaFactor);
	
	gettimeofday(&timet, NULL); t_end = timet.tv_sec +(timet.tv_usec/1000000.0);
	double elapsed = t_end - t_begin;
	
	
	printf("BooPHF constructed perfect hash for %llu keys in %.2fs\n", nelem,elapsed);
	printf("boophf  bits/elem : %f\n",(float) (bphf->totalBitSize())/nelem);
	
	//query mphf like this
	uint64_t  idx = bphf->lookup(data[0]);
	printf(" example query  %lli ----->  %llu \n",data[0],idx);
	
	free(data);
	delete bphf;
	return EXIT_SUCCESS;
}