File: siphash-sse41.c

package info (click to toggle)
haskell-hashable 1.2.1.0-5
  • links: PTS, VCS
  • area: main
  • in suites: jessie, jessie-kfreebsd
  • size: 196 kB
  • ctags: 39
  • sloc: haskell: 975; ansic: 456; makefile: 3
file content (86 lines) | stat: -rw-r--r-- 2,600 bytes parent folder | download | duplicates (4)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
/*
 * The original code was developed by Samuel Neves, and has been
 * only lightly modified.
 *
 * Used with permission.
 */
#pragma GCC target("sse4.1")

#include <smmintrin.h>
#include "siphash.h"

// Specialized for siphash, do not reuse
#define rotate16(x) _mm_shufflehi_epi16((x), _MM_SHUFFLE(2,1,0,3))

#define _mm_roti_epi64(x, c) (((c) == 16) ? rotate16((x)) : _mm_xor_si128(_mm_slli_epi64((x), (c)), _mm_srli_epi64((x), 64-(c))))
//#define _mm_roti_epi64(x, c)  _mm_xor_si128(_mm_slli_epi64((x), (c)), _mm_srli_epi64((x), 64-(c)))


u64 hashable_siphash24_sse41(u64 _k0, u64 _k1, const unsigned char *m, size_t n)
{
	__m128i v0, v1, v02, v13;
	__m128i k0;
	__m128i mi, mask, len, h;
	const __m128i zero = _mm_setzero_si128();
	size_t i, k;
	union { u64 gpr; __m128i xmm; } hash;
	unsigned char key[16];

	((u64 *)key)[0] = _k0;
	((u64 *)key)[1] = _k1;

	k0 = _mm_loadu_si128((__m128i*)(key + 0));

	v0 = _mm_xor_si128(k0, _mm_set_epi32(0x646f7261, 0x6e646f6d, 0x736f6d65, 0x70736575));
	v1 = _mm_xor_si128(k0, _mm_set_epi32(0x74656462, 0x79746573, 0x6c796765, 0x6e657261));

	v02 = _mm_unpacklo_epi64(v0, v1);
	v13 = _mm_unpackhi_epi64(v0, v1);

#define HALF_ROUND(a,b,s,t) \
do \
{ \
	__m128i b1,b2; \
	a = _mm_add_epi64(a, b);  \
	b1 = _mm_roti_epi64(b, s); b2 = _mm_roti_epi64(b, t); b = _mm_blend_epi16(b1, b2, 0xF0); \
	b = _mm_xor_si128(b, a);  \
} while(0)

#define COMPRESS(v02,v13) \
	do \
	{ \
		HALF_ROUND(v02,v13,13,16); \
		v02 = _mm_shuffle_epi32(v02, _MM_SHUFFLE(0,1,3,2)); \
		HALF_ROUND(v02,v13,17,21); \
		v02 = _mm_shuffle_epi32(v02, _MM_SHUFFLE(0,1,3,2)); \
	} while(0)

	for(i = 0; i < (n-n%8); i += 8)
	{
		mi = _mm_loadl_epi64((__m128i*)(m + i));
		v13 = _mm_xor_si128(v13, _mm_unpacklo_epi64(zero, mi));
		for(k = 0; k < SIPHASH_ROUNDS; ++k) COMPRESS(v02,v13);
		v02 = _mm_xor_si128(v02, mi);
	}

	mi = _mm_loadl_epi64((__m128i*)(m + i));
	len = _mm_set_epi32(0, 0, (n&0xff) << 24, 0);
	mask = _mm_srli_epi64(_mm_set_epi32(0, 0, 0xffffffff, 0xffffffff), 8*(8-n%8));
	mi = _mm_xor_si128(_mm_and_si128(mi, mask), len);

	v13 = _mm_xor_si128(v13, _mm_unpacklo_epi64(zero, mi));
	for(k = 0; k < SIPHASH_ROUNDS; ++k) COMPRESS(v02,v13);
	v02 = _mm_xor_si128(v02, mi);

	v02 = _mm_xor_si128(v02, _mm_set_epi32(0, 0xff, 0, 0));
	for(k = 0; k < SIPHASH_FINALROUNDS; ++k) COMPRESS(v02,v13);

	v0 = _mm_xor_si128(v02, v13);
	v0 = _mm_xor_si128(v0, _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(zero), _mm_castsi128_ps(v0))));
	hash.xmm = v0;

#undef COMPRESS
#undef HALF_ROUND
	//return _mm_extract_epi32(v0, 0) | (((u64)_mm_extract_epi32(v0, 1)) << 32);
	return hash.gpr;
}