File: sse-42.c

package info (click to toggle)
haskell-hashtables 1.4.2-1
  • links: PTS, VCS
  • area: main
  • in suites: sid
  • size: 420 kB
  • sloc: haskell: 4,662; ansic: 590; makefile: 14; sh: 4
file content (172 lines) | stat: -rw-r--r-- 5,061 bytes parent folder | download | duplicates (6)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
#include "defs.h"

#include <smmintrin.h>
#include <stdio.h>

/* Straight-line branchless SSE 4.2 code for searching an array of uint16_t
   hash codes. */

static inline int32_t mask(int32_t a, int32_t b) { return -(a == b); }

#if defined(__GNUC__)
static inline int32_t first_bit_set(int32_t a) {
    return __builtin_ffs(a) - 1;
}
#else
static uint8_t de_bruijn_table[] = {
    0,   1, 28,  2, 29, 14, 24,  3, 30, 22, 20, 15, 25, 17,  4,  8,
    31, 27, 13, 23, 21, 19, 16,  7, 26, 12, 18,  6, 11,  5, 10,  9
};

static inline int32_t first_bit_set(int32_t a) {
    int32_t zero_case = mask(0, a);
    uint32_t x = (uint32_t) (a & -a);
    x *= 0x077CB531;
    x >>= 27;
    return zero_case | de_bruijn_table[x];
}
#endif

static inline __m128i fill(small_hash_t v) {
    int32_t v1 = (((int)v) << 16) | v;
    __m128i x = _mm_cvtsi32_si128(0);
    x = _mm_insert_epi32(x, v1, 0);
    return _mm_shuffle_epi32(x, _MM_SHUFFLE(0,0,0,0));
}

#ifndef SIDD_UWORD_OPS
#define SIDD_UWORD_OPS _SIDD_UWORD_OPS
#endif

#ifndef SIDD_CMP_EQUAL_EACH
#define SIDD_CMP_EQUAL_EACH _SIDD_CMP_EQUAL_EACH
#endif

#ifndef SIDD_BIT_MASK
#define SIDD_BIT_MASK _SIDD_BIT_MASK
#endif

#define _MODE (SIDD_UWORD_OPS | SIDD_CMP_EQUAL_EACH)

static inline __m128i cmp_mask(__m128i a, __m128i b) {
    const int mode = SIDD_UWORD_OPS | SIDD_CMP_EQUAL_EACH | SIDD_BIT_MASK;
    return _mm_cmpistrm(a, b, mode);
}

static inline int32_t line_result(uint32_t m, int start) {
    int32_t p  = first_bit_set((int32_t) m);
    int32_t mm = mask(p, -1);
    return mm | (start + p);
}

#define DUMP(xval) do {                                       \
    uint16_t xval##_x0 = _mm_extract_epi16(xval, 0);          \
    uint16_t xval##_x1 = _mm_extract_epi16(xval, 1);          \
    uint16_t xval##_x2 = _mm_extract_epi16(xval, 2);          \
    uint16_t xval##_x3 = _mm_extract_epi16(xval, 3);          \
    uint16_t xval##_x4 = _mm_extract_epi16(xval, 4);          \
    uint16_t xval##_x5 = _mm_extract_epi16(xval, 5);          \
    uint16_t xval##_x6 = _mm_extract_epi16(xval, 6);          \
    uint16_t xval##_x7 = _mm_extract_epi16(xval, 7);          \
    printf("  % 10s: %04x-%04x-%04x-%04x-%04x-%04x-%04x-%04x\n", \
           #xval, xval##_x0, xval##_x1, xval##_x2, xval##_x3, \
           xval##_x4, xval##_x5, xval##_x6, xval##_x7);       \
  } while(0);


int line_search(small_hash_t* array, int start0, small_hash_t v1) {
    int offset = start0 & 31;
    int start  = start0 & ~31;
    __m128i* p = (__m128i*) &array[start];
    __m128i x1, val1, val2, val3, val4;
    __m128i m1, m2, m3, m4, dmask;

    x1 = fill(v1);

    val1 = *p++;
    m1 = cmp_mask(x1, val1);
    val2 = *p++;
    m2 = _mm_slli_si128(cmp_mask(x1, val2), 1);
    val3 = *p++;
    m3 = _mm_slli_si128(cmp_mask(x1, val3), 2);
    val4 = *p;
    m4 = _mm_slli_si128(cmp_mask(x1, val4), 3);

    dmask = _mm_or_si128(_mm_or_si128(m1, m2),
                         _mm_or_si128(m3, m4));
    uint32_t imask = _mm_extract_epi32(dmask, 0);

    const uint32_t p2 = 1 << offset;
    const uint32_t dest_mask = imask & ~(p2 - 1);

    return line_result(dest_mask, start);
}

int line_search_2(small_hash_t* array, int start0, small_hash_t v1,
                  small_hash_t v2) {
    int offset = start0 & 31;
    int start  = start0 & ~31;
    __m128i* p = (__m128i*) &array[start];
    __m128i x1, x2, val1, val2, val3, val4;
    __m128i m1, m2, m3, m4, dmask;

    x1 = fill(v1);
    x2 = fill(v2);

#define M(v) _mm_or_si128(cmp_mask(x1,(v)), \
                          cmp_mask(x2,(v)))
    val1 = *p++;
    m1 = M(val1);
    val2 = *p++;
    m2 = _mm_slli_si128(M(val2), 1);
    val3 = *p++;
    m3 = _mm_slli_si128(M(val3), 2);
    val4 = *p;
    m4 = _mm_slli_si128(M(val4), 3);
#undef M

    dmask = _mm_or_si128(_mm_or_si128(m1, m2),
                         _mm_or_si128(m3, m4));
    uint32_t imask = _mm_extract_epi32(dmask, 0);

    const uint32_t p2 = 1 << offset;
    const uint32_t dest_mask = imask & ~(p2 - 1);

    return line_result(dest_mask, start);
}

int line_search_3(small_hash_t* array, int start0, small_hash_t v1,
                  small_hash_t v2, small_hash_t v3) {
    int offset = start0 & 31;
    int start  = start0 & ~31;
    __m128i* p = (__m128i*) &array[start];
    __m128i x1, x2, x3, val1, val2, val3, val4;
    __m128i m1, m2, m3, m4, dmask;

    x1 = fill(v1);
    x2 = fill(v2);
    x3 = fill(v3);

#define M(v) _mm_or_si128(                  \
        cmp_mask(x1,(v)),                   \
        _mm_or_si128(cmp_mask(x2,(v)),      \
                     cmp_mask(x3,(v))))
    val1 = *p++;
    m1 = M(val1);
    val2 = *p++;
    m2 = _mm_slli_si128(M(val2), 1);
    val3 = *p++;
    m3 = _mm_slli_si128(M(val3), 2);
    val4 = *p;
    m4 = _mm_slli_si128(M(val4), 3);
#undef M

    dmask = _mm_or_si128(_mm_or_si128(m1, m2),
                         _mm_or_si128(m3, m4));
    uint32_t imask = _mm_extract_epi32(dmask, 0);

    const uint32_t p2 = 1 << offset;
    const uint32_t dest_mask = imask & ~(p2 - 1);

    return line_result(dest_mask, start);
}