File: chunkset_avx512.c

package info (click to toggle)
node-yarnpkg 4.1.0%2Bdfsg-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 24,752 kB
  • sloc: javascript: 38,953; ansic: 26,035; cpp: 7,247; sh: 2,829; makefile: 724; perl: 493
file content (182 lines) | stat: -rw-r--r-- 6,132 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
/* chunkset_avx512.c -- AVX512 inline functions to copy small data chunks.
 * For conditions of distribution and use, see copyright notice in zlib.h
 */
#include "zbuild.h"
#include "zmemory.h"

#ifdef X86_AVX512

#include "avx2_tables.h"
#include <immintrin.h>
#include "x86_intrins.h"

typedef __m256i chunk_t;
typedef __m128i halfchunk_t;
typedef __mmask32 mask_t;
typedef __mmask16 halfmask_t;

#define HAVE_CHUNKMEMSET_2
#define HAVE_CHUNKMEMSET_4
#define HAVE_CHUNKMEMSET_8
#define HAVE_CHUNKMEMSET_16
#define HAVE_CHUNK_MAG
#define HAVE_HALF_CHUNK
#define HAVE_MASKED_READWRITE
#define HAVE_CHUNKCOPY
#define HAVE_HALFCHUNKCOPY

static inline halfmask_t gen_half_mask(unsigned len) {
   return (halfmask_t)_bzhi_u32(0xFFFF, len);
}

static inline mask_t gen_mask(unsigned len) {
   return (mask_t)_bzhi_u32(0xFFFFFFFF, len);
}

static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) {
    *chunk = _mm256_set1_epi16(zng_memread_2(from));
}

static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) {
    *chunk = _mm256_set1_epi32(zng_memread_4(from));
}

static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) {
    *chunk = _mm256_set1_epi64x(zng_memread_8(from));
}

static inline void chunkmemset_16(uint8_t *from, chunk_t *chunk) {
    /* Unfortunately there seems to be a compiler bug in Visual Studio 2015 where
     * the load is dumped to the stack with an aligned move for this memory-register
     * broadcast. The vbroadcasti128 instruction is 2 fewer cycles and this dump to
     * stack doesn't exist if compiled with optimizations. For the sake of working
     * properly in a debugger, let's take the 2 cycle penalty */
#if defined(_MSC_VER) && _MSC_VER <= 1900
    halfchunk_t half = _mm_loadu_si128((__m128i*)from);
    *chunk = _mm256_inserti128_si256(_mm256_castsi128_si256(half), half, 1);
#else
    *chunk = _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)from));
#endif
}

static inline void loadchunk(uint8_t const *s, chunk_t *chunk) {
    *chunk = _mm256_loadu_si256((__m256i *)s);
}

static inline void storechunk(uint8_t *out, chunk_t *chunk) {
    _mm256_storeu_si256((__m256i *)out, *chunk);
}

static inline uint8_t* CHUNKCOPY(uint8_t *out, uint8_t const *from, unsigned len) {
    Assert(len > 0, "chunkcopy should never have a length 0");

    chunk_t chunk;
    uint32_t rem = len % sizeof(chunk_t);

    if (len < sizeof(chunk_t)) {
        mask_t rem_mask = gen_mask(rem);
        chunk = _mm256_maskz_loadu_epi8(rem_mask, from);
        _mm256_mask_storeu_epi8(out, rem_mask, chunk);
        return out + rem;
    }

    loadchunk(from, &chunk);
    rem = (rem == 0) ? sizeof(chunk_t) : rem;
    storechunk(out, &chunk);
    out += rem;
    from += rem;
    len -= rem;

    while (len > 0) {
        loadchunk(from, &chunk);
        storechunk(out, &chunk);
        out += sizeof(chunk_t);
        from += sizeof(chunk_t);
        len -= sizeof(chunk_t);
    }

    return out;
}

static inline chunk_t GET_CHUNK_MAG(uint8_t *buf, uint32_t *chunk_rem, uint32_t dist) {
    lut_rem_pair lut_rem = perm_idx_lut[dist - 3];
    __m256i ret_vec;
    *chunk_rem = lut_rem.remval;

    /* See the AVX2 implementation for more detailed comments. This is that + some masked
     * loads to avoid an out of bounds read on the heap */

    if (dist < 16) {
        const __m256i permute_xform =
            _mm256_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                             16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16);
        __m256i perm_vec = _mm256_load_si256((__m256i*)(permute_table+lut_rem.idx));
        halfmask_t load_mask = gen_half_mask(dist);
        __m128i ret_vec0 = _mm_maskz_loadu_epi8(load_mask, buf);
        perm_vec = _mm256_add_epi8(perm_vec, permute_xform);
        ret_vec = _mm256_inserti128_si256(_mm256_castsi128_si256(ret_vec0), ret_vec0, 1);
        ret_vec = _mm256_shuffle_epi8(ret_vec, perm_vec);
    }  else {
        halfmask_t load_mask = gen_half_mask(dist - 16);
        __m128i ret_vec0 = _mm_loadu_si128((__m128i*)buf);
        __m128i ret_vec1 = _mm_maskz_loadu_epi8(load_mask, (__m128i*)(buf + 16));
        __m128i perm_vec1 = _mm_load_si128((__m128i*)(permute_table + lut_rem.idx));
        halfmask_t xlane_mask = _mm_cmp_epi8_mask(perm_vec1, _mm_set1_epi8(15), _MM_CMPINT_LE);
        __m128i latter_half = _mm_mask_shuffle_epi8(ret_vec1, xlane_mask, ret_vec0, perm_vec1);
        ret_vec = _mm256_inserti128_si256(_mm256_castsi128_si256(ret_vec0), latter_half, 1);
    }

    return ret_vec;
}

static inline void storehalfchunk(uint8_t *out, halfchunk_t *chunk) {
    _mm_storeu_si128((__m128i *)out, *chunk);
}

static inline chunk_t halfchunk2whole(halfchunk_t *chunk) {
    /* We zero extend mostly to appease some memory sanitizers. These bytes are ultimately
     * unlikely to be actually written or read from */
    return _mm256_zextsi128_si256(*chunk);
}

static inline halfchunk_t GET_HALFCHUNK_MAG(uint8_t *buf, uint32_t *chunk_rem, uint32_t dist) {
    lut_rem_pair lut_rem = perm_idx_lut[dist - 3];
    __m128i perm_vec, ret_vec;
    halfmask_t load_mask = gen_half_mask(dist);
    ret_vec = _mm_maskz_loadu_epi8(load_mask, buf);
    *chunk_rem = half_rem_vals[dist - 3];

    perm_vec = _mm_load_si128((__m128i*)(permute_table + lut_rem.idx));
    ret_vec = _mm_shuffle_epi8(ret_vec, perm_vec);

    return ret_vec;
}

static inline uint8_t* HALFCHUNKCOPY(uint8_t *out, uint8_t const *from, unsigned len) {
    Assert(len > 0, "chunkcopy should never have a length 0");
    halfchunk_t chunk;

    uint32_t rem = len % sizeof(halfchunk_t);
    if (rem == 0) {
        rem = sizeof(halfchunk_t);
    }

    halfmask_t rem_mask = gen_half_mask(rem);
    chunk = _mm_maskz_loadu_epi8(rem_mask, from);
    _mm_mask_storeu_epi8(out, rem_mask, chunk);

    return out + rem;
}

#define CHUNKSIZE        chunksize_avx512
#define CHUNKUNROLL      chunkunroll_avx512
#define CHUNKMEMSET      chunkmemset_avx512
#define CHUNKMEMSET_SAFE chunkmemset_safe_avx512

#include "chunkset_tpl.h"

#define INFLATE_FAST     inflate_fast_avx512

#include "inffast_tpl.h"

#endif