1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245
|
/**
* Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
* SPDX-License-Identifier: Apache-2.0.
*/
#include <aws/checksums/private/crc_priv.h>
#include <aws/common/assert.h>
#include <aws/common/cpuid.h>
#include <aws/common/macros.h>
#include <emmintrin.h>
#include <immintrin.h>
#include <smmintrin.h>
#if defined(AWS_HAVE_AVX512_INTRINSICS) && defined(AWS_ARCH_INTEL_X64)
# include <wmmintrin.h>
AWS_ALIGNED_TYPEDEF(const uint64_t, aligned_512_u64[8], 64);
// This macro uses casting to ensure the compiler actually uses the unaligned load instructions
# define load_zmm(ptr) _mm512_loadu_si512((const uint8_t *)(const void *)(ptr))
/*
* crc32c_avx512(): compute the crc32c of the buffer, where the buffer
* length must be at least 256, and a multiple of 64. Based on:
*
* "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
* V. Gopal, E. Ozturk, et al., 2009, http://download.intel.com/design/intarch/papers/323102.pdf
*/
static uint32_t s_checksums_crc32c_avx512_impl(const uint8_t *input, int length, uint32_t previous_crc) {
AWS_ASSERT(
length >= 256 && "invariant violated. length must be greater than 255 bytes to use avx512 to compute crc.");
uint32_t crc = previous_crc;
/*
* Definitions of the bit-reflected domain constants k1,k2,k3,k4,k5,k6
* are similar to those given at the end of the paper
*
* k1 = ( x ^ ( 512 * 4 + 32 ) mod P(x) << 32 )' << 1
* k2 = ( x ^ ( 512 * 4 - 32 ) mod P(x) << 32 )' << 1
* k3 = ( x ^ ( 512 + 32 ) mod P(x) << 32 )' << 1
* k4 = ( x ^ ( 512 - 32 ) mod P(x) << 32 )' << 1
* k5 = ( x ^ ( 128 + 32 ) mod P(x) << 32 )' << 1
* k6 = ( x ^ ( 128 - 32 ) mod P(x) << 32 )' << 1
*/
static aligned_512_u64 k1k2 = {
0xdcb17aa4, 0xb9e02b86, 0xdcb17aa4, 0xb9e02b86, 0xdcb17aa4, 0xb9e02b86, 0xdcb17aa4, 0xb9e02b86};
static aligned_512_u64 k3k4 = {
0x740eef02, 0x9e4addf8, 0x740eef02, 0x9e4addf8, 0x740eef02, 0x9e4addf8, 0x740eef02, 0x9e4addf8};
static aligned_512_u64 k9k10 = {
0x6992cea2, 0x0d3b6092, 0x6992cea2, 0x0d3b6092, 0x6992cea2, 0x0d3b6092, 0x6992cea2, 0x0d3b6092};
static aligned_512_u64 k1k4 = {
0x1c291d04, 0xddc0152b, 0x3da6d0cb, 0xba4fc28e, 0xf20c0dfe, 0x493c7d27, 0x00000000, 0x00000000};
__m512i x0, x1, x2, x3, x4, x5, x6, x7, x8, y5, y6, y7, y8;
__m128i a1;
/*
* There's at least one block of 256.
*/
x1 = load_zmm(input + 0x00);
x2 = load_zmm(input + 0x40);
x3 = load_zmm(input + 0x80);
x4 = load_zmm(input + 0xC0);
// Load the crc into a zmm register and XOR with the first 64 bytes of input
x5 = _mm512_inserti32x4(_mm512_setzero_si512(), _mm_cvtsi32_si128((int)crc), 0);
x1 = _mm512_xor_si512(x1, x5);
x0 = load_zmm(k1k2);
input += 256;
length -= 256;
/*
* Parallel fold blocks of 256, if any.
*/
while (length >= 256) {
x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
x6 = _mm512_clmulepi64_epi128(x2, x0, 0x00);
x7 = _mm512_clmulepi64_epi128(x3, x0, 0x00);
x8 = _mm512_clmulepi64_epi128(x4, x0, 0x00);
x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11);
x2 = _mm512_clmulepi64_epi128(x2, x0, 0x11);
x3 = _mm512_clmulepi64_epi128(x3, x0, 0x11);
x4 = _mm512_clmulepi64_epi128(x4, x0, 0x11);
y5 = load_zmm(input + 0x00);
y6 = load_zmm(input + 0x40);
y7 = load_zmm(input + 0x80);
y8 = load_zmm(input + 0xC0);
x1 = _mm512_ternarylogic_epi64(x1, x5, y5, 0x96);
x2 = _mm512_ternarylogic_epi64(x2, x6, y6, 0x96);
x3 = _mm512_ternarylogic_epi64(x3, x7, y7, 0x96);
x4 = _mm512_ternarylogic_epi64(x4, x8, y8, 0x96);
input += 256;
length -= 256;
}
/*
* Fold 256 bytes into 64 bytes.
*/
x0 = load_zmm(k9k10);
x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
x6 = _mm512_clmulepi64_epi128(x1, x0, 0x11);
x3 = _mm512_ternarylogic_epi64(x3, x5, x6, 0x96);
x7 = _mm512_clmulepi64_epi128(x2, x0, 0x00);
x8 = _mm512_clmulepi64_epi128(x2, x0, 0x11);
x4 = _mm512_ternarylogic_epi64(x4, x7, x8, 0x96);
x0 = load_zmm(k3k4);
y5 = _mm512_clmulepi64_epi128(x3, x0, 0x00);
y6 = _mm512_clmulepi64_epi128(x3, x0, 0x11);
x1 = _mm512_ternarylogic_epi64(x4, y5, y6, 0x96);
/*
* Single fold blocks of 64, if any.
*/
while (length >= 64) {
x2 = load_zmm(input);
x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11);
x1 = _mm512_ternarylogic_epi64(x1, x2, x5, 0x96);
input += 64;
length -= 64;
}
/*
* Fold 512-bits to 128-bits.
*/
x0 = load_zmm(k1k4);
x4 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
x3 = _mm512_clmulepi64_epi128(x1, x0, 0x11);
x2 = _mm512_xor_si512(x3, x4);
a1 = _mm_xor_si128(_mm512_extracti32x4_epi32(x1, 3), _mm512_extracti32x4_epi32(x2, 0));
a1 = _mm_ternarylogic_epi64(a1, _mm512_extracti32x4_epi32(x2, 1), _mm512_extracti32x4_epi32(x2, 2), 0x96);
/*
* Fold 128-bits to 32-bits.
*/
uint64_t val;
val = _mm_crc32_u64(0, _mm_extract_epi64(a1, 0));
return (uint32_t)_mm_crc32_u64(val, _mm_extract_epi64(a1, 1));
}
#endif /* #if defined(AWS_HAVE_AVX512_INTRINSICS) && (INTPTR_MAX == INT64_MAX) */
static bool detection_performed = false;
static bool detected_sse42 = false;
static bool detected_avx512 = false;
static bool detected_clmul = false;
static bool detected_vpclmulqdq = false;
uint32_t aws_checksums_crc32c_intel_avx512_with_sse_fallback(const uint8_t *input, int length, uint32_t previous_crc) {
if (AWS_UNLIKELY(!detection_performed)) {
detected_sse42 = aws_cpu_has_feature(AWS_CPU_FEATURE_SSE_4_2);
detected_avx512 = aws_cpu_has_feature(AWS_CPU_FEATURE_AVX512);
detected_clmul = aws_cpu_has_feature(AWS_CPU_FEATURE_CLMUL);
detected_vpclmulqdq = aws_cpu_has_feature(AWS_CPU_FEATURE_VPCLMULQDQ);
/* Simply setting the flag true to skip HW detection next time
Not using memory barriers since the worst that can
happen is a fallback to the non HW accelerated code. */
detection_performed = true;
}
/* this is the entry point. We should only do the bit flip once. It should not be done for the subfunctions and
* branches.*/
uint32_t crc = ~previous_crc;
/* For small input, forget about alignment checks - simply compute the CRC32c one byte at a time */
if (length < (int)sizeof(slice_ptr_int_type)) {
while (length-- > 0) {
crc = (uint32_t)_mm_crc32_u8(crc, *input++);
}
return ~crc;
}
/* Get the 8-byte memory alignment of our input buffer by looking at the least significant 3 bits */
int input_alignment = (uintptr_t)(input) & 0x7;
/* Compute the number of unaligned bytes before the first aligned 8-byte chunk (will be in the range 0-7) */
int leading = (8 - input_alignment) & 0x7;
/* reduce the length by the leading unaligned bytes we are about to process */
length -= leading;
/* spin through the leading unaligned input bytes (if any) one-by-one */
while (leading-- > 0) {
crc = (uint32_t)_mm_crc32_u8(crc, *input++);
}
#if defined(AWS_HAVE_AVX512_INTRINSICS) && defined(AWS_ARCH_INTEL_X64)
int chunk_size = length & ~63;
if (detected_avx512 && detected_vpclmulqdq && detected_clmul) {
if (length >= 256) {
crc = s_checksums_crc32c_avx512_impl(input, length, crc);
/* check remaining data */
length -= chunk_size;
if (!length) {
return ~crc;
}
/* Fall into the default crc32 for the remaining data. */
input += chunk_size;
}
}
#endif
#if defined(AWS_ARCH_INTEL_X64) && !defined(_MSC_VER)
if (detected_sse42 && detected_clmul) {
// this function is an entry point on its own. It inverts the crc passed to it
// does its thing and then inverts it upon return. In order to keep
// aws_checksums_crc32c_sse42 a standalone function (which it has to be due
// to the way its implemented) it's better that it doesn't need to know it's used
// in a larger computation fallback.
return aws_checksums_crc32c_clmul_sse42(input, length, ~crc);
}
#endif
/* Spin through remaining (aligned) 8-byte chunks using the CRC32Q quad word instruction */
while (length >= (int)sizeof(slice_ptr_int_type)) {
crc = (uint32_t)crc_intrin_fn(crc, *(slice_ptr_int_type *)(input));
input += sizeof(slice_ptr_int_type);
length -= (int)sizeof(slice_ptr_int_type);
}
/* Finish up with any trailing bytes using the CRC32B single byte instruction one-by-one */
while (length-- > 0) {
crc = (uint32_t)_mm_crc32_u8(crc, *input);
input++;
}
return ~crc;
}
|