File: visualc_crc32c_sse42.c

package info (click to toggle)
aws-crt-python 0.20.4%2Bdfsg-1~bpo12%2B1
  • links: PTS, VCS
  • area: main
  • in suites: bookworm-backports
  • size: 72,656 kB
  • sloc: ansic: 381,805; python: 23,008; makefile: 6,251; sh: 4,536; cpp: 699; ruby: 208; java: 77; perl: 73; javascript: 46; xml: 11
file content (77 lines) | stat: -rw-r--r-- 2,690 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
/**
 * Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 * SPDX-License-Identifier: Apache-2.0.
 */

#include <aws/checksums/private/crc_priv.h>
#include <intrin.h>

#if defined(_M_X64) || defined(_M_IX86)

#    if defined(_M_X64)
typedef uint64_t *slice_ptr_type;
typedef uint64_t slice_ptr_int_type;
#    else
typedef uint32_t *slice_ptr_type;
typedef uint32_t slice_ptr_int_type;
#    endif

/**
 * This implements crc32c via the intel sse 4.2 instructions.
 *  This is separate from the straight asm version, because visual c does not allow
 *  inline assembly for x64.
 */
uint32_t aws_checksums_crc32c_hw(const uint8_t *data, int length, uint32_t previousCrc32) {
    uint32_t crc = ~previousCrc32;
    int length_to_process = length;

    slice_ptr_type temp = (slice_ptr_type)data;

    /*to eek good performance out of the intel implementation, we need to only hit the hardware
      once we are aligned on the byte boundaries we are using. So, peel off a byte at a time until we are
      8 byte aligned (64 bit arch) or 4 byte aligned (32 bit arch)

      first calculate how many bytes we need to burn before we are aligned.
      for a 64 bit arch this is:
      (8 - <how far we are past a boundary>) mod 8
      32 bit:
      (4 - <how far we are past a boundary>) mod 4 */
    uint8_t alignment_offset = (sizeof(slice_ptr_int_type) - ((slice_ptr_int_type)temp % sizeof(slice_ptr_int_type))) %
                               sizeof(slice_ptr_int_type);

    /*for every byte we need to burn off, just do them a byte at a time.
      increment the temp pointer by one byte at a time until we get it on an alignment boundary */
    while (alignment_offset != 0 && length_to_process) {
        uint8_t *byte_pos = (uint8_t *)temp;
        crc = (uint32_t)_mm_crc32_u8(crc, *byte_pos++);
        temp = (slice_ptr_type)byte_pos;
        --alignment_offset;
        --length_to_process;
    }

    /*now whatever is left is properly aligned on a boundary*/
    uint32_t slices = length_to_process / sizeof(temp);
    uint32_t remainder = length_to_process % sizeof(temp);

    while (slices--) {
#    if defined(_M_X64)
        crc = (uint32_t)_mm_crc32_u64(crc, *temp++);
#    else
        crc = _mm_crc32_u32(crc, *temp++);
#    endif
    }

    /* process the remaining parts that can't be done on the slice size. */
    uint8_t *remainderPos = (uint8_t *)temp;

    while (remainder--) {
        crc = (uint32_t)_mm_crc32_u8(crc, *remainderPos++);
    }

    return ~crc;
}

uint32_t aws_checksums_crc32_hw(const uint8_t *input, int length, uint32_t previousCrc32) {
    return aws_checksums_crc32_sw(input, length, previousCrc32);
}
#endif /* x64 || x86 */