File: adler32_rvv.c

package info (click to toggle)
node-yarnpkg 4.1.0%2Bdfsg-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 24,752 kB
  • sloc: javascript: 38,953; ansic: 26,035; cpp: 7,247; sh: 2,829; makefile: 724; perl: 493
file content (136 lines) | stat: -rw-r--r-- 5,152 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
/* adler32_rvv.c - RVV version of adler32
 * Copyright (C) 2023 SiFive, Inc. All rights reserved.
 * Contributed by Alex Chiang <alex.chiang@sifive.com>
 * For conditions of distribution and use, see copyright notice in zlib.h
 */

#ifdef RISCV_RVV

#include <riscv_vector.h>
#include <stdint.h>

#include "zbuild.h"
#include "adler32_p.h"

static inline uint32_t adler32_rvv_impl(uint32_t adler, uint8_t* restrict dst, const uint8_t *src, size_t len, int COPY) {
    /* split Adler-32 into component sums */
    uint32_t sum2 = (adler >> 16) & 0xffff;
    adler &= 0xffff;

    /* in case user likes doing a byte at a time, keep it fast */
    if (len == 1) {
        if (COPY) memcpy(dst, src, 1);
        return adler32_len_1(adler, src, sum2);
    }

    /* initial Adler-32 value (deferred check for len == 1 speed) */
    if (src == NULL)
        return 1L;

    /* in case short lengths are provided, keep it somewhat fast */
    if (len < 16) {
        if (COPY) memcpy(dst, src, len);
        return adler32_len_16(adler, src, len, sum2);
    }

    size_t left = len;
    size_t vl = __riscv_vsetvlmax_e8m1();
    vl = vl > 256 ? 256 : vl;
    vuint32m4_t v_buf32_accu = __riscv_vmv_v_x_u32m4(0, vl);
    vuint32m4_t v_adler32_prev_accu = __riscv_vmv_v_x_u32m4(0, vl);
    vuint16m2_t v_buf16_accu;

    /*
     * We accumulate 8-bit data, and to prevent overflow, we have to use a 32-bit accumulator.
     * However, adding 8-bit data into a 32-bit accumulator isn't efficient. We use 16-bit & 32-bit
     * accumulators to boost performance.
     *
     * The block_size is the largest multiple of vl that <= 256, because overflow would occur when
     * vl > 256 (255 * 256 <= UINT16_MAX).
     *
     * We accumulate 8-bit data into a 16-bit accumulator and then
     * move the data into the 32-bit accumulator at the last iteration.
     */
    size_t block_size = (256 / vl) * vl;
    size_t nmax_limit = (NMAX / block_size);
    size_t cnt = 0;
    while (left >= block_size) {
        v_buf16_accu = __riscv_vmv_v_x_u16m2(0, vl);
        size_t subprob = block_size;
        while (subprob > 0) {
            vuint8m1_t v_buf8 = __riscv_vle8_v_u8m1(src, vl);
            if (COPY) __riscv_vse8_v_u8m1(dst, v_buf8, vl);
            v_adler32_prev_accu = __riscv_vwaddu_wv_u32m4(v_adler32_prev_accu, v_buf16_accu, vl);
            v_buf16_accu = __riscv_vwaddu_wv_u16m2(v_buf16_accu, v_buf8, vl);
            src += vl;
            if (COPY) dst += vl;
            subprob -= vl;
        }
        v_adler32_prev_accu = __riscv_vmacc_vx_u32m4(v_adler32_prev_accu, block_size / vl, v_buf32_accu, vl);
        v_buf32_accu = __riscv_vwaddu_wv_u32m4(v_buf32_accu, v_buf16_accu, vl);
        left -= block_size;
        /* do modulo once each block of NMAX size */
        if (++cnt >= nmax_limit) {
            v_adler32_prev_accu = __riscv_vremu_vx_u32m4(v_adler32_prev_accu, BASE, vl);
            v_buf32_accu = __riscv_vremu_vx_u32m4(v_buf32_accu, BASE, vl);
            cnt = 0;
        }
    }
    /* the left len <= 256 now, we can use 16-bit accum safely */
    v_buf16_accu = __riscv_vmv_v_x_u16m2(0, vl);
    size_t res = left;
    while (left >= vl) {
        vuint8m1_t v_buf8 = __riscv_vle8_v_u8m1(src, vl);
        if (COPY) __riscv_vse8_v_u8m1(dst, v_buf8, vl);
        v_adler32_prev_accu = __riscv_vwaddu_wv_u32m4(v_adler32_prev_accu, v_buf16_accu, vl);
        v_buf16_accu = __riscv_vwaddu_wv_u16m2(v_buf16_accu, v_buf8, vl);
        src += vl;
        if (COPY) dst += vl;
        left -= vl;
    }
    v_adler32_prev_accu = __riscv_vmacc_vx_u32m4(v_adler32_prev_accu, res / vl, v_buf32_accu, vl);
    v_adler32_prev_accu = __riscv_vremu_vx_u32m4(v_adler32_prev_accu, BASE, vl);
    v_buf32_accu = __riscv_vwaddu_wv_u32m4(v_buf32_accu, v_buf16_accu, vl);

    vuint32m4_t v_seq = __riscv_vid_v_u32m4(vl);
    vuint32m4_t v_rev_seq = __riscv_vrsub_vx_u32m4(v_seq, vl, vl);
    vuint32m4_t v_sum32_accu = __riscv_vmul_vv_u32m4(v_buf32_accu, v_rev_seq, vl);

    v_sum32_accu = __riscv_vadd_vv_u32m4(v_sum32_accu, __riscv_vmul_vx_u32m4(v_adler32_prev_accu, vl, vl), vl);

    vuint32m1_t v_sum2_sum = __riscv_vmv_s_x_u32m1(0, vl);
    v_sum2_sum = __riscv_vredsum_vs_u32m4_u32m1(v_sum32_accu, v_sum2_sum, vl);
    uint32_t sum2_sum = __riscv_vmv_x_s_u32m1_u32(v_sum2_sum) % BASE;

    sum2 += (sum2_sum + adler * ((len - left) % BASE));

    vuint32m1_t v_adler_sum = __riscv_vmv_s_x_u32m1(0, vl);
    v_adler_sum = __riscv_vredsum_vs_u32m4_u32m1(v_buf32_accu, v_adler_sum, vl);
    uint32_t adler_sum = __riscv_vmv_x_s_u32m1_u32(v_adler_sum) % BASE;

    adler += adler_sum;

    sum2 %= BASE;
    adler %= BASE;

    while (left--) {
        if (COPY) *dst++ = *src;
        adler += *src++;
        sum2 += adler;
    }

    sum2 %= BASE;
    adler %= BASE;

    return adler | (sum2 << 16);
}

Z_INTERNAL uint32_t adler32_fold_copy_rvv(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) {
    return adler32_rvv_impl(adler, dst, src, len, 1);
}

Z_INTERNAL uint32_t adler32_rvv(uint32_t adler, const uint8_t *buf, size_t len) {
    return adler32_rvv_impl(adler, NULL, buf, len, 0);
}

#endif // RISCV_RVV