File: crc32_refl_by8_sse.asm

package info (click to toggle)
intel-ipsec-mb 1.3-2
  • links: PTS
  • area: main
  • in suites: bookworm
  • size: 16,792 kB
  • sloc: ansic: 100,269; asm: 87,579; pascal: 6,014; javascript: 5,562; makefile: 1,230; python: 1,088
file content (415 lines) | stat: -rw-r--r-- 14,444 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
;;
;; Copyright (c) 2020-2022, Intel Corporation
;;
;; Redistribution and use in source and binary forms, with or without
;; modification, are permitted provided that the following conditions are met:
;;
;;     * Redistributions of source code must retain the above copyright notice,
;;       this list of conditions and the following disclaimer.
;;     * Redistributions in binary form must reproduce the above copyright
;;       notice, this list of conditions and the following disclaimer in the
;;       documentation and/or other materials provided with the distribution.
;;     * Neither the name of Intel Corporation nor the names of its contributors
;;       may be used to endorse or promote products derived from this software
;;       without specific prior written permission.
;;
;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
;;

;; Authors of original CRC implementation:
;;     Erdinc Ozturk
;;     Vinodh Gopal
;;     James Guilford
;;     Greg Tucker
;;
;; Reference paper titled:
;;     "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
;;     URL: http://download.intel.com/design/intarch/papers/323102.pdf

%include "include/os.asm"
%include "include/memcpy.asm"
%include "include/reg_sizes.asm"
%include "include/crc32_refl.inc"
%include "include/clear_regs.asm"

%ifndef CRC32_REFL_FN
%define CRC32_REFL_FN crc32_refl_by8_sse
%endif

[bits 64]
default rel

%ifdef LINUX
%define arg1            rdi
%define arg2            rsi
%define arg3            rdx
%define arg4            rcx
%else
%define arg1            rcx
%define arg2            rdx
%define arg3            r8
%define arg4            r9
%endif

mksection .text

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; arg1 - initial CRC value
;; arg2 - buffer pointer
;; arg3 - buffer size
;; arg4 - pointer to CRC constants
;; Returns CRC value through EAX
align 32
MKGLOBAL(CRC32_REFL_FN,function,internal)
CRC32_REFL_FN:
        not             DWORD(arg1)

        ;; check if smaller than 256B
        cmp             arg3, 256
        jl              .less_than_256

        ;; load the initial crc value
        movd            xmm10, DWORD(arg1)      ; initial crc

        ;; load initial 128B data, xor the initial crc value
        movdqu          xmm0, [arg2 + 16 * 0]
        movdqu          xmm1, [arg2 + 16 * 1]
        movdqu          xmm2, [arg2 + 16 * 2]
        movdqu          xmm3, [arg2 + 16 * 3]
        movdqu          xmm4, [arg2 + 16 * 4]
        movdqu          xmm5, [arg2 + 16 * 5]
        movdqu          xmm6, [arg2 + 16 * 6]
        movdqu          xmm7, [arg2 + 16 * 7]

        ;; XOR the initial_crc value
        pxor            xmm0, xmm10
        movdqa          xmm10, [arg4 + crc32_const_fold_8x128b]

        ;; subtract 256 instead of 128 to save one instruction from the loop
        sub             arg3, 256

        ;; In this section of the code, there is ((128 * x) + y) bytes of buffer
        ;; where, 0 <= y < 128.
        ;; The fold_128_B_loop loop will fold 128 bytes at a time until
        ;; there is (128 + y) bytes of buffer left

        ;; Fold 128 bytes at a time.
        ;; This section of the code folds 8 xmm registers in parallel
.fold_128_B_loop:
        add             arg2, 128
        movdqu          xmm9, [arg2 + 16 * 0]
        movdqu          xmm12, [arg2 + 16 * 1]
        movdqa          xmm8, xmm0
        pclmulqdq       xmm8, xmm10, 0x10
        pclmulqdq       xmm0, xmm10 , 0x1
        movdqa          xmm13, xmm1
        pclmulqdq       xmm13, xmm10, 0x10
        pclmulqdq       xmm1, xmm10 , 0x1
        pxor            xmm0, xmm9
        xorps           xmm0, xmm8
        pxor            xmm1, xmm12
        xorps           xmm1, xmm13

        movdqu          xmm9, [arg2 + 16 * 2]
        movdqu          xmm12, [arg2 + 16 * 3]
        movdqa          xmm8, xmm2
        pclmulqdq       xmm8, xmm10, 0x10
        pclmulqdq       xmm2, xmm10 , 0x1
        movdqa          xmm13, xmm3
        pclmulqdq       xmm13, xmm10, 0x10
        pclmulqdq       xmm3, xmm10 , 0x1
        pxor            xmm2, xmm9
        xorps           xmm2, xmm8
        pxor            xmm3, xmm12
        xorps           xmm3, xmm13

        movdqu          xmm9, [arg2 + 16 * 4]
        movdqu          xmm12, [arg2 + 16 * 5]
        movdqa          xmm8, xmm4
        pclmulqdq       xmm8, xmm10, 0x10
        pclmulqdq       xmm4, xmm10 , 0x1
        movdqa          xmm13, xmm5
        pclmulqdq       xmm13, xmm10, 0x10
        pclmulqdq       xmm5, xmm10 , 0x1
        pxor            xmm4, xmm9
        xorps           xmm4, xmm8
        pxor            xmm5, xmm12
        xorps           xmm5, xmm13

        movdqu          xmm9, [arg2 + 16 * 6]
        movdqu          xmm12, [arg2 + 16 * 7]
        movdqa          xmm8, xmm6
        pclmulqdq       xmm8, xmm10, 0x10
        pclmulqdq       xmm6, xmm10 , 0x1
        movdqa          xmm13, xmm7
        pclmulqdq       xmm13, xmm10, 0x10
        pclmulqdq       xmm7, xmm10 , 0x1
        pxor            xmm6, xmm9
        xorps           xmm6, xmm8
        pxor            xmm7, xmm12
        xorps           xmm7, xmm13

        sub             arg3, 128
        jge             .fold_128_B_loop

        add             arg2, 128
        ;; At this point, the buffer pointer is pointing at the last
        ;; y bytes of the buffer, where 0 <= y < 128.
        ;; The 128B of folded data is in 8 of the xmm registers:
        ;;     xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7

        ;; fold the 8 xmm registers into 1 xmm register with different constants
        movdqa          xmm10, [arg4 + crc32_const_fold_7x128b]
        movdqa          xmm8, xmm0
        pclmulqdq       xmm8, xmm10, 0x1
        pclmulqdq       xmm0, xmm10, 0x10
        pxor            xmm7, xmm8
        xorps           xmm7, xmm0

        movdqa          xmm10, [arg4 + crc32_const_fold_6x128b]
        movdqa          xmm8, xmm1
        pclmulqdq       xmm8, xmm10, 0x1
        pclmulqdq       xmm1, xmm10, 0x10
        pxor            xmm7, xmm8
        xorps           xmm7, xmm1

        movdqa          xmm10, [arg4 + crc32_const_fold_5x128b]
        movdqa          xmm8, xmm2
        pclmulqdq       xmm8, xmm10, 0x1
        pclmulqdq       xmm2, xmm10, 0x10
        pxor            xmm7, xmm8
        pxor            xmm7, xmm2

        movdqa          xmm10, [arg4 + crc32_const_fold_4x128b]
        movdqa          xmm8, xmm3
        pclmulqdq       xmm8, xmm10, 0x1
        pclmulqdq       xmm3, xmm10, 0x10
        pxor            xmm7, xmm8
        xorps           xmm7, xmm3

        movdqa          xmm10, [arg4 + crc32_const_fold_3x128b]
        movdqa          xmm8, xmm4
        pclmulqdq       xmm8, xmm10, 0x1
        pclmulqdq       xmm4, xmm10, 0x10
        pxor            xmm7, xmm8
        pxor            xmm7, xmm4

        movdqa          xmm10, [arg4 + crc32_const_fold_2x128b]
        movdqa          xmm8, xmm5
        pclmulqdq       xmm8, xmm10, 0x1
        pclmulqdq       xmm5, xmm10, 0x10
        pxor            xmm7, xmm8
        xorps           xmm7, xmm5

        movdqa          xmm10, [arg4 + crc32_const_fold_1x128b]
        movdqa          xmm8, xmm6
        pclmulqdq       xmm8, xmm10, 0x1
        pclmulqdq       xmm6, xmm10, 0x10
        pxor            xmm7, xmm8
        pxor            xmm7, xmm6

        ;; Instead of 128, we add 128-16 to the loop counter to save 1
        ;; instruction from the loop below.
        ;; Instead of a cmp instruction, we use the negative flag with the jl instruction
        add             arg3, 128 - 16
        jl              .final_reduction_for_128

        ;; There are 16 + y bytes left to reduce.
        ;; 16 bytes is in register xmm7 and the rest is in memory
        ;; we can fold 16 bytes at a time if y>=16
        ;; continue folding 16B at a time

.16B_reduction_loop:
        movdqa          xmm8, xmm7
        pclmulqdq       xmm8, xmm10, 0x1
        pclmulqdq       xmm7, xmm10, 0x10
        pxor            xmm7, xmm8
        movdqu          xmm0, [arg2]
        pxor            xmm7, xmm0
        add             arg2, 16
        sub             arg3, 16
        ;; Instead of a cmp instruction, we utilize the flags with the jge instruction.
        ;; Equivalent of check if there is any more 16B in the buffer to be folded.
        jge             .16B_reduction_loop

        ;; Now we have 16+z bytes left to reduce, where 0<= z < 16.
        ;; First, we reduce the data in the xmm7 register

.final_reduction_for_128:
        add             arg3, 16
        je              .128_done

        ;; Here we are getting data that is less than 16 bytes.
        ;; Since we know that there was data before the pointer, we can offset
        ;; the input pointer before the actual point, to receive exactly 16 bytes.
        ;; After that the registers need to be adjusted.
.get_last_two_xmms:

        movdqa          xmm2, xmm7
        movdqu          xmm1, [arg2 - 16 + arg3]

        ;; Get rid of the extra data that was loaded before.
        ;; Load the shift constant.
        lea             rax, [rel pshufb_shf_table]
        movdqu          xmm0, [rax + arg3]

        pshufb          xmm7, xmm0
        pxor            xmm0, [rel mask3]
        pshufb          xmm2, xmm0

        pblendvb        xmm2, xmm1              ; xmm0 is implicit

        movdqa          xmm8, xmm7
        pclmulqdq       xmm8, xmm10, 0x1
        pclmulqdq       xmm7, xmm10, 0x10
        pxor            xmm7, xmm8
        pxor            xmm7, xmm2

.128_done:
        ;; compute crc of a 128-bit value
        movdqa          xmm10, [arg4 + crc32_const_fold_128b_to_64b]
        movdqa          xmm0, xmm7

        ;; 64b fold
        pclmulqdq       xmm7, xmm10, 0
        psrldq          xmm0, 8
        pxor            xmm7, xmm0

        ;; 32b fold
        movdqa          xmm0, xmm7
        pslldq          xmm7, 4
        pclmulqdq       xmm7, xmm10, 0x10
        pxor            xmm7, xmm0

        ;; barrett reduction
.barrett:
        pand            xmm7, [rel mask2]
        movdqa          xmm1, xmm7
        movdqa          xmm2, xmm7
        movdqa          xmm10, [arg4 + crc32_const_reduce_64b_to_32b]

        pclmulqdq       xmm7, xmm10, 0
        pxor            xmm7, xmm2
        pand            xmm7, [rel mask]
        movdqa          xmm2, xmm7
        pclmulqdq       xmm7, xmm10, 0x10
        pxor            xmm7, xmm2
        pxor            xmm7, xmm1
        pextrd          eax, xmm7, 2

.cleanup:
%ifdef SAFE_DATA
        clear_all_xmms_sse_asm
%endif
        not             eax
        ret

align 32
.less_than_256:
        ;; check if there is enough buffer to be able to fold 16B at a time
        cmp             arg3, 32
        jl              .less_than_32

        ;; if there is, load the constants
        movdqa          xmm10, [arg4 + crc32_const_fold_1x128b]

        movd            xmm0, DWORD(arg1)       ; get the initial crc value
        movdqu          xmm7, [arg2]            ; load the plaintext
        pxor            xmm7, xmm0

        ;; update the buffer pointer
        add             arg2, 16

        ;; update the counter
        ;; - subtract 32 instead of 16 to save one instruction from the loop
        sub             arg3, 32
        jmp             .16B_reduction_loop

align 32
.less_than_32:
        ;; Move initial crc to the return value.
        ;; This is necessary for zero-length buffers.
        mov             eax, DWORD(arg1)
        test            arg3, arg3
        je              .cleanup

        movd            xmm0, DWORD(arg1)       ; get the initial crc value

        cmp             arg3, 16
        je              .exact_16_left
        jl              .less_than_16_left

        movdqu          xmm7, [arg2]            ; load the plaintext
        pxor            xmm7, xmm0              ; xor the initial crc value
        add             arg2, 16
        sub             arg3, 16
        movdqa          xmm10, [arg4 + crc32_const_fold_1x128b]
        jmp             .get_last_two_xmms

align 32
.less_than_16_left:
        simd_load_sse_15_1 xmm7, arg2, arg3
        pxor            xmm7, xmm0              ; xor the initial crc value

        cmp             arg3, 4
        jl              .only_less_than_4

        lea             rax, [rel pshufb_shf_table]
        movdqu          xmm0, [rax + arg3]
        pshufb          xmm7,xmm0
        jmp             .128_done

align 32
.exact_16_left:
        movdqu          xmm7, [arg2]
        pxor            xmm7, xmm0              ; xor the initial crc value
        jmp             .128_done

.only_less_than_4:
        cmp             arg3, 3
        jl              .only_less_than_3
        pslldq          xmm7, 5
        jmp             .barrett

.only_less_than_3:
        cmp             arg3, 2
        jl              .only_less_than_2
        pslldq          xmm7, 6
        jmp             .barrett

.only_less_than_2:
        pslldq          xmm7, 7
        jmp             .barrett

mksection .rodata

align 16
mask:
        dq     0xFFFFFFFFFFFFFFFF, 0x0000000000000000

align 16
mask2:
        dq     0xFFFFFFFF00000000, 0xFFFFFFFFFFFFFFFF

align 16
mask3:
        dq     0x8080808080808080, 0x8080808080808080

align 16
pshufb_shf_table:
;; use these values for shift constants for the pshufb instruction
        dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
        dq 0x0706050403020100, 0x000e0d0c0b0a0908

mksection stack-noexec