File: mb_mgr_aes_cmac_submit_flush_sse.inc

package info (click to toggle)
intel-ipsec-mb 2.0.1-3
  • links: PTS, VCS
  • area: main
  • in suites: forky
  • size: 17,404 kB
  • sloc: ansic: 104,071; asm: 64,976; pascal: 18,149; javascript: 5,637; python: 1,464; makefile: 799
file content (514 lines) | stat: -rw-r--r-- 15,104 bytes parent folder | download | duplicates (4)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
;;
;; Copyright (c) 2018-2024, Intel Corporation
;;
;; Redistribution and use in source and binary forms, with or without
;; modification, are permitted provided that the following conditions are met:
;;
;;     * Redistributions of source code must retain the above copyright notice,
;;       this list of conditions and the following disclaimer.
;;     * Redistributions in binary form must reproduce the above copyright
;;       notice, this list of conditions and the following disclaimer in the
;;       documentation and/or other materials provided with the distribution.
;;     * Neither the name of Intel Corporation nor the names of its contributors
;;       may be used to endorse or promote products derived from this software
;;       without specific prior written permission.
;;
;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
;;

%ifndef MB_MGR_AES_CMAC_SUBMIT_FLUSH_SSE_INC
%define MB_MGR_AES_CMAC_SUBMIT_FLUSH_SSE_INC

%include "include/os.inc"
%include "include/imb_job.inc"
%include "include/mb_mgr_datastruct.inc"

%include "include/reg_sizes.inc"
%include "include/memcpy.inc"
%include "include/const.inc"
;%define DO_DBGPRINT
%include "include/dbgprint.inc"

%define NUM_LANES 8

%define APPEND(a,b) a %+ b

%ifdef LINUX
%define arg1	rdi
%define arg2	rsi
%else
%define arg1	rcx
%define arg2	rdx
%endif

%define state	arg1
%define job	arg2
%define len2	arg2

%define job_rax          rax

; idx needs to be in rbp
%define len              rbp
%define idx              rbp
%define tmp              rbp

%define lane             r8

%define iv               r9
%define m_last           r10
%define n                r11

%define unused_lanes     rbx
%define r                rbx

%define tmp3             r12
%define tmp4             r13
%define tmp2             r14

%define good_lane        r15
%define rbits            r15

; STACK_SPACE needs to be an odd multiple of 8
; This routine and its callee clobbers all GPRs
struc STACK
_gpr_save:	resq	8
_rsp_save:	resq	1
endstruc

;;; ===========================================================================
;;; ===========================================================================
;;; MACROS
;;; ===========================================================================
;;; ===========================================================================

;;; ===========================================================================
;;; AES CMAC job submit & flush
;;; ===========================================================================
;;; SUBMIT_FLUSH [in] - SUBMIT, FLUSH job selection
%macro GENERIC_SUBMIT_FLUSH_JOB_AES_CMAC_SSE 2
%define %%SUBMIT_FLUSH %1 ;; [in] "SUBMIT" or "FLUSH" selection
%define %%AES_CBC_MAC  %2 ;; [in] function to process CBC MAC algorithm on all lanes
        mov	rax, rsp
        sub	rsp, STACK_size
        and	rsp, -16

	mov	[rsp + _gpr_save + 8*0], rbx
	mov	[rsp + _gpr_save + 8*1], rbp
	mov	[rsp + _gpr_save + 8*2], r12
	mov	[rsp + _gpr_save + 8*3], r13
	mov	[rsp + _gpr_save + 8*4], r14
	mov	[rsp + _gpr_save + 8*5], r15
%ifndef LINUX
	mov	[rsp + _gpr_save + 8*6], rsi
	mov	[rsp + _gpr_save + 8*7], rdi
%endif
	mov	[rsp + _rsp_save], rax	; original SP

        ;; Find free lane
 	mov	unused_lanes, [state + _aes_cmac_unused_lanes]

%ifidn %%SUBMIT_FLUSH, SUBMIT

 	mov	lane, unused_lanes
        and	lane, 0xF
 	shr	unused_lanes, 4
 	mov	[state + _aes_cmac_unused_lanes], unused_lanes

        ;; Copy job info into lane
 	mov	[state + _aes_cmac_job_in_lane + lane*8], job
        ;; Copy keys into lane args
 	mov	tmp, [job + _key_expanded]
 	mov	[state + _aes_cmac_args_keys + lane*8], tmp
        mov     tmp, lane
        shl     tmp, 4  ; lane*16

        ;; Zero IV to store digest
        pxor    xmm0, xmm0
        movdqa  [state + _aes_cmac_args_IV + tmp], xmm0

        lea     m_last, [state + _aes_cmac_scratch + tmp]

        ;; calculate len
        ;; convert bits to bytes (message length in bits for CMAC)
        mov     len, [job + _msg_len_to_hash_in_bits]
        mov     rbits, len
        add     len, 7      ; inc len if there are remainder bits
        shr     len, 3
        and     rbits, 7

        ;; Check at least 1 or more blocks (get n)
        mov     n, len
        add     n, 0xf
        shr     n, 4

        ;; Check for partial block
        mov     r, len
        and     r, 0xf

        or      n, n   ; check one or more blocks?
        jz      %%_lt_one_block

        ;; One or more blocks, potentially partial
        mov     word [state + _aes_cmac_init_done + lane*2], 0

        mov     tmp2, [job + _src]
        add     tmp2, [job + _hash_start_src_offset_in_bytes]
        mov     [state + _aes_cmac_args_in + lane*8], tmp2

        ;; len = (n-1)*16
        lea     tmp2, [n - 1]
        shl     tmp2, 4
        movdqa  xmm0, [state + _aes_cmac_lens]
        XPINSRW xmm0, xmm1, tmp, lane, tmp2, scale_x16
        movdqa  [state + _aes_cmac_lens], xmm0

        ;; check remainder bits
        or      rbits, rbits
        jnz     %%_not_complete_block_3gpp

        ;; check if complete block
        or      r, r
        jz      %%_complete_block

%%_not_complete_block:
        ;; M_last = padding(M_n) XOR K2
        lea     tmp, [rel padding_0x80_tab16 + 16]
        sub     tmp, r
        movdqu  xmm0, [tmp]
        movdqa  [m_last], xmm0

        mov     tmp, [job + _src]
        add     tmp, [job + _hash_start_src_offset_in_bytes]
        lea     tmp3, [n - 1]
        shl     tmp3, 4
        add     tmp, tmp3

        memcpy_sse_16 m_last, tmp, r, tmp4, tmp3

        ;; src + n + r
        mov     tmp3, [job + _skey2]
        movdqa  xmm1, [m_last]
        movdqu  xmm0, [tmp3]
        pxor    xmm0, xmm1
        movdqa  [m_last], xmm0

%%_step_5:
        ;; Find min length
        movdqa  xmm0, [state + _aes_cmac_lens]
        phminposuw xmm1, xmm0

        cmp	byte [state + _aes_cmac_unused_lanes], 0xf
        jne	%%_return_null

%else ; end SUBMIT

        ;; Check at least one job
        bt      unused_lanes, ((NUM_LANES * 4) + 3)
	jc      %%_return_null

      	;; Find a lane with a non-null job
	xor	good_lane, good_lane
	cmp	qword [state + _aes_cmac_job_in_lane + 1*8], 0
	cmovne	good_lane, [rel one]
	cmp	qword [state + _aes_cmac_job_in_lane + 2*8], 0
	cmovne	good_lane, [rel two]
	cmp	qword [state + _aes_cmac_job_in_lane + 3*8], 0
	cmovne	good_lane, [rel three]
	cmp	qword [state + _aes_cmac_job_in_lane + 4*8], 0
	cmovne	good_lane, [rel four]
	cmp	qword [state + _aes_cmac_job_in_lane + 5*8], 0
	cmovne	good_lane, [rel five]
	cmp	qword [state + _aes_cmac_job_in_lane + 6*8], 0
	cmovne	good_lane, [rel six]
	cmp	qword [state + _aes_cmac_job_in_lane + 7*8], 0
	cmovne	good_lane, [rel seven]

	; Copy good_lane to empty lanes
	mov	tmp2, [state + _aes_cmac_args_in + good_lane*8]
	mov	tmp3, [state + _aes_cmac_args_keys + good_lane*8]
	shl	good_lane, 4 ; multiply by 16
	movdqa	xmm2, [state + _aes_cmac_args_IV + good_lane]
	movdqa	xmm0, [state + _aes_cmac_lens]

%assign I 0
%rep NUM_LANES
	cmp	qword [state + _aes_cmac_job_in_lane + I*8], 0
	jne	APPEND(%%_skip_,I)
	mov	[state + _aes_cmac_args_in + I*8], tmp2
	mov	[state + _aes_cmac_args_keys + I*8], tmp3
	movdqa	[state + _aes_cmac_args_IV + I*16], xmm2
	por	xmm0, [rel len_masks + 16*I]
APPEND(%%_skip_,I):
%assign I (I+1)
%endrep
        ;; Find min length
        phminposuw xmm1, xmm0
        jmp     %%_cmac_round

%%_cmac_round_flush:
        ;; - good lane already known
        ;; - copy good_lane input pointer to empty lanes
        ;; - lens updated and vphminposuw executed
        mov     tmp2, [state + _aes_cmac_args_in + good_lane*8]
        xor     tmp3, tmp3
%assign I 0
%rep NUM_LANES
        cmp     qword [state + _aes_cmac_job_in_lane + I*8], tmp3
        jne     APPEND(%%_skip2_,I)
        mov     [state + _aes_cmac_args_in + I*8], tmp2
APPEND(%%_skip2_,I):
%assign I (I+1)
%endrep

%endif ; end FLUSH

%%_cmac_round:
	pextrw	len2, xmm1, 0	; min value
	pextrw	idx, xmm1, 1	; min index (0...3)
        or	len2, len2
	je	%%_len_is_0

	pshufb	xmm1, [rel dupw]        ; duplicate words across all lanes
        psubw	xmm0, xmm1
	movdqa	[state + _aes_cmac_lens], xmm0

        ; "state" and "args" are the same address, arg1
	; len2 is arg2
	call    %%AES_CBC_MAC
	; state and idx are intact

        movdqa  xmm0, [state + _aes_cmac_lens]  ; preload lens
%%_len_is_0:
        ; Check if job complete
        test    word [state + _aes_cmac_init_done + idx*2], 0xffff
        jnz     %%_copy_complete_digest

        ; Finish step 6
        mov     word [state + _aes_cmac_init_done + idx*2], 1

        ; Reset NULL lane lens to UINT16_MAX
%ifidn %%SUBMIT_FLUSH, FLUSH
        pxor    xmm1, xmm1
        pcmpeqq xmm1, [state + _aes_cmac_job_in_lane + 0]
        pshufb  xmm1, [rel len_shuf_masks + 0]

        pxor    xmm2, xmm2
        pcmpeqq xmm2, [state + _aes_cmac_job_in_lane + 16]
        pshufb  xmm2, [rel len_shuf_masks + 16]

        por     xmm1, xmm2
        por     xmm0, xmm1

        pxor    xmm3, xmm3
        pcmpeqq xmm3, [state + _aes_cmac_job_in_lane + 32]
        pshufb  xmm3, [rel len_shuf_masks + 32]

        pxor    xmm4, xmm4
        pcmpeqq xmm4, [state + _aes_cmac_job_in_lane + 48]
        pshufb  xmm4, [rel len_shuf_masks + 48]

        por     xmm3, xmm4
        por     xmm0, xmm3
%endif ; %%SUBMIT_FLUSH == FLUSH

        XPINSRW xmm0, xmm1, tmp3, idx, 16, scale_x16
        movdqa  [state + _aes_cmac_lens], xmm0

        phminposuw xmm1, xmm0 ; find min length

        mov     tmp3, idx
        shl     tmp3, 4  ; idx*16
        lea     m_last, [state + _aes_cmac_scratch + tmp3]
        mov     [state + _aes_cmac_args_in + idx*8], m_last

%ifidn %%SUBMIT_FLUSH, SUBMIT
        jmp     %%_cmac_round
%else
        mov     good_lane, idx
        jmp     %%_cmac_round_flush
%endif

%%_copy_complete_digest:
        ; Job complete, copy digest to AT output
 	mov	job_rax, [state + _aes_cmac_job_in_lane + idx*8]

        mov     tmp4, idx
        shl     tmp4, 4
        lea     tmp3, [state + _aes_cmac_args_IV + tmp4]
        mov     tmp4, [job_rax + _auth_tag_output_len_in_bytes]
        mov     tmp2, [job_rax + _auth_tag_output]

        cmp     tmp4, 16
        jne     %%_ne_16_copy

        ;; 16 byte AT copy
        movdqu  xmm0, [tmp3]
        movdqu  [tmp2], xmm0
        jmp     %%_update_lanes

%%_ne_16_copy:
        memcpy_sse_16 tmp2, tmp3, tmp4, lane, iv

%%_update_lanes:
        ; Update unused lanes
        mov	unused_lanes, [state + _aes_cmac_unused_lanes]
        shl	unused_lanes, 4
 	or	unused_lanes, idx
 	mov	[state + _aes_cmac_unused_lanes], unused_lanes

        ; Set return job
        mov	job_rax, [state + _aes_cmac_job_in_lane + idx*8]

 	mov	qword [state + _aes_cmac_job_in_lane + idx*8], 0
 	or	dword [job_rax + _status], IMB_STATUS_COMPLETED_AUTH

%ifdef SAFE_DATA
        pxor    xmm0, xmm0
%ifidn %%SUBMIT_FLUSH, SUBMIT
        ;; Clear digest (in memory for IV) and scratch memory of returned job
        movdqa  [tmp3], xmm0

        shl     idx, 4
        movdqa  [state + _aes_cmac_scratch + idx], xmm0

%else
        ;; Clear digest and scratch memory of returned job and "NULL lanes"
%assign I 0
%rep NUM_LANES
        cmp     qword [state + _aes_cmac_job_in_lane + I*8], 0
        jne     APPEND(%%_skip_clear_,I)
        movdqa  [state + _aes_cmac_args_IV + I*16], xmm0
        movdqa  [state + _aes_cmac_scratch + I*16], xmm0
APPEND(%%_skip_clear_,I):
%assign I (I+1)
%endrep
%endif ;; SUBMIT

%endif ;; SAFE_DATA

%%_return:
	mov	rbx, [rsp + _gpr_save + 8*0]
	mov	rbp, [rsp + _gpr_save + 8*1]
	mov	r12, [rsp + _gpr_save + 8*2]
	mov	r13, [rsp + _gpr_save + 8*3]
	mov	r14, [rsp + _gpr_save + 8*4]
	mov	r15, [rsp + _gpr_save + 8*5]
%ifndef LINUX
	mov	rsi, [rsp + _gpr_save + 8*6]
	mov	rdi, [rsp + _gpr_save + 8*7]
%endif
	mov	rsp, [rsp + _rsp_save]	; original SP
	ret

%%_return_null:
	xor	job_rax, job_rax
	jmp	%%_return

%ifidn %%SUBMIT_FLUSH, SUBMIT
%%_complete_block:

        ;; Block size aligned
        mov     tmp2, [job + _src]
        add     tmp2, [job + _hash_start_src_offset_in_bytes]
        lea     tmp3, [n - 1]
        shl     tmp3, 4
        add     tmp2, tmp3

        ;; M_last = M_n XOR K1
        mov     tmp3, [job + _skey1]
        movdqu  xmm0, [tmp3]
        movdqu  xmm1, [tmp2]
        pxor    xmm0, xmm1
        movdqa  [m_last], xmm0

        jmp     %%_step_5

%%_lt_one_block:
        ;; Single partial block
        mov     word [state + _aes_cmac_init_done + lane*2], 1
        mov     [state + _aes_cmac_args_in + lane*8], m_last

        movdqa  xmm0, [state + _aes_cmac_lens]
        XPINSRW xmm0, xmm1, tmp2, lane, 16, scale_x16
        movdqa  [state + _aes_cmac_lens], xmm0

        mov     n, 1
        jmp     %%_not_complete_block

%%_not_complete_block_3gpp:
        ;; bit pad last block
        ;; xor with skey2
        ;; copy to m_last

        ;; load pointer to src
        mov     tmp, [job + _src]
        add     tmp, [job + _hash_start_src_offset_in_bytes]
        lea     tmp3, [n - 1]
        shl     tmp3, 4
        add     tmp, tmp3

        ;; check if partial block
        or      r, r
        jz      %%_load_full_block_3gpp

        simd_load_sse_15_1 xmm0, tmp, r
        dec     r

%%_update_mlast_3gpp:
        ;; set last byte padding mask
        ;; shift into correct xmm idx

        ;; save and restore rcx on windows
%ifndef LINUX
	mov	tmp, rcx
%endif
        mov     rcx, rbits
        mov     tmp3, 0xff
        shr     tmp3, cl
        movq    xmm2, tmp3
        XPSLLB  xmm2, r, xmm1, tmp2

        ;; pad final byte
        pandn   xmm2, xmm0
%ifndef LINUX
	mov	rcx, tmp
%endif
        ;; set OR mask to pad final bit
        mov     tmp2, tmp3
        shr     tmp2, 1
        xor     tmp2, tmp3 ; XOR to get OR mask
        movq    xmm3, tmp2
        ;; xmm1 contains shift table from previous shift
        pshufb  xmm3, xmm1

        ;; load skey2 address
        mov     tmp3, [job + _skey2]
        movdqu  xmm1, [tmp3]

        ;; set final padding bit
        por     xmm2, xmm3

        ;; XOR last partial block with skey2
        ;; update mlast
        pxor    xmm2, xmm1
        movdqa  [m_last], xmm2

        jmp     %%_step_5

%%_load_full_block_3gpp:
        movdqu  xmm0, [tmp]
        mov     r, 0xf
        jmp     %%_update_mlast_3gpp
%endif
%endmacro

%endif ;; MB_MGR_AES_CMAC_SUBMIT_FLUSH_SSE_INC