File: chachaAVX2_amd64.s

package info (click to toggle)
golang-github-aead-chacha20 0.0~git20180709.8b13a72-5
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 184 kB
  • sloc: asm: 1,539; makefile: 2
file content (406 lines) | stat: -rw-r--r-- 8,187 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
// Copyright (c) 2016 Andreas Auernhammer. All rights reserved.
// Use of this source code is governed by a license that can be
// found in the LICENSE file.

// +build amd64,!gccgo,!appengine,!nacl

#include "const.s"
#include "macro.s"

#define TWO 0(SP)
#define C16 32(SP)
#define C8 64(SP)
#define STATE_0 96(SP)
#define STATE_1 128(SP)
#define STATE_2 160(SP)
#define STATE_3 192(SP)
#define TMP_0 224(SP)
#define TMP_1 256(SP)

// func xorKeyStreamAVX(dst, src []byte, block, state *[64]byte, rounds int) int
TEXT ·xorKeyStreamAVX2(SB), 4, $320-80
	MOVQ dst_base+0(FP), DI
	MOVQ src_base+24(FP), SI
	MOVQ block+48(FP), BX
	MOVQ state+56(FP), AX
	MOVQ rounds+64(FP), DX
	MOVQ src_len+32(FP), CX

	MOVQ SP, R8
	ADDQ $32, SP
	ANDQ $-32, SP

	VMOVDQU    0(AX), Y2
	VMOVDQU    32(AX), Y3
	VPERM2I128 $0x22, Y2, Y0, Y0
	VPERM2I128 $0x33, Y2, Y1, Y1
	VPERM2I128 $0x22, Y3, Y2, Y2
	VPERM2I128 $0x33, Y3, Y3, Y3

	TESTQ CX, CX
	JZ    done

	VMOVDQU ·one_AVX2<>(SB), Y4
	VPADDD  Y4, Y3, Y3

	VMOVDQA Y0, STATE_0
	VMOVDQA Y1, STATE_1
	VMOVDQA Y2, STATE_2
	VMOVDQA Y3, STATE_3

	VMOVDQU ·rol16_AVX2<>(SB), Y4
	VMOVDQU ·rol8_AVX2<>(SB), Y5
	VMOVDQU ·two_AVX2<>(SB), Y6
	VMOVDQA Y4, Y14
	VMOVDQA Y5, Y15
	VMOVDQA Y4, C16
	VMOVDQA Y5, C8
	VMOVDQA Y6, TWO

	CMPQ CX, $64
	JBE  between_0_and_64
	CMPQ CX, $192
	JBE  between_64_and_192
	CMPQ CX, $320
	JBE  between_192_and_320
	CMPQ CX, $448
	JBE  between_320_and_448

at_least_512:
	VMOVDQA Y0, Y4
	VMOVDQA Y1, Y5
	VMOVDQA Y2, Y6
	VPADDQ  TWO, Y3, Y7
	VMOVDQA Y0, Y8
	VMOVDQA Y1, Y9
	VMOVDQA Y2, Y10
	VPADDQ  TWO, Y7, Y11
	VMOVDQA Y0, Y12
	VMOVDQA Y1, Y13
	VMOVDQA Y2, Y14
	VPADDQ  TWO, Y11, Y15

	MOVQ DX, R9

chacha_loop_512:
	VMOVDQA Y8, TMP_0
	CHACHA_QROUND_AVX(Y0, Y1, Y2, Y3, Y8, C16, C8)
	CHACHA_QROUND_AVX(Y4, Y5, Y6, Y7, Y8, C16, C8)
	VMOVDQA TMP_0, Y8
	VMOVDQA Y0, TMP_0
	CHACHA_QROUND_AVX(Y8, Y9, Y10, Y11, Y0, C16, C8)
	CHACHA_QROUND_AVX(Y12, Y13, Y14, Y15, Y0, C16, C8)
	CHACHA_SHUFFLE_AVX(Y1, Y2, Y3)
	CHACHA_SHUFFLE_AVX(Y5, Y6, Y7)
	CHACHA_SHUFFLE_AVX(Y9, Y10, Y11)
	CHACHA_SHUFFLE_AVX(Y13, Y14, Y15)

	CHACHA_QROUND_AVX(Y12, Y13, Y14, Y15, Y0, C16, C8)
	CHACHA_QROUND_AVX(Y8, Y9, Y10, Y11, Y0, C16, C8)
	VMOVDQA TMP_0, Y0
	VMOVDQA Y8, TMP_0
	CHACHA_QROUND_AVX(Y4, Y5, Y6, Y7, Y8, C16, C8)
	CHACHA_QROUND_AVX(Y0, Y1, Y2, Y3, Y8, C16, C8)
	VMOVDQA TMP_0, Y8
	CHACHA_SHUFFLE_AVX(Y3, Y2, Y1)
	CHACHA_SHUFFLE_AVX(Y7, Y6, Y5)
	CHACHA_SHUFFLE_AVX(Y11, Y10, Y9)
	CHACHA_SHUFFLE_AVX(Y15, Y14, Y13)
	SUBQ    $2, R9
	JA      chacha_loop_512

	VMOVDQA Y12, TMP_0
	VMOVDQA Y13, TMP_1
	VPADDD  STATE_0, Y0, Y0
	VPADDD  STATE_1, Y1, Y1
	VPADDD  STATE_2, Y2, Y2
	VPADDD  STATE_3, Y3, Y3
	XOR_AVX2(DI, SI, 0, Y0, Y1, Y2, Y3, Y12, Y13)
	VMOVDQA STATE_0, Y0
	VMOVDQA STATE_1, Y1
	VMOVDQA STATE_2, Y2
	VMOVDQA STATE_3, Y3
	VPADDQ  TWO, Y3, Y3

	VPADDD Y0, Y4, Y4
	VPADDD Y1, Y5, Y5
	VPADDD Y2, Y6, Y6
	VPADDD Y3, Y7, Y7
	XOR_AVX2(DI, SI, 128, Y4, Y5, Y6, Y7, Y12, Y13)
	VPADDQ TWO, Y3, Y3

	VPADDD Y0, Y8, Y8
	VPADDD Y1, Y9, Y9
	VPADDD Y2, Y10, Y10
	VPADDD Y3, Y11, Y11
	XOR_AVX2(DI, SI, 256, Y8, Y9, Y10, Y11, Y12, Y13)
	VPADDQ TWO, Y3, Y3

	VPADDD TMP_0, Y0, Y12
	VPADDD TMP_1, Y1, Y13
	VPADDD Y2, Y14, Y14
	VPADDD Y3, Y15, Y15
	VPADDQ TWO, Y3, Y3

	CMPQ CX, $512
	JB   less_than_512

	XOR_AVX2(DI, SI, 384, Y12, Y13, Y14, Y15, Y4, Y5)
	VMOVDQA Y3, STATE_3
	ADDQ    $512, SI
	ADDQ    $512, DI
	SUBQ    $512, CX
	CMPQ    CX, $448
	JA      at_least_512

	TESTQ CX, CX
	JZ    done

	VMOVDQA C16, Y14
	VMOVDQA C8, Y15

	CMPQ CX, $64
	JBE  between_0_and_64
	CMPQ CX, $192
	JBE  between_64_and_192
	CMPQ CX, $320
	JBE  between_192_and_320
	JMP  between_320_and_448

less_than_512:
	XOR_UPPER_AVX2(DI, SI, 384, Y12, Y13, Y14, Y15, Y4, Y5)
	EXTRACT_LOWER(BX, Y12, Y13, Y14, Y15, Y4)
	ADDQ $448, SI
	ADDQ $448, DI
	SUBQ $448, CX
	JMP  finalize

between_320_and_448:
	VMOVDQA Y0, Y4
	VMOVDQA Y1, Y5
	VMOVDQA Y2, Y6
	VPADDQ  TWO, Y3, Y7
	VMOVDQA Y0, Y8
	VMOVDQA Y1, Y9
	VMOVDQA Y2, Y10
	VPADDQ  TWO, Y7, Y11

	MOVQ DX, R9

chacha_loop_384:
	CHACHA_QROUND_AVX(Y0, Y1, Y2, Y3, Y13, Y14, Y15)
	CHACHA_QROUND_AVX(Y4, Y5, Y6, Y7, Y13, Y14, Y15)
	CHACHA_QROUND_AVX(Y8, Y9, Y10, Y11, Y13, Y14, Y15)
	CHACHA_SHUFFLE_AVX(Y1, Y2, Y3)
	CHACHA_SHUFFLE_AVX(Y5, Y6, Y7)
	CHACHA_SHUFFLE_AVX(Y9, Y10, Y11)
	CHACHA_QROUND_AVX(Y0, Y1, Y2, Y3, Y13, Y14, Y15)
	CHACHA_QROUND_AVX(Y4, Y5, Y6, Y7, Y13, Y14, Y15)
	CHACHA_QROUND_AVX(Y8, Y9, Y10, Y11, Y13, Y14, Y15)
	CHACHA_SHUFFLE_AVX(Y3, Y2, Y1)
	CHACHA_SHUFFLE_AVX(Y7, Y6, Y5)
	CHACHA_SHUFFLE_AVX(Y11, Y10, Y9)
	SUBQ $2, R9
	JA   chacha_loop_384

	VPADDD  STATE_0, Y0, Y0
	VPADDD  STATE_1, Y1, Y1
	VPADDD  STATE_2, Y2, Y2
	VPADDD  STATE_3, Y3, Y3
	XOR_AVX2(DI, SI, 0, Y0, Y1, Y2, Y3, Y12, Y13)
	VMOVDQA STATE_0, Y0
	VMOVDQA STATE_1, Y1
	VMOVDQA STATE_2, Y2
	VMOVDQA STATE_3, Y3
	VPADDQ  TWO, Y3, Y3

	VPADDD Y0, Y4, Y4
	VPADDD Y1, Y5, Y5
	VPADDD Y2, Y6, Y6
	VPADDD Y3, Y7, Y7
	XOR_AVX2(DI, SI, 128, Y4, Y5, Y6, Y7, Y12, Y13)
	VPADDQ TWO, Y3, Y3

	VPADDD Y0, Y8, Y8
	VPADDD Y1, Y9, Y9
	VPADDD Y2, Y10, Y10
	VPADDD Y3, Y11, Y11
	VPADDQ TWO, Y3, Y3

	CMPQ CX, $384
	JB   less_than_384

	XOR_AVX2(DI, SI, 256, Y8, Y9, Y10, Y11, Y12, Y13)
	SUBQ  $384, CX
	TESTQ CX, CX
	JE    done

	ADDQ $384, SI
	ADDQ $384, DI
	JMP  between_0_and_64

less_than_384:
	XOR_UPPER_AVX2(DI, SI, 256, Y8, Y9, Y10, Y11, Y12, Y13)
	EXTRACT_LOWER(BX, Y8, Y9, Y10, Y11, Y12)
	ADDQ $320, SI
	ADDQ $320, DI
	SUBQ $320, CX
	JMP  finalize

between_192_and_320:
	VMOVDQA Y0, Y4
	VMOVDQA Y1, Y5
	VMOVDQA Y2, Y6
	VMOVDQA Y3, Y7
	VMOVDQA Y0, Y8
	VMOVDQA Y1, Y9
	VMOVDQA Y2, Y10
	VPADDQ  TWO, Y3, Y11

	MOVQ DX, R9

chacha_loop_256:
	CHACHA_QROUND_AVX(Y4, Y5, Y6, Y7, Y13, Y14, Y15)
	CHACHA_QROUND_AVX(Y8, Y9, Y10, Y11, Y13, Y14, Y15)
	CHACHA_SHUFFLE_AVX(Y5, Y6, Y7)
	CHACHA_SHUFFLE_AVX(Y9, Y10, Y11)
	CHACHA_QROUND_AVX(Y4, Y5, Y6, Y7, Y13, Y14, Y15)
	CHACHA_QROUND_AVX(Y8, Y9, Y10, Y11, Y13, Y14, Y15)
	CHACHA_SHUFFLE_AVX(Y7, Y6, Y5)
	CHACHA_SHUFFLE_AVX(Y11, Y10, Y9)
	SUBQ $2, R9
	JA   chacha_loop_256

	VPADDD Y0, Y4, Y4
	VPADDD Y1, Y5, Y5
	VPADDD Y2, Y6, Y6
	VPADDD Y3, Y7, Y7
	VPADDQ TWO, Y3, Y3
	XOR_AVX2(DI, SI, 0, Y4, Y5, Y6, Y7, Y12, Y13)
	VPADDD Y0, Y8, Y8
	VPADDD Y1, Y9, Y9
	VPADDD Y2, Y10, Y10
	VPADDD Y3, Y11, Y11
	VPADDQ TWO, Y3, Y3

	CMPQ CX, $256
	JB   less_than_256

	XOR_AVX2(DI, SI, 128, Y8, Y9, Y10, Y11, Y12, Y13)
	SUBQ  $256, CX
	TESTQ CX, CX
	JE    done

	ADDQ $256, SI
	ADDQ $256, DI
	JMP  between_0_and_64

less_than_256:
	XOR_UPPER_AVX2(DI, SI, 128, Y8, Y9, Y10, Y11, Y12, Y13)
	EXTRACT_LOWER(BX, Y8, Y9, Y10, Y11, Y12)
	ADDQ $192, SI
	ADDQ $192, DI
	SUBQ $192, CX
	JMP  finalize

between_64_and_192:
	VMOVDQA Y0, Y4
	VMOVDQA Y1, Y5
	VMOVDQA Y2, Y6
	VMOVDQA Y3, Y7

	MOVQ DX, R9

chacha_loop_128:
	CHACHA_QROUND_AVX(Y4, Y5, Y6, Y7, Y13, Y14, Y15)
	CHACHA_SHUFFLE_AVX(Y5, Y6, Y7)
	CHACHA_QROUND_AVX(Y4, Y5, Y6, Y7, Y13, Y14, Y15)
	CHACHA_SHUFFLE_AVX(Y7, Y6, Y5)
	SUBQ $2, R9
	JA   chacha_loop_128

	VPADDD Y0, Y4, Y4
	VPADDD Y1, Y5, Y5
	VPADDD Y2, Y6, Y6
	VPADDD Y3, Y7, Y7
	VPADDQ TWO, Y3, Y3

	CMPQ CX, $128
	JB   less_than_128

	XOR_AVX2(DI, SI, 0, Y4, Y5, Y6, Y7, Y12, Y13)
	SUBQ  $128, CX
	TESTQ CX, CX
	JE    done

	ADDQ $128, SI
	ADDQ $128, DI
	JMP  between_0_and_64

less_than_128:
	XOR_UPPER_AVX2(DI, SI, 0, Y4, Y5, Y6, Y7, Y12, Y13)
	EXTRACT_LOWER(BX, Y4, Y5, Y6, Y7, Y13)
	ADDQ $64, SI
	ADDQ $64, DI
	SUBQ $64, CX
	JMP  finalize

between_0_and_64:
	VMOVDQA X0, X4
	VMOVDQA X1, X5
	VMOVDQA X2, X6
	VMOVDQA X3, X7

	MOVQ DX, R9

chacha_loop_64:
	CHACHA_QROUND_AVX(X4, X5, X6, X7, X13, X14, X15)
	CHACHA_SHUFFLE_AVX(X5, X6, X7)
	CHACHA_QROUND_AVX(X4, X5, X6, X7, X13, X14, X15)
	CHACHA_SHUFFLE_AVX(X7, X6, X5)
	SUBQ $2, R9
	JA   chacha_loop_64

	VPADDD  X0, X4, X4
	VPADDD  X1, X5, X5
	VPADDD  X2, X6, X6
	VPADDD  X3, X7, X7
	VMOVDQU ·one<>(SB), X0
	VPADDQ  X0, X3, X3

	CMPQ CX, $64
	JB   less_than_64

	XOR_AVX(DI, SI, 0, X4, X5, X6, X7, X13)
	SUBQ $64, CX
	JMP  done

less_than_64:
	VMOVDQU X4, 0(BX)
	VMOVDQU X5, 16(BX)
	VMOVDQU X6, 32(BX)
	VMOVDQU X7, 48(BX)

finalize:
	XORQ R11, R11
	XORQ R12, R12
	MOVQ CX, BP

xor_loop:
	MOVB 0(SI), R11
	MOVB 0(BX), R12
	XORQ R11, R12
	MOVB R12, 0(DI)
	INCQ SI
	INCQ BX
	INCQ DI
	DECQ BP
	JA   xor_loop

done:
	VMOVDQU X3, 48(AX)
	VZEROUPPER
	MOVQ    R8, SP
	MOVQ    CX, ret+72(FP)
	RET