File: macro.s

package info (click to toggle)
golang-github-aead-chacha20 0.0~git20180709.8b13a72-4
  • links: PTS, VCS
  • area: main
  • in suites: bookworm, bullseye
  • size: 176 kB
  • sloc: asm: 1,539; makefile: 2
file content (163 lines) | stat: -rw-r--r-- 5,385 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
// Copyright (c) 2018 Andreas Auernhammer. All rights reserved.
// Use of this source code is governed by a license that can be
// found in the LICENSE file.

// +build 386,!gccgo,!appengine,!nacl amd64,!gccgo,!appengine,!nacl

// ROTL_SSE rotates all 4 32 bit values of the XMM register v
// left by n bits using SSE2 instructions (0 <= n <= 32).
// The XMM register t is used as a temp. register.
#define ROTL_SSE(n, t, v) \
	MOVO  v, t;       \
	PSLLL $n, t;      \
	PSRLL $(32-n), v; \
	PXOR  t, v

// ROTL_AVX rotates all 4/8 32 bit values of the AVX/AVX2 register v
// left by n bits using AVX/AVX2 instructions (0 <= n <= 32).
// The AVX/AVX2 register t is used as a temp. register.
#define ROTL_AVX(n, t, v) \
	VPSLLD $n, v, t;      \
	VPSRLD $(32-n), v, v; \
	VPXOR  v, t, v

// CHACHA_QROUND_SSE2 performs a ChaCha quarter-round using the
// 4 XMM registers v0, v1, v2 and v3. It uses only ROTL_SSE2 for
// rotations. The XMM register t is used as a temp. register.
#define CHACHA_QROUND_SSE2(v0, v1, v2, v3, t) \
	PADDL v1, v0;        \
	PXOR  v0, v3;        \
	ROTL_SSE(16, t, v3); \
	PADDL v3, v2;        \
	PXOR  v2, v1;        \
	ROTL_SSE(12, t, v1); \
	PADDL v1, v0;        \
	PXOR  v0, v3;        \
	ROTL_SSE(8, t, v3);  \
	PADDL v3, v2;        \
	PXOR  v2, v1;        \
	ROTL_SSE(7, t, v1)

// CHACHA_QROUND_SSSE3 performs a ChaCha quarter-round using the
// 4 XMM registers v0, v1, v2 and v3. It uses PSHUFB for 8/16 bit
// rotations. The XMM register t is used as a temp. register.
//
// r16 holds the PSHUFB constant for a 16 bit left rotate.
// r8 holds the PSHUFB constant for a 8 bit left rotate.
#define CHACHA_QROUND_SSSE3(v0, v1, v2, v3, t, r16, r8) \
	PADDL  v1, v0;       \
	PXOR   v0, v3;       \
	PSHUFB r16, v3;      \
	PADDL  v3, v2;       \
	PXOR   v2, v1;       \
	ROTL_SSE(12, t, v1); \
	PADDL  v1, v0;       \
	PXOR   v0, v3;       \
	PSHUFB r8, v3;       \
	PADDL  v3, v2;       \
	PXOR   v2, v1;       \
	ROTL_SSE(7, t, v1)

// CHACHA_QROUND_AVX performs a ChaCha quarter-round using the
// 4 AVX/AVX2 registers v0, v1, v2 and v3. It uses VPSHUFB for 8/16 bit
// rotations. The AVX/AVX2 register t is used as a temp. register.
//
// r16 holds the VPSHUFB constant for a 16 bit left rotate.
// r8 holds the VPSHUFB constant for a 8 bit left rotate.
#define CHACHA_QROUND_AVX(v0, v1, v2, v3, t, r16, r8) \
	VPADDD  v0, v1, v0;  \
	VPXOR   v3, v0, v3;  \
	VPSHUFB r16, v3, v3; \
	VPADDD  v2, v3, v2;  \
	VPXOR   v1, v2, v1;  \
	ROTL_AVX(12, t, v1); \
	VPADDD  v0, v1, v0;  \
	VPXOR   v3, v0, v3;  \
	VPSHUFB r8, v3, v3;  \
	VPADDD  v2, v3, v2;  \
	VPXOR   v1, v2, v1;  \
	ROTL_AVX(7, t, v1)

// CHACHA_SHUFFLE_SSE performs a ChaCha shuffle using the
// 3 XMM registers v1, v2 and v3. The inverse shuffle is
// performed by switching v1 and v3: CHACHA_SHUFFLE_SSE(v3, v2, v1).
#define CHACHA_SHUFFLE_SSE(v1, v2, v3) \
	PSHUFL $0x39, v1, v1; \
	PSHUFL $0x4E, v2, v2; \
	PSHUFL $0x93, v3, v3

// CHACHA_SHUFFLE_AVX performs a ChaCha shuffle using the
// 3 AVX/AVX2 registers v1, v2 and v3. The inverse shuffle is
// performed by switching v1 and v3: CHACHA_SHUFFLE_AVX(v3, v2, v1).
#define CHACHA_SHUFFLE_AVX(v1, v2, v3) \
	VPSHUFD $0x39, v1, v1; \
	VPSHUFD $0x4E, v2, v2; \
	VPSHUFD $0x93, v3, v3

// XOR_SSE extracts 4x16 byte vectors from src at
// off, xors all vectors with the corresponding XMM
// register (v0 - v3) and writes the result to dst
// at off.
// The XMM register t is used as a temp. register.
#define XOR_SSE(dst, src, off, v0, v1, v2, v3, t) \
	MOVOU 0+off(src), t;  \
	PXOR  v0, t;          \
	MOVOU t, 0+off(dst);  \
	MOVOU 16+off(src), t; \
	PXOR  v1, t;          \
	MOVOU t, 16+off(dst); \
	MOVOU 32+off(src), t; \
	PXOR  v2, t;          \
	MOVOU t, 32+off(dst); \
	MOVOU 48+off(src), t; \
	PXOR  v3, t;          \
	MOVOU t, 48+off(dst)

// XOR_AVX extracts 4x16 byte vectors from src at
// off, xors all vectors with the corresponding AVX
// register (v0 - v3) and writes the result to dst
// at off.
// The XMM register t is used as a temp. register.
#define XOR_AVX(dst, src, off, v0, v1, v2, v3, t) \
	VPXOR   0+off(src), v0, t;  \
	VMOVDQU t, 0+off(dst);      \
	VPXOR   16+off(src), v1, t; \
	VMOVDQU t, 16+off(dst);     \
	VPXOR   32+off(src), v2, t; \
	VMOVDQU t, 32+off(dst);     \
	VPXOR   48+off(src), v3, t; \
	VMOVDQU t, 48+off(dst)

#define XOR_AVX2(dst, src, off, v0, v1, v2, v3, t0, t1) \
	VMOVDQU    (0+off)(src), t0;  \
	VPERM2I128 $32, v1, v0, t1;   \
	VPXOR      t0, t1, t0;        \
	VMOVDQU    t0, (0+off)(dst);  \
	VMOVDQU    (32+off)(src), t0; \
	VPERM2I128 $32, v3, v2, t1;   \
	VPXOR      t0, t1, t0;        \
	VMOVDQU    t0, (32+off)(dst); \
	VMOVDQU    (64+off)(src), t0; \
	VPERM2I128 $49, v1, v0, t1;   \
	VPXOR      t0, t1, t0;        \
	VMOVDQU    t0, (64+off)(dst); \
	VMOVDQU    (96+off)(src), t0; \
	VPERM2I128 $49, v3, v2, t1;   \
	VPXOR      t0, t1, t0;        \
	VMOVDQU    t0, (96+off)(dst)

#define XOR_UPPER_AVX2(dst, src, off, v0, v1, v2, v3, t0, t1) \
	VMOVDQU    (0+off)(src), t0;  \
	VPERM2I128 $32, v1, v0, t1;   \
	VPXOR      t0, t1, t0;        \
	VMOVDQU    t0, (0+off)(dst);  \
	VMOVDQU    (32+off)(src), t0; \
	VPERM2I128 $32, v3, v2, t1;   \
	VPXOR      t0, t1, t0;        \
	VMOVDQU    t0, (32+off)(dst); \

#define EXTRACT_LOWER(dst, v0, v1, v2, v3, t0) \
	VPERM2I128 $49, v1, v0, t0; \
	VMOVDQU    t0, 0(dst);      \
	VPERM2I128 $49, v3, v2, t0; \
	VMOVDQU    t0, 32(dst)