File: fp_amd64.h

package info (click to toggle)
golang-github-cloudflare-circl 1.6.1-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 18,064 kB
  • sloc: asm: 20,492; ansic: 1,292; makefile: 68
file content (351 lines) | stat: -rw-r--r-- 12,639 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
// This code was imported from https://github.com/armfazh/rfc7748_precomputed

// CHECK_BMI2ADX triggers bmi2adx if supported,
// otherwise it fallbacks to legacy code.
#define CHECK_BMI2ADX(label, legacy, bmi2adx) \
    CMPB ·hasBmi2Adx(SB), $0  \
    JE label                  \
    bmi2adx                   \
    RET                       \
    label:                    \
    legacy                    \
    RET

// cselect is a conditional move
// if b=1: it copies y into x;
// if b=0: x remains with the same value;
// if b<> 0,1: undefined.
// Uses: AX, DX, FLAGS
// Instr: x86_64, cmov
#define cselect(x,y,b) \
    TESTQ b, b \
    MOVQ  0+x, AX; MOVQ  0+y, DX; CMOVQNE DX, AX; MOVQ AX,  0+x; \
    MOVQ  8+x, AX; MOVQ  8+y, DX; CMOVQNE DX, AX; MOVQ AX,  8+x; \
    MOVQ 16+x, AX; MOVQ 16+y, DX; CMOVQNE DX, AX; MOVQ AX, 16+x; \
    MOVQ 24+x, AX; MOVQ 24+y, DX; CMOVQNE DX, AX; MOVQ AX, 24+x;

// cswap is a conditional swap
// if b=1: x,y <- y,x;
// if b=0: x,y remain with the same values;
// if b<> 0,1: undefined.
// Uses: AX, DX, R8, FLAGS
// Instr: x86_64, cmov
#define cswap(x,y,b) \
    TESTQ b, b \
    MOVQ  0+x, AX; MOVQ AX, R8; MOVQ  0+y, DX; CMOVQNE DX, AX; CMOVQNE R8, DX; MOVQ AX,  0+x; MOVQ DX,  0+y; \
    MOVQ  8+x, AX; MOVQ AX, R8; MOVQ  8+y, DX; CMOVQNE DX, AX; CMOVQNE R8, DX; MOVQ AX,  8+x; MOVQ DX,  8+y; \
    MOVQ 16+x, AX; MOVQ AX, R8; MOVQ 16+y, DX; CMOVQNE DX, AX; CMOVQNE R8, DX; MOVQ AX, 16+x; MOVQ DX, 16+y; \
    MOVQ 24+x, AX; MOVQ AX, R8; MOVQ 24+y, DX; CMOVQNE DX, AX; CMOVQNE R8, DX; MOVQ AX, 24+x; MOVQ DX, 24+y;

// additionLeg adds x and y and stores in z
// Uses: AX, DX, R8-R11, FLAGS
// Instr: x86_64, cmov
#define additionLeg(z,x,y) \
    MOVL $38, AX; \
    MOVL  $0, DX; \
    MOVQ  0+x,  R8;  ADDQ  0+y,  R8; \
    MOVQ  8+x,  R9;  ADCQ  8+y,  R9; \
    MOVQ 16+x, R10;  ADCQ 16+y, R10; \
    MOVQ 24+x, R11;  ADCQ 24+y, R11; \
    CMOVQCS AX, DX;    \
    ADDQ DX,  R8; \
    ADCQ $0,  R9;  MOVQ  R9,  8+z; \
    ADCQ $0, R10;  MOVQ R10, 16+z; \
    ADCQ $0, R11;  MOVQ R11, 24+z; \
    MOVL $0,  DX; \
    CMOVQCS AX, DX; \
    ADDQ DX,  R8;  MOVQ  R8,  0+z;

// additionAdx adds x and y and stores in z
// Uses: AX, DX, R8-R11, FLAGS
// Instr: x86_64, cmov, adx
#define additionAdx(z,x,y) \
    MOVL $38, AX; \
    XORL  DX, DX; \
    MOVQ  0+x,  R8;  ADCXQ  0+y,  R8; \
    MOVQ  8+x,  R9;  ADCXQ  8+y,  R9; \
    MOVQ 16+x, R10;  ADCXQ 16+y, R10; \
    MOVQ 24+x, R11;  ADCXQ 24+y, R11; \
    CMOVQCS AX, DX ; \
    XORL  AX,  AX; \
    ADCXQ DX,  R8; \
    ADCXQ AX,  R9;  MOVQ  R9,  8+z; \
    ADCXQ AX, R10;  MOVQ R10, 16+z; \
    ADCXQ AX, R11;  MOVQ R11, 24+z; \
    MOVL $38,  DX; \
    CMOVQCS DX, AX; \
    ADDQ  AX,  R8;  MOVQ  R8,  0+z;

// subtraction subtracts y from x and stores in z
// Uses: AX, DX, R8-R11, FLAGS
// Instr: x86_64, cmov
#define subtraction(z,x,y) \
    MOVL   $38,  AX; \
    MOVQ  0+x,  R8;  SUBQ  0+y,  R8; \
    MOVQ  8+x,  R9;  SBBQ  8+y,  R9; \
    MOVQ 16+x, R10;  SBBQ 16+y, R10; \
    MOVQ 24+x, R11;  SBBQ 24+y, R11; \
    MOVL $0, DX; \
    CMOVQCS AX, DX; \
    SUBQ  DX,  R8; \
    SBBQ  $0,  R9;  MOVQ  R9,  8+z; \
    SBBQ  $0, R10;  MOVQ R10, 16+z; \
    SBBQ  $0, R11;  MOVQ R11, 24+z; \
    MOVL  $0,  DX; \
    CMOVQCS AX, DX; \
    SUBQ  DX,  R8;  MOVQ  R8,  0+z;

// integerMulAdx multiplies x and y and stores in z
// Uses: AX, DX, R8-R15, FLAGS
// Instr: x86_64, bmi2, adx
#define integerMulAdx(z,x,y) \
    MOVL    $0,R15; \
    MOVQ   0+y, DX;       XORL  AX,  AX; \
    MULXQ  0+x, AX,  R8;  MOVQ  AX, 0+z; \
    MULXQ  8+x, AX,  R9;  ADCXQ AX,  R8; \
    MULXQ 16+x, AX, R10;  ADCXQ AX,  R9; \
    MULXQ 24+x, AX, R11;  ADCXQ AX, R10; \
    MOVL $0, AX;;;;;;;;;  ADCXQ AX, R11; \
    MOVQ   8+y, DX;       XORL   AX,  AX; \
    MULXQ  0+x, AX, R12;  ADCXQ  R8,  AX;  MOVQ  AX,  8+z; \
    MULXQ  8+x, AX, R13;  ADCXQ  R9, R12;  ADOXQ AX, R12; \
    MULXQ 16+x, AX, R14;  ADCXQ R10, R13;  ADOXQ AX, R13; \
    MULXQ 24+x, AX, R15;  ADCXQ R11, R14;  ADOXQ AX, R14; \
    MOVL $0, AX;;;;;;;;;  ADCXQ  AX, R15;  ADOXQ AX, R15; \
    MOVQ  16+y, DX;       XORL   AX,  AX; \
    MULXQ  0+x, AX,  R8;  ADCXQ R12,  AX;  MOVQ  AX, 16+z; \
    MULXQ  8+x, AX,  R9;  ADCXQ R13,  R8;  ADOXQ AX,  R8; \
    MULXQ 16+x, AX, R10;  ADCXQ R14,  R9;  ADOXQ AX,  R9; \
    MULXQ 24+x, AX, R11;  ADCXQ R15, R10;  ADOXQ AX, R10; \
    MOVL $0, AX;;;;;;;;;  ADCXQ  AX, R11;  ADOXQ AX, R11; \
    MOVQ  24+y, DX;       XORL   AX,  AX; \
    MULXQ  0+x, AX, R12;  ADCXQ  R8,  AX;  MOVQ  AX, 24+z; \
    MULXQ  8+x, AX, R13;  ADCXQ  R9, R12;  ADOXQ AX, R12;  MOVQ R12, 32+z; \
    MULXQ 16+x, AX, R14;  ADCXQ R10, R13;  ADOXQ AX, R13;  MOVQ R13, 40+z; \
    MULXQ 24+x, AX, R15;  ADCXQ R11, R14;  ADOXQ AX, R14;  MOVQ R14, 48+z; \
    MOVL $0, AX;;;;;;;;;  ADCXQ  AX, R15;  ADOXQ AX, R15;  MOVQ R15, 56+z;

// integerMulLeg multiplies x and y and stores in z
// Uses: AX, DX, R8-R15, FLAGS
// Instr: x86_64
#define integerMulLeg(z,x,y) \
    MOVQ  0+y, R8; \
    MOVQ  0+x, AX; MULQ R8; MOVQ AX, 0+z; MOVQ DX, R15; \
    MOVQ  8+x, AX; MULQ R8; MOVQ AX, R13; MOVQ DX, R10; \
    MOVQ 16+x, AX; MULQ R8; MOVQ AX, R14; MOVQ DX, R11; \
    MOVQ 24+x, AX; MULQ R8; \
    ADDQ R13, R15; \
    ADCQ R14, R10;  MOVQ R10, 16+z; \
    ADCQ  AX, R11;  MOVQ R11, 24+z; \
    ADCQ  $0,  DX;  MOVQ  DX, 32+z; \
    MOVQ  8+y, R8; \
    MOVQ  0+x, AX; MULQ R8; MOVQ AX, R12; MOVQ DX,  R9; \
    MOVQ  8+x, AX; MULQ R8; MOVQ AX, R13; MOVQ DX, R10; \
    MOVQ 16+x, AX; MULQ R8; MOVQ AX, R14; MOVQ DX, R11; \
    MOVQ 24+x, AX; MULQ R8; \
    ADDQ R12, R15; MOVQ R15,  8+z; \
    ADCQ R13,  R9; \
    ADCQ R14, R10; \
    ADCQ  AX, R11; \
    ADCQ  $0,  DX; \
    ADCQ 16+z,  R9;  MOVQ  R9,  R15; \
    ADCQ 24+z, R10;  MOVQ R10, 24+z; \
    ADCQ 32+z, R11;  MOVQ R11, 32+z; \
    ADCQ   $0,  DX;  MOVQ  DX, 40+z; \
    MOVQ 16+y, R8; \
    MOVQ  0+x, AX; MULQ R8; MOVQ AX, R12; MOVQ DX,  R9; \
    MOVQ  8+x, AX; MULQ R8; MOVQ AX, R13; MOVQ DX, R10; \
    MOVQ 16+x, AX; MULQ R8; MOVQ AX, R14; MOVQ DX, R11; \
    MOVQ 24+x, AX; MULQ R8; \
    ADDQ R12, R15;  MOVQ R15, 16+z; \
    ADCQ R13,  R9; \
    ADCQ R14, R10; \
    ADCQ  AX, R11; \
    ADCQ  $0,  DX; \
    ADCQ 24+z,  R9;  MOVQ  R9,  R15; \
    ADCQ 32+z, R10;  MOVQ R10, 32+z; \
    ADCQ 40+z, R11;  MOVQ R11, 40+z; \
    ADCQ   $0,  DX;  MOVQ  DX, 48+z; \
    MOVQ 24+y, R8; \
    MOVQ  0+x, AX; MULQ R8; MOVQ AX, R12; MOVQ DX,  R9; \
    MOVQ  8+x, AX; MULQ R8; MOVQ AX, R13; MOVQ DX, R10; \
    MOVQ 16+x, AX; MULQ R8; MOVQ AX, R14; MOVQ DX, R11; \
    MOVQ 24+x, AX; MULQ R8; \
    ADDQ R12, R15; MOVQ R15, 24+z; \
    ADCQ R13,  R9; \
    ADCQ R14, R10; \
    ADCQ  AX, R11; \
    ADCQ  $0,  DX; \
    ADCQ 32+z,  R9;  MOVQ  R9, 32+z; \
    ADCQ 40+z, R10;  MOVQ R10, 40+z; \
    ADCQ 48+z, R11;  MOVQ R11, 48+z; \
    ADCQ   $0,  DX;  MOVQ  DX, 56+z;

// integerSqrLeg squares x and stores in z
// Uses: AX, CX, DX, R8-R15, FLAGS
// Instr: x86_64
#define integerSqrLeg(z,x) \
    MOVQ  0+x, R8; \
    MOVQ  8+x, AX; MULQ R8; MOVQ AX,  R9; MOVQ DX, R10; /* A[0]*A[1] */ \
    MOVQ 16+x, AX; MULQ R8; MOVQ AX, R14; MOVQ DX, R11; /* A[0]*A[2] */ \
    MOVQ 24+x, AX; MULQ R8; MOVQ AX, R15; MOVQ DX, R12; /* A[0]*A[3] */ \
    MOVQ 24+x, R8; \
    MOVQ  8+x, AX; MULQ R8; MOVQ AX,  CX; MOVQ DX, R13; /* A[3]*A[1] */ \
    MOVQ 16+x, AX; MULQ R8; /* A[3]*A[2] */ \
    \
    ADDQ R14, R10;\
    ADCQ R15, R11; MOVL $0, R15;\
    ADCQ  CX, R12;\
    ADCQ  AX, R13;\
    ADCQ  $0,  DX; MOVQ DX, R14;\
    MOVQ 8+x, AX; MULQ 16+x;\
    \
    ADDQ AX, R11;\
    ADCQ DX, R12;\
    ADCQ $0, R13;\
    ADCQ $0, R14;\
    ADCQ $0, R15;\
    \
    SHLQ $1, R14, R15; MOVQ R15, 56+z;\
    SHLQ $1, R13, R14; MOVQ R14, 48+z;\
    SHLQ $1, R12, R13; MOVQ R13, 40+z;\
    SHLQ $1, R11, R12; MOVQ R12, 32+z;\
    SHLQ $1, R10, R11; MOVQ R11, 24+z;\
    SHLQ $1,  R9, R10; MOVQ R10, 16+z;\
    SHLQ $1,  R9;      MOVQ  R9,  8+z;\
    \
    MOVQ  0+x,AX; MULQ AX; MOVQ AX, 0+z; MOVQ DX,  R9;\
    MOVQ  8+x,AX; MULQ AX; MOVQ AX, R10; MOVQ DX, R11;\
    MOVQ 16+x,AX; MULQ AX; MOVQ AX, R12; MOVQ DX, R13;\
    MOVQ 24+x,AX; MULQ AX; MOVQ AX, R14; MOVQ DX, R15;\
    \
    ADDQ  8+z,  R9; MOVQ  R9,  8+z;\
    ADCQ 16+z, R10; MOVQ R10, 16+z;\
    ADCQ 24+z, R11; MOVQ R11, 24+z;\
    ADCQ 32+z, R12; MOVQ R12, 32+z;\
    ADCQ 40+z, R13; MOVQ R13, 40+z;\
    ADCQ 48+z, R14; MOVQ R14, 48+z;\
    ADCQ 56+z, R15; MOVQ R15, 56+z;

// integerSqrAdx squares x and stores in z
// Uses: AX, CX, DX, R8-R15, FLAGS
// Instr: x86_64, bmi2, adx
#define integerSqrAdx(z,x) \
    MOVQ   0+x,  DX; /* A[0] */ \
    MULXQ  8+x,  R8, R14; /* A[1]*A[0] */  XORL  R15, R15; \
    MULXQ 16+x,  R9, R10; /* A[2]*A[0] */  ADCXQ R14,  R9; \
    MULXQ 24+x,  AX,  CX; /* A[3]*A[0] */  ADCXQ  AX, R10; \
    MOVQ  24+x,  DX; /* A[3] */ \
    MULXQ  8+x, R11, R12; /* A[1]*A[3] */  ADCXQ  CX, R11; \
    MULXQ 16+x,  AX, R13; /* A[2]*A[3] */  ADCXQ  AX, R12; \
    MOVQ   8+x,  DX; /* A[1] */            ADCXQ R15, R13; \
    MULXQ 16+x,  AX,  CX; /* A[2]*A[1] */  MOVL   $0, R14; \
    ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;  ADCXQ R15, R14; \
    XORL  R15, R15; \
    ADOXQ  AX, R10;  ADCXQ  R8,  R8; \
    ADOXQ  CX, R11;  ADCXQ  R9,  R9; \
    ADOXQ R15, R12;  ADCXQ R10, R10; \
    ADOXQ R15, R13;  ADCXQ R11, R11; \
    ADOXQ R15, R14;  ADCXQ R12, R12; \
    ;;;;;;;;;;;;;;;  ADCXQ R13, R13; \
    ;;;;;;;;;;;;;;;  ADCXQ R14, R14; \
    MOVQ  0+x, DX;  MULXQ DX, AX, CX; /* A[0]^2 */ \
    ;;;;;;;;;;;;;;;  MOVQ  AX,  0+z; \
    ADDQ CX,  R8;    MOVQ  R8,  8+z; \
    MOVQ  8+x, DX;  MULXQ DX, AX, CX; /* A[1]^2 */ \
    ADCQ AX,  R9;    MOVQ  R9, 16+z; \
    ADCQ CX, R10;    MOVQ R10, 24+z; \
    MOVQ 16+x, DX;  MULXQ DX, AX, CX; /* A[2]^2 */ \
    ADCQ AX, R11;    MOVQ R11, 32+z; \
    ADCQ CX, R12;    MOVQ R12, 40+z; \
    MOVQ 24+x, DX;  MULXQ DX, AX, CX; /* A[3]^2 */ \
    ADCQ AX, R13;    MOVQ R13, 48+z; \
    ADCQ CX, R14;    MOVQ R14, 56+z;

// reduceFromDouble finds z congruent to x modulo p such that 0<z<2^256
// Uses: AX, DX, R8-R13, FLAGS
// Instr: x86_64
#define reduceFromDoubleLeg(z,x) \
    /* 2*C = 38 = 2^256 */ \
    MOVL $38, AX; MULQ 32+x; MOVQ AX,  R8; MOVQ DX,  R9; /* C*C[4] */ \
    MOVL $38, AX; MULQ 40+x; MOVQ AX, R12; MOVQ DX, R10; /* C*C[5] */ \
    MOVL $38, AX; MULQ 48+x; MOVQ AX, R13; MOVQ DX, R11; /* C*C[6] */ \
    MOVL $38, AX; MULQ 56+x; /* C*C[7] */ \
    ADDQ R12,  R9; \
    ADCQ R13, R10; \
    ADCQ  AX, R11; \
    ADCQ  $0,  DX; \
    ADDQ  0+x,  R8; \
    ADCQ  8+x,  R9; \
    ADCQ 16+x, R10; \
    ADCQ 24+x, R11; \
    ADCQ    $0, DX; \
    MOVL $38, AX; \
    IMULQ AX, DX; /* C*C[4], CF=0, OF=0 */ \
    ADDQ DX,  R8; \
    ADCQ $0,  R9; MOVQ  R9,  8+z; \
    ADCQ $0, R10; MOVQ R10, 16+z; \
    ADCQ $0, R11; MOVQ R11, 24+z; \
    MOVL $0,  DX; \
    CMOVQCS AX, DX; \
    ADDQ DX,  R8; MOVQ  R8,  0+z;

// reduceFromDoubleAdx finds z congruent to x modulo p such that 0<z<2^256
// Uses: AX, DX, R8-R13, FLAGS
// Instr: x86_64, bmi2, adx
#define reduceFromDoubleAdx(z,x) \
    MOVL    $38,  DX; /* 2*C = 38 = 2^256 */ \
    MULXQ 32+x,  R8, R10; /* C*C[4] */  XORL AX, AX;     ADOXQ  0+x,  R8; \
    MULXQ 40+x,  R9, R11; /* C*C[5] */  ADCXQ R10,  R9;  ADOXQ  8+x,  R9; \
    MULXQ 48+x, R10, R13; /* C*C[6] */  ADCXQ R11, R10;  ADOXQ 16+x, R10; \
    MULXQ 56+x, R11, R12; /* C*C[7] */  ADCXQ R13, R11;  ADOXQ 24+x, R11; \
    ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;  ADCXQ  AX, R12;  ADOXQ   AX, R12; \
    IMULQ  DX, R12; /* C*C[4], CF=0, OF=0 */ \
    ADCXQ R12, R8; \
    ADCXQ AX,  R9; MOVQ  R9,  8+z; \
    ADCXQ AX, R10; MOVQ R10, 16+z; \
    ADCXQ AX, R11; MOVQ R11, 24+z; \
    MOVL  $0, R12; \
    CMOVQCS DX, R12; \
    ADDQ R12,  R8; MOVQ  R8,  0+z;

// addSub calculates two operations: x,y = x+y,x-y
// Uses: AX, DX, R8-R15, FLAGS
#define addSub(x,y) \
    MOVL $38, AX; \
    XORL  DX, DX; \
    MOVQ  0+x,  R8;  MOVQ  R8, R12;  ADDQ  0+y,  R8; \
    MOVQ  8+x,  R9;  MOVQ  R9, R13;  ADCQ  8+y,  R9; \
    MOVQ 16+x, R10;  MOVQ R10, R14;  ADCQ 16+y, R10; \
    MOVQ 24+x, R11;  MOVQ R11, R15;  ADCQ 24+y, R11; \
    CMOVQCS AX, DX; \
    XORL AX,  AX; \
    ADDQ DX,  R8; \
    ADCQ $0,  R9; \
    ADCQ $0, R10; \
    ADCQ $0, R11; \
    MOVL $38, DX; \
    CMOVQCS DX, AX; \
    ADDQ  AX, R8; \
    MOVL $38, AX; \
    SUBQ  0+y, R12; \
    SBBQ  8+y, R13; \
    SBBQ 16+y, R14; \
    SBBQ 24+y, R15; \
    MOVL $0, DX; \
    CMOVQCS AX, DX; \
    SUBQ DX, R12; \
    SBBQ $0, R13; \
    SBBQ $0, R14; \
    SBBQ $0, R15; \
    MOVL $0,  DX; \
    CMOVQCS AX, DX; \
    SUBQ DX, R12; \
    MOVQ  R8,  0+x; \
    MOVQ  R9,  8+x; \
    MOVQ R10, 16+x; \
    MOVQ R11, 24+x; \
    MOVQ R12,  0+y; \
    MOVQ R13,  8+y; \
    MOVQ R14, 16+y; \
    MOVQ R15, 24+y;