File: fp511_amd64.s

package info (click to toggle)
golang-github-cloudflare-circl 1.6.1-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 18,064 kB
  • sloc: asm: 20,492; ansic: 1,292; makefile: 68
file content (168 lines) | stat: -rw-r--r-- 5,991 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
//go:build amd64 && !purego
// +build amd64,!purego

#include "textflag.h"

// Multiplies 512-bit value by 64-bit value. Uses MULQ instruction to
// multiply 2 64-bit values.
//
// Result: x = (y * z) mod 2^512
//
// Registers used: AX, CX, DX, SI, DI, R8
//
// func mul512Amd64(a, b *Fp, c uint64)
TEXT ·mul512Amd64(SB), NOSPLIT, $0-24
    MOVQ    a+0(FP), DI    // result
    MOVQ    b+8(FP), SI    // multiplicand

    // Check whether to use optimized implementation
    CMPB    ·hasBMI2(SB), $1
    JE      mul512_mulx

    MOVQ c+16(FP), R10  // 64 bit multiplier, used by MULQ
    MOVQ R10, AX; MULQ  0(SI);                            MOVQ DX, R11; MOVQ AX,  0(DI) //x[0]
    MOVQ R10, AX; MULQ  8(SI); ADDQ R11, AX; ADCQ $0, DX; MOVQ DX, R11; MOVQ AX,  8(DI) //x[1]
    MOVQ R10, AX; MULQ 16(SI); ADDQ R11, AX; ADCQ $0, DX; MOVQ DX, R11; MOVQ AX, 16(DI) //x[2]
    MOVQ R10, AX; MULQ 24(SI); ADDQ R11, AX; ADCQ $0, DX; MOVQ DX, R11; MOVQ AX, 24(DI) //x[3]
    MOVQ R10, AX; MULQ 32(SI); ADDQ R11, AX; ADCQ $0, DX; MOVQ DX, R11; MOVQ AX, 32(DI) //x[4]
    MOVQ R10, AX; MULQ 40(SI); ADDQ R11, AX; ADCQ $0, DX; MOVQ DX, R11; MOVQ AX, 40(DI) //x[5]
    MOVQ R10, AX; MULQ 48(SI); ADDQ R11, AX; ADCQ $0, DX; MOVQ DX, R11; MOVQ AX, 48(DI) //x[6]
    MOVQ R10, AX; MULQ 56(SI); ADDQ R11, AX;                            MOVQ AX, 56(DI) //x[7]
    RET

// Optimized for CPUs with BMI2
mul512_mulx:
    MOVQ     c+16(FP), DX                                  // 64 bit multiplier, used by MULX
    MULXQ    0(SI), AX, R10; MOVQ AX, 0(DI)                // x[0]
    MULXQ    8(SI), AX, R11; ADDQ R10, AX; MOVQ AX,  8(DI) // x[1]
    MULXQ   16(SI), AX, R10; ADCQ R11, AX; MOVQ AX, 16(DI) // x[2]
    MULXQ   24(SI), AX, R11; ADCQ R10, AX; MOVQ AX, 24(DI) // x[3]
    MULXQ   32(SI), AX, R10; ADCQ R11, AX; MOVQ AX, 32(DI) // x[4]
    MULXQ   40(SI), AX, R11; ADCQ R10, AX; MOVQ AX, 40(DI) // x[5]
    MULXQ   48(SI), AX, R10; ADCQ R11, AX; MOVQ AX, 48(DI) // x[6]
    MULXQ   56(SI), AX, R11; ADCQ R10, AX; MOVQ AX, 56(DI) // x[7]
    RET

TEXT ·cswap512Amd64(SB),NOSPLIT,$0-17
    MOVQ    x+0(FP), DI
    MOVQ    y+8(FP), SI
    MOVBLZX choice+16(FP), AX       // AL = 0 or 1

    // Make AX, so that either all bits are set or non
    // AX = 0 or 1
    NEGQ    AX

    // Fill xmm15. After this step first half of XMM15 is
    // just zeros and second half is whatever in AX
    MOVQ    AX, X15

    // Copy lower double word everywhere else. So that
    // XMM15=AL|AL|AL|AL. As AX has either all bits set
    // or non result will be that XMM15 has also either
    // all bits set or non of them.
    PSHUFD $0, X15, X15

#ifndef CSWAP_BLOCK
#define CSWAP_BLOCK(idx)       \
    MOVOU   (idx*16)(DI), X0 \
    MOVOU   (idx*16)(SI), X1 \
    \ // X2 = mask & (X0 ^ X1)
    MOVO     X1, X2 \
    PXOR     X0, X2 \
    PAND    X15, X2 \
    \
    PXOR     X2, X0 \
    PXOR     X2, X1 \
    \
    MOVOU    X0, (idx*16)(DI) \
    MOVOU    X1, (idx*16)(SI)
#endif

    CSWAP_BLOCK(0)
    CSWAP_BLOCK(1)
    CSWAP_BLOCK(2)
    CSWAP_BLOCK(3)

    RET

// mulAsm implements montgomery multiplication interleaved with
// montgomery reduction. It uses MULX and ADCX/ADOX instructions.
// Implementation specific to 511-bit prime 'p'
//
// func mulBmiAsm(res, x, y *fp)
TEXT ·mulBmiAsm(SB),NOSPLIT,$8-24

    MOVQ x+8(FP), DI // multiplicand
    MOVQ y+16(FP), SI // multiplier

    XORQ  R8,  R8
    XORQ  R9,  R9
    XORQ R10, R10
    XORQ R11, R11
    XORQ R12, R12
    XORQ R13, R13
    XORQ R14, R14
    XORQ  CX,  CX

    MOVQ BP, 0(SP) // push: BP is Callee-save.
    XORQ BP, BP

// Uses BMI2 (MULX)
#ifdef MULS_MULX_512
#undef MULS_MULX_512
#endif
#define MULS_MULX_512(idx, r0, r1, r2, r3, r4, r5, r6, r7, r8) \
    \ // Reduction step
    MOVQ  ( 0)(SI), DX      \
    MULXQ ( 8*idx)(DI), DX, AX  \
    ADDQ  r0, DX            \
    MOVQ ·pNegInv(SB), AX \
    MULXQ AX, DX, AX  \
    \
    XORQ  AX, AX; \
    MOVQ ·p+ 0(SB), AX; MULXQ AX, AX, BX;  ADOXQ AX, r0; ADCXQ BX, r1 \
    MOVQ ·p+ 8(SB), AX; MULXQ AX, AX, BX;  ADOXQ AX, r1; ADCXQ BX, r2 \
    MOVQ ·p+16(SB), AX; MULXQ AX, AX, BX;  ADOXQ AX, r2; ADCXQ BX, r3 \
    MOVQ ·p+24(SB), AX; MULXQ AX, AX, BX;  ADOXQ AX, r3; ADCXQ BX, r4 \
    MOVQ ·p+32(SB), AX; MULXQ AX, AX, BX;  ADOXQ AX, r4; ADCXQ BX, r5 \
    MOVQ ·p+40(SB), AX; MULXQ AX, AX, BX;  ADOXQ AX, r5; ADCXQ BX, r6 \
    MOVQ ·p+48(SB), AX; MULXQ AX, AX, BX;  ADOXQ AX, r6; ADCXQ BX, r7 \
    MOVQ ·p+56(SB), AX; MULXQ AX, AX, BX;  ADOXQ AX, r7; ADCXQ BX, r8 \
    MOVQ  $0, AX; ;;;;;;;;;;;;;;;;;;;;;;;  ADOXQ AX, r8; \
    \ // Multiplication step
    MOVQ (8*idx)(DI), DX \
    \
    XORQ  AX, AX \
    MULXQ ( 0)(SI), AX, BX; ADOXQ AX, r0; ADCXQ BX, r1 \
    MULXQ ( 8)(SI), AX, BX; ADOXQ AX, r1; ADCXQ BX, r2 \
    MULXQ (16)(SI), AX, BX; ADOXQ AX, r2; ADCXQ BX, r3 \
    MULXQ (24)(SI), AX, BX; ADOXQ AX, r3; ADCXQ BX, r4 \
    MULXQ (32)(SI), AX, BX; ADOXQ AX, r4; ADCXQ BX, r5 \
    MULXQ (40)(SI), AX, BX; ADOXQ AX, r5; ADCXQ BX, r6 \
    MULXQ (48)(SI), AX, BX; ADOXQ AX, r6; ADCXQ BX, r7 \
    MULXQ (56)(SI), AX, BX; ADOXQ AX, r7; ADCXQ BX, r8 \
    MOVQ  $0, AX          ; ADOXQ AX, r8;

    MULS_MULX_512(0,  R8,  R9, R10, R11, R12, R13, R14,  CX,  BP)
    MULS_MULX_512(1,  R9, R10, R11, R12, R13, R14,  CX,  BP,  R8)
    MULS_MULX_512(2, R10, R11, R12, R13, R14,  CX,  BP,  R8,  R9)
    MULS_MULX_512(3, R11, R12, R13, R14,  CX,  BP,  R8,  R9, R10)
    MULS_MULX_512(4, R12, R13, R14,  CX,  BP,  R8,  R9, R10, R11)
    MULS_MULX_512(5, R13, R14,  CX,  BP,  R8,  R9, R10, R11, R12)
    MULS_MULX_512(6, R14,  CX,  BP,  R8,  R9, R10, R11, R12, R13)
    MULS_MULX_512(7,  CX,  BP,  R8,  R9, R10, R11, R12, R13, R14)
#undef MULS_MULX_512

    MOVQ res+0(FP), DI
    MOVQ  BP, ( 0)(DI)
    MOVQ  R8, ( 8)(DI)
    MOVQ  R9, (16)(DI)
    MOVQ R10, (24)(DI)
    MOVQ R11, (32)(DI)
    MOVQ R12, (40)(DI)
    MOVQ R13, (48)(DI)
    MOVQ R14, (56)(DI)
    MOVQ 0(SP), BP // pop: BP is Callee-save.

    // NOW DI needs to be reduced if > p
    RET