1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193
|
// +build amd64
#include "textflag.h"
// Depends on circl/math/fp448 package
#include "../../math/fp448/fp_amd64.h"
#include "curve_amd64.h"
// CTE_A24 is (A+2)/4 from Curve448
#define CTE_A24 39082
#define Size 56
// multiplyA24Leg multiplies x times CTE_A24 and stores in z
// Uses: AX, DX, R8-R15, FLAGS
// Instr: x86_64, cmov, adx
#define multiplyA24Leg(z,x) \
MOVQ $CTE_A24, R15; \
MOVQ 0+x, AX; MULQ R15; MOVQ AX, R8; ;;;;;;;;;;;; MOVQ DX, R9; \
MOVQ 8+x, AX; MULQ R15; ADDQ AX, R9; ADCQ $0, DX; MOVQ DX, R10; \
MOVQ 16+x, AX; MULQ R15; ADDQ AX, R10; ADCQ $0, DX; MOVQ DX, R11; \
MOVQ 24+x, AX; MULQ R15; ADDQ AX, R11; ADCQ $0, DX; MOVQ DX, R12; \
MOVQ 32+x, AX; MULQ R15; ADDQ AX, R12; ADCQ $0, DX; MOVQ DX, R13; \
MOVQ 40+x, AX; MULQ R15; ADDQ AX, R13; ADCQ $0, DX; MOVQ DX, R14; \
MOVQ 48+x, AX; MULQ R15; ADDQ AX, R14; ADCQ $0, DX; \
MOVQ DX, AX; \
SHLQ $32, AX; \
ADDQ DX, R8; MOVQ $0, DX; \
ADCQ $0, R9; \
ADCQ $0, R10; \
ADCQ AX, R11; \
ADCQ $0, R12; \
ADCQ $0, R13; \
ADCQ $0, R14; \
ADCQ $0, DX; \
MOVQ DX, AX; \
SHLQ $32, AX; \
ADDQ DX, R8; \
ADCQ $0, R9; \
ADCQ $0, R10; \
ADCQ AX, R11; \
ADCQ $0, R12; \
ADCQ $0, R13; \
ADCQ $0, R14; \
MOVQ R8, 0+z; \
MOVQ R9, 8+z; \
MOVQ R10, 16+z; \
MOVQ R11, 24+z; \
MOVQ R12, 32+z; \
MOVQ R13, 40+z; \
MOVQ R14, 48+z;
// multiplyA24Adx multiplies x times CTE_A24 and stores in z
// Uses: AX, DX, R8-R14, FLAGS
// Instr: x86_64, bmi2
#define multiplyA24Adx(z,x) \
MOVQ $CTE_A24, DX; \
MULXQ 0+x, R8, R9; \
MULXQ 8+x, AX, R10; ADDQ AX, R9; \
MULXQ 16+x, AX, R11; ADCQ AX, R10; \
MULXQ 24+x, AX, R12; ADCQ AX, R11; \
MULXQ 32+x, AX, R13; ADCQ AX, R12; \
MULXQ 40+x, AX, R14; ADCQ AX, R13; \
MULXQ 48+x, AX, DX; ADCQ AX, R14; \
;;;;;;;;;;;;;;;;;;;; ADCQ $0, DX; \
MOVQ DX, AX; \
SHLQ $32, AX; \
ADDQ DX, R8; MOVQ $0, DX; \
ADCQ $0, R9; \
ADCQ $0, R10; \
ADCQ AX, R11; \
ADCQ $0, R12; \
ADCQ $0, R13; \
ADCQ $0, R14; \
ADCQ $0, DX; \
MOVQ DX, AX; \
SHLQ $32, AX; \
ADDQ DX, R8; \
ADCQ $0, R9; \
ADCQ $0, R10; \
ADCQ AX, R11; \
ADCQ $0, R12; \
ADCQ $0, R13; \
ADCQ $0, R14; \
MOVQ R8, 0+z; \
MOVQ R9, 8+z; \
MOVQ R10, 16+z; \
MOVQ R11, 24+z; \
MOVQ R12, 32+z; \
MOVQ R13, 40+z; \
MOVQ R14, 48+z;
#define mulA24Legacy \
multiplyA24Leg(0(DI),0(SI))
#define mulA24Bmi2Adx \
multiplyA24Adx(0(DI),0(SI))
// func mulA24Amd64(z, x *fp448.Elt)
TEXT ·mulA24Amd64(SB),NOSPLIT,$0-16
MOVQ z+0(FP), DI
MOVQ x+8(FP), SI
CHECK_BMI2ADX(LMA24, mulA24Legacy, mulA24Bmi2Adx)
// func ladderStepAmd64(w *[5]fp448.Elt, b uint)
// ladderStepAmd64 calculates a point addition and doubling as follows:
// (x2,z2) = 2*(x2,z2) and (x3,z3) = (x2,z2)+(x3,z3) using as a difference (x1,-).
// w = {x1,x2,z2,x3,z4} are five fp255.Elt of 56 bytes.
// stack = (t0,t1) are two fp.Elt of fp.Size bytes, and
// (b0,b1) are two-double precision fp.Elt of 2*fp.Size bytes.
TEXT ·ladderStepAmd64(SB),NOSPLIT,$336-16
// Parameters
#define regWork DI
#define regMove SI
#define x1 0*Size(regWork)
#define x2 1*Size(regWork)
#define z2 2*Size(regWork)
#define x3 3*Size(regWork)
#define z3 4*Size(regWork)
// Local variables
#define t0 0*Size(SP)
#define t1 1*Size(SP)
#define b0 2*Size(SP)
#define b1 4*Size(SP)
MOVQ w+0(FP), regWork
MOVQ b+8(FP), regMove
CHECK_BMI2ADX(LLADSTEP, ladderStepLeg, ladderStepBmi2Adx)
#undef regWork
#undef regMove
#undef x1
#undef x2
#undef z2
#undef x3
#undef z3
#undef t0
#undef t1
#undef b0
#undef b1
// func diffAddAmd64(work *[5]fp.Elt, swap uint)
// diffAddAmd64 calculates a differential point addition using a precomputed point.
// (x1,z1) = (x1,z1)+(mu) using a difference point (x2,z2)
// work = {mu,x1,z1,x2,z2} are five fp448.Elt of 56 bytes, and
// stack = (b0,b1) are two-double precision fp.Elt of 2*fp.Size bytes.
// This is Equation 7 at https://eprint.iacr.org/2017/264.
TEXT ·diffAddAmd64(SB),NOSPLIT,$224-16
// Parameters
#define regWork DI
#define regSwap SI
#define ui 0*Size(regWork)
#define x1 1*Size(regWork)
#define z1 2*Size(regWork)
#define x2 3*Size(regWork)
#define z2 4*Size(regWork)
// Local variables
#define b0 0*Size(SP)
#define b1 2*Size(SP)
MOVQ w+0(FP), regWork
MOVQ b+8(FP), regSwap
cswap(x1,x2,regSwap)
cswap(z1,z2,regSwap)
CHECK_BMI2ADX(LDIFADD, difAddLeg, difAddBmi2Adx)
#undef regWork
#undef regSwap
#undef ui
#undef x1
#undef z1
#undef x2
#undef z2
#undef b0
#undef b1
// func doubleAmd64(x, z *fp448.Elt)
// doubleAmd64 calculates a point doubling (x1,z1) = 2*(x1,z1).
// stack = (t0,t1) are two fp.Elt of fp.Size bytes, and
// (b0,b1) are two-double precision fp.Elt of 2*fp.Size bytes.
TEXT ·doubleAmd64(SB),NOSPLIT,$336-16
// Parameters
#define x1 0(DI)
#define z1 0(SI)
// Local variables
#define t0 0*Size(SP)
#define t1 1*Size(SP)
#define b0 2*Size(SP)
#define b1 4*Size(SP)
MOVQ x+0(FP), DI
MOVQ z+8(FP), SI
CHECK_BMI2ADX(LDOUB,doubleLeg,doubleBmi2Adx)
#undef x1
#undef z1
#undef t0
#undef t1
#undef b0
#undef b1
|