1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157
|
//go:build amd64 && !purego
// +build amd64,!purego
#include "textflag.h"
// Depends on circl/math/fp25519 package
#include "../../math/fp25519/fp_amd64.h"
#include "curve_amd64.h"
// CTE_A24 is (A+2)/4 from Curve25519
#define CTE_A24 121666
#define Size 32
// multiplyA24Leg multiplies x times CTE_A24 and stores in z
// Uses: AX, DX, R8-R13, FLAGS
// Instr: x86_64, cmov
#define multiplyA24Leg(z,x) \
MOVL $CTE_A24, AX; MULQ 0+x; MOVQ AX, R8; MOVQ DX, R9; \
MOVL $CTE_A24, AX; MULQ 8+x; MOVQ AX, R12; MOVQ DX, R10; \
MOVL $CTE_A24, AX; MULQ 16+x; MOVQ AX, R13; MOVQ DX, R11; \
MOVL $CTE_A24, AX; MULQ 24+x; \
ADDQ R12, R9; \
ADCQ R13, R10; \
ADCQ AX, R11; \
ADCQ $0, DX; \
MOVL $38, AX; /* 2*C = 38 = 2^256 MOD 2^255-19*/ \
IMULQ AX, DX; \
ADDQ DX, R8; \
ADCQ $0, R9; MOVQ R9, 8+z; \
ADCQ $0, R10; MOVQ R10, 16+z; \
ADCQ $0, R11; MOVQ R11, 24+z; \
MOVQ $0, DX; \
CMOVQCS AX, DX; \
ADDQ DX, R8; MOVQ R8, 0+z;
// multiplyA24Adx multiplies x times CTE_A24 and stores in z
// Uses: AX, DX, R8-R12, FLAGS
// Instr: x86_64, cmov, bmi2
#define multiplyA24Adx(z,x) \
MOVQ $CTE_A24, DX; \
MULXQ 0+x, R8, R10; \
MULXQ 8+x, R9, R11; ADDQ R10, R9; \
MULXQ 16+x, R10, AX; ADCQ R11, R10; \
MULXQ 24+x, R11, R12; ADCQ AX, R11; \
;;;;;;;;;;;;;;;;;;;;; ADCQ $0, R12; \
MOVL $38, DX; /* 2*C = 38 = 2^256 MOD 2^255-19*/ \
IMULQ DX, R12; \
ADDQ R12, R8; \
ADCQ $0, R9; MOVQ R9, 8+z; \
ADCQ $0, R10; MOVQ R10, 16+z; \
ADCQ $0, R11; MOVQ R11, 24+z; \
MOVQ $0, R12; \
CMOVQCS DX, R12; \
ADDQ R12, R8; MOVQ R8, 0+z;
#define mulA24Legacy \
multiplyA24Leg(0(DI),0(SI))
#define mulA24Bmi2Adx \
multiplyA24Adx(0(DI),0(SI))
// func mulA24Amd64(z, x *fp255.Elt)
TEXT ·mulA24Amd64(SB),NOSPLIT,$0-16
MOVQ z+0(FP), DI
MOVQ x+8(FP), SI
CHECK_BMI2ADX(LMA24, mulA24Legacy, mulA24Bmi2Adx)
// func ladderStepAmd64(w *[5]fp255.Elt, b uint)
// ladderStepAmd64 calculates a point addition and doubling as follows:
// (x2,z2) = 2*(x2,z2) and (x3,z3) = (x2,z2)+(x3,z3) using as a difference (x1,-).
// work = (x1,x2,z2,x3,z3) are five fp255.Elt of 32 bytes.
// stack = (t0,t1) are two fp.Elt of fp.Size bytes, and
// (b0,b1) are two-double precision fp.Elt of 2*fp.Size bytes.
TEXT ·ladderStepAmd64(SB),NOSPLIT,$192-16
// Parameters
#define regWork DI
#define regMove SI
#define x1 0*Size(regWork)
#define x2 1*Size(regWork)
#define z2 2*Size(regWork)
#define x3 3*Size(regWork)
#define z3 4*Size(regWork)
// Local variables
#define t0 0*Size(SP)
#define t1 1*Size(SP)
#define b0 2*Size(SP)
#define b1 4*Size(SP)
MOVQ w+0(FP), regWork
MOVQ b+8(FP), regMove
CHECK_BMI2ADX(LLADSTEP, ladderStepLeg, ladderStepBmi2Adx)
#undef regWork
#undef regMove
#undef x1
#undef x2
#undef z2
#undef x3
#undef z3
#undef t0
#undef t1
#undef b0
#undef b1
// func diffAddAmd64(w *[5]fp255.Elt, b uint)
// diffAddAmd64 calculates a differential point addition using a precomputed point.
// (x1,z1) = (x1,z1)+(mu) using a difference point (x2,z2)
// w = (mu,x1,z1,x2,z2) are five fp.Elt, and
// stack = (b0,b1) are two-double precision fp.Elt of 2*fp.Size bytes.
TEXT ·diffAddAmd64(SB),NOSPLIT,$128-16
// Parameters
#define regWork DI
#define regSwap SI
#define ui 0*Size(regWork)
#define x1 1*Size(regWork)
#define z1 2*Size(regWork)
#define x2 3*Size(regWork)
#define z2 4*Size(regWork)
// Local variables
#define b0 0*Size(SP)
#define b1 2*Size(SP)
MOVQ w+0(FP), regWork
MOVQ b+8(FP), regSwap
cswap(x1,x2,regSwap)
cswap(z1,z2,regSwap)
CHECK_BMI2ADX(LDIFADD, difAddLeg, difAddBmi2Adx)
#undef regWork
#undef regSwap
#undef ui
#undef x1
#undef z1
#undef x2
#undef z2
#undef b0
#undef b1
// func doubleAmd64(x, z *fp255.Elt)
// doubleAmd64 calculates a point doubling (x1,z1) = 2*(x1,z1).
// stack = (t0,t1) are two fp.Elt of fp.Size bytes, and
// (b0,b1) are two-double precision fp.Elt of 2*fp.Size bytes.
TEXT ·doubleAmd64(SB),NOSPLIT,$192-16
// Parameters
#define x1 0(DI)
#define z1 0(SI)
// Local variables
#define t0 0*Size(SP)
#define t1 1*Size(SP)
#define b0 2*Size(SP)
#define b1 4*Size(SP)
MOVQ x+0(FP), DI
MOVQ z+8(FP), SI
CHECK_BMI2ADX(LDOUB,doubleLeg,doubleBmi2Adx)
#undef x1
#undef z1
#undef t0
#undef t1
#undef b0
#undef b1
|