1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168
|
//go:build amd64 && !purego
// +build amd64,!purego
#include "textflag.h"
// Multiplies 512-bit value by 64-bit value. Uses MULQ instruction to
// multiply 2 64-bit values.
//
// Result: x = (y * z) mod 2^512
//
// Registers used: AX, CX, DX, SI, DI, R8
//
// func mul512Amd64(a, b *Fp, c uint64)
TEXT ·mul512Amd64(SB), NOSPLIT, $0-24
MOVQ a+0(FP), DI // result
MOVQ b+8(FP), SI // multiplicand
// Check whether to use optimized implementation
CMPB ·hasBMI2(SB), $1
JE mul512_mulx
MOVQ c+16(FP), R10 // 64 bit multiplier, used by MULQ
MOVQ R10, AX; MULQ 0(SI); MOVQ DX, R11; MOVQ AX, 0(DI) //x[0]
MOVQ R10, AX; MULQ 8(SI); ADDQ R11, AX; ADCQ $0, DX; MOVQ DX, R11; MOVQ AX, 8(DI) //x[1]
MOVQ R10, AX; MULQ 16(SI); ADDQ R11, AX; ADCQ $0, DX; MOVQ DX, R11; MOVQ AX, 16(DI) //x[2]
MOVQ R10, AX; MULQ 24(SI); ADDQ R11, AX; ADCQ $0, DX; MOVQ DX, R11; MOVQ AX, 24(DI) //x[3]
MOVQ R10, AX; MULQ 32(SI); ADDQ R11, AX; ADCQ $0, DX; MOVQ DX, R11; MOVQ AX, 32(DI) //x[4]
MOVQ R10, AX; MULQ 40(SI); ADDQ R11, AX; ADCQ $0, DX; MOVQ DX, R11; MOVQ AX, 40(DI) //x[5]
MOVQ R10, AX; MULQ 48(SI); ADDQ R11, AX; ADCQ $0, DX; MOVQ DX, R11; MOVQ AX, 48(DI) //x[6]
MOVQ R10, AX; MULQ 56(SI); ADDQ R11, AX; MOVQ AX, 56(DI) //x[7]
RET
// Optimized for CPUs with BMI2
mul512_mulx:
MOVQ c+16(FP), DX // 64 bit multiplier, used by MULX
MULXQ 0(SI), AX, R10; MOVQ AX, 0(DI) // x[0]
MULXQ 8(SI), AX, R11; ADDQ R10, AX; MOVQ AX, 8(DI) // x[1]
MULXQ 16(SI), AX, R10; ADCQ R11, AX; MOVQ AX, 16(DI) // x[2]
MULXQ 24(SI), AX, R11; ADCQ R10, AX; MOVQ AX, 24(DI) // x[3]
MULXQ 32(SI), AX, R10; ADCQ R11, AX; MOVQ AX, 32(DI) // x[4]
MULXQ 40(SI), AX, R11; ADCQ R10, AX; MOVQ AX, 40(DI) // x[5]
MULXQ 48(SI), AX, R10; ADCQ R11, AX; MOVQ AX, 48(DI) // x[6]
MULXQ 56(SI), AX, R11; ADCQ R10, AX; MOVQ AX, 56(DI) // x[7]
RET
TEXT ·cswap512Amd64(SB),NOSPLIT,$0-17
MOVQ x+0(FP), DI
MOVQ y+8(FP), SI
MOVBLZX choice+16(FP), AX // AL = 0 or 1
// Make AX, so that either all bits are set or non
// AX = 0 or 1
NEGQ AX
// Fill xmm15. After this step first half of XMM15 is
// just zeros and second half is whatever in AX
MOVQ AX, X15
// Copy lower double word everywhere else. So that
// XMM15=AL|AL|AL|AL. As AX has either all bits set
// or non result will be that XMM15 has also either
// all bits set or non of them.
PSHUFD $0, X15, X15
#ifndef CSWAP_BLOCK
#define CSWAP_BLOCK(idx) \
MOVOU (idx*16)(DI), X0 \
MOVOU (idx*16)(SI), X1 \
\ // X2 = mask & (X0 ^ X1)
MOVO X1, X2 \
PXOR X0, X2 \
PAND X15, X2 \
\
PXOR X2, X0 \
PXOR X2, X1 \
\
MOVOU X0, (idx*16)(DI) \
MOVOU X1, (idx*16)(SI)
#endif
CSWAP_BLOCK(0)
CSWAP_BLOCK(1)
CSWAP_BLOCK(2)
CSWAP_BLOCK(3)
RET
// mulAsm implements montgomery multiplication interleaved with
// montgomery reduction. It uses MULX and ADCX/ADOX instructions.
// Implementation specific to 511-bit prime 'p'
//
// func mulBmiAsm(res, x, y *fp)
TEXT ·mulBmiAsm(SB),NOSPLIT,$8-24
MOVQ x+8(FP), DI // multiplicand
MOVQ y+16(FP), SI // multiplier
XORQ R8, R8
XORQ R9, R9
XORQ R10, R10
XORQ R11, R11
XORQ R12, R12
XORQ R13, R13
XORQ R14, R14
XORQ CX, CX
MOVQ BP, 0(SP) // push: BP is Callee-save.
XORQ BP, BP
// Uses BMI2 (MULX)
#ifdef MULS_MULX_512
#undef MULS_MULX_512
#endif
#define MULS_MULX_512(idx, r0, r1, r2, r3, r4, r5, r6, r7, r8) \
\ // Reduction step
MOVQ ( 0)(SI), DX \
MULXQ ( 8*idx)(DI), DX, AX \
ADDQ r0, DX \
MOVQ ·pNegInv(SB), AX \
MULXQ AX, DX, AX \
\
XORQ AX, AX; \
MOVQ ·p+ 0(SB), AX; MULXQ AX, AX, BX; ADOXQ AX, r0; ADCXQ BX, r1 \
MOVQ ·p+ 8(SB), AX; MULXQ AX, AX, BX; ADOXQ AX, r1; ADCXQ BX, r2 \
MOVQ ·p+16(SB), AX; MULXQ AX, AX, BX; ADOXQ AX, r2; ADCXQ BX, r3 \
MOVQ ·p+24(SB), AX; MULXQ AX, AX, BX; ADOXQ AX, r3; ADCXQ BX, r4 \
MOVQ ·p+32(SB), AX; MULXQ AX, AX, BX; ADOXQ AX, r4; ADCXQ BX, r5 \
MOVQ ·p+40(SB), AX; MULXQ AX, AX, BX; ADOXQ AX, r5; ADCXQ BX, r6 \
MOVQ ·p+48(SB), AX; MULXQ AX, AX, BX; ADOXQ AX, r6; ADCXQ BX, r7 \
MOVQ ·p+56(SB), AX; MULXQ AX, AX, BX; ADOXQ AX, r7; ADCXQ BX, r8 \
MOVQ $0, AX; ;;;;;;;;;;;;;;;;;;;;;;; ADOXQ AX, r8; \
\ // Multiplication step
MOVQ (8*idx)(DI), DX \
\
XORQ AX, AX \
MULXQ ( 0)(SI), AX, BX; ADOXQ AX, r0; ADCXQ BX, r1 \
MULXQ ( 8)(SI), AX, BX; ADOXQ AX, r1; ADCXQ BX, r2 \
MULXQ (16)(SI), AX, BX; ADOXQ AX, r2; ADCXQ BX, r3 \
MULXQ (24)(SI), AX, BX; ADOXQ AX, r3; ADCXQ BX, r4 \
MULXQ (32)(SI), AX, BX; ADOXQ AX, r4; ADCXQ BX, r5 \
MULXQ (40)(SI), AX, BX; ADOXQ AX, r5; ADCXQ BX, r6 \
MULXQ (48)(SI), AX, BX; ADOXQ AX, r6; ADCXQ BX, r7 \
MULXQ (56)(SI), AX, BX; ADOXQ AX, r7; ADCXQ BX, r8 \
MOVQ $0, AX ; ADOXQ AX, r8;
MULS_MULX_512(0, R8, R9, R10, R11, R12, R13, R14, CX, BP)
MULS_MULX_512(1, R9, R10, R11, R12, R13, R14, CX, BP, R8)
MULS_MULX_512(2, R10, R11, R12, R13, R14, CX, BP, R8, R9)
MULS_MULX_512(3, R11, R12, R13, R14, CX, BP, R8, R9, R10)
MULS_MULX_512(4, R12, R13, R14, CX, BP, R8, R9, R10, R11)
MULS_MULX_512(5, R13, R14, CX, BP, R8, R9, R10, R11, R12)
MULS_MULX_512(6, R14, CX, BP, R8, R9, R10, R11, R12, R13)
MULS_MULX_512(7, CX, BP, R8, R9, R10, R11, R12, R13, R14)
#undef MULS_MULX_512
MOVQ res+0(FP), DI
MOVQ BP, ( 0)(DI)
MOVQ R8, ( 8)(DI)
MOVQ R9, (16)(DI)
MOVQ R10, (24)(DI)
MOVQ R11, (32)(DI)
MOVQ R12, (40)(DI)
MOVQ R13, (48)(DI)
MOVQ R14, (56)(DI)
MOVQ 0(SP), BP // pop: BP is Callee-save.
// NOW DI needs to be reduced if > p
RET
|