1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155
|
//go:generate go run src.go -out ../../f1600x4_amd64.s -stubs ../../f1600x4stubs_amd64.go -pkg keccakf1600
// AVX2 fourway parallelized Keccak-f[1600].
package main
import (
. "github.com/mmcloughlin/avo/build" // nolint:stylecheck,golint
. "github.com/mmcloughlin/avo/operand" // nolint:stylecheck,golint
)
// nolint:funlen
func main() {
ConstraintExpr("amd64")
// Must be called on 32 byte aligned memory.
TEXT("f1600x4AVX2", NOSPLIT, "func(state *uint64, rc *[24]uint64, turbo bool)")
Pragma("noescape")
statePtr := Load(Param("state"), GP64())
state := func(offset int) Op {
return Mem{Base: statePtr, Disp: 32 * offset}
}
rcPtr := Load(Param("rc"), GP64())
// We use the same approach as the normal Keccak-f[1600] implementation
// (in the internal/sha3 package): we group four rounds together into a
// super round. Thus we have six super rounds.
superRound := GP64()
MOVQ(U64(6), superRound) // count down.
turbo := Load(Param("turbo"), GP64())
TESTQ(turbo, turbo)
JZ(LabelRef("loop"))
MOVQ(U64(3), superRound) // Skip 3 * 4 = 12 rounds
ADDQ(Imm(8*12), rcPtr)
// XXX Because our AVX2 is significantly larger, it might better not
// to group four rounds together, but simply loop over the rounds
// themselves.
Label("loop")
for r := 0; r < 4; r++ {
// Compute parities: p[i] = a[i] ^ a[i + 5] ^ ... ^ a[i + 20].
p := []Op{YMM(), YMM(), YMM(), YMM(), YMM()}
for i := 0; i < 5; i++ {
VMOVDQA(state(i), p[i])
}
for j := 1; j < 5; j++ {
for i := 0; i < 5; i++ {
VPXOR(state(5*j+i), p[i], p[i])
}
}
// Rotate and xor parities: d[i] = rotate_left(p[i+1], 1) ^ p[i-1]
t := []Op{YMM(), YMM(), YMM(), YMM(), YMM()}
d := []Op{YMM(), YMM(), YMM(), YMM(), YMM()}
for i := 0; i < 5; i++ {
VPSLLQ(U8(1), p[(i+1)%5], t[i])
}
for i := 0; i < 5; i++ {
VPSRLQ(U8(63), p[(i+1)%5], d[i])
}
for i := 0; i < 5; i++ {
VPOR(t[i], d[i], d[i])
}
for i := 0; i < 5; i++ {
VPXOR(d[i], p[(i+4)%5], d[i])
}
// Rotation to use
rot := func(i, g int) int {
table := [][]int{
{0, 24, 18, 6, 12},
{7, 23, 2, 9, 22},
{1, 3, 17, 16, 20},
{13, 8, 4, 5, 15},
{19, 10, 21, 14, 11},
}
t := table[g][i]
return ((t + 1) * t / 2) % 64 // t'th triangle number
}
// Index into d to use
di := func(i, g int) int {
return (3*g + i) % 5
}
// Index into state to use
si := func(i, g, r int) int {
n := []int{6, 16, 11, 1}[r]
m := []int{10, 20, 15, 5}[r]
return (i*n + m*g) % 25
}
for g := 0; g < 5; g++ {
s := []Op{YMM(), YMM(), YMM(), YMM(), YMM()}
// Load the right five words from the state and XOR d into them.
for i := 0; i < 5; i++ {
VPXOR(state(si(di(i, g), g, r)), d[di(i, g)], s[i])
}
// Rotate each s[i] by the appropriate amount
for i := 0; i < 5; i++ {
if rot(i, g) != 0 {
VPSLLQ(U8(rot(i, g)), s[i], t[i])
}
}
for i := 0; i < 5; i++ {
if rot(i, g) != 0 {
VPSRLQ(U8(64-rot(i, g)), s[i], s[i])
}
}
for i := 0; i < 5; i++ {
if rot(i, g) != 0 {
VPOR(t[i], s[i], s[i])
}
}
// Compute the new words s[i] ^ (s[i+2] & ~s[i+1])
for i := 0; i < 5; i++ {
VPANDN(s[(i+2)%5], s[(i+1)%5], t[i])
}
for i := 0; i < 5; i++ {
VPXOR(s[i], t[i], t[i])
}
// Round constant
if g == 0 {
// Note that we move rcPtr by 8*4 bytes after each superround.
rc := YMM()
VPBROADCASTQ(Mem{Base: rcPtr, Disp: r * 8}, rc)
VPXOR(rc, t[0], t[0])
}
// Store back into state
for i := 0; i < 5; i++ {
VMOVDQA(t[i], state(si(i, g, r)))
}
}
}
ADDQ(Imm(8*4), rcPtr)
SUBQ(U32(1), superRound)
JNZ(LabelRef("loop"))
RET()
Generate()
}
|