File: src.go

package info (click to toggle)
golang-github-cloudflare-circl 1.3.7-1
  • links: PTS, VCS
  • area: main
  • in suites: sid
  • size: 15,352 kB
  • sloc: asm: 20,491; ansic: 1,292; makefile: 68
file content (155 lines) | stat: -rw-r--r-- 3,692 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
//go:generate go run src.go -out ../../f1600x4_amd64.s -stubs ../../f1600x4stubs_amd64.go -pkg keccakf1600

// AVX2 fourway parallelized Keccak-f[1600].

package main

import (
	. "github.com/mmcloughlin/avo/build"   // nolint:stylecheck,golint
	. "github.com/mmcloughlin/avo/operand" // nolint:stylecheck,golint
)

// nolint:funlen
func main() {
	ConstraintExpr("amd64")

	// Must be called on 32 byte aligned memory.
	TEXT("f1600x4AVX2", NOSPLIT, "func(state *uint64, rc *[24]uint64, turbo bool)")

	Pragma("noescape")

	statePtr := Load(Param("state"), GP64())
	state := func(offset int) Op {
		return Mem{Base: statePtr, Disp: 32 * offset}
	}

	rcPtr := Load(Param("rc"), GP64())

	// We use the same approach as the normal Keccak-f[1600] implementation
	// (in the internal/sha3 package): we group four rounds together into a
	// super round.  Thus we have six super rounds.
	superRound := GP64()
	MOVQ(U64(6), superRound) // count down.

	turbo := Load(Param("turbo"), GP64())
	TESTQ(turbo, turbo)
	JZ(LabelRef("loop"))

	MOVQ(U64(3), superRound) // Skip 3 * 4 = 12 rounds
	ADDQ(Imm(8*12), rcPtr)

	// XXX Because our AVX2 is significantly larger, it might better not
	//     to group four rounds together, but simply loop over the rounds
	//     themselves.

	Label("loop")

	for r := 0; r < 4; r++ {
		// Compute parities: p[i] = a[i] ^ a[i + 5] ^ ... ^ a[i + 20].
		p := []Op{YMM(), YMM(), YMM(), YMM(), YMM()}
		for i := 0; i < 5; i++ {
			VMOVDQA(state(i), p[i])
		}
		for j := 1; j < 5; j++ {
			for i := 0; i < 5; i++ {
				VPXOR(state(5*j+i), p[i], p[i])
			}
		}

		// Rotate and xor parities: d[i] = rotate_left(p[i+1], 1) ^ p[i-1]
		t := []Op{YMM(), YMM(), YMM(), YMM(), YMM()}
		d := []Op{YMM(), YMM(), YMM(), YMM(), YMM()}
		for i := 0; i < 5; i++ {
			VPSLLQ(U8(1), p[(i+1)%5], t[i])
		}
		for i := 0; i < 5; i++ {
			VPSRLQ(U8(63), p[(i+1)%5], d[i])
		}
		for i := 0; i < 5; i++ {
			VPOR(t[i], d[i], d[i])
		}
		for i := 0; i < 5; i++ {
			VPXOR(d[i], p[(i+4)%5], d[i])
		}

		// Rotation to use
		rot := func(i, g int) int {
			table := [][]int{
				{0, 24, 18, 6, 12},
				{7, 23, 2, 9, 22},
				{1, 3, 17, 16, 20},
				{13, 8, 4, 5, 15},
				{19, 10, 21, 14, 11},
			}
			t := table[g][i]
			return ((t + 1) * t / 2) % 64 // t'th triangle number
		}

		// Index into d to use
		di := func(i, g int) int {
			return (3*g + i) % 5
		}

		// Index into state to use
		si := func(i, g, r int) int {
			n := []int{6, 16, 11, 1}[r]
			m := []int{10, 20, 15, 5}[r]
			return (i*n + m*g) % 25
		}

		for g := 0; g < 5; g++ {
			s := []Op{YMM(), YMM(), YMM(), YMM(), YMM()}

			// Load the right five words from the state and XOR d into them.
			for i := 0; i < 5; i++ {
				VPXOR(state(si(di(i, g), g, r)), d[di(i, g)], s[i])
			}

			// Rotate each s[i] by the appropriate amount
			for i := 0; i < 5; i++ {
				if rot(i, g) != 0 {
					VPSLLQ(U8(rot(i, g)), s[i], t[i])
				}
			}
			for i := 0; i < 5; i++ {
				if rot(i, g) != 0 {
					VPSRLQ(U8(64-rot(i, g)), s[i], s[i])
				}
			}
			for i := 0; i < 5; i++ {
				if rot(i, g) != 0 {
					VPOR(t[i], s[i], s[i])
				}
			}

			// Compute the new words s[i] ^ (s[i+2] & ~s[i+1])
			for i := 0; i < 5; i++ {
				VPANDN(s[(i+2)%5], s[(i+1)%5], t[i])
			}
			for i := 0; i < 5; i++ {
				VPXOR(s[i], t[i], t[i])
			}

			// Round constant
			if g == 0 {
				// Note that we move rcPtr by 8*4 bytes after each superround.
				rc := YMM()
				VPBROADCASTQ(Mem{Base: rcPtr, Disp: r * 8}, rc)
				VPXOR(rc, t[0], t[0])
			}

			// Store back into state
			for i := 0; i < 5; i++ {
				VMOVDQA(t[i], state(si(i, g, r)))
			}
		}
	}

	ADDQ(Imm(8*4), rcPtr)
	SUBQ(U32(1), superRound)
	JNZ(LabelRef("loop"))

	RET()

	Generate()
}