File: encode_asm.go

package info (click to toggle)
golang-github-segmentio-asm 1.2.0%2Bgit20231107.1cfacc8-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 932 kB
  • sloc: asm: 6,093; makefile: 32
file content (109 lines) | stat: -rw-r--r-- 2,804 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
// +build ignore
//
// This code is a go assembly implementation of:
//
// Muła, Wojciech, & Lemire, Daniel (Thu, 14 Jun 2018).
//   Faster Base64 Encoding and Decoding Using AVX2 Instructions.
//   [arXiv:1704.00605](https://arxiv.org/abs/1704.00605)
//
// ...with changes to support multiple encodings.
package main

import (
	. "github.com/mmcloughlin/avo/build"
	. "github.com/mmcloughlin/avo/operand"
	. "github.com/segmentio/asm/build/internal/asm"
	. "github.com/segmentio/asm/build/internal/x86"
)

func init() {
	ConstraintExpr("!purego")
}

func main() {
	TEXT("encodeAVX2", NOSPLIT, "func(dst, src []byte, lut *int8) (int, int)")

	dst := Mem{Base: Load(Param("dst").Base(), GP64()), Index: GP64(), Scale: 1}
	src := Mem{Base: Load(Param("src").Base(), GP64()), Index: GP64(), Scale: 1}
	lut := Mem{Base: Load(Param("lut"), GP64())}
	rem := Load(Param("src").Len(), GP64())

	rsrc := YMM()
	rdst := YMM()
	msrc := YMM()
	shl4 := YMM()
	shl8 := YMM()
	blnd := YMM()
	mult := YMM()
	shfl := YMM()
	subs := YMM()
	cmps := YMM()
	xlat := YMM()
	xtab := YMM()
	xsub := VecBroadcast(U8(51), YMM())
	xcmp := VecBroadcast(U8(25), YMM())

	XORQ(dst.Index, dst.Index)
	XORQ(src.Index, src.Index)

	Comment("Load the 16-byte LUT into both lanes of the register")
	VPERMQ(Imm(1<<6|1<<2), lut, xtab)

	Comment("Load the first block using a mask to avoid potential fault")
	VMOVDQU(ConstLoadMask32("b64_enc_load",
		0, 1, 1, 1,
		1, 1, 1, 1,
	), rsrc)
	VPMASKMOVD(src.Offset(-4), rsrc, rsrc)

	Label("loop")

	VPSHUFB(ConstBytes("b64_enc_shuf", []byte{
		5, 4, 6, 5, 8, 7, 9, 8, 11, 10, 12, 11, 14, 13, 15, 14,
		1, 0, 2, 1, 4, 3, 5, 4, 7, 6, 8, 7, 10, 9, 11, 10,
	}), rsrc, rsrc)

	VPAND(ConstArray16("b64_enc_mask1",
		0x03f0, 0x003f, 0x03f0, 0x003f, 0x03f0, 0x003f, 0x03f0, 0x003f,
		0x03f0, 0x003f, 0x03f0, 0x003f, 0x03f0, 0x003f, 0x03f0, 0x003f,
	), rsrc, msrc)
	VPSLLW(Imm(8), msrc, shl8)
	VPSLLW(Imm(4), msrc, shl4)
	VPBLENDW(Imm(170), shl8, shl4, blnd)

	VPAND(ConstArray16("b64_enc_mask2",
		0xfc00, 0x0fc0, 0xfc00, 0x0fc0, 0xfc00, 0x0fc0, 0xfc00, 0x0fc0,
		0xfc00, 0x0fc0, 0xfc00, 0x0fc0, 0xfc00, 0x0fc0, 0xfc00, 0x0fc0,
	), rsrc, msrc)
	VPMULHUW(ConstArray16("b64_enc_mult",
		0x0040, 0x0400, 0x0040, 0x0400, 0x0040, 0x0400, 0x0040, 0x0400,
		0x0040, 0x0400, 0x0040, 0x0400, 0x0040, 0x0400, 0x0040, 0x0400,
	), msrc, mult)

	VPOR(mult, blnd, shfl)

	VPSUBUSB(xsub, shfl, subs)
	VPCMPGTB(xcmp, shfl, cmps)
	VPSUBB(cmps, subs, subs)
	VPSHUFB(subs, xtab, xlat)
	VPADDB(shfl, xlat, rdst)
	VMOVDQU(rdst, dst)

	ADDQ(Imm(32), dst.Index)
	ADDQ(Imm(24), src.Index)
	SUBQ(Imm(24), rem)

	CMPQ(rem, Imm(32))
	JB(LabelRef("done"))

	VMOVDQU(src.Offset(-4), rsrc)
	JMP(LabelRef("loop"))

	Label("done")
	Store(dst.Index, ReturnIndex(0))
	Store(src.Index, ReturnIndex(1))
	VZEROUPPER()
	RET()

	Generate()
}