File: swap64_asm.go

package info (click to toggle)
golang-github-segmentio-asm 1.2.0%2Bgit20231107.1cfacc8-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 932 kB
  • sloc: asm: 6,093; makefile: 32
file content (116 lines) | stat: -rw-r--r-- 2,563 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
// +build ignore

package main

import (
	. "github.com/mmcloughlin/avo/build"
	. "github.com/mmcloughlin/avo/operand"
	. "github.com/segmentio/asm/build/internal/asm"
	. "github.com/segmentio/asm/build/internal/x86"

	"github.com/mmcloughlin/avo/reg"
	"github.com/segmentio/asm/cpu"
)

const unroll = 4

func init() {
	ConstraintExpr("!purego")
}

func main() {
	TEXT("swap64", NOSPLIT, "func(b []byte)")
	Doc("swap64 performs an in-place byte swap on each qword of the input buffer.")

	// Load slice ptr + length, and calculate end ptr.
	ptr := Load(Param("b").Base(), GP64())
	len := Load(Param("b").Len(), GP64())
	end := GP64()
	MOVQ(ptr, end)
	ADDQ(len, end)

	JumpUnlessFeature("x86_loop", cpu.AVX2)

	// Prepare the shuffle mask.
	shuffleMaskData := ConstBytes("shuffle_mask", []byte{
		7, 6, 5, 4, 3, 2, 1, 0,
		15, 14, 13, 12, 11, 10, 9, 8,
		7, 6, 5, 4, 3, 2, 1, 0,
		15, 14, 13, 12, 11, 10, 9, 8,
	})
	shuffleMask := YMM()
	VMOVDQU(shuffleMaskData, shuffleMask)

	// Loop while we have at least unroll*32 bytes remaining.
	Label("avx2_loop")
	next := GP64()
	MOVQ(ptr, next)
	ADDQ(Imm(unroll*32), next)
	CMPQ(next, end)
	JAE(LabelRef("x86_loop"))

	// Load multiple chunks => byte swap => store.
	var vectors [unroll]reg.VecVirtual
	for i := 0; i < unroll; i++ {
		vectors[i] = YMM()
	}
	for i := 0; i < unroll; i++ {
		VMOVDQU(Mem{Base: ptr}.Offset(i*32), vectors[i])
	}
	for i := 0; i < unroll; i++ {
		VPSHUFB(shuffleMask, vectors[i], vectors[i])
	}
	for i := 0; i < unroll; i++ {
		VMOVDQU(vectors[i], Mem{Base: ptr}.Offset(i*32))
	}

	// Increment ptr and loop.
	MOVQ(next, ptr)
	JMP(LabelRef("avx2_loop"))

	// Loop while we have at least unroll*8 bytes remaining.
	Label("x86_loop")
	next = GP64()
	MOVQ(ptr, next)
	ADDQ(Imm(unroll*8), next)
	CMPQ(next, end)
	JAE(LabelRef("slow_loop"))

	// Load qwords => byte swap => store.
	var chunks [unroll]reg.GPVirtual
	for i := 0; i < unroll; i++ {
		chunks[i] = GP64()
	}
	for i := 0; i < unroll; i++ {
		MOVQ(Mem{Base: ptr}.Offset(i*8), chunks[i])
	}
	for i := 0; i < unroll; i++ {
		BSWAPQ(chunks[i])
	}
	for i := 0; i < unroll; i++ {
		MOVQ(chunks[i], Mem{Base: ptr}.Offset(i*8))
	}

	// Increment ptr and loop.
	MOVQ(next, ptr)
	JMP(LabelRef("x86_loop"))

	// Loop until ptr reaches the end.
	Label("slow_loop")
	CMPQ(ptr, end)
	JAE(LabelRef("done"))

	// Load a qword => byte swap => store.
	qword := GP64()
	MOVQ(Mem{Base: ptr}, qword)
	BSWAPQ(qword)
	MOVQ(qword, Mem{Base: ptr})

	// Increment ptr and loop.
	ADDQ(Imm(8), ptr)
	JMP(LabelRef("slow_loop"))

	Label("done")
	RET()
	Generate()
}