File: keyset_arm64.s

package info (click to toggle)
golang-github-segmentio-asm 1.2.0%2Bgit20231107.1cfacc8-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 932 kB
  • sloc: asm: 6,093; makefile: 32
file content (143 lines) | stat: -rw-r--r-- 4,716 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
//go:build !purego
// +build !purego

#include "textflag.h"

// func Lookup(keyset []byte, key []byte) int
TEXT ·Lookup(SB), NOSPLIT, $0-56
	MOVD keyset+0(FP), R0
	MOVD keyset_len+8(FP), R1
	MOVD key+24(FP), R2
	MOVD key_len+32(FP), R3
	MOVD key_cap+40(FP), R4

	// None of the keys in the set are greater than 16 bytes, so if the input
	// key is we can jump straight to not found.
	CMP $16, R3
	BHI notfound

	// We'll be moving the keyset pointer (R0) forward as we compare keys, so
	// make a copy of the starting point (R6). Also add the byte length (R1) to
	// obtain a pointer to the end of the keyset (R5).
	MOVD R0, R6
	ADD  R0, R1, R5

	// Prepare a 64-bit mask of all ones.
	MOVD $-1, R7

	// Prepare a vector of all zeroes.
	VMOV ZR, V1.B16

	// Check that it's safe to load 16 bytes of input. If cap(input)<16, jump
	// to a check that determines whether a tail load is necessary (to avoid a
	// page fault).
	CMP $16, R4
	BLO safeload

load:
	// Load the input key (V0) and pad with zero bytes (V1). To blend the two
	// vectors, we load a mask for the particular key length and then use TBL
	// to select bytes from either V0 or V1.
	VLD1 (R2), [V0.B16]
	MOVD $blend_masks<>(SB), R10
	ADD  R3<<4, R10, R10
	VLD1 (R10), [V2.B16]
	VTBL V2.B16, [V0.B16, V1.B16], V3.B16

loop:
	// Loop through each 16 byte key in the keyset.
	CMP R0, R5
	BEQ notfound

	// Load and compare the next key.
	VLD1.P 16(R0), [V4.B16]
	VCMEQ  V3.B16, V4.B16, V5.B16
	VMOV   V5.D[0], R8
	VMOV   V5.D[1], R9
	AND    R8, R9, R9

	// If the masks match, we found the key.
	CMP R9, R7
	BEQ found
	JMP loop

found:
	// If the key was found, take the position in the keyset and convert it
	// to an index. The keyset pointer (R0) will be 1 key past the match, so
	// subtract the starting pointer (R6), divide by 16 to convert from byte
	// length to an index, and then subtract one.
	SUB  R6, R0, R0
	ADD  R0>>4, ZR, R0
	SUB  $1, R0, R0
	MOVD R0, ret+48(FP)
	RET

notfound:
	// Return the number of keys in the keyset, which is the byte length (R1)
	// divided by 16.
	ADD  R1>>4, ZR, R1
	MOVD R1, ret+48(FP)
	RET

safeload:
	// Check if the input crosses a page boundary. If not, jump back.
	AND $4095, R2, R12
	CMP $4080, R12
	BLS load

	// If it does cross a page boundary, we must assume that loading 16 bytes
	// will cause a fault. Instead, we load the 16 bytes up to and including the
	// key and then shuffle the key forward in the register. We can shuffle and
	// pad with zeroes at the same time to avoid having to also blend (as load
	// does).
	MOVD $16, R12
	SUB  R3, R12, R12
	SUB  R12, R2, R2
	VLD1 (R2), [V0.B16]
	MOVD $shuffle_masks<>(SB), R10
	ADD  R12, R10, R10
	VLD1 (R10), [V2.B16]
	VTBL V2.B16, [V0.B16, V1.B16], V3.B16
	JMP  loop

DATA blend_masks<>+0(SB)/8, $0x1010101010101010
DATA blend_masks<>+8(SB)/8, $0x1010101010101010
DATA blend_masks<>+16(SB)/8, $0x1010101010101000
DATA blend_masks<>+24(SB)/8, $0x1010101010101010
DATA blend_masks<>+32(SB)/8, $0x1010101010100100
DATA blend_masks<>+40(SB)/8, $0x1010101010101010
DATA blend_masks<>+48(SB)/8, $0x1010101010020100
DATA blend_masks<>+56(SB)/8, $0x1010101010101010
DATA blend_masks<>+64(SB)/8, $0x1010101003020100
DATA blend_masks<>+72(SB)/8, $0x1010101010101010
DATA blend_masks<>+80(SB)/8, $0x1010100403020100
DATA blend_masks<>+88(SB)/8, $0x1010101010101010
DATA blend_masks<>+96(SB)/8, $0x1010050403020100
DATA blend_masks<>+104(SB)/8, $0x1010101010101010
DATA blend_masks<>+112(SB)/8, $0x1006050403020100
DATA blend_masks<>+120(SB)/8, $0x1010101010101010
DATA blend_masks<>+128(SB)/8, $0x0706050403020100
DATA blend_masks<>+136(SB)/8, $0x1010101010101010
DATA blend_masks<>+144(SB)/8, $0x0706050403020100
DATA blend_masks<>+152(SB)/8, $0x1010101010101008
DATA blend_masks<>+160(SB)/8, $0x0706050403020100
DATA blend_masks<>+168(SB)/8, $0x1010101010100908
DATA blend_masks<>+176(SB)/8, $0x0706050403020100
DATA blend_masks<>+184(SB)/8, $0x10101010100A0908
DATA blend_masks<>+192(SB)/8, $0x0706050403020100
DATA blend_masks<>+200(SB)/8, $0x101010100B0A0908
DATA blend_masks<>+208(SB)/8, $0x0706050403020100
DATA blend_masks<>+216(SB)/8, $0x1010100C0B0A0908
DATA blend_masks<>+224(SB)/8, $0x0706050403020100
DATA blend_masks<>+232(SB)/8, $0x10100D0C0B0A0908
DATA blend_masks<>+240(SB)/8, $0x0706050403020100
DATA blend_masks<>+248(SB)/8, $0x100E0D0C0B0A0908
DATA blend_masks<>+256(SB)/8, $0x0706050403020100
DATA blend_masks<>+264(SB)/8, $0x0F0E0D0C0B0A0908
GLOBL blend_masks<>(SB), RODATA|NOPTR, $272

DATA shuffle_masks<>+0(SB)/8, $0x0706050403020100
DATA shuffle_masks<>+8(SB)/8, $0x0F0E0D0C0B0A0908
DATA shuffle_masks<>+16(SB)/8, $0x1010101010101010
DATA shuffle_masks<>+24(SB)/8, $0x1010101010101010
GLOBL shuffle_masks<>(SB), RODATA|NOPTR, $32