File: contains_amd64.s

package info (click to toggle)
golang-github-segmentio-asm 1.2.0%2Bgit20231107.1cfacc8-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 932 kB
  • sloc: asm: 6,093; makefile: 32
file content (270 lines) | stat: -rw-r--r-- 4,629 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
// Code generated by command: go run contains_asm.go -pkg mem -out ../mem/contains_amd64.s -stubs ../mem/contains_amd64.go. DO NOT EDIT.

//go:build !purego

#include "textflag.h"

// func ContainsByte(haystack []byte, needle byte) bool
// Requires: AVX, AVX2, SSE2, SSE4.1
TEXT ·ContainsByte(SB), NOSPLIT, $0-33
	MOVQ haystack_base+0(FP), AX
	MOVQ haystack_len+8(FP), CX
	XORQ DX, DX
	MOVB needle+24(FP), DL
	MOVQ DX, BX
	SHLQ $0x08, BX
	ORQ  BX, DX
	MOVQ DX, BX
	SHLQ $0x10, BX
	ORQ  BX, DX
	MOVQ DX, BX
	SHLQ $0x20, BX
	ORQ  BX, DX
	MOVQ $0x0101010101010101, BX
	MOVQ $0x8080808080808080, SI
	MOVB $0x00, ret+32(FP)
	JMP  start

found:
	MOVB $0x01, ret+32(FP)
	JMP  done

avx2_found:
	MOVB $0x01, ret+32(FP)
	JMP  avx2_done

start:
	CMPQ   CX, $0x10
	JBE    tail
	PXOR   X1, X1
	PINSRQ $0x00, DX, X0
	PINSRQ $0x01, DX, X0

tail:
	CMPQ         CX, $0x00
	JE           done
	CMPQ         CX, $0x01
	JE           handle1
	CMPQ         CX, $0x03
	JBE          handle2to3
	CMPQ         CX, $0x04
	JE           handle4
	CMPQ         CX, $0x08
	JB           handle5to7
	JE           handle8
	CMPQ         CX, $0x10
	JBE          handle9to16
	CMPQ         CX, $0x20
	JBE          handle17to32
	CMPQ         CX, $0x40
	JBE          handle33to64
	BTL          $0x08, github·com∕segmentio∕asm∕cpu·X86+0(SB)
	JCC          generic
	VZEROUPPER
	VPBROADCASTQ X0, Y0
	CMPQ         CX, $0x00000100
	JB           avx2_tail
	JMP          avx2

generic:
	MOVOU   (AX), X2
	MOVOU   16(AX), X3
	MOVOU   32(AX), X4
	MOVOU   48(AX), X5
	PCMPEQB X0, X2
	PCMPEQB X0, X3
	PCMPEQB X0, X4
	PCMPEQB X0, X5
	POR     X2, X3
	POR     X4, X5
	POR     X3, X5
	PTEST   X5, X1
	JCC     found
	ADDQ    $0x40, AX
	SUBQ    $0x40, CX
	CMPQ    CX, $0x40
	JBE     tail
	JMP     generic

done:
	RET

handle1:
	MOVB (AX), AL
	CMPB AL, DL
	JE   found
	RET

handle2to3:
	MOVW (AX), DI
	MOVW -2(AX)(CX*1), AX
	XORW DX, DI
	MOVW DI, CX
	XORW DX, AX
	MOVW AX, DX
	SUBW BX, CX
	NOTW DI
	ANDW DI, CX
	SUBW BX, DX
	NOTW AX
	ANDW AX, DX
	ORW  CX, DX
	ANDW SI, DX
	JNZ  found
	RET

handle4:
	MOVL (AX), AX
	XORL DX, AX
	MOVL AX, CX
	SUBL BX, CX
	NOTL AX
	ANDL AX, CX
	ANDL SI, CX
	JNZ  found
	RET

handle5to7:
	MOVL (AX), DI
	MOVL -4(AX)(CX*1), AX
	XORL DX, DI
	MOVL DI, CX
	XORL DX, AX
	MOVL AX, DX
	SUBL BX, CX
	NOTL DI
	ANDL DI, CX
	SUBL BX, DX
	NOTL AX
	ANDL AX, DX
	ORL  CX, DX
	ANDL SI, DX
	JNZ  found
	RET

handle8:
	MOVQ (AX), AX
	XORQ DX, AX
	MOVQ AX, CX
	SUBQ BX, CX
	NOTQ AX
	ANDQ AX, CX
	ANDQ SI, CX
	JNZ  found
	RET

handle9to16:
	MOVQ (AX), DI
	MOVQ -8(AX)(CX*1), AX
	XORQ DX, DI
	MOVQ DI, CX
	XORQ DX, AX
	MOVQ AX, DX
	SUBQ BX, CX
	NOTQ DI
	ANDQ DI, CX
	SUBQ BX, DX
	NOTQ AX
	ANDQ AX, DX
	ORQ  CX, DX
	ANDQ SI, DX
	JNZ  found
	RET

handle17to32:
	MOVOU   (AX), X2
	MOVOU   -16(AX)(CX*1), X3
	PCMPEQB X0, X2
	PCMPEQB X0, X3
	POR     X2, X3
	PTEST   X3, X1
	JCC     found
	RET

handle33to64:
	MOVOU   (AX), X2
	MOVOU   16(AX), X3
	MOVOU   -32(AX)(CX*1), X4
	MOVOU   -16(AX)(CX*1), X5
	PCMPEQB X0, X2
	PCMPEQB X0, X3
	PCMPEQB X0, X4
	PCMPEQB X0, X5
	POR     X2, X3
	POR     X4, X5
	POR     X3, X5
	PTEST   X5, X1
	JCC     found
	RET

	// AVX optimized version for medium to large size inputs.
avx2:
	VPCMPEQB (AX), Y0, Y2
	VPCMPEQB 32(AX), Y0, Y3
	VPCMPEQB 64(AX), Y0, Y4
	VPCMPEQB 96(AX), Y0, Y5
	VPCMPEQB 128(AX), Y0, Y6
	VPCMPEQB 160(AX), Y0, Y7
	VPCMPEQB 192(AX), Y0, Y8
	VPCMPEQB 224(AX), Y0, Y9
	VPOR     Y2, Y3, Y3
	VPOR     Y4, Y5, Y5
	VPOR     Y6, Y7, Y7
	VPOR     Y8, Y9, Y9
	VPOR     Y3, Y5, Y5
	VPOR     Y7, Y9, Y9
	VPOR     Y5, Y9, Y9
	VPTEST   Y9, Y1
	JCC      avx2_found
	ADDQ     $0x00000100, AX
	SUBQ     $0x00000100, CX
	JZ       avx2_done
	CMPQ     CX, $0x00000100
	JAE      avx2

avx2_tail:
	CMPQ     CX, $0x40
	JBE      avx2_tail_1to64
	CMPQ     CX, $0x80
	JBE      avx2_tail_65to128
	VPCMPEQB (AX), Y0, Y2
	VPCMPEQB 32(AX), Y0, Y3
	VPCMPEQB 64(AX), Y0, Y4
	VPCMPEQB 96(AX), Y0, Y5
	VPCMPEQB -128(AX)(CX*1), Y0, Y6
	VPCMPEQB -96(AX)(CX*1), Y0, Y7
	VPCMPEQB -64(AX)(CX*1), Y0, Y8
	VPCMPEQB -32(AX)(CX*1), Y0, Y0
	VPOR     Y2, Y3, Y3
	VPOR     Y4, Y5, Y5
	VPOR     Y6, Y7, Y7
	VPOR     Y8, Y0, Y0
	VPOR     Y3, Y5, Y5
	VPOR     Y7, Y0, Y0
	VPOR     Y5, Y0, Y0
	VPTEST   Y0, Y1
	JCC      avx2_found
	JMP      avx2_done

avx2_tail_65to128:
	VPCMPEQB (AX), Y0, Y2
	VPCMPEQB 32(AX), Y0, Y3
	VPCMPEQB -64(AX)(CX*1), Y0, Y4
	VPCMPEQB -32(AX)(CX*1), Y0, Y0
	VPOR     Y2, Y3, Y3
	VPOR     Y4, Y0, Y0
	VPOR     Y3, Y0, Y0
	VPTEST   Y0, Y1
	JCC      avx2_found
	JMP      avx2_done

avx2_tail_1to64:
	VPCMPEQB -64(AX)(CX*1), Y0, Y2
	VPCMPEQB -32(AX)(CX*1), Y0, Y0
	VPOR     Y2, Y0, Y0
	VPTEST   Y0, Y1
	JCC      avx2_found

avx2_done:
	VZEROUPPER
	RET