File: keyset_amd64.s

package info (click to toggle)
golang-github-segmentio-asm 1.2.0%2Bgit20231107.1cfacc8-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 932 kB
  • sloc: asm: 6,093; makefile: 32
file content (107 lines) | stat: -rw-r--r-- 2,120 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
// Code generated by command: go run keyset_asm.go -pkg keyset -out ../keyset/keyset_amd64.s -stubs ../keyset/keyset_amd64.go. DO NOT EDIT.

//go:build !purego

#include "textflag.h"

// func Lookup(keyset []byte, key []byte) int
// Requires: AVX
TEXT ·Lookup(SB), NOSPLIT, $0-56
	MOVQ keyset_base+0(FP), AX
	MOVQ keyset_len+8(FP), CX
	SHRQ $0x04, CX
	MOVQ key_base+24(FP), DX
	MOVQ key_len+32(FP), BX
	MOVQ key_cap+40(FP), SI
	CMPQ BX, $0x10
	JA   not_found
	CMPQ SI, $0x10
	JB   safe_load

load:
	VMOVUPS (DX), X0

prepare:
	VPXOR     X2, X2, X2
	VPCMPEQB  X1, X1, X1
	LEAQ      blend_masks<>+16(SB), DX
	SUBQ      BX, DX
	VMOVUPS   (DX), X3
	VPBLENDVB X3, X0, X2, X0
	XORQ      DX, DX
	MOVQ      CX, BX
	SHRQ      $0x02, BX
	SHLQ      $0x02, BX

bigloop:
	CMPQ     DX, BX
	JE       loop
	VPCMPEQB (AX), X0, X8
	VPTEST   X1, X8
	JCS      done
	VPCMPEQB 16(AX), X0, X9
	VPTEST   X1, X9
	JCS      found1
	VPCMPEQB 32(AX), X0, X10
	VPTEST   X1, X10
	JCS      found2
	VPCMPEQB 48(AX), X0, X11
	VPTEST   X1, X11
	JCS      found3
	ADDQ     $0x04, DX
	ADDQ     $0x40, AX
	JMP      bigloop

loop:
	CMPQ     DX, CX
	JE       done
	VPCMPEQB (AX), X0, X2
	VPTEST   X1, X2
	JCS      done
	INCQ     DX
	ADDQ     $0x10, AX
	JMP      loop
	JMP done

found3:
	INCQ DX

found2:
	INCQ DX

found1:
	INCQ DX

done:
	MOVQ DX, ret+48(FP)
	RET

not_found:
	MOVQ CX, ret+48(FP)
	RET

safe_load:
	MOVQ    DX, SI
	ANDQ    $0x00000fff, SI
	CMPQ    SI, $0x00000ff0
	JBE     load
	MOVQ    $0xfffffffffffffff0, SI
	ADDQ    BX, SI
	VMOVUPS (DX)(SI*1), X0
	LEAQ    shuffle_masks<>+16(SB), DX
	SUBQ    BX, DX
	VMOVUPS (DX), X1
	VPSHUFB X1, X0, X0
	JMP     prepare

DATA blend_masks<>+0(SB)/8, $0xffffffffffffffff
DATA blend_masks<>+8(SB)/8, $0xffffffffffffffff
DATA blend_masks<>+16(SB)/8, $0x0000000000000000
DATA blend_masks<>+24(SB)/8, $0x0000000000000000
GLOBL blend_masks<>(SB), RODATA|NOPTR, $32

DATA shuffle_masks<>+0(SB)/8, $0x0706050403020100
DATA shuffle_masks<>+8(SB)/8, $0x0f0e0d0c0b0a0908
DATA shuffle_masks<>+16(SB)/8, $0x0706050403020100
DATA shuffle_masks<>+24(SB)/8, $0x0f0e0d0c0b0a0908
GLOBL shuffle_masks<>(SB), RODATA|NOPTR, $32