1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143
|
//go:build !purego
// +build !purego
#include "textflag.h"
// func Lookup(keyset []byte, key []byte) int
TEXT ·Lookup(SB), NOSPLIT, $0-56
MOVD keyset+0(FP), R0
MOVD keyset_len+8(FP), R1
MOVD key+24(FP), R2
MOVD key_len+32(FP), R3
MOVD key_cap+40(FP), R4
// None of the keys in the set are greater than 16 bytes, so if the input
// key is we can jump straight to not found.
CMP $16, R3
BHI notfound
// We'll be moving the keyset pointer (R0) forward as we compare keys, so
// make a copy of the starting point (R6). Also add the byte length (R1) to
// obtain a pointer to the end of the keyset (R5).
MOVD R0, R6
ADD R0, R1, R5
// Prepare a 64-bit mask of all ones.
MOVD $-1, R7
// Prepare a vector of all zeroes.
VMOV ZR, V1.B16
// Check that it's safe to load 16 bytes of input. If cap(input)<16, jump
// to a check that determines whether a tail load is necessary (to avoid a
// page fault).
CMP $16, R4
BLO safeload
load:
// Load the input key (V0) and pad with zero bytes (V1). To blend the two
// vectors, we load a mask for the particular key length and then use TBL
// to select bytes from either V0 or V1.
VLD1 (R2), [V0.B16]
MOVD $blend_masks<>(SB), R10
ADD R3<<4, R10, R10
VLD1 (R10), [V2.B16]
VTBL V2.B16, [V0.B16, V1.B16], V3.B16
loop:
// Loop through each 16 byte key in the keyset.
CMP R0, R5
BEQ notfound
// Load and compare the next key.
VLD1.P 16(R0), [V4.B16]
VCMEQ V3.B16, V4.B16, V5.B16
VMOV V5.D[0], R8
VMOV V5.D[1], R9
AND R8, R9, R9
// If the masks match, we found the key.
CMP R9, R7
BEQ found
JMP loop
found:
// If the key was found, take the position in the keyset and convert it
// to an index. The keyset pointer (R0) will be 1 key past the match, so
// subtract the starting pointer (R6), divide by 16 to convert from byte
// length to an index, and then subtract one.
SUB R6, R0, R0
ADD R0>>4, ZR, R0
SUB $1, R0, R0
MOVD R0, ret+48(FP)
RET
notfound:
// Return the number of keys in the keyset, which is the byte length (R1)
// divided by 16.
ADD R1>>4, ZR, R1
MOVD R1, ret+48(FP)
RET
safeload:
// Check if the input crosses a page boundary. If not, jump back.
AND $4095, R2, R12
CMP $4080, R12
BLS load
// If it does cross a page boundary, we must assume that loading 16 bytes
// will cause a fault. Instead, we load the 16 bytes up to and including the
// key and then shuffle the key forward in the register. We can shuffle and
// pad with zeroes at the same time to avoid having to also blend (as load
// does).
MOVD $16, R12
SUB R3, R12, R12
SUB R12, R2, R2
VLD1 (R2), [V0.B16]
MOVD $shuffle_masks<>(SB), R10
ADD R12, R10, R10
VLD1 (R10), [V2.B16]
VTBL V2.B16, [V0.B16, V1.B16], V3.B16
JMP loop
DATA blend_masks<>+0(SB)/8, $0x1010101010101010
DATA blend_masks<>+8(SB)/8, $0x1010101010101010
DATA blend_masks<>+16(SB)/8, $0x1010101010101000
DATA blend_masks<>+24(SB)/8, $0x1010101010101010
DATA blend_masks<>+32(SB)/8, $0x1010101010100100
DATA blend_masks<>+40(SB)/8, $0x1010101010101010
DATA blend_masks<>+48(SB)/8, $0x1010101010020100
DATA blend_masks<>+56(SB)/8, $0x1010101010101010
DATA blend_masks<>+64(SB)/8, $0x1010101003020100
DATA blend_masks<>+72(SB)/8, $0x1010101010101010
DATA blend_masks<>+80(SB)/8, $0x1010100403020100
DATA blend_masks<>+88(SB)/8, $0x1010101010101010
DATA blend_masks<>+96(SB)/8, $0x1010050403020100
DATA blend_masks<>+104(SB)/8, $0x1010101010101010
DATA blend_masks<>+112(SB)/8, $0x1006050403020100
DATA blend_masks<>+120(SB)/8, $0x1010101010101010
DATA blend_masks<>+128(SB)/8, $0x0706050403020100
DATA blend_masks<>+136(SB)/8, $0x1010101010101010
DATA blend_masks<>+144(SB)/8, $0x0706050403020100
DATA blend_masks<>+152(SB)/8, $0x1010101010101008
DATA blend_masks<>+160(SB)/8, $0x0706050403020100
DATA blend_masks<>+168(SB)/8, $0x1010101010100908
DATA blend_masks<>+176(SB)/8, $0x0706050403020100
DATA blend_masks<>+184(SB)/8, $0x10101010100A0908
DATA blend_masks<>+192(SB)/8, $0x0706050403020100
DATA blend_masks<>+200(SB)/8, $0x101010100B0A0908
DATA blend_masks<>+208(SB)/8, $0x0706050403020100
DATA blend_masks<>+216(SB)/8, $0x1010100C0B0A0908
DATA blend_masks<>+224(SB)/8, $0x0706050403020100
DATA blend_masks<>+232(SB)/8, $0x10100D0C0B0A0908
DATA blend_masks<>+240(SB)/8, $0x0706050403020100
DATA blend_masks<>+248(SB)/8, $0x100E0D0C0B0A0908
DATA blend_masks<>+256(SB)/8, $0x0706050403020100
DATA blend_masks<>+264(SB)/8, $0x0F0E0D0C0B0A0908
GLOBL blend_masks<>(SB), RODATA|NOPTR, $272
DATA shuffle_masks<>+0(SB)/8, $0x0706050403020100
DATA shuffle_masks<>+8(SB)/8, $0x0F0E0D0C0B0A0908
DATA shuffle_masks<>+16(SB)/8, $0x1010101010101010
DATA shuffle_masks<>+24(SB)/8, $0x1010101010101010
GLOBL shuffle_masks<>(SB), RODATA|NOPTR, $32
|