1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303
|
// Code generated by command: go run equal_fold_asm.go -pkg ascii -out ../ascii/equal_fold_amd64.s -stubs ../ascii/equal_fold_amd64.go. DO NOT EDIT.
//go:build !purego
#include "textflag.h"
// func EqualFoldString(a string, b string) bool
// Requires: AVX, AVX2, SSE4.1
TEXT ·EqualFoldString(SB), NOSPLIT, $0-33
MOVQ a_base+0(FP), CX
MOVQ a_len+8(FP), DX
MOVQ b_base+16(FP), BX
CMPQ DX, b_len+24(FP)
JNE done
XORQ AX, AX
CMPQ DX, $0x10
JB init_x86
BTL $0x08, github·com∕segmentio∕asm∕cpu·X86+0(SB)
JCS init_avx
init_x86:
LEAQ github·com∕segmentio∕asm∕ascii·lowerCase+0(SB), R9
XORL SI, SI
cmp8:
CMPQ DX, $0x08
JB cmp7
MOVBLZX (CX)(AX*1), DI
MOVBLZX (BX)(AX*1), R8
MOVB (R9)(DI*1), DI
XORB (R9)(R8*1), DI
ORB DI, SI
MOVBLZX 1(CX)(AX*1), DI
MOVBLZX 1(BX)(AX*1), R8
MOVB (R9)(DI*1), DI
XORB (R9)(R8*1), DI
ORB DI, SI
MOVBLZX 2(CX)(AX*1), DI
MOVBLZX 2(BX)(AX*1), R8
MOVB (R9)(DI*1), DI
XORB (R9)(R8*1), DI
ORB DI, SI
MOVBLZX 3(CX)(AX*1), DI
MOVBLZX 3(BX)(AX*1), R8
MOVB (R9)(DI*1), DI
XORB (R9)(R8*1), DI
ORB DI, SI
MOVBLZX 4(CX)(AX*1), DI
MOVBLZX 4(BX)(AX*1), R8
MOVB (R9)(DI*1), DI
XORB (R9)(R8*1), DI
ORB DI, SI
MOVBLZX 5(CX)(AX*1), DI
MOVBLZX 5(BX)(AX*1), R8
MOVB (R9)(DI*1), DI
XORB (R9)(R8*1), DI
ORB DI, SI
MOVBLZX 6(CX)(AX*1), DI
MOVBLZX 6(BX)(AX*1), R8
MOVB (R9)(DI*1), DI
XORB (R9)(R8*1), DI
ORB DI, SI
MOVBLZX 7(CX)(AX*1), DI
MOVBLZX 7(BX)(AX*1), R8
MOVB (R9)(DI*1), DI
XORB (R9)(R8*1), DI
ORB DI, SI
JNE done
ADDQ $0x08, AX
SUBQ $0x08, DX
JMP cmp8
cmp7:
CMPQ DX, $0x07
JB cmp6
MOVBLZX 6(CX)(AX*1), DI
MOVBLZX 6(BX)(AX*1), R8
MOVB (R9)(DI*1), DI
XORB (R9)(R8*1), DI
ORB DI, SI
cmp6:
CMPQ DX, $0x06
JB cmp5
MOVBLZX 5(CX)(AX*1), DI
MOVBLZX 5(BX)(AX*1), R8
MOVB (R9)(DI*1), DI
XORB (R9)(R8*1), DI
ORB DI, SI
cmp5:
CMPQ DX, $0x05
JB cmp4
MOVBLZX 4(CX)(AX*1), DI
MOVBLZX 4(BX)(AX*1), R8
MOVB (R9)(DI*1), DI
XORB (R9)(R8*1), DI
ORB DI, SI
cmp4:
CMPQ DX, $0x04
JB cmp3
MOVBLZX 3(CX)(AX*1), DI
MOVBLZX 3(BX)(AX*1), R8
MOVB (R9)(DI*1), DI
XORB (R9)(R8*1), DI
ORB DI, SI
cmp3:
CMPQ DX, $0x03
JB cmp2
MOVBLZX 2(CX)(AX*1), DI
MOVBLZX 2(BX)(AX*1), R8
MOVB (R9)(DI*1), DI
XORB (R9)(R8*1), DI
ORB DI, SI
cmp2:
CMPQ DX, $0x02
JB cmp1
MOVBLZX 1(CX)(AX*1), DI
MOVBLZX 1(BX)(AX*1), R8
MOVB (R9)(DI*1), DI
XORB (R9)(R8*1), DI
ORB DI, SI
cmp1:
CMPQ DX, $0x01
JB success
MOVBLZX (CX)(AX*1), DI
MOVBLZX (BX)(AX*1), R8
MOVB (R9)(DI*1), DI
XORB (R9)(R8*1), DI
ORB DI, SI
done:
SETEQ ret+32(FP)
RET
success:
MOVB $0x01, ret+32(FP)
RET
init_avx:
MOVB $0x20, SI
PINSRB $0x00, SI, X12
VPBROADCASTB X12, Y12
MOVB $0x1f, SI
PINSRB $0x00, SI, X13
VPBROADCASTB X13, Y13
MOVB $0x9a, SI
PINSRB $0x00, SI, X14
VPBROADCASTB X14, Y14
MOVB $0x01, SI
PINSRB $0x00, SI, X15
VPBROADCASTB X15, Y15
cmp128:
CMPQ DX, $0x80
JB cmp64
VMOVDQU (CX)(AX*1), Y0
VMOVDQU 32(CX)(AX*1), Y1
VMOVDQU 64(CX)(AX*1), Y2
VMOVDQU 96(CX)(AX*1), Y3
VMOVDQU (BX)(AX*1), Y4
VMOVDQU 32(BX)(AX*1), Y5
VMOVDQU 64(BX)(AX*1), Y6
VMOVDQU 96(BX)(AX*1), Y7
VXORPD Y0, Y4, Y4
VPCMPEQB Y12, Y4, Y8
VORPD Y12, Y0, Y0
VPADDB Y13, Y0, Y0
VPCMPGTB Y0, Y14, Y0
VPAND Y8, Y0, Y0
VPAND Y15, Y0, Y0
VPSLLW $0x05, Y0, Y0
VPCMPEQB Y4, Y0, Y0
VXORPD Y1, Y5, Y5
VPCMPEQB Y12, Y5, Y9
VORPD Y12, Y1, Y1
VPADDB Y13, Y1, Y1
VPCMPGTB Y1, Y14, Y1
VPAND Y9, Y1, Y1
VPAND Y15, Y1, Y1
VPSLLW $0x05, Y1, Y1
VPCMPEQB Y5, Y1, Y1
VXORPD Y2, Y6, Y6
VPCMPEQB Y12, Y6, Y10
VORPD Y12, Y2, Y2
VPADDB Y13, Y2, Y2
VPCMPGTB Y2, Y14, Y2
VPAND Y10, Y2, Y2
VPAND Y15, Y2, Y2
VPSLLW $0x05, Y2, Y2
VPCMPEQB Y6, Y2, Y2
VXORPD Y3, Y7, Y7
VPCMPEQB Y12, Y7, Y11
VORPD Y12, Y3, Y3
VPADDB Y13, Y3, Y3
VPCMPGTB Y3, Y14, Y3
VPAND Y11, Y3, Y3
VPAND Y15, Y3, Y3
VPSLLW $0x05, Y3, Y3
VPCMPEQB Y7, Y3, Y3
VPAND Y1, Y0, Y0
VPAND Y3, Y2, Y2
VPAND Y2, Y0, Y0
ADDQ $0x80, AX
SUBQ $0x80, DX
VPMOVMSKB Y0, SI
XORL $0xffffffff, SI
JNE done
JMP cmp128
cmp64:
CMPQ DX, $0x40
JB cmp32
VMOVDQU (CX)(AX*1), Y0
VMOVDQU 32(CX)(AX*1), Y1
VMOVDQU (BX)(AX*1), Y2
VMOVDQU 32(BX)(AX*1), Y3
VXORPD Y0, Y2, Y2
VPCMPEQB Y12, Y2, Y4
VORPD Y12, Y0, Y0
VPADDB Y13, Y0, Y0
VPCMPGTB Y0, Y14, Y0
VPAND Y4, Y0, Y0
VPAND Y15, Y0, Y0
VPSLLW $0x05, Y0, Y0
VPCMPEQB Y2, Y0, Y0
VXORPD Y1, Y3, Y3
VPCMPEQB Y12, Y3, Y5
VORPD Y12, Y1, Y1
VPADDB Y13, Y1, Y1
VPCMPGTB Y1, Y14, Y1
VPAND Y5, Y1, Y1
VPAND Y15, Y1, Y1
VPSLLW $0x05, Y1, Y1
VPCMPEQB Y3, Y1, Y1
VPAND Y1, Y0, Y0
ADDQ $0x40, AX
SUBQ $0x40, DX
VPMOVMSKB Y0, SI
XORL $0xffffffff, SI
JNE done
cmp32:
CMPQ DX, $0x20
JB cmp16
VMOVDQU (CX)(AX*1), Y0
VMOVDQU (BX)(AX*1), Y1
VXORPD Y0, Y1, Y1
VPCMPEQB Y12, Y1, Y2
VORPD Y12, Y0, Y0
VPADDB Y13, Y0, Y0
VPCMPGTB Y0, Y14, Y0
VPAND Y2, Y0, Y0
VPAND Y15, Y0, Y0
VPSLLW $0x05, Y0, Y0
VPCMPEQB Y1, Y0, Y0
ADDQ $0x20, AX
SUBQ $0x20, DX
VPMOVMSKB Y0, SI
XORL $0xffffffff, SI
JNE done
cmp16:
CMPQ DX, $0x10
JLE cmp_tail
VMOVDQU (CX)(AX*1), X0
VMOVDQU (BX)(AX*1), X1
VXORPD X0, X1, X1
VPCMPEQB X12, X1, X2
VORPD X12, X0, X0
VPADDB X13, X0, X0
VPCMPGTB X0, X14, X0
VPAND X2, X0, X0
VPAND X15, X0, X0
VPSLLW $0x05, X0, X0
VPCMPEQB X1, X0, X0
ADDQ $0x10, AX
SUBQ $0x10, DX
VPMOVMSKB X0, SI
XORL $0x0000ffff, SI
JNE done
cmp_tail:
SUBQ $0x10, DX
ADDQ DX, AX
VMOVDQU (CX)(AX*1), X0
VMOVDQU (BX)(AX*1), X1
VXORPD X0, X1, X1
VPCMPEQB X12, X1, X2
VORPD X12, X0, X0
VPADDB X13, X0, X0
VPCMPGTB X0, X14, X0
VPAND X2, X0, X0
VPAND X15, X0, X0
VPSLLW $0x05, X0, X0
VPCMPEQB X1, X0, X0
VPMOVMSKB X0, AX
XORL $0x0000ffff, AX
JMP done
|