
|
// Code generated by command: go run sums_asm.go -pkg slices -out ../slices/sums_amd64.s -stubs ../slices/sums_amd64.go. DO NOT EDIT.
//go:build !purego
#include "textflag.h"
// func sumUint64(x []uint64, y []uint64)
// Requires: AVX, AVX2, CMOV
TEXT ·sumUint64(SB), NOSPLIT, $0-48
XORQ CX, CX
MOVQ x_base+0(FP), DX
MOVQ y_base+24(FP), BX
MOVQ x_len+8(FP), SI
MOVQ y_len+32(FP), AX
CMPQ AX, SI
CMOVQLT AX, SI
BTL $0x08, github·com∕segmentio∕asm∕cpu·X86+0(SB)
JCC x86_loop
avx2_loop:
MOVQ CX, AX
ADDQ $0x10, AX
CMPQ AX, SI
JAE x86_loop
VMOVDQU (DX)(CX*8), Y0
VMOVDQU (BX)(CX*8), Y1
VMOVDQU 32(DX)(CX*8), Y2
VMOVDQU 32(BX)(CX*8), Y3
VMOVDQU 64(DX)(CX*8), Y4
VMOVDQU 64(BX)(CX*8), Y5
VMOVDQU 96(DX)(CX*8), Y6
VMOVDQU 96(BX)(CX*8), Y7
VPADDQ Y0, Y1, Y0
VPADDQ Y2, Y3, Y2
VPADDQ Y4, Y5, Y4
VPADDQ Y6, Y7, Y6
VMOVDQU Y0, (DX)(CX*8)
VMOVDQU Y2, 32(DX)(CX*8)
VMOVDQU Y4, 64(DX)(CX*8)
VMOVDQU Y6, 96(DX)(CX*8)
MOVQ AX, CX
JMP avx2_loop
x86_loop:
CMPQ CX, SI
JAE return
MOVQ (BX)(CX*8), AX
ADDQ AX, (DX)(CX*8)
ADDQ $0x01, CX
JMP x86_loop
return:
RET
// func sumUint32(x []uint32, y []uint32)
// Requires: AVX, AVX2, CMOV
TEXT ·sumUint32(SB), NOSPLIT, $0-48
XORQ CX, CX
MOVQ x_base+0(FP), DX
MOVQ y_base+24(FP), BX
MOVQ x_len+8(FP), SI
MOVQ y_len+32(FP), AX
CMPQ AX, SI
CMOVQLT AX, SI
BTL $0x08, github·com∕segmentio∕asm∕cpu·X86+0(SB)
JCC x86_loop
avx2_loop:
MOVQ CX, AX
ADDQ $0x20, AX
CMPQ AX, SI
JAE x86_loop
VMOVDQU (DX)(CX*4), Y0
VMOVDQU (BX)(CX*4), Y1
VMOVDQU 32(DX)(CX*4), Y2
VMOVDQU 32(BX)(CX*4), Y3
VMOVDQU 64(DX)(CX*4), Y4
VMOVDQU 64(BX)(CX*4), Y5
VMOVDQU 96(DX)(CX*4), Y6
VMOVDQU 96(BX)(CX*4), Y7
VPADDD Y0, Y1, Y0
VPADDD Y2, Y3, Y2
VPADDD Y4, Y5, Y4
VPADDD Y6, Y7, Y6
VMOVDQU Y0, (DX)(CX*4)
VMOVDQU Y2, 32(DX)(CX*4)
VMOVDQU Y4, 64(DX)(CX*4)
VMOVDQU Y6, 96(DX)(CX*4)
MOVQ AX, CX
JMP avx2_loop
x86_loop:
CMPQ CX, SI
JAE return
MOVL (BX)(CX*4), AX
ADDL AX, (DX)(CX*4)
ADDQ $0x01, CX
JMP x86_loop
return:
RET
// func sumUint16(x []uint16, y []uint16)
// Requires: AVX, AVX2, CMOV
TEXT ·sumUint16(SB), NOSPLIT, $0-48
XORQ CX, CX
MOVQ x_base+0(FP), DX
MOVQ y_base+24(FP), BX
MOVQ x_len+8(FP), SI
MOVQ y_len+32(FP), AX
CMPQ AX, SI
CMOVQLT AX, SI
BTL $0x08, github·com∕segmentio∕asm∕cpu·X86+0(SB)
JCC x86_loop
avx2_loop:
MOVQ CX, AX
ADDQ $0x40, AX
CMPQ AX, SI
JAE x86_loop
VMOVDQU (DX)(CX*2), Y0
VMOVDQU (BX)(CX*2), Y1
VMOVDQU 32(DX)(CX*2), Y2
VMOVDQU 32(BX)(CX*2), Y3
VMOVDQU 64(DX)(CX*2), Y4
VMOVDQU 64(BX)(CX*2), Y5
VMOVDQU 96(DX)(CX*2), Y6
VMOVDQU 96(BX)(CX*2), Y7
VPADDW Y0, Y1, Y0
VPADDW Y2, Y3, Y2
VPADDW Y4, Y5, Y4
VPADDW Y6, Y7, Y6
VMOVDQU Y0, (DX)(CX*2)
VMOVDQU Y2, 32(DX)(CX*2)
VMOVDQU Y4, 64(DX)(CX*2)
VMOVDQU Y6, 96(DX)(CX*2)
MOVQ AX, CX
JMP avx2_loop
x86_loop:
CMPQ CX, SI
JAE return
MOVW (BX)(CX*2), AX
ADDW AX, (DX)(CX*2)
ADDQ $0x01, CX
JMP x86_loop
return:
RET
// func sumUint8(x []uint8, y []uint8)
// Requires: AVX, AVX2, CMOV
TEXT ·sumUint8(SB), NOSPLIT, $0-48
XORQ CX, CX
MOVQ x_base+0(FP), DX
MOVQ y_base+24(FP), BX
MOVQ x_len+8(FP), SI
MOVQ y_len+32(FP), AX
CMPQ AX, SI
CMOVQLT AX, SI
BTL $0x08, github·com∕segmentio∕asm∕cpu·X86+0(SB)
JCC x86_loop
avx2_loop:
MOVQ CX, AX
ADDQ $0x80, AX
CMPQ AX, SI
JAE x86_loop
VMOVDQU (DX)(CX*1), Y0
VMOVDQU (BX)(CX*1), Y1
VMOVDQU 32(DX)(CX*1), Y2
VMOVDQU 32(BX)(CX*1), Y3
VMOVDQU 64(DX)(CX*1), Y4
VMOVDQU 64(BX)(CX*1), Y5
VMOVDQU 96(DX)(CX*1), Y6
VMOVDQU 96(BX)(CX*1), Y7
VPADDB Y0, Y1, Y0
VPADDB Y2, Y3, Y2
VPADDB Y4, Y5, Y4
VPADDB Y6, Y7, Y6
VMOVDQU Y0, (DX)(CX*1)
VMOVDQU Y2, 32(DX)(CX*1)
VMOVDQU Y4, 64(DX)(CX*1)
VMOVDQU Y6, 96(DX)(CX*1)
MOVQ AX, CX
JMP avx2_loop
x86_loop:
CMPQ CX, SI
JAE return
MOVB (BX)(CX*1), AL
ADDB AL, (DX)(CX*1)
ADDQ $0x01, CX
JMP x86_loop
return:
RET
|