1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197
|
// Code generated by command: go run sums_asm.go -pkg slices -out ../slices/sums_amd64.s -stubs ../slices/sums_amd64.go. DO NOT EDIT.
//go:build !purego
#include "textflag.h"
// func sumUint64(x []uint64, y []uint64)
// Requires: AVX, AVX2, CMOV
TEXT ·sumUint64(SB), NOSPLIT, $0-48
XORQ CX, CX
MOVQ x_base+0(FP), DX
MOVQ y_base+24(FP), BX
MOVQ x_len+8(FP), SI
MOVQ y_len+32(FP), AX
CMPQ AX, SI
CMOVQLT AX, SI
BTL $0x08, github·com∕segmentio∕asm∕cpu·X86+0(SB)
JCC x86_loop
avx2_loop:
MOVQ CX, AX
ADDQ $0x10, AX
CMPQ AX, SI
JAE x86_loop
VMOVDQU (DX)(CX*8), Y0
VMOVDQU (BX)(CX*8), Y1
VMOVDQU 32(DX)(CX*8), Y2
VMOVDQU 32(BX)(CX*8), Y3
VMOVDQU 64(DX)(CX*8), Y4
VMOVDQU 64(BX)(CX*8), Y5
VMOVDQU 96(DX)(CX*8), Y6
VMOVDQU 96(BX)(CX*8), Y7
VPADDQ Y0, Y1, Y0
VPADDQ Y2, Y3, Y2
VPADDQ Y4, Y5, Y4
VPADDQ Y6, Y7, Y6
VMOVDQU Y0, (DX)(CX*8)
VMOVDQU Y2, 32(DX)(CX*8)
VMOVDQU Y4, 64(DX)(CX*8)
VMOVDQU Y6, 96(DX)(CX*8)
MOVQ AX, CX
JMP avx2_loop
x86_loop:
CMPQ CX, SI
JAE return
MOVQ (BX)(CX*8), AX
ADDQ AX, (DX)(CX*8)
ADDQ $0x01, CX
JMP x86_loop
return:
RET
// func sumUint32(x []uint32, y []uint32)
// Requires: AVX, AVX2, CMOV
TEXT ·sumUint32(SB), NOSPLIT, $0-48
XORQ CX, CX
MOVQ x_base+0(FP), DX
MOVQ y_base+24(FP), BX
MOVQ x_len+8(FP), SI
MOVQ y_len+32(FP), AX
CMPQ AX, SI
CMOVQLT AX, SI
BTL $0x08, github·com∕segmentio∕asm∕cpu·X86+0(SB)
JCC x86_loop
avx2_loop:
MOVQ CX, AX
ADDQ $0x20, AX
CMPQ AX, SI
JAE x86_loop
VMOVDQU (DX)(CX*4), Y0
VMOVDQU (BX)(CX*4), Y1
VMOVDQU 32(DX)(CX*4), Y2
VMOVDQU 32(BX)(CX*4), Y3
VMOVDQU 64(DX)(CX*4), Y4
VMOVDQU 64(BX)(CX*4), Y5
VMOVDQU 96(DX)(CX*4), Y6
VMOVDQU 96(BX)(CX*4), Y7
VPADDD Y0, Y1, Y0
VPADDD Y2, Y3, Y2
VPADDD Y4, Y5, Y4
VPADDD Y6, Y7, Y6
VMOVDQU Y0, (DX)(CX*4)
VMOVDQU Y2, 32(DX)(CX*4)
VMOVDQU Y4, 64(DX)(CX*4)
VMOVDQU Y6, 96(DX)(CX*4)
MOVQ AX, CX
JMP avx2_loop
x86_loop:
CMPQ CX, SI
JAE return
MOVL (BX)(CX*4), AX
ADDL AX, (DX)(CX*4)
ADDQ $0x01, CX
JMP x86_loop
return:
RET
// func sumUint16(x []uint16, y []uint16)
// Requires: AVX, AVX2, CMOV
TEXT ·sumUint16(SB), NOSPLIT, $0-48
XORQ CX, CX
MOVQ x_base+0(FP), DX
MOVQ y_base+24(FP), BX
MOVQ x_len+8(FP), SI
MOVQ y_len+32(FP), AX
CMPQ AX, SI
CMOVQLT AX, SI
BTL $0x08, github·com∕segmentio∕asm∕cpu·X86+0(SB)
JCC x86_loop
avx2_loop:
MOVQ CX, AX
ADDQ $0x40, AX
CMPQ AX, SI
JAE x86_loop
VMOVDQU (DX)(CX*2), Y0
VMOVDQU (BX)(CX*2), Y1
VMOVDQU 32(DX)(CX*2), Y2
VMOVDQU 32(BX)(CX*2), Y3
VMOVDQU 64(DX)(CX*2), Y4
VMOVDQU 64(BX)(CX*2), Y5
VMOVDQU 96(DX)(CX*2), Y6
VMOVDQU 96(BX)(CX*2), Y7
VPADDW Y0, Y1, Y0
VPADDW Y2, Y3, Y2
VPADDW Y4, Y5, Y4
VPADDW Y6, Y7, Y6
VMOVDQU Y0, (DX)(CX*2)
VMOVDQU Y2, 32(DX)(CX*2)
VMOVDQU Y4, 64(DX)(CX*2)
VMOVDQU Y6, 96(DX)(CX*2)
MOVQ AX, CX
JMP avx2_loop
x86_loop:
CMPQ CX, SI
JAE return
MOVW (BX)(CX*2), AX
ADDW AX, (DX)(CX*2)
ADDQ $0x01, CX
JMP x86_loop
return:
RET
// func sumUint8(x []uint8, y []uint8)
// Requires: AVX, AVX2, CMOV
TEXT ·sumUint8(SB), NOSPLIT, $0-48
XORQ CX, CX
MOVQ x_base+0(FP), DX
MOVQ y_base+24(FP), BX
MOVQ x_len+8(FP), SI
MOVQ y_len+32(FP), AX
CMPQ AX, SI
CMOVQLT AX, SI
BTL $0x08, github·com∕segmentio∕asm∕cpu·X86+0(SB)
JCC x86_loop
avx2_loop:
MOVQ CX, AX
ADDQ $0x80, AX
CMPQ AX, SI
JAE x86_loop
VMOVDQU (DX)(CX*1), Y0
VMOVDQU (BX)(CX*1), Y1
VMOVDQU 32(DX)(CX*1), Y2
VMOVDQU 32(BX)(CX*1), Y3
VMOVDQU 64(DX)(CX*1), Y4
VMOVDQU 64(BX)(CX*1), Y5
VMOVDQU 96(DX)(CX*1), Y6
VMOVDQU 96(BX)(CX*1), Y7
VPADDB Y0, Y1, Y0
VPADDB Y2, Y3, Y2
VPADDB Y4, Y5, Y4
VPADDB Y6, Y7, Y6
VMOVDQU Y0, (DX)(CX*1)
VMOVDQU Y2, 32(DX)(CX*1)
VMOVDQU Y4, 64(DX)(CX*1)
VMOVDQU Y6, 96(DX)(CX*1)
MOVQ AX, CX
JMP avx2_loop
x86_loop:
CMPQ CX, SI
JAE return
MOVB (BX)(CX*1), AL
ADDB AL, (DX)(CX*1)
ADDQ $0x01, CX
JMP x86_loop
return:
RET
|