File: sums_amd64.s

package info (click to toggle)
golang-github-segmentio-asm 1.2.0%2Bgit20231107.1cfacc8-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 932 kB
  • sloc: asm: 6,093; makefile: 32
file content (197 lines) | stat: -rw-r--r-- 4,045 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
// Code generated by command: go run sums_asm.go -pkg slices -out ../slices/sums_amd64.s -stubs ../slices/sums_amd64.go. DO NOT EDIT.

//go:build !purego

#include "textflag.h"

// func sumUint64(x []uint64, y []uint64)
// Requires: AVX, AVX2, CMOV
TEXT ·sumUint64(SB), NOSPLIT, $0-48
	XORQ    CX, CX
	MOVQ    x_base+0(FP), DX
	MOVQ    y_base+24(FP), BX
	MOVQ    x_len+8(FP), SI
	MOVQ    y_len+32(FP), AX
	CMPQ    AX, SI
	CMOVQLT AX, SI
	BTL     $0x08, github·com∕segmentio∕asm∕cpu·X86+0(SB)
	JCC     x86_loop

avx2_loop:
	MOVQ    CX, AX
	ADDQ    $0x10, AX
	CMPQ    AX, SI
	JAE     x86_loop
	VMOVDQU (DX)(CX*8), Y0
	VMOVDQU (BX)(CX*8), Y1
	VMOVDQU 32(DX)(CX*8), Y2
	VMOVDQU 32(BX)(CX*8), Y3
	VMOVDQU 64(DX)(CX*8), Y4
	VMOVDQU 64(BX)(CX*8), Y5
	VMOVDQU 96(DX)(CX*8), Y6
	VMOVDQU 96(BX)(CX*8), Y7
	VPADDQ  Y0, Y1, Y0
	VPADDQ  Y2, Y3, Y2
	VPADDQ  Y4, Y5, Y4
	VPADDQ  Y6, Y7, Y6
	VMOVDQU Y0, (DX)(CX*8)
	VMOVDQU Y2, 32(DX)(CX*8)
	VMOVDQU Y4, 64(DX)(CX*8)
	VMOVDQU Y6, 96(DX)(CX*8)
	MOVQ    AX, CX
	JMP     avx2_loop

x86_loop:
	CMPQ CX, SI
	JAE  return
	MOVQ (BX)(CX*8), AX
	ADDQ AX, (DX)(CX*8)
	ADDQ $0x01, CX
	JMP  x86_loop

return:
	RET

// func sumUint32(x []uint32, y []uint32)
// Requires: AVX, AVX2, CMOV
TEXT ·sumUint32(SB), NOSPLIT, $0-48
	XORQ    CX, CX
	MOVQ    x_base+0(FP), DX
	MOVQ    y_base+24(FP), BX
	MOVQ    x_len+8(FP), SI
	MOVQ    y_len+32(FP), AX
	CMPQ    AX, SI
	CMOVQLT AX, SI
	BTL     $0x08, github·com∕segmentio∕asm∕cpu·X86+0(SB)
	JCC     x86_loop

avx2_loop:
	MOVQ    CX, AX
	ADDQ    $0x20, AX
	CMPQ    AX, SI
	JAE     x86_loop
	VMOVDQU (DX)(CX*4), Y0
	VMOVDQU (BX)(CX*4), Y1
	VMOVDQU 32(DX)(CX*4), Y2
	VMOVDQU 32(BX)(CX*4), Y3
	VMOVDQU 64(DX)(CX*4), Y4
	VMOVDQU 64(BX)(CX*4), Y5
	VMOVDQU 96(DX)(CX*4), Y6
	VMOVDQU 96(BX)(CX*4), Y7
	VPADDD  Y0, Y1, Y0
	VPADDD  Y2, Y3, Y2
	VPADDD  Y4, Y5, Y4
	VPADDD  Y6, Y7, Y6
	VMOVDQU Y0, (DX)(CX*4)
	VMOVDQU Y2, 32(DX)(CX*4)
	VMOVDQU Y4, 64(DX)(CX*4)
	VMOVDQU Y6, 96(DX)(CX*4)
	MOVQ    AX, CX
	JMP     avx2_loop

x86_loop:
	CMPQ CX, SI
	JAE  return
	MOVL (BX)(CX*4), AX
	ADDL AX, (DX)(CX*4)
	ADDQ $0x01, CX
	JMP  x86_loop

return:
	RET

// func sumUint16(x []uint16, y []uint16)
// Requires: AVX, AVX2, CMOV
TEXT ·sumUint16(SB), NOSPLIT, $0-48
	XORQ    CX, CX
	MOVQ    x_base+0(FP), DX
	MOVQ    y_base+24(FP), BX
	MOVQ    x_len+8(FP), SI
	MOVQ    y_len+32(FP), AX
	CMPQ    AX, SI
	CMOVQLT AX, SI
	BTL     $0x08, github·com∕segmentio∕asm∕cpu·X86+0(SB)
	JCC     x86_loop

avx2_loop:
	MOVQ    CX, AX
	ADDQ    $0x40, AX
	CMPQ    AX, SI
	JAE     x86_loop
	VMOVDQU (DX)(CX*2), Y0
	VMOVDQU (BX)(CX*2), Y1
	VMOVDQU 32(DX)(CX*2), Y2
	VMOVDQU 32(BX)(CX*2), Y3
	VMOVDQU 64(DX)(CX*2), Y4
	VMOVDQU 64(BX)(CX*2), Y5
	VMOVDQU 96(DX)(CX*2), Y6
	VMOVDQU 96(BX)(CX*2), Y7
	VPADDW  Y0, Y1, Y0
	VPADDW  Y2, Y3, Y2
	VPADDW  Y4, Y5, Y4
	VPADDW  Y6, Y7, Y6
	VMOVDQU Y0, (DX)(CX*2)
	VMOVDQU Y2, 32(DX)(CX*2)
	VMOVDQU Y4, 64(DX)(CX*2)
	VMOVDQU Y6, 96(DX)(CX*2)
	MOVQ    AX, CX
	JMP     avx2_loop

x86_loop:
	CMPQ CX, SI
	JAE  return
	MOVW (BX)(CX*2), AX
	ADDW AX, (DX)(CX*2)
	ADDQ $0x01, CX
	JMP  x86_loop

return:
	RET

// func sumUint8(x []uint8, y []uint8)
// Requires: AVX, AVX2, CMOV
TEXT ·sumUint8(SB), NOSPLIT, $0-48
	XORQ    CX, CX
	MOVQ    x_base+0(FP), DX
	MOVQ    y_base+24(FP), BX
	MOVQ    x_len+8(FP), SI
	MOVQ    y_len+32(FP), AX
	CMPQ    AX, SI
	CMOVQLT AX, SI
	BTL     $0x08, github·com∕segmentio∕asm∕cpu·X86+0(SB)
	JCC     x86_loop

avx2_loop:
	MOVQ    CX, AX
	ADDQ    $0x80, AX
	CMPQ    AX, SI
	JAE     x86_loop
	VMOVDQU (DX)(CX*1), Y0
	VMOVDQU (BX)(CX*1), Y1
	VMOVDQU 32(DX)(CX*1), Y2
	VMOVDQU 32(BX)(CX*1), Y3
	VMOVDQU 64(DX)(CX*1), Y4
	VMOVDQU 64(BX)(CX*1), Y5
	VMOVDQU 96(DX)(CX*1), Y6
	VMOVDQU 96(BX)(CX*1), Y7
	VPADDB  Y0, Y1, Y0
	VPADDB  Y2, Y3, Y2
	VPADDB  Y4, Y5, Y4
	VPADDB  Y6, Y7, Y6
	VMOVDQU Y0, (DX)(CX*1)
	VMOVDQU Y2, 32(DX)(CX*1)
	VMOVDQU Y4, 64(DX)(CX*1)
	VMOVDQU Y6, 96(DX)(CX*1)
	MOVQ    AX, CX
	JMP     avx2_loop

x86_loop:
	CMPQ CX, SI
	JAE  return
	MOVB (BX)(CX*1), AL
	ADDB AL, (DX)(CX*1)
	ADDQ $0x01, CX
	JMP  x86_loop

return:
	RET