File: float64_avx2_amd64.s

package info (click to toggle)
golang-github-apache-arrow-go 18.2.0-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 32,200 kB
  • sloc: asm: 477,547; ansic: 5,369; cpp: 759; sh: 585; makefile: 319; python: 190; sed: 5
file content (167 lines) | stat: -rw-r--r-- 9,629 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
//+build !noasm !appengine
// AUTO-GENERATED BY C2GOASM -- DO NOT EDIT

TEXT ยท_sum_float64_avx2(SB), $0-24

	MOVQ buf+0(FP), DI
	MOVQ len+8(FP), SI
	MOVQ res+16(FP), DX

	LONG $0xc057f9c5         // vxorpd    xmm0, xmm0, xmm0
	WORD $0x8548; BYTE $0xf6 // test    rsi, rsi
	JE   LBB0_14
	LONG $0x1ffe8348         // cmp    rsi, 31
	JBE  LBB0_2
	WORD $0x8949; BYTE $0xf1 // mov    r9, rsi
	LONG $0xe0e18349         // and    r9, -32
	JE   LBB0_2
	LONG $0xe0418d4d         // lea    r8, [r9 - 32]
	WORD $0x8944; BYTE $0xc0 // mov    eax, r8d
	WORD $0xe8c1; BYTE $0x05 // shr    eax, 5
	WORD $0xc0ff             // inc    eax
	LONG $0x07e08348         // and    rax, 7
	JE   LBB0_7
	WORD $0xf748; BYTE $0xd8 // neg    rax
	LONG $0xc057fdc5         // vxorpd    ymm0, ymm0, ymm0
	WORD $0xc931             // xor    ecx, ecx
	LONG $0xc957f5c5         // vxorpd    ymm1, ymm1, ymm1
	LONG $0xd257edc5         // vxorpd    ymm2, ymm2, ymm2
	LONG $0xdb57e5c5         // vxorpd    ymm3, ymm3, ymm3
	LONG $0xe457ddc5         // vxorpd    ymm4, ymm4, ymm4
	LONG $0xed57d5c5         // vxorpd    ymm5, ymm5, ymm5
	LONG $0xf657cdc5         // vxorpd    ymm6, ymm6, ymm6
	LONG $0xff57c5c5         // vxorpd    ymm7, ymm7, ymm7

LBB0_9:
	LONG $0x0458fdc5; BYTE $0xcf         // vaddpd    ymm0, ymm0, yword [rdi + 8*rcx]
	LONG $0x4c58f5c5; WORD $0x20cf       // vaddpd    ymm1, ymm1, yword [rdi + 8*rcx + 32]
	LONG $0x5458edc5; WORD $0x40cf       // vaddpd    ymm2, ymm2, yword [rdi + 8*rcx + 64]
	LONG $0x5c58e5c5; WORD $0x60cf       // vaddpd    ymm3, ymm3, yword [rdi + 8*rcx + 96]
	QUAD $0x000080cfa458ddc5; BYTE $0x00 // vaddpd    ymm4, ymm4, yword [rdi + 8*rcx + 128]
	QUAD $0x0000a0cfac58d5c5; BYTE $0x00 // vaddpd    ymm5, ymm5, yword [rdi + 8*rcx + 160]
	QUAD $0x0000c0cfb458cdc5; BYTE $0x00 // vaddpd    ymm6, ymm6, yword [rdi + 8*rcx + 192]
	QUAD $0x0000e0cfbc58c5c5; BYTE $0x00 // vaddpd    ymm7, ymm7, yword [rdi + 8*rcx + 224]
	LONG $0x20c18348                     // add    rcx, 32
	WORD $0xff48; BYTE $0xc0             // inc    rax
	JNE  LBB0_9
	JMP  LBB0_10

LBB0_2:
	WORD $0x3145; BYTE $0xc9 // xor    r9d, r9d

LBB0_3:
	LONG $0xcf048d4a         // lea    rax, [rdi + 8*r9]
	WORD $0x294c; BYTE $0xce // sub    rsi, r9

LBB0_4:
	LONG $0x0058fbc5         // vaddsd    xmm0, xmm0, qword [rax]
	LONG $0x08c08348         // add    rax, 8
	WORD $0xff48; BYTE $0xce // dec    rsi
	JNE  LBB0_4

LBB0_14:
	LONG $0x0211fbc5 // vmovsd    qword [rdx], xmm0
	VZEROUPPER
	RET

LBB0_7:
	WORD $0xc931     // xor    ecx, ecx
	LONG $0xc057fdc5 // vxorpd    ymm0, ymm0, ymm0
	LONG $0xc957f5c5 // vxorpd    ymm1, ymm1, ymm1
	LONG $0xd257edc5 // vxorpd    ymm2, ymm2, ymm2
	LONG $0xdb57e5c5 // vxorpd    ymm3, ymm3, ymm3
	LONG $0xe457ddc5 // vxorpd    ymm4, ymm4, ymm4
	LONG $0xed57d5c5 // vxorpd    ymm5, ymm5, ymm5
	LONG $0xf657cdc5 // vxorpd    ymm6, ymm6, ymm6
	LONG $0xff57c5c5 // vxorpd    ymm7, ymm7, ymm7

LBB0_10:
	LONG $0xe0f88149; WORD $0x0000; BYTE $0x00 // cmp    r8, 224
	JB   LBB0_13
	WORD $0x894c; BYTE $0xc8                   // mov    rax, r9
	WORD $0x2948; BYTE $0xc8                   // sub    rax, rcx
	QUAD $0x00000700cf8c8d48                   // lea    rcx, [rdi + 8*rcx + 1792]

LBB0_12:
	QUAD $0xfffff9e0b958c5c5                   // vaddpd    ymm7, ymm7, yword [rcx - 1568]
	QUAD $0xfffff9c0b158cdc5                   // vaddpd    ymm6, ymm6, yword [rcx - 1600]
	QUAD $0xfffff9a0a958d5c5                   // vaddpd    ymm5, ymm5, yword [rcx - 1632]
	QUAD $0xfffff980a158ddc5                   // vaddpd    ymm4, ymm4, yword [rcx - 1664]
	QUAD $0xfffff9609958e5c5                   // vaddpd    ymm3, ymm3, yword [rcx - 1696]
	QUAD $0xfffff9409158edc5                   // vaddpd    ymm2, ymm2, yword [rcx - 1728]
	QUAD $0xfffff9208958f5c5                   // vaddpd    ymm1, ymm1, yword [rcx - 1760]
	QUAD $0xfffff9008158fdc5                   // vaddpd    ymm0, ymm0, yword [rcx - 1792]
	QUAD $0xfffffa008158fdc5                   // vaddpd    ymm0, ymm0, yword [rcx - 1536]
	QUAD $0xfffffa208958f5c5                   // vaddpd    ymm1, ymm1, yword [rcx - 1504]
	QUAD $0xfffffa409158edc5                   // vaddpd    ymm2, ymm2, yword [rcx - 1472]
	QUAD $0xfffffa609958e5c5                   // vaddpd    ymm3, ymm3, yword [rcx - 1440]
	QUAD $0xfffffa80a158ddc5                   // vaddpd    ymm4, ymm4, yword [rcx - 1408]
	QUAD $0xfffffaa0a958d5c5                   // vaddpd    ymm5, ymm5, yword [rcx - 1376]
	QUAD $0xfffffac0b158cdc5                   // vaddpd    ymm6, ymm6, yword [rcx - 1344]
	QUAD $0xfffffae0b958c5c5                   // vaddpd    ymm7, ymm7, yword [rcx - 1312]
	QUAD $0xfffffbe0b958c5c5                   // vaddpd    ymm7, ymm7, yword [rcx - 1056]
	QUAD $0xfffffbc0b158cdc5                   // vaddpd    ymm6, ymm6, yword [rcx - 1088]
	QUAD $0xfffffba0a958d5c5                   // vaddpd    ymm5, ymm5, yword [rcx - 1120]
	QUAD $0xfffffb80a158ddc5                   // vaddpd    ymm4, ymm4, yword [rcx - 1152]
	QUAD $0xfffffb609958e5c5                   // vaddpd    ymm3, ymm3, yword [rcx - 1184]
	QUAD $0xfffffb409158edc5                   // vaddpd    ymm2, ymm2, yword [rcx - 1216]
	QUAD $0xfffffb208958f5c5                   // vaddpd    ymm1, ymm1, yword [rcx - 1248]
	QUAD $0xfffffb008158fdc5                   // vaddpd    ymm0, ymm0, yword [rcx - 1280]
	QUAD $0xfffffc008158fdc5                   // vaddpd    ymm0, ymm0, yword [rcx - 1024]
	QUAD $0xfffffc208958f5c5                   // vaddpd    ymm1, ymm1, yword [rcx - 992]
	QUAD $0xfffffc409158edc5                   // vaddpd    ymm2, ymm2, yword [rcx - 960]
	QUAD $0xfffffc609958e5c5                   // vaddpd    ymm3, ymm3, yword [rcx - 928]
	QUAD $0xfffffc80a158ddc5                   // vaddpd    ymm4, ymm4, yword [rcx - 896]
	QUAD $0xfffffca0a958d5c5                   // vaddpd    ymm5, ymm5, yword [rcx - 864]
	QUAD $0xfffffcc0b158cdc5                   // vaddpd    ymm6, ymm6, yword [rcx - 832]
	QUAD $0xfffffce0b958c5c5                   // vaddpd    ymm7, ymm7, yword [rcx - 800]
	QUAD $0xfffffde0b958c5c5                   // vaddpd    ymm7, ymm7, yword [rcx - 544]
	QUAD $0xfffffdc0b158cdc5                   // vaddpd    ymm6, ymm6, yword [rcx - 576]
	QUAD $0xfffffda0a958d5c5                   // vaddpd    ymm5, ymm5, yword [rcx - 608]
	QUAD $0xfffffd80a158ddc5                   // vaddpd    ymm4, ymm4, yword [rcx - 640]
	QUAD $0xfffffd609958e5c5                   // vaddpd    ymm3, ymm3, yword [rcx - 672]
	QUAD $0xfffffd409158edc5                   // vaddpd    ymm2, ymm2, yword [rcx - 704]
	QUAD $0xfffffd208958f5c5                   // vaddpd    ymm1, ymm1, yword [rcx - 736]
	QUAD $0xfffffd008158fdc5                   // vaddpd    ymm0, ymm0, yword [rcx - 768]
	QUAD $0xfffffe008158fdc5                   // vaddpd    ymm0, ymm0, yword [rcx - 512]
	QUAD $0xfffffe208958f5c5                   // vaddpd    ymm1, ymm1, yword [rcx - 480]
	QUAD $0xfffffe409158edc5                   // vaddpd    ymm2, ymm2, yword [rcx - 448]
	QUAD $0xfffffe609958e5c5                   // vaddpd    ymm3, ymm3, yword [rcx - 416]
	QUAD $0xfffffe80a158ddc5                   // vaddpd    ymm4, ymm4, yword [rcx - 384]
	QUAD $0xfffffea0a958d5c5                   // vaddpd    ymm5, ymm5, yword [rcx - 352]
	QUAD $0xfffffec0b158cdc5                   // vaddpd    ymm6, ymm6, yword [rcx - 320]
	QUAD $0xfffffee0b958c5c5                   // vaddpd    ymm7, ymm7, yword [rcx - 288]
	LONG $0x7958c5c5; BYTE $0xe0               // vaddpd    ymm7, ymm7, yword [rcx - 32]
	LONG $0x7158cdc5; BYTE $0xc0               // vaddpd    ymm6, ymm6, yword [rcx - 64]
	LONG $0x6958d5c5; BYTE $0xa0               // vaddpd    ymm5, ymm5, yword [rcx - 96]
	LONG $0x6158ddc5; BYTE $0x80               // vaddpd    ymm4, ymm4, yword [rcx - 128]
	QUAD $0xffffff609958e5c5                   // vaddpd    ymm3, ymm3, yword [rcx - 160]
	QUAD $0xffffff409158edc5                   // vaddpd    ymm2, ymm2, yword [rcx - 192]
	QUAD $0xffffff208958f5c5                   // vaddpd    ymm1, ymm1, yword [rcx - 224]
	QUAD $0xffffff008158fdc5                   // vaddpd    ymm0, ymm0, yword [rcx - 256]
	LONG $0x0158fdc5                           // vaddpd    ymm0, ymm0, yword [rcx]
	LONG $0x4958f5c5; BYTE $0x20               // vaddpd    ymm1, ymm1, yword [rcx + 32]
	LONG $0x5158edc5; BYTE $0x40               // vaddpd    ymm2, ymm2, yword [rcx + 64]
	LONG $0x5958e5c5; BYTE $0x60               // vaddpd    ymm3, ymm3, yword [rcx + 96]
	QUAD $0x00000080a158ddc5                   // vaddpd    ymm4, ymm4, yword [rcx + 128]
	QUAD $0x000000a0a958d5c5                   // vaddpd    ymm5, ymm5, yword [rcx + 160]
	QUAD $0x000000c0b158cdc5                   // vaddpd    ymm6, ymm6, yword [rcx + 192]
	QUAD $0x000000e0b958c5c5                   // vaddpd    ymm7, ymm7, yword [rcx + 224]
	LONG $0x00c18148; WORD $0x0008; BYTE $0x00 // add    rcx, 2048
	LONG $0xff000548; WORD $0xffff             // add    rax, -256
	JNE  LBB0_12

LBB0_13:
	LONG $0xcd58f5c5               // vaddpd    ymm1, ymm1, ymm5
	LONG $0xdf58e5c5               // vaddpd    ymm3, ymm3, ymm7
	LONG $0xc458fdc5               // vaddpd    ymm0, ymm0, ymm4
	LONG $0xd658edc5               // vaddpd    ymm2, ymm2, ymm6
	LONG $0xc258fdc5               // vaddpd    ymm0, ymm0, ymm2
	LONG $0xcb58f5c5               // vaddpd    ymm1, ymm1, ymm3
	LONG $0xc158fdc5               // vaddpd    ymm0, ymm0, ymm1
	LONG $0x197de3c4; WORD $0x01c1 // vextractf128    xmm1, ymm0, 1
	LONG $0xc158fdc5               // vaddpd    ymm0, ymm0, ymm1
	LONG $0xc07cfdc5               // vhaddpd    ymm0, ymm0, ymm0
	WORD $0x3949; BYTE $0xf1       // cmp    r9, rsi
	JNE  LBB0_3
	JMP  LBB0_14