File: min_max_neon.s

package info (click to toggle)
golang-github-apache-arrow-go 18.2.0-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 32,200 kB
  • sloc: asm: 477,547; ansic: 5,369; cpp: 759; sh: 585; makefile: 319; python: 190; sed: 5
file content (318 lines) | stat: -rw-r--r-- 7,850 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
	.text
	.file	"min_max.c"
	.globl	int32_max_min_neon      // -- Begin function int32_max_min_neon
	.p2align	2
	.type	int32_max_min_neon,@function
int32_max_min_neon:                     // @int32_max_min_neon
// %bb.0:
	stp	x29, x30, [sp, #-16]!   // 16-byte Folded Spill
	cmp	w1, #1                  // =1
	mov	x29, sp
	b.lt	.LBB0_3
// %bb.1:
	cmp	w1, #3                  // =3
	mov	w8, w1
	b.hi	.LBB0_4
// %bb.2:
	mov	x9, xzr
	mov	w11, #-2147483648
	mov	w10, #2147483647
	b	.LBB0_7
.LBB0_3:
	mov	w10, #2147483647
	mov	w11, #-2147483648
	str	w11, [x3]
	str	w10, [x2]
	ldp	x29, x30, [sp], #16     // 16-byte Folded Reload
	ret
.LBB0_4:
	and	x9, x8, #0xfffffffc
	add	x10, x0, #8             // =8
	movi	v2.2s, #128, lsl #24
	mvni	v0.2s, #128, lsl #24
	mvni	v1.2s, #128, lsl #24
	mov	x11, x9
	movi	v3.2s, #128, lsl #24
.LBB0_5:                                // =>This Inner Loop Header: Depth=1
	ldp	d4, d5, [x10, #-8]
	subs	x11, x11, #4            // =4
	add	x10, x10, #16           // =16
	smin	v0.2s, v0.2s, v4.2s
	smin	v1.2s, v1.2s, v5.2s
	smax	v2.2s, v2.2s, v4.2s
	smax	v3.2s, v3.2s, v5.2s
	b.ne	.LBB0_5
// %bb.6:
	smax	v2.2s, v2.2s, v3.2s
	smin	v0.2s, v0.2s, v1.2s
	dup	v1.2s, v2.s[1]
	dup	v3.2s, v0.s[1]
	smax	v1.2s, v2.2s, v1.2s
	smin	v0.2s, v0.2s, v3.2s
	cmp	x9, x8
	fmov	w11, s1
	fmov	w10, s0
	b.eq	.LBB0_9
.LBB0_7:
	add	x12, x0, x9, lsl #2
	sub	x8, x8, x9
.LBB0_8:                                // =>This Inner Loop Header: Depth=1
	ldr	w9, [x12], #4
	cmp	w10, w9
	csel	w10, w10, w9, lt
	cmp	w11, w9
	csel	w11, w11, w9, gt
	subs	x8, x8, #1              // =1
	b.ne	.LBB0_8
.LBB0_9:
	str	w11, [x3]
	str	w10, [x2]
	ldp	x29, x30, [sp], #16     // 16-byte Folded Reload
	ret
.Lfunc_end0:
	.size	int32_max_min_neon, .Lfunc_end0-int32_max_min_neon
                                        // -- End function
	.globl	uint32_max_min_neon     // -- Begin function uint32_max_min_neon
	.p2align	2
	.type	uint32_max_min_neon,@function
uint32_max_min_neon:                    // @uint32_max_min_neon
// %bb.0:
	stp	x29, x30, [sp, #-16]!   // 16-byte Folded Spill
	cmp	w1, #1                  // =1
	mov	x29, sp
	b.lt	.LBB1_3
// %bb.1:
	cmp	w1, #3                  // =3
	mov	w8, w1
	b.hi	.LBB1_4
// %bb.2:
	mov	x9, xzr
	mov	w10, wzr
	mov	w11, #-1
	b	.LBB1_7
.LBB1_3:
	mov	w10, wzr
	mov	w11, #-1
	str	w10, [x3]
	str	w11, [x2]
	ldp	x29, x30, [sp], #16     // 16-byte Folded Reload
	ret
.LBB1_4:
	and	x9, x8, #0xfffffffc
	movi	v1.2d, #0000000000000000
	movi	v0.2d, #0xffffffffffffffff
	add	x10, x0, #8             // =8
	movi	v2.2d, #0xffffffffffffffff
	mov	x11, x9
	movi	v3.2d, #0000000000000000
.LBB1_5:                                // =>This Inner Loop Header: Depth=1
	ldp	d4, d5, [x10, #-8]
	subs	x11, x11, #4            // =4
	add	x10, x10, #16           // =16
	umin	v0.2s, v0.2s, v4.2s
	umin	v2.2s, v2.2s, v5.2s
	umax	v1.2s, v1.2s, v4.2s
	umax	v3.2s, v3.2s, v5.2s
	b.ne	.LBB1_5
// %bb.6:
	umax	v1.2s, v1.2s, v3.2s
	umin	v0.2s, v0.2s, v2.2s
	dup	v2.2s, v1.s[1]
	dup	v3.2s, v0.s[1]
	umax	v1.2s, v1.2s, v2.2s
	umin	v0.2s, v0.2s, v3.2s
	cmp	x9, x8
	fmov	w10, s1
	fmov	w11, s0
	b.eq	.LBB1_9
.LBB1_7:
	add	x12, x0, x9, lsl #2
	sub	x8, x8, x9
.LBB1_8:                                // =>This Inner Loop Header: Depth=1
	ldr	w9, [x12], #4
	cmp	w11, w9
	csel	w11, w11, w9, lo
	cmp	w10, w9
	csel	w10, w10, w9, hi
	subs	x8, x8, #1              // =1
	b.ne	.LBB1_8
.LBB1_9:
	str	w10, [x3]
	str	w11, [x2]
	ldp	x29, x30, [sp], #16     // 16-byte Folded Reload
	ret
.Lfunc_end1:
	.size	uint32_max_min_neon, .Lfunc_end1-uint32_max_min_neon
                                        // -- End function
	.globl	int64_max_min_neon      // -- Begin function int64_max_min_neon
	.p2align	2
	.type	int64_max_min_neon,@function
int64_max_min_neon:                     // @int64_max_min_neon
// %bb.0:
	stp	x29, x30, [sp, #-16]!   // 16-byte Folded Spill
	cmp	w1, #1                  // =1
	mov	x29, sp
	b.lt	.LBB2_3
// %bb.1:
	mov	w8, w1
	mov	x11, #-9223372036854775808
	cmp	w1, #3                  // =3
	mov	x10, #9223372036854775807
	b.hi	.LBB2_4
// %bb.2:
	mov	x9, xzr
	b	.LBB2_7
.LBB2_3:
	mov	x10, #9223372036854775807
	mov	x11, #-9223372036854775808
	str	x11, [x3]
	str	x10, [x2]
	ldp	x29, x30, [sp], #16     // 16-byte Folded Reload
	ret
.LBB2_4:
	and	x9, x8, #0xfffffffc
	dup	v1.2d, x11
	dup	v0.2d, x10
	add	x10, x0, #16            // =16
	mov	x11, x9
	mov	v2.16b, v0.16b
	mov	v3.16b, v1.16b
.LBB2_5:                                // =>This Inner Loop Header: Depth=1
	ldp	q4, q5, [x10, #-16]
	mov	v6.16b, v3.16b
	mov	v7.16b, v1.16b
	mov	v3.16b, v2.16b
	mov	v1.16b, v0.16b
	cmgt	v0.2d, v4.2d, v0.2d
	cmgt	v2.2d, v5.2d, v2.2d
	bsl	v0.16b, v1.16b, v4.16b
	cmgt	v1.2d, v7.2d, v4.2d
	bsl	v2.16b, v3.16b, v5.16b
	cmgt	v3.2d, v6.2d, v5.2d
	subs	x11, x11, #4            // =4
	bsl	v1.16b, v7.16b, v4.16b
	bsl	v3.16b, v6.16b, v5.16b
	add	x10, x10, #32           // =32
	b.ne	.LBB2_5
// %bb.6:
	cmgt	v4.2d, v1.2d, v3.2d
	cmgt	v5.2d, v2.2d, v0.2d
	bsl	v4.16b, v1.16b, v3.16b
	bsl	v5.16b, v0.16b, v2.16b
	dup	v0.2d, v4.d[1]
	dup	v1.2d, v5.d[1]
	cmgt	v2.2d, v4.2d, v0.2d
	cmgt	v3.2d, v1.2d, v5.2d
	bsl	v2.16b, v4.16b, v0.16b
	bsl	v3.16b, v5.16b, v1.16b
	cmp	x9, x8
	fmov	x11, d2
	fmov	x10, d3
	b.eq	.LBB2_9
.LBB2_7:
	add	x12, x0, x9, lsl #3
	sub	x8, x8, x9
.LBB2_8:                                // =>This Inner Loop Header: Depth=1
	ldr	x9, [x12], #8
	cmp	x10, x9
	csel	x10, x10, x9, lt
	cmp	x11, x9
	csel	x11, x11, x9, gt
	subs	x8, x8, #1              // =1
	b.ne	.LBB2_8
.LBB2_9:
	str	x11, [x3]
	str	x10, [x2]
	ldp	x29, x30, [sp], #16     // 16-byte Folded Reload
	ret
.Lfunc_end2:
	.size	int64_max_min_neon, .Lfunc_end2-int64_max_min_neon
                                        // -- End function
	.globl	uint64_max_min_neon     // -- Begin function uint64_max_min_neon
	.p2align	2
	.type	uint64_max_min_neon,@function
uint64_max_min_neon:                    // @uint64_max_min_neon
// %bb.0:
	stp	x29, x30, [sp, #-16]!   // 16-byte Folded Spill
	cmp	w1, #1                  // =1
	mov	x29, sp
	b.lt	.LBB3_3
// %bb.1:
	cmp	w1, #3                  // =3
	mov	w8, w1
	b.hi	.LBB3_4
// %bb.2:
	mov	x9, xzr
	mov	x10, xzr
	mov	x11, #-1
	b	.LBB3_7
.LBB3_3:
	mov	x10, xzr
	mov	x11, #-1
	str	x10, [x3]
	str	x11, [x2]
	ldp	x29, x30, [sp], #16     // 16-byte Folded Reload
	ret
.LBB3_4:
	and	x9, x8, #0xfffffffc
	add	x10, x0, #16            // =16
	movi	v1.2d, #0000000000000000
	movi	v0.2d, #0xffffffffffffffff
	movi	v2.2d, #0xffffffffffffffff
	mov	x11, x9
	movi	v3.2d, #0000000000000000
.LBB3_5:                                // =>This Inner Loop Header: Depth=1
	ldp	q4, q5, [x10, #-16]
	mov	v6.16b, v3.16b
	mov	v7.16b, v1.16b
	mov	v3.16b, v2.16b
	mov	v1.16b, v0.16b
	cmhi	v0.2d, v4.2d, v0.2d
	cmhi	v2.2d, v5.2d, v2.2d
	bsl	v0.16b, v1.16b, v4.16b
	cmhi	v1.2d, v7.2d, v4.2d
	bsl	v2.16b, v3.16b, v5.16b
	cmhi	v3.2d, v6.2d, v5.2d
	subs	x11, x11, #4            // =4
	bsl	v1.16b, v7.16b, v4.16b
	bsl	v3.16b, v6.16b, v5.16b
	add	x10, x10, #32           // =32
	b.ne	.LBB3_5
// %bb.6:
	cmhi	v4.2d, v1.2d, v3.2d
	cmhi	v5.2d, v2.2d, v0.2d
	bsl	v4.16b, v1.16b, v3.16b
	bsl	v5.16b, v0.16b, v2.16b
	dup	v0.2d, v4.d[1]
	dup	v1.2d, v5.d[1]
	cmhi	v2.2d, v4.2d, v0.2d
	cmhi	v3.2d, v1.2d, v5.2d
	bsl	v2.16b, v4.16b, v0.16b
	bsl	v3.16b, v5.16b, v1.16b
	cmp	x9, x8
	fmov	x10, d2
	fmov	x11, d3
	b.eq	.LBB3_9
.LBB3_7:
	add	x12, x0, x9, lsl #3
	sub	x8, x8, x9
.LBB3_8:                                // =>This Inner Loop Header: Depth=1
	ldr	x9, [x12], #8
	cmp	x11, x9
	csel	x11, x11, x9, lo
	cmp	x10, x9
	csel	x10, x10, x9, hi
	subs	x8, x8, #1              // =1
	b.ne	.LBB3_8
.LBB3_9:
	str	x10, [x3]
	str	x11, [x2]
	ldp	x29, x30, [sp], #16     // 16-byte Folded Reload
	ret
.Lfunc_end3:
	.size	uint64_max_min_neon, .Lfunc_end3-uint64_max_min_neon
                                        // -- End function

	.ident	"clang version 9.0.1-12 "
	.section	".note.GNU-stack","",@progbits
	.addrsig