File: min_max_avx2_amd64.s

package info (click to toggle)
golang-github-apache-arrow-go 18.2.0-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 32,200 kB
  • sloc: asm: 477,547; ansic: 5,369; cpp: 759; sh: 585; makefile: 319; python: 190; sed: 5
file content (1009 lines) | stat: -rw-r--r-- 26,478 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
	.text
	.intel_syntax noprefix
	.file	"min_max.c"
	.section	.rodata.cst32,"aM",@progbits,32
	.p2align	5                               # -- Begin function int8_max_min_avx2
.LCPI0_0:
	.zero	32,128
.LCPI0_1:
	.zero	32,127
	.section	.rodata.cst16,"aM",@progbits,16
	.p2align	4
.LCPI0_2:
	.zero	16,127
.LCPI0_3:
	.zero	16,128
	.text
	.globl	int8_max_min_avx2
	.p2align	4, 0x90
	.type	int8_max_min_avx2,@function
int8_max_min_avx2:                      # @int8_max_min_avx2
# %bb.0:
	push	rbp
	mov	rbp, rsp
	and	rsp, -8
	test	esi, esi
	jle	.LBB0_1
# %bb.2:
	mov	r9d, esi
	cmp	esi, 63
	ja	.LBB0_4
# %bb.3:
	mov	r8b, -128
	mov	sil, 127
	xor	r10d, r10d
	jmp	.LBB0_11
.LBB0_1:
	mov	sil, 127
	mov	r8b, -128
	jmp	.LBB0_12
.LBB0_4:
	mov	r10d, r9d
	and	r10d, -64
	lea	rax, [r10 - 64]
	mov	r8, rax
	shr	r8, 6
	add	r8, 1
	test	rax, rax
	je	.LBB0_5
# %bb.6:
	mov	rsi, r8
	and	rsi, -2
	neg	rsi
	vmovdqa	ymm1, ymmword ptr [rip + .LCPI0_0] # ymm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
	vmovdqa	ymm0, ymmword ptr [rip + .LCPI0_1] # ymm0 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
	xor	eax, eax
	vmovdqa	ymm2, ymm0
	vmovdqa	ymm3, ymm1
	.p2align	4, 0x90
.LBB0_7:                                # =>This Inner Loop Header: Depth=1
	vmovdqu	ymm4, ymmword ptr [rdi + rax]
	vmovdqu	ymm5, ymmword ptr [rdi + rax + 32]
	vmovdqu	ymm6, ymmword ptr [rdi + rax + 64]
	vmovdqu	ymm7, ymmword ptr [rdi + rax + 96]
	vpminsb	ymm0, ymm0, ymm4
	vpminsb	ymm2, ymm2, ymm5
	vpmaxsb	ymm1, ymm1, ymm4
	vpmaxsb	ymm3, ymm3, ymm5
	vpminsb	ymm0, ymm0, ymm6
	vpminsb	ymm2, ymm2, ymm7
	vpmaxsb	ymm1, ymm1, ymm6
	vpmaxsb	ymm3, ymm3, ymm7
	sub	rax, -128
	add	rsi, 2
	jne	.LBB0_7
# %bb.8:
	test	r8b, 1
	je	.LBB0_10
.LBB0_9:
	vmovdqu	ymm4, ymmword ptr [rdi + rax]
	vmovdqu	ymm5, ymmword ptr [rdi + rax + 32]
	vpmaxsb	ymm3, ymm3, ymm5
	vpmaxsb	ymm1, ymm1, ymm4
	vpminsb	ymm2, ymm2, ymm5
	vpminsb	ymm0, ymm0, ymm4
.LBB0_10:
	vpmaxsb	ymm1, ymm1, ymm3
	vextracti128	xmm3, ymm1, 1
	vpmaxsb	xmm1, xmm1, xmm3
	vpxor	xmm1, xmm1, xmmword ptr [rip + .LCPI0_2]
	vpminsb	ymm0, ymm0, ymm2
	vpsrlw	xmm2, xmm1, 8
	vpminub	xmm1, xmm1, xmm2
	vphminposuw	xmm1, xmm1
	vmovd	r8d, xmm1
	xor	r8b, 127
	vextracti128	xmm1, ymm0, 1
	vpminsb	xmm0, xmm0, xmm1
	vpxor	xmm0, xmm0, xmmword ptr [rip + .LCPI0_3]
	vpsrlw	xmm1, xmm0, 8
	vpminub	xmm0, xmm0, xmm1
	vphminposuw	xmm0, xmm0
	vmovd	esi, xmm0
	xor	sil, -128
	cmp	r10, r9
	je	.LBB0_12
	.p2align	4, 0x90
.LBB0_11:                               # =>This Inner Loop Header: Depth=1
	movzx	eax, byte ptr [rdi + r10]
	cmp	sil, al
	movzx	esi, sil
	cmovg	esi, eax
	cmp	r8b, al
	movzx	r8d, r8b
	cmovl	r8d, eax
	add	r10, 1
	cmp	r9, r10
	jne	.LBB0_11
.LBB0_12:
	mov	byte ptr [rcx], r8b
	mov	byte ptr [rdx], sil
	mov	rsp, rbp
	pop	rbp
	vzeroupper
	ret
.LBB0_5:
	vmovdqa	ymm1, ymmword ptr [rip + .LCPI0_0] # ymm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
	vmovdqa	ymm0, ymmword ptr [rip + .LCPI0_1] # ymm0 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
	xor	eax, eax
	vmovdqa	ymm2, ymm0
	vmovdqa	ymm3, ymm1
	test	r8b, 1
	jne	.LBB0_9
	jmp	.LBB0_10
.Lfunc_end0:
	.size	int8_max_min_avx2, .Lfunc_end0-int8_max_min_avx2
                                        # -- End function
	.globl	uint8_max_min_avx2              # -- Begin function uint8_max_min_avx2
	.p2align	4, 0x90
	.type	uint8_max_min_avx2,@function
uint8_max_min_avx2:                     # @uint8_max_min_avx2
# %bb.0:
	push	rbp
	mov	rbp, rsp
	and	rsp, -8
	test	esi, esi
	jle	.LBB1_1
# %bb.2:
	mov	r9d, esi
	cmp	esi, 63
	ja	.LBB1_4
# %bb.3:
	mov	sil, -1
	xor	r10d, r10d
	xor	eax, eax
	jmp	.LBB1_11
.LBB1_1:
	mov	sil, -1
	xor	eax, eax
	jmp	.LBB1_12
.LBB1_4:
	mov	r10d, r9d
	and	r10d, -64
	lea	rax, [r10 - 64]
	mov	r8, rax
	shr	r8, 6
	add	r8, 1
	test	rax, rax
	je	.LBB1_5
# %bb.6:
	mov	rsi, r8
	and	rsi, -2
	neg	rsi
	vpxor	xmm0, xmm0, xmm0
	vpcmpeqd	ymm1, ymm1, ymm1
	xor	eax, eax
	vpcmpeqd	ymm2, ymm2, ymm2
	vpxor	xmm3, xmm3, xmm3
	.p2align	4, 0x90
.LBB1_7:                                # =>This Inner Loop Header: Depth=1
	vmovdqu	ymm4, ymmword ptr [rdi + rax]
	vmovdqu	ymm5, ymmword ptr [rdi + rax + 32]
	vmovdqu	ymm6, ymmword ptr [rdi + rax + 64]
	vmovdqu	ymm7, ymmword ptr [rdi + rax + 96]
	vpminub	ymm1, ymm1, ymm4
	vpminub	ymm2, ymm2, ymm5
	vpmaxub	ymm0, ymm0, ymm4
	vpmaxub	ymm3, ymm3, ymm5
	vpminub	ymm1, ymm1, ymm6
	vpminub	ymm2, ymm2, ymm7
	vpmaxub	ymm0, ymm0, ymm6
	vpmaxub	ymm3, ymm3, ymm7
	sub	rax, -128
	add	rsi, 2
	jne	.LBB1_7
# %bb.8:
	test	r8b, 1
	je	.LBB1_10
.LBB1_9:
	vmovdqu	ymm4, ymmword ptr [rdi + rax]
	vmovdqu	ymm5, ymmword ptr [rdi + rax + 32]
	vpmaxub	ymm3, ymm3, ymm5
	vpmaxub	ymm0, ymm0, ymm4
	vpminub	ymm2, ymm2, ymm5
	vpminub	ymm1, ymm1, ymm4
.LBB1_10:
	vpminub	ymm1, ymm1, ymm2
	vpmaxub	ymm0, ymm0, ymm3
	vextracti128	xmm2, ymm0, 1
	vpmaxub	xmm0, xmm0, xmm2
	vpcmpeqd	xmm2, xmm2, xmm2
	vpxor	xmm0, xmm0, xmm2
	vpsrlw	xmm2, xmm0, 8
	vpminub	xmm0, xmm0, xmm2
	vphminposuw	xmm0, xmm0
	vmovd	eax, xmm0
	not	al
	vextracti128	xmm0, ymm1, 1
	vpminub	xmm0, xmm1, xmm0
	vpsrlw	xmm1, xmm0, 8
	vpminub	xmm0, xmm0, xmm1
	vphminposuw	xmm0, xmm0
	vmovd	esi, xmm0
	cmp	r10, r9
	je	.LBB1_12
	.p2align	4, 0x90
.LBB1_11:                               # =>This Inner Loop Header: Depth=1
	movzx	r8d, byte ptr [rdi + r10]
	cmp	sil, r8b
	movzx	esi, sil
	cmovae	esi, r8d
	cmp	al, r8b
	movzx	eax, al
	cmovbe	eax, r8d
	add	r10, 1
	cmp	r9, r10
	jne	.LBB1_11
.LBB1_12:
	mov	byte ptr [rcx], al
	mov	byte ptr [rdx], sil
	mov	rsp, rbp
	pop	rbp
	vzeroupper
	ret
.LBB1_5:
	vpxor	xmm0, xmm0, xmm0
	vpcmpeqd	ymm1, ymm1, ymm1
	xor	eax, eax
	vpcmpeqd	ymm2, ymm2, ymm2
	vpxor	xmm3, xmm3, xmm3
	test	r8b, 1
	jne	.LBB1_9
	jmp	.LBB1_10
.Lfunc_end1:
	.size	uint8_max_min_avx2, .Lfunc_end1-uint8_max_min_avx2
                                        # -- End function
	.section	.rodata.cst32,"aM",@progbits,32
	.p2align	5                               # -- Begin function int16_max_min_avx2
.LCPI2_0:
	.short	32768                           # 0x8000
	.short	32768                           # 0x8000
	.short	32768                           # 0x8000
	.short	32768                           # 0x8000
	.short	32768                           # 0x8000
	.short	32768                           # 0x8000
	.short	32768                           # 0x8000
	.short	32768                           # 0x8000
	.short	32768                           # 0x8000
	.short	32768                           # 0x8000
	.short	32768                           # 0x8000
	.short	32768                           # 0x8000
	.short	32768                           # 0x8000
	.short	32768                           # 0x8000
	.short	32768                           # 0x8000
	.short	32768                           # 0x8000
.LCPI2_1:
	.short	32767                           # 0x7fff
	.short	32767                           # 0x7fff
	.short	32767                           # 0x7fff
	.short	32767                           # 0x7fff
	.short	32767                           # 0x7fff
	.short	32767                           # 0x7fff
	.short	32767                           # 0x7fff
	.short	32767                           # 0x7fff
	.short	32767                           # 0x7fff
	.short	32767                           # 0x7fff
	.short	32767                           # 0x7fff
	.short	32767                           # 0x7fff
	.short	32767                           # 0x7fff
	.short	32767                           # 0x7fff
	.short	32767                           # 0x7fff
	.short	32767                           # 0x7fff
	.section	.rodata.cst16,"aM",@progbits,16
	.p2align	4
.LCPI2_2:
	.short	32767                           # 0x7fff
	.short	32767                           # 0x7fff
	.short	32767                           # 0x7fff
	.short	32767                           # 0x7fff
	.short	32767                           # 0x7fff
	.short	32767                           # 0x7fff
	.short	32767                           # 0x7fff
	.short	32767                           # 0x7fff
.LCPI2_3:
	.short	32768                           # 0x8000
	.short	32768                           # 0x8000
	.short	32768                           # 0x8000
	.short	32768                           # 0x8000
	.short	32768                           # 0x8000
	.short	32768                           # 0x8000
	.short	32768                           # 0x8000
	.short	32768                           # 0x8000
	.text
	.globl	int16_max_min_avx2
	.p2align	4, 0x90
	.type	int16_max_min_avx2,@function
int16_max_min_avx2:                     # @int16_max_min_avx2
# %bb.0:
	push	rbp
	mov	rbp, rsp
	and	rsp, -8
	test	esi, esi
	jle	.LBB2_1
# %bb.2:
	mov	r9d, esi
	cmp	esi, 31
	ja	.LBB2_4
# %bb.3:
	mov	r8w, -32768
	mov	si, 32767
	xor	r10d, r10d
	jmp	.LBB2_11
.LBB2_1:
	mov	si, 32767
	mov	r8w, -32768
	jmp	.LBB2_12
.LBB2_4:
	mov	r10d, r9d
	and	r10d, -32
	lea	rax, [r10 - 32]
	mov	r8, rax
	shr	r8, 5
	add	r8, 1
	test	rax, rax
	je	.LBB2_5
# %bb.6:
	mov	rsi, r8
	and	rsi, -2
	neg	rsi
	vmovdqa	ymm1, ymmword ptr [rip + .LCPI2_0] # ymm1 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768]
	vmovdqa	ymm0, ymmword ptr [rip + .LCPI2_1] # ymm0 = [32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767]
	xor	eax, eax
	vmovdqa	ymm2, ymm0
	vmovdqa	ymm3, ymm1
	.p2align	4, 0x90
.LBB2_7:                                # =>This Inner Loop Header: Depth=1
	vmovdqu	ymm4, ymmword ptr [rdi + 2*rax]
	vmovdqu	ymm5, ymmword ptr [rdi + 2*rax + 32]
	vmovdqu	ymm6, ymmword ptr [rdi + 2*rax + 64]
	vmovdqu	ymm7, ymmword ptr [rdi + 2*rax + 96]
	vpminsw	ymm0, ymm0, ymm4
	vpminsw	ymm2, ymm2, ymm5
	vpmaxsw	ymm1, ymm1, ymm4
	vpmaxsw	ymm3, ymm3, ymm5
	vpminsw	ymm0, ymm0, ymm6
	vpminsw	ymm2, ymm2, ymm7
	vpmaxsw	ymm1, ymm1, ymm6
	vpmaxsw	ymm3, ymm3, ymm7
	add	rax, 64
	add	rsi, 2
	jne	.LBB2_7
# %bb.8:
	test	r8b, 1
	je	.LBB2_10
.LBB2_9:
	vmovdqu	ymm4, ymmword ptr [rdi + 2*rax]
	vmovdqu	ymm5, ymmword ptr [rdi + 2*rax + 32]
	vpmaxsw	ymm3, ymm3, ymm5
	vpmaxsw	ymm1, ymm1, ymm4
	vpminsw	ymm2, ymm2, ymm5
	vpminsw	ymm0, ymm0, ymm4
.LBB2_10:
	vpmaxsw	ymm1, ymm1, ymm3
	vextracti128	xmm3, ymm1, 1
	vpmaxsw	xmm1, xmm1, xmm3
	vpxor	xmm1, xmm1, xmmword ptr [rip + .LCPI2_2]
	vpminsw	ymm0, ymm0, ymm2
	vphminposuw	xmm1, xmm1
	vmovd	r8d, xmm1
	xor	r8d, 32767
	vextracti128	xmm1, ymm0, 1
	vpminsw	xmm0, xmm0, xmm1
	vpxor	xmm0, xmm0, xmmword ptr [rip + .LCPI2_3]
	vphminposuw	xmm0, xmm0
	vmovd	esi, xmm0
	xor	esi, 32768
	cmp	r10, r9
	je	.LBB2_12
	.p2align	4, 0x90
.LBB2_11:                               # =>This Inner Loop Header: Depth=1
	movzx	eax, word ptr [rdi + 2*r10]
	cmp	si, ax
	cmovg	esi, eax
	cmp	r8w, ax
	cmovl	r8d, eax
	add	r10, 1
	cmp	r9, r10
	jne	.LBB2_11
.LBB2_12:
	mov	word ptr [rcx], r8w
	mov	word ptr [rdx], si
	mov	rsp, rbp
	pop	rbp
	vzeroupper
	ret
.LBB2_5:
	vmovdqa	ymm1, ymmword ptr [rip + .LCPI2_0] # ymm1 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768]
	vmovdqa	ymm0, ymmword ptr [rip + .LCPI2_1] # ymm0 = [32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767]
	xor	eax, eax
	vmovdqa	ymm2, ymm0
	vmovdqa	ymm3, ymm1
	test	r8b, 1
	jne	.LBB2_9
	jmp	.LBB2_10
.Lfunc_end2:
	.size	int16_max_min_avx2, .Lfunc_end2-int16_max_min_avx2
                                        # -- End function
	.globl	uint16_max_min_avx2             # -- Begin function uint16_max_min_avx2
	.p2align	4, 0x90
	.type	uint16_max_min_avx2,@function
uint16_max_min_avx2:                    # @uint16_max_min_avx2
# %bb.0:
	push	rbp
	mov	rbp, rsp
	and	rsp, -8
	test	esi, esi
	jle	.LBB3_1
# %bb.2:
	mov	r9d, esi
	cmp	esi, 31
	ja	.LBB3_4
# %bb.3:
	mov	r8w, -1
	xor	r10d, r10d
	xor	esi, esi
	jmp	.LBB3_11
.LBB3_1:
	mov	r8w, -1
	xor	esi, esi
	jmp	.LBB3_12
.LBB3_4:
	mov	r10d, r9d
	and	r10d, -32
	lea	rax, [r10 - 32]
	mov	r8, rax
	shr	r8, 5
	add	r8, 1
	test	rax, rax
	je	.LBB3_5
# %bb.6:
	mov	rsi, r8
	and	rsi, -2
	neg	rsi
	vpxor	xmm0, xmm0, xmm0
	vpcmpeqd	ymm1, ymm1, ymm1
	xor	eax, eax
	vpcmpeqd	ymm2, ymm2, ymm2
	vpxor	xmm3, xmm3, xmm3
	.p2align	4, 0x90
.LBB3_7:                                # =>This Inner Loop Header: Depth=1
	vmovdqu	ymm4, ymmword ptr [rdi + 2*rax]
	vmovdqu	ymm5, ymmword ptr [rdi + 2*rax + 32]
	vmovdqu	ymm6, ymmword ptr [rdi + 2*rax + 64]
	vmovdqu	ymm7, ymmword ptr [rdi + 2*rax + 96]
	vpminuw	ymm1, ymm1, ymm4
	vpminuw	ymm2, ymm2, ymm5
	vpmaxuw	ymm0, ymm0, ymm4
	vpmaxuw	ymm3, ymm3, ymm5
	vpminuw	ymm1, ymm1, ymm6
	vpminuw	ymm2, ymm2, ymm7
	vpmaxuw	ymm0, ymm0, ymm6
	vpmaxuw	ymm3, ymm3, ymm7
	add	rax, 64
	add	rsi, 2
	jne	.LBB3_7
# %bb.8:
	test	r8b, 1
	je	.LBB3_10
.LBB3_9:
	vmovdqu	ymm4, ymmword ptr [rdi + 2*rax]
	vmovdqu	ymm5, ymmword ptr [rdi + 2*rax + 32]
	vpmaxuw	ymm3, ymm3, ymm5
	vpmaxuw	ymm0, ymm0, ymm4
	vpminuw	ymm2, ymm2, ymm5
	vpminuw	ymm1, ymm1, ymm4
.LBB3_10:
	vpminuw	ymm1, ymm1, ymm2
	vpmaxuw	ymm0, ymm0, ymm3
	vextracti128	xmm2, ymm0, 1
	vpmaxuw	xmm0, xmm0, xmm2
	vpcmpeqd	xmm2, xmm2, xmm2
	vpxor	xmm0, xmm0, xmm2
	vphminposuw	xmm0, xmm0
	vmovd	esi, xmm0
	not	esi
	vextracti128	xmm0, ymm1, 1
	vpminuw	xmm0, xmm1, xmm0
	vphminposuw	xmm0, xmm0
	vmovd	r8d, xmm0
	cmp	r10, r9
	je	.LBB3_12
	.p2align	4, 0x90
.LBB3_11:                               # =>This Inner Loop Header: Depth=1
	movzx	eax, word ptr [rdi + 2*r10]
	cmp	r8w, ax
	cmovae	r8d, eax
	cmp	si, ax
	cmovbe	esi, eax
	add	r10, 1
	cmp	r9, r10
	jne	.LBB3_11
.LBB3_12:
	mov	word ptr [rcx], si
	mov	word ptr [rdx], r8w
	mov	rsp, rbp
	pop	rbp
	vzeroupper
	ret
.LBB3_5:
	vpxor	xmm0, xmm0, xmm0
	vpcmpeqd	ymm1, ymm1, ymm1
	xor	eax, eax
	vpcmpeqd	ymm2, ymm2, ymm2
	vpxor	xmm3, xmm3, xmm3
	test	r8b, 1
	jne	.LBB3_9
	jmp	.LBB3_10
.Lfunc_end3:
	.size	uint16_max_min_avx2, .Lfunc_end3-uint16_max_min_avx2
                                        # -- End function
	.section	.rodata.cst4,"aM",@progbits,4
	.p2align	2                               # -- Begin function int32_max_min_avx2
.LCPI4_0:
	.long	2147483648                      # 0x80000000
.LCPI4_1:
	.long	2147483647                      # 0x7fffffff
	.text
	.globl	int32_max_min_avx2
	.p2align	4, 0x90
	.type	int32_max_min_avx2,@function
int32_max_min_avx2:                     # @int32_max_min_avx2
# %bb.0:
	push	rbp
	mov	rbp, rsp
	and	rsp, -8
	test	esi, esi
	jle	.LBB4_1
# %bb.2:
	mov	r8d, esi
	cmp	esi, 31
	ja	.LBB4_4
# %bb.3:
	mov	r10d, -2147483648
	mov	eax, 2147483647
	xor	r9d, r9d
	jmp	.LBB4_7
.LBB4_1:
	mov	eax, 2147483647
	mov	esi, -2147483648
	jmp	.LBB4_8
.LBB4_4:
	mov	r9d, r8d
	vpbroadcastd	ymm4, dword ptr [rip + .LCPI4_0] # ymm4 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648]
	and	r9d, -32
	vpbroadcastd	ymm0, dword ptr [rip + .LCPI4_1] # ymm0 = [2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647]
	xor	eax, eax
	vmovdqa	ymm1, ymm0
	vmovdqa	ymm2, ymm0
	vmovdqa	ymm3, ymm0
	vmovdqa	ymm5, ymm4
	vmovdqa	ymm6, ymm4
	vmovdqa	ymm7, ymm4
	.p2align	4, 0x90
.LBB4_5:                                # =>This Inner Loop Header: Depth=1
	vmovdqu	ymm8, ymmword ptr [rdi + 4*rax]
	vmovdqu	ymm9, ymmword ptr [rdi + 4*rax + 32]
	vmovdqu	ymm10, ymmword ptr [rdi + 4*rax + 64]
	vmovdqu	ymm11, ymmword ptr [rdi + 4*rax + 96]
	vpminsd	ymm0, ymm0, ymm8
	vpminsd	ymm1, ymm1, ymm9
	vpminsd	ymm2, ymm2, ymm10
	vpminsd	ymm3, ymm3, ymm11
	vpmaxsd	ymm4, ymm4, ymm8
	vpmaxsd	ymm5, ymm5, ymm9
	vpmaxsd	ymm6, ymm6, ymm10
	vpmaxsd	ymm7, ymm7, ymm11
	add	rax, 32
	cmp	r9, rax
	jne	.LBB4_5
# %bb.6:
	vpmaxsd	ymm4, ymm4, ymm5
	vpmaxsd	ymm4, ymm4, ymm6
	vpmaxsd	ymm4, ymm4, ymm7
	vextracti128	xmm5, ymm4, 1
	vpmaxsd	xmm4, xmm4, xmm5
	vpshufd	xmm5, xmm4, 78                  # xmm5 = xmm4[2,3,0,1]
	vpmaxsd	xmm4, xmm4, xmm5
	vpshufd	xmm5, xmm4, 229                 # xmm5 = xmm4[1,1,2,3]
	vpmaxsd	xmm4, xmm4, xmm5
	vmovd	r10d, xmm4
	vpminsd	ymm0, ymm0, ymm1
	vpminsd	ymm0, ymm0, ymm2
	vpminsd	ymm0, ymm0, ymm3
	vextracti128	xmm1, ymm0, 1
	vpminsd	xmm0, xmm0, xmm1
	vpshufd	xmm1, xmm0, 78                  # xmm1 = xmm0[2,3,0,1]
	vpminsd	xmm0, xmm0, xmm1
	vpshufd	xmm1, xmm0, 229                 # xmm1 = xmm0[1,1,2,3]
	vpminsd	xmm0, xmm0, xmm1
	vmovd	eax, xmm0
	mov	esi, r10d
	cmp	r9, r8
	je	.LBB4_8
	.p2align	4, 0x90
.LBB4_7:                                # =>This Inner Loop Header: Depth=1
	mov	esi, dword ptr [rdi + 4*r9]
	cmp	eax, esi
	cmovg	eax, esi
	cmp	r10d, esi
	cmovge	esi, r10d
	add	r9, 1
	mov	r10d, esi
	cmp	r8, r9
	jne	.LBB4_7
.LBB4_8:
	mov	dword ptr [rcx], esi
	mov	dword ptr [rdx], eax
	mov	rsp, rbp
	pop	rbp
	vzeroupper
	ret
.Lfunc_end4:
	.size	int32_max_min_avx2, .Lfunc_end4-int32_max_min_avx2
                                        # -- End function
	.globl	uint32_max_min_avx2             # -- Begin function uint32_max_min_avx2
	.p2align	4, 0x90
	.type	uint32_max_min_avx2,@function
uint32_max_min_avx2:                    # @uint32_max_min_avx2
# %bb.0:
	push	rbp
	mov	rbp, rsp
	and	rsp, -8
	test	esi, esi
	jle	.LBB5_1
# %bb.2:
	mov	r8d, esi
	cmp	esi, 31
	ja	.LBB5_4
# %bb.3:
	xor	r9d, r9d
	mov	eax, -1
	xor	r10d, r10d
	jmp	.LBB5_7
.LBB5_1:
	mov	eax, -1
	xor	esi, esi
	jmp	.LBB5_8
.LBB5_4:
	mov	r9d, r8d
	and	r9d, -32
	vpxor	xmm4, xmm4, xmm4
	vpcmpeqd	ymm0, ymm0, ymm0
	xor	eax, eax
	vpcmpeqd	ymm1, ymm1, ymm1
	vpcmpeqd	ymm2, ymm2, ymm2
	vpcmpeqd	ymm3, ymm3, ymm3
	vpxor	xmm5, xmm5, xmm5
	vpxor	xmm6, xmm6, xmm6
	vpxor	xmm7, xmm7, xmm7
	.p2align	4, 0x90
.LBB5_5:                                # =>This Inner Loop Header: Depth=1
	vmovdqu	ymm8, ymmword ptr [rdi + 4*rax]
	vmovdqu	ymm9, ymmword ptr [rdi + 4*rax + 32]
	vmovdqu	ymm10, ymmword ptr [rdi + 4*rax + 64]
	vmovdqu	ymm11, ymmword ptr [rdi + 4*rax + 96]
	vpminud	ymm0, ymm0, ymm8
	vpminud	ymm1, ymm1, ymm9
	vpminud	ymm2, ymm2, ymm10
	vpminud	ymm3, ymm3, ymm11
	vpmaxud	ymm4, ymm4, ymm8
	vpmaxud	ymm5, ymm5, ymm9
	vpmaxud	ymm6, ymm6, ymm10
	vpmaxud	ymm7, ymm7, ymm11
	add	rax, 32
	cmp	r9, rax
	jne	.LBB5_5
# %bb.6:
	vpmaxud	ymm4, ymm4, ymm5
	vpmaxud	ymm4, ymm4, ymm6
	vpmaxud	ymm4, ymm4, ymm7
	vextracti128	xmm5, ymm4, 1
	vpmaxud	xmm4, xmm4, xmm5
	vpshufd	xmm5, xmm4, 78                  # xmm5 = xmm4[2,3,0,1]
	vpmaxud	xmm4, xmm4, xmm5
	vpshufd	xmm5, xmm4, 229                 # xmm5 = xmm4[1,1,2,3]
	vpmaxud	xmm4, xmm4, xmm5
	vmovd	r10d, xmm4
	vpminud	ymm0, ymm0, ymm1
	vpminud	ymm0, ymm0, ymm2
	vpminud	ymm0, ymm0, ymm3
	vextracti128	xmm1, ymm0, 1
	vpminud	xmm0, xmm0, xmm1
	vpshufd	xmm1, xmm0, 78                  # xmm1 = xmm0[2,3,0,1]
	vpminud	xmm0, xmm0, xmm1
	vpshufd	xmm1, xmm0, 229                 # xmm1 = xmm0[1,1,2,3]
	vpminud	xmm0, xmm0, xmm1
	vmovd	eax, xmm0
	mov	esi, r10d
	cmp	r9, r8
	je	.LBB5_8
	.p2align	4, 0x90
.LBB5_7:                                # =>This Inner Loop Header: Depth=1
	mov	esi, dword ptr [rdi + 4*r9]
	cmp	eax, esi
	cmovae	eax, esi
	cmp	r10d, esi
	cmova	esi, r10d
	add	r9, 1
	mov	r10d, esi
	cmp	r8, r9
	jne	.LBB5_7
.LBB5_8:
	mov	dword ptr [rcx], esi
	mov	dword ptr [rdx], eax
	mov	rsp, rbp
	pop	rbp
	vzeroupper
	ret
.Lfunc_end5:
	.size	uint32_max_min_avx2, .Lfunc_end5-uint32_max_min_avx2
                                        # -- End function
	.section	.rodata.cst8,"aM",@progbits,8
	.p2align	3                               # -- Begin function int64_max_min_avx2
.LCPI6_0:
	.quad	-9223372036854775808            # 0x8000000000000000
.LCPI6_1:
	.quad	9223372036854775807             # 0x7fffffffffffffff
	.text
	.globl	int64_max_min_avx2
	.p2align	4, 0x90
	.type	int64_max_min_avx2,@function
int64_max_min_avx2:                     # @int64_max_min_avx2
# %bb.0:
	push	rbp
	mov	rbp, rsp
	and	rsp, -8
	movabs	rax, 9223372036854775807
	test	esi, esi
	jle	.LBB6_1
# %bb.2:
	mov	r8d, esi
	cmp	esi, 15
	ja	.LBB6_4
# %bb.3:
	lea	r10, [rax + 1]
	xor	r9d, r9d
	jmp	.LBB6_7
.LBB6_1:
	lea	rsi, [rax + 1]
	jmp	.LBB6_8
.LBB6_4:
	mov	r9d, r8d
	vpbroadcastq	ymm4, qword ptr [rip + .LCPI6_0] # ymm4 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
	and	r9d, -16
	vpbroadcastq	ymm0, qword ptr [rip + .LCPI6_1] # ymm0 = [9223372036854775807,9223372036854775807,9223372036854775807,9223372036854775807]
	xor	eax, eax
	vmovdqa	ymm3, ymm0
	vmovdqa	ymm2, ymm0
	vmovdqa	ymm1, ymm0
	vmovdqa	ymm7, ymm4
	vmovdqa	ymm6, ymm4
	vmovdqa	ymm5, ymm4
	.p2align	4, 0x90
.LBB6_5:                                # =>This Inner Loop Header: Depth=1
	vmovdqu	ymm8, ymmword ptr [rdi + 8*rax]
	vpcmpgtq	ymm9, ymm8, ymm0
	vblendvpd	ymm0, ymm8, ymm0, ymm9
	vmovdqu	ymm9, ymmword ptr [rdi + 8*rax + 32]
	vpcmpgtq	ymm10, ymm9, ymm3
	vblendvpd	ymm3, ymm9, ymm3, ymm10
	vmovdqu	ymm10, ymmword ptr [rdi + 8*rax + 64]
	vpcmpgtq	ymm11, ymm10, ymm2
	vblendvpd	ymm2, ymm10, ymm2, ymm11
	vmovdqu	ymm11, ymmword ptr [rdi + 8*rax + 96]
	vpcmpgtq	ymm12, ymm11, ymm1
	vblendvpd	ymm1, ymm11, ymm1, ymm12
	vpcmpgtq	ymm12, ymm4, ymm8
	vblendvpd	ymm4, ymm8, ymm4, ymm12
	vpcmpgtq	ymm8, ymm7, ymm9
	vblendvpd	ymm7, ymm9, ymm7, ymm8
	vpcmpgtq	ymm8, ymm6, ymm10
	vblendvpd	ymm6, ymm10, ymm6, ymm8
	vpcmpgtq	ymm8, ymm5, ymm11
	vblendvpd	ymm5, ymm11, ymm5, ymm8
	add	rax, 16
	cmp	r9, rax
	jne	.LBB6_5
# %bb.6:
	vpcmpgtq	ymm8, ymm4, ymm7
	vblendvpd	ymm4, ymm7, ymm4, ymm8
	vpcmpgtq	ymm7, ymm4, ymm6
	vblendvpd	ymm4, ymm6, ymm4, ymm7
	vpcmpgtq	ymm6, ymm4, ymm5
	vblendvpd	ymm4, ymm5, ymm4, ymm6
	vextractf128	xmm5, ymm4, 1
	vpcmpgtq	xmm6, xmm4, xmm5
	vblendvpd	xmm4, xmm5, xmm4, xmm6
	vpermilps	xmm5, xmm4, 78          # xmm5 = xmm4[2,3,0,1]
	vpcmpgtq	xmm6, xmm4, xmm5
	vblendvpd	xmm4, xmm5, xmm4, xmm6
	vmovq	r10, xmm4
	vpcmpgtq	ymm4, ymm3, ymm0
	vblendvpd	ymm0, ymm3, ymm0, ymm4
	vpcmpgtq	ymm3, ymm2, ymm0
	vblendvpd	ymm0, ymm2, ymm0, ymm3
	vpcmpgtq	ymm2, ymm1, ymm0
	vblendvpd	ymm0, ymm1, ymm0, ymm2
	vextractf128	xmm1, ymm0, 1
	vpcmpgtq	xmm2, xmm1, xmm0
	vblendvpd	xmm0, xmm1, xmm0, xmm2
	vpermilps	xmm1, xmm0, 78          # xmm1 = xmm0[2,3,0,1]
	vpcmpgtq	xmm2, xmm1, xmm0
	vblendvpd	xmm0, xmm1, xmm0, xmm2
	vmovq	rax, xmm0
	mov	rsi, r10
	cmp	r9, r8
	je	.LBB6_8
	.p2align	4, 0x90
.LBB6_7:                                # =>This Inner Loop Header: Depth=1
	mov	rsi, qword ptr [rdi + 8*r9]
	cmp	rax, rsi
	cmovg	rax, rsi
	cmp	r10, rsi
	cmovge	rsi, r10
	add	r9, 1
	mov	r10, rsi
	cmp	r8, r9
	jne	.LBB6_7
.LBB6_8:
	mov	qword ptr [rcx], rsi
	mov	qword ptr [rdx], rax
	mov	rsp, rbp
	pop	rbp
	vzeroupper
	ret
.Lfunc_end6:
	.size	int64_max_min_avx2, .Lfunc_end6-int64_max_min_avx2
                                        # -- End function
	.section	.rodata.cst8,"aM",@progbits,8
	.p2align	3                               # -- Begin function uint64_max_min_avx2
.LCPI7_0:
	.quad	-9223372036854775808            # 0x8000000000000000
	.text
	.globl	uint64_max_min_avx2
	.p2align	4, 0x90
	.type	uint64_max_min_avx2,@function
uint64_max_min_avx2:                    # @uint64_max_min_avx2
# %bb.0:
	push	rbp
	mov	rbp, rsp
	and	rsp, -8
	test	esi, esi
	jle	.LBB7_1
# %bb.2:
	mov	r8d, esi
	cmp	esi, 15
	ja	.LBB7_4
# %bb.3:
	mov	rax, -1
	xor	r9d, r9d
	xor	r10d, r10d
	jmp	.LBB7_7
.LBB7_1:
	mov	rax, -1
	xor	esi, esi
	jmp	.LBB7_8
.LBB7_4:
	mov	r9d, r8d
	and	r9d, -16
	vpxor	xmm5, xmm5, xmm5
	vpcmpeqd	ymm1, ymm1, ymm1
	xor	eax, eax
	vpbroadcastq	ymm0, qword ptr [rip + .LCPI7_0] # ymm0 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
	vpcmpeqd	ymm4, ymm4, ymm4
	vpcmpeqd	ymm3, ymm3, ymm3
	vpcmpeqd	ymm2, ymm2, ymm2
	vpxor	xmm8, xmm8, xmm8
	vpxor	xmm7, xmm7, xmm7
	vpxor	xmm6, xmm6, xmm6
	.p2align	4, 0x90
.LBB7_5:                                # =>This Inner Loop Header: Depth=1
	vmovdqu	ymm9, ymmword ptr [rdi + 8*rax]
	vpxor	ymm10, ymm1, ymm0
	vpxor	ymm11, ymm9, ymm0
	vpcmpgtq	ymm10, ymm11, ymm10
	vblendvpd	ymm1, ymm9, ymm1, ymm10
	vpxor	ymm10, ymm5, ymm0
	vpcmpgtq	ymm10, ymm10, ymm11
	vblendvpd	ymm5, ymm9, ymm5, ymm10
	vmovdqu	ymm9, ymmword ptr [rdi + 8*rax + 32]
	vpxor	ymm10, ymm4, ymm0
	vpxor	ymm11, ymm9, ymm0
	vpcmpgtq	ymm10, ymm11, ymm10
	vblendvpd	ymm4, ymm9, ymm4, ymm10
	vpxor	ymm10, ymm8, ymm0
	vpcmpgtq	ymm10, ymm10, ymm11
	vmovdqu	ymm11, ymmword ptr [rdi + 8*rax + 64]
	vblendvpd	ymm8, ymm9, ymm8, ymm10
	vpxor	ymm9, ymm3, ymm0
	vpxor	ymm10, ymm11, ymm0
	vpcmpgtq	ymm9, ymm10, ymm9
	vblendvpd	ymm3, ymm11, ymm3, ymm9
	vpxor	ymm9, ymm7, ymm0
	vpcmpgtq	ymm9, ymm9, ymm10
	vblendvpd	ymm7, ymm11, ymm7, ymm9
	vmovdqu	ymm9, ymmword ptr [rdi + 8*rax + 96]
	vpxor	ymm10, ymm2, ymm0
	vpxor	ymm11, ymm9, ymm0
	vpcmpgtq	ymm10, ymm11, ymm10
	vblendvpd	ymm2, ymm9, ymm2, ymm10
	vpxor	ymm10, ymm6, ymm0
	vpcmpgtq	ymm10, ymm10, ymm11
	vblendvpd	ymm6, ymm9, ymm6, ymm10
	add	rax, 16
	cmp	r9, rax
	jne	.LBB7_5
# %bb.6:
	vpxor	ymm9, ymm8, ymm0
	vpxor	ymm10, ymm5, ymm0
	vpcmpgtq	ymm9, ymm10, ymm9
	vblendvpd	ymm5, ymm8, ymm5, ymm9
	vxorpd	ymm8, ymm5, ymm0
	vpxor	ymm9, ymm7, ymm0
	vpcmpgtq	ymm8, ymm8, ymm9
	vblendvpd	ymm5, ymm7, ymm5, ymm8
	vxorpd	ymm7, ymm5, ymm0
	vpxor	ymm8, ymm6, ymm0
	vpcmpgtq	ymm7, ymm7, ymm8
	vblendvpd	ymm5, ymm6, ymm5, ymm7
	vextractf128	xmm6, ymm5, 1
	vxorpd	xmm8, xmm6, xmm0
	vxorpd	xmm7, xmm5, xmm0
	vpcmpgtq	xmm7, xmm7, xmm8
	vblendvpd	xmm5, xmm6, xmm5, xmm7
	vpermilps	xmm6, xmm5, 78          # xmm6 = xmm5[2,3,0,1]
	vxorpd	xmm8, xmm5, xmm0
	vxorpd	xmm7, xmm6, xmm0
	vpcmpgtq	xmm7, xmm8, xmm7
	vblendvpd	xmm5, xmm6, xmm5, xmm7
	vpxor	ymm6, ymm1, ymm0
	vpxor	ymm7, ymm4, ymm0
	vpcmpgtq	ymm6, ymm7, ymm6
	vblendvpd	ymm1, ymm4, ymm1, ymm6
	vxorpd	ymm4, ymm1, ymm0
	vpxor	ymm6, ymm3, ymm0
	vpcmpgtq	ymm4, ymm6, ymm4
	vblendvpd	ymm1, ymm3, ymm1, ymm4
	vmovq	r10, xmm5
	vxorpd	ymm3, ymm1, ymm0
	vpxor	ymm4, ymm2, ymm0
	vpcmpgtq	ymm3, ymm4, ymm3
	vblendvpd	ymm1, ymm2, ymm1, ymm3
	vextractf128	xmm2, ymm1, 1
	vxorpd	xmm3, xmm1, xmm0
	vxorpd	xmm4, xmm2, xmm0
	vpcmpgtq	xmm3, xmm4, xmm3
	vblendvpd	xmm1, xmm2, xmm1, xmm3
	vpermilps	xmm2, xmm1, 78          # xmm2 = xmm1[2,3,0,1]
	vxorpd	xmm3, xmm1, xmm0
	vxorpd	xmm0, xmm2, xmm0
	vpcmpgtq	xmm0, xmm0, xmm3
	vblendvpd	xmm0, xmm2, xmm1, xmm0
	vmovq	rax, xmm0
	mov	rsi, r10
	cmp	r9, r8
	je	.LBB7_8
	.p2align	4, 0x90
.LBB7_7:                                # =>This Inner Loop Header: Depth=1
	mov	rsi, qword ptr [rdi + 8*r9]
	cmp	rax, rsi
	cmovae	rax, rsi
	cmp	r10, rsi
	cmova	rsi, r10
	add	r9, 1
	mov	r10, rsi
	cmp	r8, r9
	jne	.LBB7_7
.LBB7_8:
	mov	qword ptr [rcx], rsi
	mov	qword ptr [rdx], rax
	mov	rsp, rbp
	pop	rbp
	vzeroupper
	ret
.Lfunc_end7:
	.size	uint64_max_min_avx2, .Lfunc_end7-uint64_max_min_avx2
                                        # -- End function
	.ident	"Debian clang version 11.0.1-2"
	.section	".note.GNU-stack","",@progbits
	.addrsig