File: uint64_sse4.s

package info (click to toggle)
golang-github-apache-arrow-go 18.2.0-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 32,200 kB
  • sloc: asm: 477,547; ansic: 5,369; cpp: 759; sh: 585; makefile: 319; python: 190; sed: 5
file content (108 lines) | stat: -rw-r--r-- 2,139 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
	.text
	.intel_syntax noprefix
	.file	"_lib/uint64.c"
	.globl	sum_uint64_sse4
	.p2align	4, 0x90
	.type	sum_uint64_sse4,@function
sum_uint64_sse4:                        # @sum_uint64_sse4
# BB#0:
	push	rbp
	mov	rbp, rsp
	and	rsp, -8
	test	rsi, rsi
	je	.LBB0_1
# BB#2:
	cmp	rsi, 3
	jbe	.LBB0_3
# BB#6:
	mov	r9, rsi
	and	r9, -4
	je	.LBB0_3
# BB#7:
	lea	r8, [r9 - 4]
	mov	eax, r8d
	shr	eax, 2
	inc	eax
	and	rax, 3
	je	.LBB0_8
# BB#9:
	neg	rax
	pxor	xmm0, xmm0
	xor	ecx, ecx
	pxor	xmm1, xmm1
	.p2align	4, 0x90
.LBB0_10:                               # =>This Inner Loop Header: Depth=1
	movdqu	xmm2, xmmword ptr [rdi + 8*rcx]
	movdqu	xmm3, xmmword ptr [rdi + 8*rcx + 16]
	paddq	xmm0, xmm2
	paddq	xmm1, xmm3
	add	rcx, 4
	inc	rax
	jne	.LBB0_10
	jmp	.LBB0_11
.LBB0_3:
	xor	r9d, r9d
	xor	eax, eax
.LBB0_4:
	lea	rcx, [rdi + 8*r9]
	sub	rsi, r9
	.p2align	4, 0x90
.LBB0_5:                                # =>This Inner Loop Header: Depth=1
	add	rax, qword ptr [rcx]
	add	rcx, 8
	dec	rsi
	jne	.LBB0_5
	jmp	.LBB0_15
.LBB0_1:
	xor	eax, eax
.LBB0_15:
	mov	qword ptr [rdx], rax
	mov	rsp, rbp
	pop	rbp
	ret
.LBB0_8:
	xor	ecx, ecx
	pxor	xmm0, xmm0
	pxor	xmm1, xmm1
.LBB0_11:
	cmp	r8, 12
	jb	.LBB0_14
# BB#12:
	mov	rax, r9
	sub	rax, rcx
	lea	rcx, [rdi + 8*rcx + 112]
	.p2align	4, 0x90
.LBB0_13:                               # =>This Inner Loop Header: Depth=1
	movdqu	xmm2, xmmword ptr [rcx - 112]
	movdqu	xmm3, xmmword ptr [rcx - 96]
	movdqu	xmm4, xmmword ptr [rcx - 80]
	movdqu	xmm5, xmmword ptr [rcx - 64]
	paddq	xmm2, xmm0
	paddq	xmm3, xmm1
	movdqu	xmm6, xmmword ptr [rcx - 48]
	movdqu	xmm7, xmmword ptr [rcx - 32]
	paddq	xmm6, xmm4
	paddq	xmm6, xmm2
	paddq	xmm7, xmm5
	paddq	xmm7, xmm3
	movdqu	xmm0, xmmword ptr [rcx - 16]
	movdqu	xmm1, xmmword ptr [rcx]
	paddq	xmm0, xmm6
	paddq	xmm1, xmm7
	sub	rcx, -128
	add	rax, -16
	jne	.LBB0_13
.LBB0_14:
	paddq	xmm0, xmm1
	pshufd	xmm1, xmm0, 78          # xmm1 = xmm0[2,3,0,1]
	paddq	xmm1, xmm0
	movq	rax, xmm1
	cmp	r9, rsi
	jne	.LBB0_4
	jmp	.LBB0_15
.Lfunc_end0:
	.size	sum_uint64_sse4, .Lfunc_end0-sum_uint64_sse4


	.ident	"Apple LLVM version 9.0.0 (clang-900.0.39.2)"
	.section	".note.GNU-stack","",@progbits