File: sum_amd64.s

package info (click to toggle)
golang-gonum-v1-gonum 0.15.1-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 18,792 kB
  • sloc: asm: 6,252; fortran: 5,271; sh: 377; ruby: 211; makefile: 98
file content (100 lines) | stat: -rw-r--r-- 2,240 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
// Copyright ©2021 The Gonum Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

// +build !noasm,!gccgo,!safe

#include "textflag.h"

#define X_PTR SI
#define IDX AX
#define LEN CX
#define TAIL BX
#define SUM X0
#define SUM_1 X1
#define SUM_2 X2
#define SUM_3 X3

// func Sum(x []float32) float32
TEXT ·Sum(SB), NOSPLIT, $0
	MOVQ x_base+0(FP), X_PTR // X_PTR = &x
	MOVQ x_len+8(FP), LEN    // LEN = len(x)
	XORQ IDX, IDX            // i = 0
	PXOR SUM, SUM            // p_sum_i = 0
	CMPQ LEN, $0             // if LEN == 0 { return 0 }
	JE   sum_end

	PXOR SUM_1, SUM_1
	PXOR SUM_2, SUM_2
	PXOR SUM_3, SUM_3

	MOVQ X_PTR, TAIL // Check memory alignment
	ANDQ $15, TAIL   // TAIL = &x % 16
	JZ   no_trim     // if TAIL == 0 { goto no_trim }
	SUBQ $16, TAIL   // TAIL -= 16

sum_align: // Align on 16-byte boundary do {
	ADDSS (X_PTR)(IDX*4), SUM // SUM += x[0]
	INCQ  IDX                 // i++
	DECQ  LEN                 // LEN--
	JZ    sum_end             // if LEN == 0 { return }
	ADDQ  $4, TAIL            // TAIL += 4
	JNZ   sum_align           // } while TAIL < 0

no_trim:
	MOVQ LEN, TAIL
	SHRQ $4, LEN   // LEN = floor( n / 16 )
	JZ   sum_tail8 // if LEN == 0 { goto sum_tail8 }


sum_loop: // sum 16x wide do {
	ADDPS (X_PTR)(IDX*4), SUM     // sum_i += x[i:i+4]
	ADDPS 16(X_PTR)(IDX*4), SUM_1
	ADDPS 32(X_PTR)(IDX*4), SUM_2
	ADDPS 48(X_PTR)(IDX*4), SUM_3

	ADDQ  $16, IDX                // i += 16
	DECQ  LEN
	JNZ   sum_loop                // } while --LEN > 0

sum_tail8:
	ADDPS SUM_3, SUM
	ADDPS SUM_2, SUM_1

	TESTQ $8, TAIL
	JZ    sum_tail4

	ADDPS (X_PTR)(IDX*4), SUM // sum_i += x[i:i+4]
	ADDPS 16(X_PTR)(IDX*4), SUM_1
	ADDQ  $8, IDX

sum_tail4:
	ADDPS SUM_1, SUM

	TESTQ $4, TAIL
	JZ    sum_tail2

	ADDPS (X_PTR)(IDX*4), SUM // sum_i += x[i:i+4]
	ADDQ  $4, IDX

sum_tail2:
	HADDPS SUM, SUM            // sum_i[:2] += sum_i[2:4]

	TESTQ $2, TAIL
	JZ    sum_tail1

	MOVSD (X_PTR)(IDX*4), SUM_1 // reuse SUM_1
	ADDPS SUM_1, SUM            // sum_i += x[i:i+2]
	ADDQ  $2, IDX

sum_tail1:
	HADDPS SUM, SUM // sum_i[0] += sum_i[1]

	TESTQ $1, TAIL
	JZ    sum_end

	ADDSS (X_PTR)(IDX*4), SUM

sum_end: // return sum
	MOVSS SUM, ret+24(FP)
	RET