File: axpyinc_amd64.s

package info (click to toggle)
golang-gonum-v1-gonum 0.15.1-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 18,792 kB
  • sloc: asm: 6,252; fortran: 5,271; sh: 377; ruby: 211; makefile: 98
file content (151 lines) | stat: -rw-r--r-- 4,686 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
// Copyright ©2016 The Gonum Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

// +build !noasm,!gccgo,!safe

#include "textflag.h"

// MOVSHDUP X3, X2
#define MOVSHDUP_X3_X2 BYTE $0xF3; BYTE $0x0F; BYTE $0x16; BYTE $0xD3
// MOVSLDUP X3, X3
#define MOVSLDUP_X3_X3 BYTE $0xF3; BYTE $0x0F; BYTE $0x12; BYTE $0xDB
// ADDSUBPS X2, X3
#define ADDSUBPS_X2_X3 BYTE $0xF2; BYTE $0x0F; BYTE $0xD0; BYTE $0xDA

// MOVSHDUP X5, X4
#define MOVSHDUP_X5_X4 BYTE $0xF3; BYTE $0x0F; BYTE $0x16; BYTE $0xE5
// MOVSLDUP X5, X5
#define MOVSLDUP_X5_X5 BYTE $0xF3; BYTE $0x0F; BYTE $0x12; BYTE $0xED
// ADDSUBPS X4, X5
#define ADDSUBPS_X4_X5 BYTE $0xF2; BYTE $0x0F; BYTE $0xD0; BYTE $0xEC

// MOVSHDUP X7, X6
#define MOVSHDUP_X7_X6 BYTE $0xF3; BYTE $0x0F; BYTE $0x16; BYTE $0xF7
// MOVSLDUP X7, X7
#define MOVSLDUP_X7_X7 BYTE $0xF3; BYTE $0x0F; BYTE $0x12; BYTE $0xFF
// ADDSUBPS X6, X7
#define ADDSUBPS_X6_X7 BYTE $0xF2; BYTE $0x0F; BYTE $0xD0; BYTE $0xFE

// MOVSHDUP X9, X8
#define MOVSHDUP_X9_X8 BYTE $0xF3; BYTE $0x45; BYTE $0x0F; BYTE $0x16; BYTE $0xC1
// MOVSLDUP X9, X9
#define MOVSLDUP_X9_X9 BYTE $0xF3; BYTE $0x45; BYTE $0x0F; BYTE $0x12; BYTE $0xC9
// ADDSUBPS X8, X9
#define ADDSUBPS_X8_X9 BYTE $0xF2; BYTE $0x45; BYTE $0x0F; BYTE $0xD0; BYTE $0xC8

// func AxpyInc(alpha complex64, x, y []complex64, n, incX, incY, ix, iy uintptr)
TEXT ·AxpyInc(SB), NOSPLIT, $0
	MOVQ   x_base+8(FP), SI  // SI = &x
	MOVQ   y_base+32(FP), DI // DI = &y
	MOVQ   n+56(FP), CX      // CX = n
	CMPQ   CX, $0            // if n==0 { return }
	JE     axpyi_end
	MOVQ   ix+80(FP), R8     // R8 = ix
	MOVQ   iy+88(FP), R9     // R9 = iy
	LEAQ   (SI)(R8*8), SI    // SI = &(x[ix])
	LEAQ   (DI)(R9*8), DI    // DI = &(y[iy])
	MOVQ   DI, DX            // DX = DI    // Read/Write pointers
	MOVQ   incX+64(FP), R8   // R8 = incX
	SHLQ   $3, R8            // R8 *= sizeof(complex64)
	MOVQ   incY+72(FP), R9   // R9 = incY
	SHLQ   $3, R9            // R9 *= sizeof(complex64)
	MOVSD  alpha+0(FP), X0   // X0 = { 0, 0, imag(a), real(a) }
	MOVAPS X0, X1
	SHUFPS $0x11, X1, X1     // X1 = { 0, 0, real(a), imag(a) }
	MOVAPS X0, X10           // Copy X0 and X1 for pipelining
	MOVAPS X1, X11
	MOVQ   CX, BX
	ANDQ   $3, CX            // CX = n % 4
	SHRQ   $2, BX            // BX = floor( n / 4 )
	JZ     axpyi_tail        // if BX == 0 { goto axpyi_tail }

axpyi_loop: // do {
	MOVSD (SI), X3       // X_i = { imag(x[i+1]), real(x[i+1]) }
	MOVSD (SI)(R8*1), X5
	LEAQ  (SI)(R8*2), SI // SI = &(SI[incX*2])
	MOVSD (SI), X7
	MOVSD (SI)(R8*1), X9

	// X_(i-1) = { imag(x[i]), imag(x[i]) }
	MOVSHDUP_X3_X2
	MOVSHDUP_X5_X4
	MOVSHDUP_X7_X6
	MOVSHDUP_X9_X8

	// X_i = { real(x[i]), real(x[i]) }
	MOVSLDUP_X3_X3
	MOVSLDUP_X5_X5
	MOVSLDUP_X7_X7
	MOVSLDUP_X9_X9

	// X_(i-1) = {  real(a) * imag(x[i]),   imag(a) * imag(x[i]) }
	// X_i     = {  imag(a) * real(x[i]),   real(a) * real(x[i])  }
	MULPS X1, X2
	MULPS X0, X3
	MULPS X11, X4
	MULPS X10, X5
	MULPS X1, X6
	MULPS X0, X7
	MULPS X11, X8
	MULPS X10, X9

	// X_i = {
	//	imag(result[i]):   imag(a)*real(x[i]) + real(a)*imag(x[i]),
	//	real(result[i]):   real(a)*real(x[i]) - imag(a)*imag(x[i]),
	//  }
	ADDSUBPS_X2_X3
	ADDSUBPS_X4_X5
	ADDSUBPS_X6_X7
	ADDSUBPS_X8_X9

	// X_i = { imag(result[i]) + imag(y[i]), real(result[i]) + real(y[i]) }
	MOVSD (DX), X2
	MOVSD (DX)(R9*1), X4
	LEAQ  (DX)(R9*2), DX // DX = &(DX[incY*2])
	MOVSD (DX), X6
	MOVSD (DX)(R9*1), X8
	ADDPS X2, X3
	ADDPS X4, X5
	ADDPS X6, X7
	ADDPS X8, X9

	MOVSD X3, (DI)       // y[i] = X_i
	MOVSD X5, (DI)(R9*1)
	LEAQ  (DI)(R9*2), DI // DI = &(DI[incDst])
	MOVSD X7, (DI)
	MOVSD X9, (DI)(R9*1)
	LEAQ  (SI)(R8*2), SI // SI = &(SI[incX*2])
	LEAQ  (DX)(R9*2), DX // DX = &(DX[incY*2])
	LEAQ  (DI)(R9*2), DI // DI = &(DI[incDst])
	DECQ  BX
	JNZ   axpyi_loop     // }  while --BX > 0
	CMPQ  CX, $0         // if CX == 0 { return }
	JE    axpyi_end

axpyi_tail: // do {
	MOVSD (SI), X3 // X_i = { imag(x[i+1]), real(x[i+1]) }
	MOVSHDUP_X3_X2 // X_(i-1) = { real(x[i]), real(x[i]) }
	MOVSLDUP_X3_X3 // X_i = { imag(x[i]), imag(x[i]) }

	// X_i     = { imag(a) * real(x[i]),  real(a) * real(x[i]) }
	// X_(i-1) = { real(a) * imag(x[i]),  imag(a) * imag(x[i]) }
	MULPS X1, X2
	MULPS X0, X3

	// X_i = {
	//	imag(result[i]):   imag(a)*real(x[i]) + real(a)*imag(x[i]),
	//	real(result[i]):   real(a)*real(x[i]) - imag(a)*imag(x[i]),
	//  }
	ADDSUBPS_X2_X3 // (ai*x1r+ar*x1i, ar*x1r-ai*x1i)

	// X_i = { imag(result[i]) + imag(y[i]),  real(result[i]) + real(y[i])  }
	MOVSD (DI), X4
	ADDPS X4, X3
	MOVSD X3, (DI)   // y[i] = X_i
	ADDQ  R8, SI     // SI += incX
	ADDQ  R9, DI     // DI += incY
	LOOP  axpyi_tail // } while --CX > 0

axpyi_end:
	RET