File: axpyunitaryto_amd64.s

package info (click to toggle)
golang-gonum-v1-gonum 0.15.1-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 18,792 kB
  • sloc: asm: 6,252; fortran: 5,271; sh: 377; ruby: 211; makefile: 98
file content (157 lines) | stat: -rw-r--r-- 5,435 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
// Copyright ©2016 The Gonum Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

// +build !noasm,!gccgo,!safe

#include "textflag.h"

// MOVSHDUP X3, X2
#define MOVSHDUP_X3_X2 BYTE $0xF3; BYTE $0x0F; BYTE $0x16; BYTE $0xD3
// MOVSLDUP X3, X3
#define MOVSLDUP_X3_X3 BYTE $0xF3; BYTE $0x0F; BYTE $0x12; BYTE $0xDB
// ADDSUBPS X2, X3
#define ADDSUBPS_X2_X3 BYTE $0xF2; BYTE $0x0F; BYTE $0xD0; BYTE $0xDA

// MOVSHDUP X5, X4
#define MOVSHDUP_X5_X4 BYTE $0xF3; BYTE $0x0F; BYTE $0x16; BYTE $0xE5
// MOVSLDUP X5, X5
#define MOVSLDUP_X5_X5 BYTE $0xF3; BYTE $0x0F; BYTE $0x12; BYTE $0xED
// ADDSUBPS X4, X5
#define ADDSUBPS_X4_X5 BYTE $0xF2; BYTE $0x0F; BYTE $0xD0; BYTE $0xEC

// MOVSHDUP X7, X6
#define MOVSHDUP_X7_X6 BYTE $0xF3; BYTE $0x0F; BYTE $0x16; BYTE $0xF7
// MOVSLDUP X7, X7
#define MOVSLDUP_X7_X7 BYTE $0xF3; BYTE $0x0F; BYTE $0x12; BYTE $0xFF
// ADDSUBPS X6, X7
#define ADDSUBPS_X6_X7 BYTE $0xF2; BYTE $0x0F; BYTE $0xD0; BYTE $0xFE

// MOVSHDUP X9, X8
#define MOVSHDUP_X9_X8 BYTE $0xF3; BYTE $0x45; BYTE $0x0F; BYTE $0x16; BYTE $0xC1
// MOVSLDUP X9, X9
#define MOVSLDUP_X9_X9 BYTE $0xF3; BYTE $0x45; BYTE $0x0F; BYTE $0x12; BYTE $0xC9
// ADDSUBPS X8, X9
#define ADDSUBPS_X8_X9 BYTE $0xF2; BYTE $0x45; BYTE $0x0F; BYTE $0xD0; BYTE $0xC8

// func AxpyUnitaryTo(dst []complex64, alpha complex64, x, y []complex64)
TEXT ·AxpyUnitaryTo(SB), NOSPLIT, $0
	MOVQ    dst_base+0(FP), DI // DI = &dst
	MOVQ    x_base+32(FP), SI  // SI = &x
	MOVQ    y_base+56(FP), DX  // DX = &y
	MOVQ    x_len+40(FP), CX
	CMPQ    y_len+64(FP), CX   // CX = min( len(x), len(y), len(dst) )
	CMOVQLE y_len+64(FP), CX
	CMPQ    dst_len+8(FP), CX
	CMOVQLE dst_len+8(FP), CX
	CMPQ    CX, $0             // if CX == 0 { return }
	JE      caxy_end
	MOVSD   alpha+24(FP), X0   // X0 = { 0, 0, imag(a), real(a) }
	SHUFPD  $0, X0, X0         // X0  = { imag(a), real(a), imag(a), real(a) }
	MOVAPS  X0, X1
	SHUFPS  $0x11, X1, X1      // X1 = { real(a), imag(a), real(a), imag(a) }
	XORQ    AX, AX             // i = 0
	MOVQ    DX, BX             // Align on 16-byte boundary for ADDPS
	ANDQ    $15, BX            // BX = &y & 15
	JZ      caxy_no_trim       // if BX == 0 { goto caxy_no_trim }

	MOVSD (SI)(AX*8), X3 // X3 = { imag(x[i]), real(x[i]) }
	MOVSHDUP_X3_X2       // X2 = { imag(x[i]), imag(x[i]) }
	MOVSLDUP_X3_X3       // X3 = { real(x[i]), real(x[i]) }
	MULPS X1, X2         // X2 = { real(a) * imag(x[i]), imag(a) * imag(x[i]) }
	MULPS X0, X3         // X3 = { imag(a) * real(x[i]), real(a) * real(x[i]) }

	// X3 = { imag(a)*real(x[i]) + real(a)*imag(x[i]), real(a)*real(x[i]) - imag(a)*imag(x[i]) }
	ADDSUBPS_X2_X3
	MOVSD (DX)(AX*8), X4 // X3 += y[i]
	ADDPS X4, X3
	MOVSD X3, (DI)(AX*8) // dst[i]  = X3
	INCQ  AX             // i++
	DECQ  CX             // --CX
	JZ    caxy_tail      // if BX == 0 { goto caxy_tail }

caxy_no_trim:
	MOVAPS X0, X10   // Copy X0 and X1 for pipelineing
	MOVAPS X1, X11
	MOVQ   CX, BX
	ANDQ   $7, CX    // CX = n % 8
	SHRQ   $3, BX    // BX = floor( n / 8 )
	JZ     caxy_tail // if BX == 0 { goto caxy_tail }

caxy_loop:
	// X_i = { imag(x[i]), real(x[i]), imag(x[i+1]), real(x[i+1]) }
	MOVUPS (SI)(AX*8), X3
	MOVUPS 16(SI)(AX*8), X5
	MOVUPS 32(SI)(AX*8), X7
	MOVUPS 48(SI)(AX*8), X9

	// X_(i-1) = { imag(x[i]), imag(x[i]), imag(x[i]+1), imag(x[i]+1) }
	MOVSHDUP_X3_X2
	MOVSHDUP_X5_X4
	MOVSHDUP_X7_X6
	MOVSHDUP_X9_X8

	// X_i = { real(x[i]), real(x[i]), real(x[i+1]), real(x[i+1]) }
	MOVSLDUP_X3_X3
	MOVSLDUP_X5_X5
	MOVSLDUP_X7_X7
	MOVSLDUP_X9_X9

	// X_i     = {  imag(a) * real(x[i]),   real(a) * real(x[i]),
	// 		imag(a) * real(x[i+1]), real(a) * real(x[i+1])  }
	// X_(i-1) = {  real(a) * imag(x[i]),   imag(a) * imag(x[i]),
	//		real(a) * imag(x[i+1]), imag(a) * imag(x[i+1])  }
	MULPS X1, X2
	MULPS X0, X3
	MULPS X11, X4
	MULPS X10, X5
	MULPS X1, X6
	MULPS X0, X7
	MULPS X11, X8
	MULPS X10, X9

	// X_i = {
	//	imag(result[i]):   imag(a)*real(x[i]) + real(a)*imag(x[i]),
	//	real(result[i]):   real(a)*real(x[i]) - imag(a)*imag(x[i]),
	//	imag(result[i+1]): imag(a)*real(x[i+1]) + real(a)*imag(x[i+1]),
	//	real(result[i+1]): real(a)*real(x[i+1]) - imag(a)*imag(x[i+1]),
	//  }
	ADDSUBPS_X2_X3
	ADDSUBPS_X4_X5
	ADDSUBPS_X6_X7
	ADDSUBPS_X8_X9

	// X_i = { imag(result[i])   + imag(y[i]),   real(result[i])   + real(y[i]),
	//	   imag(result[i+1]) + imag(y[i+1]), real(result[i+1]) + real(y[i+1])  }
	ADDPS  (DX)(AX*8), X3
	ADDPS  16(DX)(AX*8), X5
	ADDPS  32(DX)(AX*8), X7
	ADDPS  48(DX)(AX*8), X9
	MOVUPS X3, (DI)(AX*8)   // y[i:i+1] = X_i
	MOVUPS X5, 16(DI)(AX*8)
	MOVUPS X7, 32(DI)(AX*8)
	MOVUPS X9, 48(DI)(AX*8)
	ADDQ   $8, AX           // i += 8
	DECQ   BX               // --BX
	JNZ    caxy_loop        // }  while BX > 0
	CMPQ   CX, $0           // if CX == 0  { return }
	JE     caxy_end

caxy_tail: // do {
	MOVSD (SI)(AX*8), X3 // X3 = { imag(x[i]), real(x[i]) }
	MOVSHDUP_X3_X2       // X2 = { imag(x[i]), imag(x[i]) }
	MOVSLDUP_X3_X3       // X3 = { real(x[i]), real(x[i]) }
	MULPS X1, X2         // X2 = { real(a) * imag(x[i]), imag(a) * imag(x[i]) }
	MULPS X0, X3         // X3 = { imag(a) * real(x[i]), real(a) * real(x[i]) }

	// X3 = { imag(a)*real(x[i]) + real(a)*imag(x[i]),
	//	  real(a)*real(x[i]) - imag(a)*imag(x[i])  }
	ADDSUBPS_X2_X3
	MOVSD (DX)(AX*8), X4 // X3 += y[i]
	ADDPS X4, X3
	MOVSD X3, (DI)(AX*8) // y[i]  = X3
	INCQ  AX             // ++i
	LOOP  caxy_tail      // } while --CX > 0

caxy_end:
	RET