File: dscalunitary_amd64.s

package info (click to toggle)
golang-gonum-v1-gonum 0.15.1-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 18,792 kB
  • sloc: asm: 6,252; fortran: 5,271; sh: 377; ruby: 211; makefile: 98
file content (66 lines) | stat: -rw-r--r-- 1,706 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
// Copyright ©2017 The Gonum Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

// +build !noasm,!gccgo,!safe

#include "textflag.h"

#define SRC SI
#define DST SI
#define LEN CX
#define IDX AX
#define TAIL BX
#define ALPHA X0
#define ALPHA_2 X1

#define MOVDDUP_ALPHA    LONG $0x44120FF2; WORD $0x0824 // MOVDDUP 8(SP), X0

// func DscalUnitary(alpha float64, x []complex128)
TEXT ·DscalUnitary(SB), NOSPLIT, $0
	MOVQ x_base+8(FP), SRC // SRC = &x
	MOVQ x_len+16(FP), LEN // LEN = len(x)
	CMPQ LEN, $0           // if LEN == 0 { return }
	JE   dscal_end

	MOVDDUP_ALPHA         // ALPHA = alpha
	XORQ   IDX, IDX       // IDX = 0
	MOVUPS ALPHA, ALPHA_2 // Copy ALPHA to ALPHA_2 for pipelining
	MOVQ   LEN, TAIL      // TAIL = LEN
	SHRQ   $2, LEN        // LEN = floor( n / 4 )
	JZ     dscal_tail     // if LEN == 0 { goto dscal_tail }

dscal_loop: // do {
	MOVUPS (SRC)(IDX*8), X2   // X_i = x[i]
	MOVUPS 16(SRC)(IDX*8), X3
	MOVUPS 32(SRC)(IDX*8), X4
	MOVUPS 48(SRC)(IDX*8), X5

	MULPD ALPHA, X2   // X_i *= ALPHA
	MULPD ALPHA_2, X3
	MULPD ALPHA, X4
	MULPD ALPHA_2, X5

	MOVUPS X2, (DST)(IDX*8)   // x[i] = X_i
	MOVUPS X3, 16(DST)(IDX*8)
	MOVUPS X4, 32(DST)(IDX*8)
	MOVUPS X5, 48(DST)(IDX*8)

	ADDQ $8, IDX    // IDX += 8
	DECQ LEN
	JNZ  dscal_loop // } while --LEN > 0

dscal_tail:
	ANDQ $3, TAIL  // TAIL = TAIL % 4
	JZ   dscal_end // if TAIL == 0 { return }

dscal_tail_loop: // do {
	MOVUPS (SRC)(IDX*8), X2 // X_i = x[i]
	MULPD  ALPHA, X2        // X_i *= ALPHA
	MOVUPS X2, (DST)(IDX*8) // x[i] = X_i
	ADDQ   $2, IDX          // IDX += 2
	DECQ   TAIL
	JNZ    dscal_tail_loop  // } while --TAIL > 0

dscal_end:
	RET