File: asm_vecMul_sse.s

package info (click to toggle)
golang-github-gorgonia-vecf64 0.9.0-2
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 252 kB
  • sloc: asm: 601; sh: 15; makefile: 2
file content (65 lines) | stat: -rw-r--r-- 1,015 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
// +build sse
// +build amd64

#include "textflag.h"

// func mulAsm(a, b []float64)
TEXT ·mulAsm(SB), NOSPLIT, $0
	MOVQ a_data+0(FP), SI
	MOVQ b_data+24(FP), DI // use destination index register for this

	MOVQ a_len+8(FP), AX  // len(a) into AX

	// check if there are at least 8 elements
	SUBQ $8, AX
	JL   remainder

loop:
	// a[0]
	MOVAPD (SI), X0
	MOVAPD (DI), X1
	MULPD  X0, X1
	MOVAPD X1, (SI)

	MOVAPD 16(SI), X2
	MOVAPD 16(DI), X3
	MULPD  X2, X3
	MOVAPD X3, 16(SI)

	MOVAPD 32(SI), X4
	MOVAPD 32(DI), X5
	MULPD  X4, X5
	MOVAPD X5, 32(SI)

	MOVAPD 48(SI), X6
	MOVAPD 48(DI), X7
	MULPD  X6, X7
	MOVAPD X7, 48(SI)

	// update pointers. 4 registers, 2 elements at once, each element is 8 bytes
	ADDQ $64, SI
	ADDQ $64, DI

	// len(a) is now 4*2 elements less
	SUBQ $8, AX
	JGE  loop

remainder:
	ADDQ $8, AX
	JE   done

remainderloop:
	MOVSD (SI), X0
	MOVSD (DI), X1
	MULSD X0, X1
	MOVSD X1, (SI)

	// update pointer to the top of the data
	ADDQ $8, SI
	ADDQ $8, DI

	DECQ AX
	JNE  remainderloop

done:
	RET