File: asm_vecDiv_sse.s

package info (click to toggle)
golang-github-gorgonia-vecf64 0.9.0-2
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 252 kB
  • sloc: asm: 601; sh: 15; makefile: 2
file content (70 lines) | stat: -rw-r--r-- 1,068 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
// +build sse
// +build amd64

#include "textflag.h"

// func divAsm(a, b []float64)
TEXT ·divAsm(SB), NOSPLIT, $0
	MOVQ a_data+0(FP), SI
	MOVQ b_data+24(FP), DI // use destination index register for this

	MOVQ a_len+8(FP), AX  // len(a) into AX

	// check if there are at least 8 elements
	SUBQ $8, AX
	JL   remainder

loop:

	// a[0]
	MOVAPD (SI), X0
	MOVAPD (DI), X1
	DIVPD  X1, X0
	MOVAPD X0, (SI)

	MOVAPD 16(SI), X2
	MOVAPD 16(DI), X3
	DIVPD  X3, X2
	MOVAPD X2, 16(SI)

	MOVAPD 32(SI), X4
	MOVAPD 32(DI), X5
	DIVPD  X5, X4
	MOVAPD X4, 32(SI)

	MOVAPD 48(SI), X6
	MOVAPD 48(DI), X7
	DIVPD  X7, X6
	MOVAPD X6, 48(SI)

	// update pointers. 4 registers, 2 elements each, 8 bytes per element
	ADDQ $64, SI
	ADDQ $64, DI

	// len(a) is now 4*2 elements less
	SUBQ $8, AX
	JGE  loop

remainder:
	ADDQ $8, AX
	JE   done

remainderloop:

	// copy into the appropriate registers
	MOVSD (SI), X0
	MOVSD (DI), X1
	DIVSD X1, X0

	// save it back
	MOVSD X0, (SI)

	// update pointer to the top of the data
	ADDQ $8, SI
	ADDQ $8, DI

	DECQ AX
	JNE  remainderloop

done:
	RET