File: asm_vecInvSqrt_sse.s

package info (click to toggle)
golang-github-gorgonia-vecf64 0.9.0-2
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 252 kB
  • sloc: asm: 601; sh: 15; makefile: 2
file content (72 lines) | stat: -rw-r--r-- 1,306 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
// +build sse
// +build amd64
// +build !fastmath

/*
InvSqrt is a function that inverse square roots (1/√x) each element in a []float64

The SSE version uses SHUFPD to "broadcast" the 1.0 constant to the X1 register. The rest proceeds as expected
*/
#include "textflag.h"

#define one 0x3ff0000000000000

// func InvSqrt(a []float64)
TEXT ·InvSqrt(SB), NOSPLIT, $0
	MOVQ a_data+0(FP), SI
	MOVQ SI, CX
	MOVQ a_len+8(FP), AX  // len(a) into AX

	// make sure that len(a) >= 1
	XORQ BX, BX
	CMPQ BX, AX
	JGE  done

	MOVQ $one, DX

	SUBQ $2, AX
	JL   remainder

	// back up the first element of the slice
	MOVQ (SI), BX
	MOVQ DX, (SI)

	// broadcast 1.0 to all elements of X1
	// 0x00 shuffles the least significant bits of the X1 reg, which means the first element is repeated
	MOVUPD (SI), X1
	SHUFPD $0x00, X1, X1

	MOVAPD X1, X2 // backup, because X1 will get clobbered in DIVPD

	// restore the first element now we're done
	MOVQ BX, (SI)

loop:
	MOVAPD X2, X1
	SQRTPD (SI), X0
	DIVPD  X0, X1
	MOVUPD X1, (SI)

	// we processed 2 elements. Each element is 8 bytes. So jump 16 ahead
	ADDQ $16, SI

	SUBQ $2, AX
	JGE  loop

remainder:
	ADDQ $2, AX
	JE   done

remainder1:
	MOVQ   DX, X1
	MOVSD  (SI), X0
	SQRTSD X0, X0
	DIVSD  X0, X1
	MOVSD  X1, (SI)

	ADDQ $8, SI
	DECQ AX
	JNE  remainder1

done:
	RET