1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72
|
// +build sse
// +build amd64
// +build !fastmath
/*
InvSqrt is a function that inverse square roots (1/√x) each element in a []float64
The SSE version uses SHUFPD to "broadcast" the 1.0 constant to the X1 register. The rest proceeds as expected
*/
#include "textflag.h"
#define one 0x3ff0000000000000
// func InvSqrt(a []float64)
TEXT ·InvSqrt(SB), NOSPLIT, $0
MOVQ a_data+0(FP), SI
MOVQ SI, CX
MOVQ a_len+8(FP), AX // len(a) into AX
// make sure that len(a) >= 1
XORQ BX, BX
CMPQ BX, AX
JGE done
MOVQ $one, DX
SUBQ $2, AX
JL remainder
// back up the first element of the slice
MOVQ (SI), BX
MOVQ DX, (SI)
// broadcast 1.0 to all elements of X1
// 0x00 shuffles the least significant bits of the X1 reg, which means the first element is repeated
MOVUPD (SI), X1
SHUFPD $0x00, X1, X1
MOVAPD X1, X2 // backup, because X1 will get clobbered in DIVPD
// restore the first element now we're done
MOVQ BX, (SI)
loop:
MOVAPD X2, X1
SQRTPD (SI), X0
DIVPD X0, X1
MOVUPD X1, (SI)
// we processed 2 elements. Each element is 8 bytes. So jump 16 ahead
ADDQ $16, SI
SUBQ $2, AX
JGE loop
remainder:
ADDQ $2, AX
JE done
remainder1:
MOVQ DX, X1
MOVSD (SI), X0
SQRTSD X0, X0
DIVSD X0, X1
MOVSD X1, (SI)
ADDQ $8, SI
DECQ AX
JNE remainder1
done:
RET
|