1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179
|
//define this to sacrify accuracy for speed (up to 40% faster)
//#define SPEED_SPEED_SPEED
#ifndef __GNUC__
/* ******************************************************************* */
// no GCC
#include <xmmintrin.h>
#ifdef _MSC_VER
// microsoft specific
//#if (defined(_M_IX86_FP) && (_M_IX86_FP >= 1)) || defined(_M_X64) || defined(_M_AMD64)
//#define __SSE__
//#define __SSE2__
//#endif
#define ALWAYS_INLINE(ret_type) static __forceinline ret_type
#define ALIGNED __declspec(align(16))
#define __v128 ALIGNED __m128
#define _mm_init1_ps(f) {f, f, f, f}
//#define _mm_set_ps1(f) {f, f, f, f}
#else
#define ALWAYS_INLINE(ret_type) static inline ret_type
#define __v128 __m128
#define _mm_init1_ps(f) {f, f, f, f}
#endif
#else
/* ******************************************************************* */
// use gcc extensions for SSE
#if (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__) > 40500
// x86intrin.h availeable since gcc 4.5.0
#include <malloc.h>
#include <xmmintrin.h>
#include <x86intrin.h>
#else
#include <xmmintrin.h>
#endif
#if !defined(__INTEL_COMPILER)
#include <mm_malloc.h>
#endif
#ifndef __MACH__
#include <malloc.h>
#endif
typedef float __v128 __attribute__(( vector_size(4*sizeof(float)) ,aligned(16) ));
#define _mm_init1_ps(f) {f, f, f, f}
#if defined(__MINGW32__) || defined(mingw32) || defined(MINGW)
// workaround for buggy _mm_malloc in mingw
#define _aligned_free __mingw_aligned_free
#define _aligned_malloc __mingw_aligned_malloc
#endif
#define ALIGNED __attribute__((aligned (16)))
#define ALWAYS_INLINE(ret_type) static inline ret_type __attribute__((__gnu_inline__, __always_inline__))
#endif
/* ******************************************************************* */
// Number of Element an SSE operation processes
#define VECT_SIZE 4
/* ******************************************************************* */
// macros to make the source easier to read
#if 0 && defined(__GNUC__)
// let GCC do it - slower (unaligned mem. access ??)
#define LOAD_V4(arr, ind) ( *((__v128 *) (arr + ind)) )
#define STORE_V4(arr, ind, value) { *((__v128 *) (arr + ind)) = value ;}
#define ADD_V4(arr, ind, value) { *((__v128 *) (arr + ind)) += (value) ;}
#define SUB_V4(arr, ind, value) { *((__v128 *) (arr + ind)) -= (value) ;}
#else
// use SSE macros
#define LOAD_V4(arr, ind) _mm_load_ps((arr + ind))
#define STORE_V4(arr, ind, value) _mm_store_ps((arr + ind), (value))
#define ADD_V4(arr, ind, value) _mm_store_ps((arr + ind), _mm_add_ps( _mm_load_ps((arr + ind)), (value)))
#define SUB_V4(arr, ind, value) _mm_store_ps((arr + ind), _mm_sub_ps( _mm_load_ps((arr + ind)), (value)))
#endif
/* ******************************************************************* */
#if 0 && defined(__GNUC__)
// let GCC do it -- slower ..
#define V_ADD(a, b) ( (a) + (b))
#define V_SUB(a, b) ( (a) - (b))
#define V_MUL(a, b) ( (a) * (b))
#define V_DIV(a, b) ( (a) / (b))
#define V_INCR(a, b) {a += (b);}
#define V_DECR(a, b) {a -= (b);}
#else
// use SSE macros .. somehow faster ...
#define V_ADD(a, b) _mm_add_ps((a), (b))
#define V_SUB(a, b) _mm_sub_ps((a), (b))
#define V_MUL(a, b) _mm_mul_ps((a), (b))
#define V_DIV(a, b) _mm_div_ps((a), (b))
#define V_INCR(a, b) {a = _mm_add_ps( (a), (b));}
#define V_DECR(a, b) {a = _mm_sub_ps( (a), (b));}
#endif
/* ******************************************************************* */
// aux. constants
static const __v128 _half4 = _mm_init1_ps(.5f);
static const __v128 _three4= _mm_init1_ps(3.f);
static const __v128 _two4 = _mm_init1_ps(2.f);
/* ******************************************************************* */
/* SSE fast inverse square root */
/* ret = 1/sqrt(v) accuracy: approx. 22bit */
/* ******************************************************************* */
#ifdef SPEED_SPEED_SPEED
// skip refinement step. accuracy: approx 10bit
#define newtonrapson_rsqrt4( v ) _mm_rsqrt_ps( v )
#else
ALWAYS_INLINE(__v128) newtonrapson_rsqrt4( const __v128 v ) {
// x = rsqrt_approx(v);
// x' = 0.5 * x * (3 - v*x*x);
__v128 approx = _mm_rsqrt_ps( v );
// one newton-rapson step to improve accuracy
return V_MUL( V_MUL(_half4, approx),
V_SUB(_three4,
V_MUL( V_MUL(v, approx), approx) ) );
}
#endif
/* ******************************************************************* */
/* SSE fast inverse */
/* ret = 1/v accuracy: approx. 22bit */
/* ******************************************************************* */
#ifdef SPEED_SPEED_SPEED
// skip refinement step. accuracy: approx 10bit
#define newtonrapson_rcp( v ) _mm_rcp_ps( v )
#else
ALWAYS_INLINE(__v128) newtonrapson_rcp(const __v128 v) {
// x = reciprocal_approx(v);
// x' = x * (2 - x * v);
__v128 r = _mm_rcp_ps(v);
return (V_MUL(r, V_SUB( _two4, V_MUL(r, v))));
}
#endif
/* ******************************************************************* */
/* SSE: sum of all four elements of a vector */
/* ret = v[0] + v[1] + v[2] + v[3] */
/* ******************************************************************* */
ALWAYS_INLINE(float) _vector4_sum(const __v128 v) {
__v128 a;
a = _mm_shuffle_ps(v,v, _MM_SHUFFLE(0,3,0,1)); // 1->0, 3->2;
a = V_ADD(a, v); // 0+1, 1+0, 2+3, 3+2
return(_mm_cvtss_f32(V_ADD(a, _mm_shuffle_ps(a,a, _MM_SHUFFLE(0,0,0,2)))));// 2->0
// 0 = 1 + 0 + 3 + 2
}
|