1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264
|
#ifndef FASTMATH_H
#define FASTMATH_H
#ifndef DEDICATED_NOSSE
#include <xmmintrin.h>
#endif
#include <boost/cstdint.hpp>
#include "lib/streflop/streflop_cond.h"
#include "System/maindefines.h"
#ifdef _MSC_VER
#define __builtin_sqrtf sqrtf
#endif
/**
* @file FastMath.cpp
* @brief Fast math routines
*
* Contains faster alternatives for the more time
* consuming standard math routines. These functions
* are not as accurate, however, but are generally
* acceptable for most applications.
*
*/
namespace fastmath {
float isqrt_nosse(float);
/****************** Square root functions ******************/
/**
* @brief DO NOT USE IN SYNCED CODE. Calculates 1/sqrt(x) using SSE instructions.
*
* This is much slower than isqrt_nosse (extremely slow on AMDs) and
* additionally gives different results on Intel and AMD processors.
*/
inline float isqrt_sse(float x)
{
#ifndef DEDICATED_NOSSE
__m128 vec = _mm_load_ss(&x);
vec = _mm_rsqrt_ss(vec);
_mm_store_ss(&x, vec);
return x;
#else
return isqrt_nosse(x);
#endif
}
/**
* @brief Sync-safe. Calculates square root with using SSE instructions.
*
* Slower than std::sqrtf, faster than streflop
*/
inline float __ALIGN_ARG__ sqrt_sse(float x)
{
#ifndef DEDICATED_NOSSE
__m128 vec = _mm_load_ss(&x);
vec = _mm_sqrt_ss(vec);
_mm_store_ss(&x, vec);
return x;
#else
return sqrt(x);
#endif
}
/**
* @brief Calculates 1/sqrt(x) with less accuracy
*
* Gets a very good first guess and then uses one
* iteration of newton's method for improved accuracy.
* Average relative error: 0.093%
* Highest relative error: 0.175%
*
* see "The Math Behind The Fast Inverse Square Root Function Code"
* by Charles McEniry [2007] for a mathematical derivation of this
* method (or Chris Lomont's 2003 "Fast Inverse Square Root" paper)
*
* It has been found to give slightly different results on different Intel CPUs.
* Possible cause: 32bit vs 64bit. Use with care.
*/
inline float isqrt_nosse(float x) {
float xh = 0.5f * x;
boost::int32_t i = *(boost::int32_t*) &x;
// "magic number" which makes a very good first guess
i = 0x5f375a86 - (i >> 1);
x = *(float*) &i;
// Newton's method. One iteration for less accuracy but more speed.
x = x * (1.5f - xh * (x * x));
return x;
}
/**
* @brief Calculates 1/sqrt(x) with more accuracy
*
* Gets a very good first guess and then uses two
* iterations of newton's method for improved accuracy.
* Average relative error: 0.00017%
* Highest relative error: 0.00047%
*
*/
inline float isqrt2_nosse(float x) {
float xh = 0.5f * x;
boost::int32_t i = *(boost::int32_t*) &x;
// "magic number" which makes a very good first guess
i = 0x5f375a86 - (i >> 1);
x = *(float*) &i;
// Newton's method. Two iterations for more accuracy but less speed.
x = x * (1.5f - xh * (x * x));
x = x * (1.5f - xh * (x * x));
return x;
}
/****************** Square root ******************/
/** Calculate sqrt using builtin sqrtf. */
inline float sqrt(float x)
{
return __builtin_sqrtf(x);
}
/** Calculate sqrt using builtin sqrtf. */
inline float sqrt2(float x)
{
return __builtin_sqrtf(x);
}
/**
* @brief A (possibly very) inaccurate and numerically unstable, but fast, version of sqrt.
*
* Use with care.
*/
inline float apxsqrt(float x) {
return x * isqrt_nosse(x);
}
/**
* @brief A (possibly very) inaccurate and numerically unstable, but fast, version of sqrt.
*
* Use with care. This should be a little bit better, albeit slower, than fastmath::sqrt.
*/
inline float apxsqrt2(float x) {
return x * isqrt2_nosse(x);
}
/**
* @brief Calculates 1/sqrt(x) very quickly.
*
*/
inline float isqrt(float x) {
return isqrt2_nosse(x);
}
/**
* @brief Calculates 1/sqrt(x) very quickly. More accurate but slower than isqrt.
*
*/
inline float isqrt2(float x) {
return isqrt2_nosse(x);
}
/****************** Trigonometric functions ******************/
/**
* @brief Pi
*
* Cherry flavored.
*/
static const float PI = 3.141592654f;
/**
* @brief Half of pi
*
* Pi divided by two
*/
static const float HALFPI = PI / 2.0f;
/**
* @brief Pi times two
*
* Pi times two
*/
static const float PI2 = PI * 2.0f;
/**
* @brief Four divided by pi
*
* Four over pi
*/
static const float PIU4 = 4.0f / PI;
/**
* @brief Negative four divided by pi squared
*
* Negative four over (pi squared)
*/
static const float PISUN4 = -4.0f / (PI * PI);
/**
* @brief reciprocal of pi
*
* One over (pi times two)
*/
static const float INVPI2 = 1.0f / PI2;
/**
* @brief negative half pi
*
* -pi / 2
*/
static const float NEGHALFPI = -HALFPI;
/**
* @brief calculates the sine of x
*
* Range reduces x to -PI ... PI, and then uses the
* sine approximation method as described at
* http://www.devmaster.net/forums/showthread.php?t=5784
*
* Average percentage error: 0.15281632393574715%
* Highest percentage error: 0.17455324009559584%
*/
inline float sin(float x) {
/* range reduce to -PI ... PI, as the approximation
method only works well for that range. */
x = x - ((int)(x * INVPI2)) * PI2;
if (x > HALFPI) {
x = -x + PI;
} else if (x < NEGHALFPI ) {
x = -x - PI;
}
/* approximation */
x = (PIU4) * x + (PISUN4) * x * math::fabs(x);
x = 0.225f * (x * math::fabs(x) - x) + x;
return x;
}
/**
* @brief calculates the cosine of x
*
* Adds half of pi to x and then uses the sine method.
*/
inline float cos(float x) {
return sin(x + HALFPI);
}
}
using fastmath::PI;
namespace math {
//! override streflop with faster sqrt!
inline float sqrt(float x) {
return fastmath::sqrt_sse(x);
}
using fastmath::isqrt;
using fastmath::isqrt2;
}
#endif
|