File: MathUtils.h

package info (click to toggle)
bzflag 2.0.13.20080902-1
  • links: PTS
  • area: main
  • in suites: lenny
  • size: 27,564 kB
  • ctags: 34,716
  • sloc: cpp: 139,842; ansic: 14,510; sh: 10,715; makefile: 2,454; perl: 477; php: 428; python: 345; objc: 243; xml: 24
file content (361 lines) | stat: -rw-r--r-- 9,876 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
/* bzflag
 * Copyright (c) 1993 - 2008 Tim Riker
 *
 * This package is free software;  you can redistribute it and/or
 * modify it under the terms of the license found in the file
 * named LICENSE that should have accompanied this file.
 *
 * THIS PACKAGE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR
 * IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED
 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.
 */

/*
 * The fast sqrt() implementation contained herein was taken from the
 * public domained nVIDIA Fast Math Routines and adapted for BZFlag's
 * needs as well as wrapping table construction inside a class.
 */

#ifndef __MATHUTILS_H__
#define __MATHUTILS_H__

/* common header comes first */
#include "common.h"

/* system headers */
#include <math.h>
#include <iostream>

/* common headers */
#include "TimeKeeper.h"


#define FP_BITS(fp) (*(unsigned long *)&(fp))
#define FP_ABS_BITS(fp) (FP_BITS(fp)&0x7FFFFFFF)
#define FP_SIGN_BIT(fp) (FP_BITS(fp)&0x80000000)
#define FP_ONE_BITS 0x3F800000

// r = 1/p
#define FP_INV(r,p)							  \
{									    \
    int _i = 2 * FP_ONE_BITS - *(int *)&(p);				 \
    r = *(float *)&_i;						       \
    r = r * (2.0f - (p) * r);						\
}

/////////////////////////////////////////////////
// The following comes from Vincent Van Eeckhout
// Thanks for sending us the code!
// It's the same thing in assembly but without this C-needed line:
//    r = *(float *)&_i;

float   __two = 2.0f;

#define FP_INV2(r,p)		     \
{					\
    __asm { mov     eax,0x7F000000    }; \
    __asm { sub     eax,dword ptr [p] }; \
    __asm { mov     dword ptr [r],eax }; \
    __asm { fld     dword ptr [p]     }; \
    __asm { fmul    dword ptr [r]     }; \
    __asm { fsubr   [__two]	   }; \
    __asm { fmul    dword ptr [r]     }; \
    __asm { fstp    dword ptr [r]     }; \
}

#define FP_EXP(e,p)							  \
{									    \
    int _i;								  \
    e = -1.44269504f * (float)0x00800000 * (p);			      \
    _i = (int)e + 0x3F800000;						\
    e = *(float *)&_i;						       \
}

#define FP_NORM_TO_BYTE(i,p)						 \
{									    \
    float _n = (p) + 1.0f;						   \
    i = *(int *)&_n;							 \
    if (i >= 0x40000000)     i = 0xFF;				       \
    else if (i <=0x3F800000) i = 0;					  \
    else i = ((i) >> 15) & 0xFF;					     \
}

inline unsigned long FP_NORM_TO_BYTE2(float p)
{
  float fpTmp = p + 1.0f;
  return ((*(unsigned *)&fpTmp) >> 15) & 0xFF;
}

inline unsigned long FP_NORM_TO_BYTE3(float p)
{
  float ftmp = p + 12582912.0f;
  return ((*(unsigned long *)&ftmp) & 0xFF);
}


/** The math_util class contains general routines for common
 * mathematical functions that should be used with care.  The
 * precision and/or accuracy of the routine below are not guaranteed,
 * though the implementation should useful for fast estimates where
 * accuracy is not a stringent concern.
 *
 * For the square root and inverse square root routines below, they
 * are

 */
class math_util {

private:
  /** table of precomputed square root values */
  static unsigned int _fast_sqrt_table[0x10000];
  /** keep track of whether the table was precomputed yet */
  static bool _built_fast_sqrt_table;

  /** table of random floats for performance testing */
  static float _random_floats[0x10000];
  /** keep tracke of whether the random floats are set yet */
  static bool _built_random_floats;

  typedef union FastSqrtUnion
  {
    float f;
    unsigned int i;
  } FastSqrtUnion;

protected:

  static void  build_sqrt_table()
  {
    unsigned int i;
    FastSqrtUnion s;

    for (i = 0; i <= 0x7FFF; i++) {
      // Build a float with the bit pattern i as mantissa
      //  and an exponent of 0, stored as 127
      s.i = (i << 8) | (0x7F << 23);
      s.f = (float)sqrt(s.f);

      // Take the square root then strip the first 7 bits of
      //  the mantissa into the table
      _fast_sqrt_table[i + 0x8000] = (s.i & 0x7FFFFF);

      // Repeat the process, this time with an exponent of 1,
      //  stored as 128
      s.i = (i << 8) | (0x80 << 23);
      s.f = (float)sqrt(s.f);

      _fast_sqrt_table[i] = (s.i & 0x7FFFFF);
    }
  }

  static void build_random_floats()
  {
    unsigned int i;
    bzfsrand(0); // ensure consistent seed and same rand value order
    for (i=0; i < 0x10000; i++) {
      _random_floats[i] = (float)bzfrand();
    }
  }

  /* INVERSE SQUARE ROOT */

  /** system implementation of floating point square root estimate
   */
  static inline float fastinvsqrt0(float n)
  {
    float f;
    float p = sqrtf(n);
    FP_INV(f, p)
    return f;
  }

  /** software estimate of inverse square root
   */
  static inline float fastinvsqrt1(float n)
  {
    float f;
    float p = fastsqrt1(n);
    FP_INV(f, p)
    return f;
  }

  /** hardware instruction based inverse square root estimate
   */
  static inline float fastinvsqrt2 (float n)
  {
#if defined(__APPLE__)
    float result;
    asm ( "frsqrte %0, %1" : /*OUT*/ "=f" (result) : /*IN*/ "f" (n) );
    return result;
#else
    float f;
    float p = sqrtf(n);
    FP_INV(f, p)
    return f;
#endif
  }


  /* SQUARE ROOT */

  /** system implementation of floating point square root
   */
  static inline float fastsqrt0(float n)
  {
    return sqrtf(n);
  }

  /** software estimate replacement -- this is nvidias fast square
   * root routine is a table-based solution that trades off memory
   * utilization for performance.  it's not necessarily a cache
   * friendly method, but gives impressive results for common use.
   */
  static inline float fastsqrt1(float n)
  {
    // make sure the table is built
    if (!_built_fast_sqrt_table) {
      build_sqrt_table();
      _built_fast_sqrt_table = true;
    }

    // check for square root of 0
    if (FP_BITS(n) == 0) {
      return 0.0;
    }

    FP_BITS(n) = _fast_sqrt_table[(FP_BITS(n) >> 8) & 0xFFFF] | ((((FP_BITS(n) - 0x3F800000) >> 1) + 0x3F800000) & 0x7F800000);

    return n;
  }

  /** hardware instruction based square root estimate
   */
  static inline float fastsqrt2(float n)
  {
#if defined(__APPLE__)
    float f;
    float p = fastinvsqrt2(n);
    FP_INV(f, p)
    return f;
#else
    return sqrtf(n);
#endif
  }


public:

  /** function pointer to the fastinvsqrt routine that performs best
   * on this system.
   */
  static float (*fastinvsqrt)(float);

  /** function pointer to the fastsqrt routine that performs best on
   * this system.
   */
  static float (*fastsqrt)(float);

  /** optimize usage of square root and inverse square root at runtime
   * to use the fastest available.  A quick timed test is performed on
   * each of the routines and the function pointers are set to the
   * fastest.  There is no consideration for tolerance being taken
   * into account -- they are all assumed to be insufficient if
   * accuracy is required.
   */
  static void optimize()
  {
    /* array of function pointers for testing */
    float (*mathTest[6])(float) = {fastsqrt0, fastsqrt1, fastsqrt0,
				   fastinvsqrt0, fastinvsqrt1, fastinvsqrt0};
    const char *label[6] = {"system square root", "nvidia square root estimate", "hardware-based square root",
			    "system inverse square root", "nvidia inverse square root estimate", "harware-based inverse square root"};

    std::cout << "Optimizing math routines..." << std::endl;

    if (!_built_random_floats) {
      std::cout << "...building random float table" << std::endl;
      build_random_floats();
      _built_random_floats = true;
    }

    /* minimize cache effect by preloading */
    double sum = 0.0;
    for (unsigned int fl = 0; fl < 0x10000; fl++) {
      sum += _random_floats[fl];
    }
    // should always be false, but compiler shouldn't know that
    if (sum < -1.0f) {
      std::cerr << "ERROR: should not have reached here in MathUtils.h" << std::endl;
      exit(1);
    }

    /* test math routine candidates (reverse order) */
    unsigned int mostIterations = 0;
    unsigned int bestIndex = 0;
    for (int i = 0; i < 6; i++) {
      unsigned int iterations = 0; // how many test iterations were performed
      sum = 0.0; // make sure optimizer doesn't optimize us away

      if (i == 0) {
	std::cout << "...testing square root" << std::flush;
      } else if (i == 3) {
	std::cout << "...testing inverse square root" << std::flush;
      }

      // test for half a second per function
      TimeKeeper t = TimeKeeper::getCurrent();
      do {
	for (unsigned int f = 0; f < 0x10000; f++) {
	  sum += mathTest[i](_random_floats[f]);
	}
	iterations++;
      } while (TimeKeeper::getCurrent() - t < 0.5f);
      std::cout << " ... " << iterations;

      // should always be false, but compiler shouldn't know that
      if (sum < -1.0f) {
	std::cerr << "ERROR: should not have reached here in MathUtils.h" << std::endl;
	exit(1);
      }

      if (iterations > mostIterations) {
	mostIterations = iterations;
	bestIndex = i;
      }

      if (i == 2) {
	fastsqrt = mathTest[bestIndex];
	std::cout << std::endl << "...using " << label[bestIndex] << std::endl;
	mostIterations = 0;
	bestIndex = 0;
      } else if (i == 5) {
	fastinvsqrt = mathTest[bestIndex];
	std::cout << std::endl << "...using " << label[bestIndex] << std::endl;
	mostIterations = 0;
	bestIndex = 0;
      }
    }
    std::cout << "...done optimizing math routines." << std::endl;
    return;
  }

};

/* static initializers */
unsigned int math_util::_fast_sqrt_table[0x10000] = {0};
bool math_util::_built_fast_sqrt_table = false;
float math_util::_random_floats[0x10000] = {0};
bool math_util::_built_random_floats = false;
float (*math_util::fastinvsqrt)(float) = math_util::fastinvsqrt0;
float (*math_util::fastsqrt)(float) = math_util::fastsqrt0;

#endif  /* __MATHUTILS_H__ */


// Local Variables: ***
// mode: C++ ***
// tab-width: 8 ***
// c-basic-offset: 2 ***
// indent-tabs-mode: t ***
// End: ***
// ex: shiftwidth=2 tabstop=8