1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148
|
#include "benchmark_helpers.hpp"
#include <cmath>
#ifdef __SSE__
#include <xmmintrin.h>
#endif
using namespace std;
nova::aligned_array<float, 64> out, in, in2;
typedef float afloat __attribute__ ((__aligned__(16)));
void __noinline__ bench_1(float * out, float * in1, float in2, float slope, unsigned int n)
{
for (unsigned int i = 0; i != n; ++i)
{
out[i] = in1[i] + in2;
in2 += slope;
}
}
void __noinline__ bench_2(float * out, float * in1, float in2, float slope, unsigned int n)
{
for (unsigned int i = 0; i != n; i += 4)
{
out[i] = in1[i] + in2; in2 += slope;
out[i+1] = in1[i+1] + in2; in2 += slope;
out[i+2] = in1[i+2] + in2; in2 += slope;
out[i+3] = in1[i+3] + in2; in2 += slope;
}
}
#ifdef __SSE__
void __noinline__ bench_3(float * out, float * in1, float in2, float slope, unsigned int n)
{
__m128 arg2 = _mm_set_ps(in2, in2+slope, in2+slope+slope, in2+slope+slope+slope);
const __m128 vslope = _mm_set_ps1(slope+slope+slope+slope);
std::size_t loops = n / 4;
do {
__m128 arg1 = _mm_load_ps(in1);
__m128 result = _mm_add_ps(arg1, arg2);
arg2 = _mm_add_ps(arg2, vslope);
_mm_store_ps(out, result);
in1+=4;
out+=4;
} while (--loops);
}
void __noinline__ bench_3a(float * out, float * in1, float in2, float slope, unsigned int n)
{
__m128 arg2 = _mm_set_ps(in2, in2+slope, in2+ 2*slope, in2+3*slope);
const __m128 vslope = _mm_set_ps1(4 * slope);
std::size_t loops = n / 4;
do {
__m128 arg1 = _mm_load_ps(in1);
__m128 result = _mm_add_ps(arg1, arg2);
arg2 = _mm_add_ps(arg2, vslope);
_mm_store_ps(out, result);
in1+=4;
out+=4;
} while (--loops);
}
#endif
void __noinline__ bench_4(float * out, float * in1, float in2, float slope, unsigned int n)
{
for (unsigned int i = 0; i != n; ++i)
{
out[i] = in1[i] * in2;
in2 += slope;
}
}
void __noinline__ bench_5(float * out, float * in1, float in2, float slope, unsigned int n)
{
for (unsigned int i = 0; i != n; i += 4)
{
out[i] = in1[i] * in2; in2 += slope;
out[i+1] = in1[i+1] * in2; in2 += slope;
out[i+2] = in1[i+2] * in2; in2 += slope;
out[i+3] = in1[i+3] * in2; in2 += slope;
}
}
#ifdef __SSE__
void __noinline__ bench_6(float * out, float * in1, float in2, float slope, unsigned int n)
{
__m128 arg2 = _mm_set_ps(in2, in2+slope, in2+slope+slope, in2+slope+slope+slope);
const __m128 vslope = _mm_set_ps1(slope+slope+slope+slope);
std::size_t loops = n / 4;
do {
__m128 arg1 = _mm_load_ps(in1);
__m128 result = _mm_mul_ps(arg1, arg2);
arg2 = _mm_add_ps(arg2, vslope);
_mm_store_ps(out, result);
in1+=4;
out+=4;
} while (--loops);
}
void __noinline__ bench_6a(float * out, float * in1, float in2, float slope, unsigned int n)
{
__m128 arg2 = _mm_set_ps(in2, in2+slope, in2+2*slope, in2+3*slope);
const __m128 vslope = _mm_set_ps1(4*slope);
std::size_t loops = n / 4;
do {
__m128 arg1 = _mm_load_ps(in1);
__m128 result = _mm_mul_ps(arg1, arg2);
arg2 = _mm_add_ps(arg2, vslope);
_mm_store_ps(out, result);
in1+=4;
out+=4;
} while (--loops);
}
#endif
int main(void)
{
out.assign(0.f);
in.assign(0.2f);
in2.assign(0.3f);
const unsigned int iterations = 50000000;
run_bench(boost::bind(bench_1, out.begin(), in.begin(), 0.1f, 0.001f, 64), iterations);
run_bench(boost::bind(bench_2, out.begin(), in.begin(), 0.1f, 0.001f, 64), iterations);
#ifdef __SSE__
run_bench(boost::bind(bench_3, out.begin(), in.begin(), 0.1f, 0.001f, 64), iterations);
run_bench(boost::bind(bench_3a, out.begin(), in.begin(), 0.1f, 0.001f, 64), iterations);
#endif
run_bench(boost::bind(bench_4, out.begin(), in.begin(), 0.1f, 0.001f, 64), iterations);
run_bench(boost::bind(bench_5, out.begin(), in.begin(), 0.1f, 0.001f, 64), iterations);
#ifdef __SSE__
run_bench(boost::bind(bench_6, out.begin(), in.begin(), 0.1f, 0.001f, 64), iterations);
run_bench(boost::bind(bench_6a, out.begin(), in.begin(), 0.1f, 0.001f, 64), iterations);
#endif
}
|