File: simd_unroll_benchmarks2.cpp

package info (click to toggle)
supercollider-sc3-plugins 3.13.0~repack-1
  • links: PTS, VCS
  • area: main
  • in suites: trixie
  • size: 20,104 kB
  • sloc: cpp: 303,352; lisp: 9,589; ansic: 3,547; sh: 96; makefile: 87; haskell: 21
file content (129 lines) | stat: -rw-r--r-- 3,069 bytes parent folder | download | duplicates (12)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
#include "benchmark_helpers.hpp"

#ifdef __SSE__
#include <xmmintrin.h>
#endif

#include "../simd_binary_arithmetic.hpp"

using namespace nova;
using namespace std;

aligned_array<float, 64> out, in1, in2;

#ifdef __SSE__
void __noinline__ bench_1(float * out, float * in1, float * in2, unsigned int n)
{
    n /= 4;

    do
    {
        __m128 lhs = _mm_load_ps(in1);
        __m128 rhs = _mm_load_ps(in2);
        __m128 result = _mm_add_ps(lhs, rhs);
        _mm_store_ps(out, result);
        in1 += 4;
        in2 += 4;
        out += 4;
    }
    while (--n);
}

void __noinline__ bench_2(float * out, float * in1, float * in2, unsigned int n)
{
    n /= 8;

    do
    {
        __m128 lhs1 = _mm_load_ps(in1);
        __m128 lhs2 = _mm_load_ps(in1+4);
        __m128 rhs1 = _mm_load_ps(in2);
        __m128 rhs2 = _mm_load_ps(in2+4);
        __m128 result1 = _mm_add_ps(lhs1, rhs1);
        __m128 result2 = _mm_add_ps(lhs2, rhs2);
        _mm_store_ps(out, result1);
        _mm_store_ps(out+4, result2);
        in1 += 8;
        in2 += 8;
        out += 8;
    }
    while (--n);
}

void __noinline__ bench_3(float * out, float * in1, float * in2, unsigned int n)
{
    n /= 16;

    do
    {
        __m128 lhs1 = _mm_load_ps(in1);
        __m128 lhs2 = _mm_load_ps(in1+4);
        __m128 lhs3 = _mm_load_ps(in1+8);
        __m128 lhs4 = _mm_load_ps(in1+12);
        __m128 rhs1 = _mm_load_ps(in2);
        __m128 rhs2 = _mm_load_ps(in2+4);
        __m128 rhs3 = _mm_load_ps(in2+8);
        __m128 rhs4 = _mm_load_ps(in2+12);
        __m128 result1 = _mm_add_ps(lhs1, rhs1);
        __m128 result2 = _mm_add_ps(lhs2, rhs2);
        __m128 result3 = _mm_add_ps(lhs3, rhs3);
        __m128 result4 = _mm_add_ps(lhs4, rhs4);
        _mm_store_ps(out, result1);
        _mm_store_ps(out+4, result2);
        _mm_store_ps(out+8, result3);
        _mm_store_ps(out+12, result4);
        in1 += 16;
        in2 += 16;
        out += 16;
    }
    while (--n);
}
#endif


void __noinline__ bench_4(float * out, float * in1, float * in2, unsigned int n)
{
    n /= 8;

    do
    {
        nova::plus_vec_simd<8>(out, in1, in2);
        in1 += 8;
        in2 += 8;
        out += 8;
    }
    while (--n);
}

void __noinline__ bench_5(float * out, float * in1, float * in2, unsigned int n)
{
    n /= 16;

    do
    {
        nova::plus_vec_simd<16>(out, in1, in2);
        in1 += 16;
        in2 += 16;
        out += 16;
    }
    while (--n);
}



int main(void)
{
    out.assign(0.f);
    in1.assign(0.f);
    in2.assign(0.f);

    const unsigned int iterations = 100000000;

#ifdef __SSE__
    run_bench(boost::bind(bench_1, out.begin(), in1.begin(), in2.begin(), 64), iterations);
    run_bench(boost::bind(bench_2, out.begin(), in1.begin(), in2.begin(), 64), iterations);
    run_bench(boost::bind(bench_3, out.begin(), in1.begin(), in2.begin(), 64), iterations);
#endif
    run_bench(boost::bind(bench_4, out.begin(), in1.begin(), in2.begin(), 64), iterations);
    run_bench(boost::bind(bench_5, out.begin(), in1.begin(), in2.begin(), 64), iterations);
}