1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197
|
#ifndef LLVM_LIBC_BENCHMARKS_LIBC_GPU_BENCHMARK_H
#define LLVM_LIBC_BENCHMARKS_LIBC_GPU_BENCHMARK_H
#include "benchmarks/gpu/BenchmarkLogger.h"
#include "benchmarks/gpu/timing/timing.h"
#include "src/__support/CPP/array.h"
#include "src/__support/CPP/functional.h"
#include "src/__support/CPP/limits.h"
#include "src/__support/CPP/string_view.h"
#include "src/__support/CPP/type_traits.h"
#include "src/__support/FPUtil/FPBits.h"
#include "src/__support/macros/config.h"
#include "src/stdlib/rand.h"
#include "src/time/clock.h"
#include <stdint.h>
namespace LIBC_NAMESPACE_DECL {
namespace benchmarks {
struct BenchmarkOptions {
uint32_t initial_iterations = 1;
uint32_t min_iterations = 1;
uint32_t max_iterations = 10000000;
uint32_t min_samples = 4;
uint32_t max_samples = 1000;
int64_t min_duration = 500 * 1000; // 500 * 1000 nanoseconds = 500 us
int64_t max_duration = 1000 * 1000 * 1000; // 1e9 nanoseconds = 1 second
double epsilon = 0.0001;
double scaling_factor = 1.4;
};
struct Measurement {
uint32_t iterations = 0;
uint64_t elapsed_cycles = 0;
};
class RefinableRuntimeEstimation {
uint64_t total_cycles = 0;
uint32_t total_iterations = 0;
public:
uint64_t update(const Measurement &M) {
total_cycles += M.elapsed_cycles;
total_iterations += M.iterations;
return total_cycles / total_iterations;
}
};
// Tracks the progression of the runtime estimation
class RuntimeEstimationProgression {
RefinableRuntimeEstimation rre;
public:
uint64_t current_estimation = 0;
double compute_improvement(const Measurement &M) {
const uint64_t new_estimation = rre.update(M);
double ratio =
(static_cast<double>(current_estimation) / new_estimation) - 1.0;
// Get absolute value
if (ratio < 0)
ratio *= -1;
current_estimation = new_estimation;
return ratio;
}
};
struct BenchmarkResult {
uint64_t cycles = 0;
double standard_deviation = 0;
uint64_t min = UINT64_MAX;
uint64_t max = 0;
uint32_t samples = 0;
uint32_t total_iterations = 0;
clock_t total_time = 0;
};
BenchmarkResult benchmark(const BenchmarkOptions &options,
cpp::function<uint64_t(void)> wrapper_func);
class Benchmark {
const cpp::function<uint64_t(void)> func;
const cpp::string_view suite_name;
const cpp::string_view test_name;
const uint32_t num_threads;
public:
Benchmark(cpp::function<uint64_t(void)> func, char const *suite_name,
char const *test_name, uint32_t num_threads)
: func(func), suite_name(suite_name), test_name(test_name),
num_threads(num_threads) {
add_benchmark(this);
}
static void run_benchmarks();
const cpp::string_view get_suite_name() const { return suite_name; }
const cpp::string_view get_test_name() const { return test_name; }
protected:
static void add_benchmark(Benchmark *benchmark);
private:
BenchmarkResult run() {
BenchmarkOptions options;
return benchmark(options, func);
}
};
// We want our random values to be approximately
// Output: a random number with the exponent field between min_exp and max_exp,
// i.e. 2^min_exp <= |real_value| < 2^(max_exp + 1),
// Caveats:
// -EXP_BIAS corresponding to denormal values,
// EXP_BIAS + 1 corresponding to inf or nan.
template <typename T>
static T
get_rand_input(int max_exp = LIBC_NAMESPACE::fputil::FPBits<T>::EXP_BIAS,
int min_exp = -LIBC_NAMESPACE::fputil::FPBits<T>::EXP_BIAS) {
using FPBits = LIBC_NAMESPACE::fputil::FPBits<T>;
// Required to correctly instantiate FPBits for floats and doubles.
using RandType = typename cpp::conditional_t<(cpp::is_same_v<T, double>),
uint64_t, uint32_t>;
RandType bits;
if constexpr (cpp::is_same_v<T, uint64_t>)
bits = (static_cast<uint64_t>(LIBC_NAMESPACE::rand()) << 32) |
static_cast<uint64_t>(LIBC_NAMESPACE::rand());
else
bits = LIBC_NAMESPACE::rand();
double scale =
static_cast<double>(max_exp - min_exp + 1) / (2 * FPBits::EXP_BIAS + 1);
FPBits fp(bits);
fp.set_biased_exponent(
static_cast<uint32_t>(fp.get_biased_exponent() * scale + min_exp));
return fp.get_val();
}
template <typename T> class MathPerf {
using FPBits = fputil::FPBits<T>;
using StorageType = typename FPBits::StorageType;
static constexpr StorageType UIntMax =
cpp::numeric_limits<StorageType>::max();
public:
template <size_t N = 1>
static uint64_t run_throughput_in_range(T f(T), int min_exp, int max_exp) {
cpp::array<T, N> inputs;
for (size_t i = 0; i < N; ++i)
inputs[i] = get_rand_input<T>(min_exp, max_exp);
uint64_t total_time = LIBC_NAMESPACE::throughput(f, inputs);
return total_time / N;
}
// Throughput benchmarking for functions that take 2 inputs.
template <size_t N = 1>
static uint64_t run_throughput_in_range(T f(T, T), int arg1_min_exp,
int arg1_max_exp, int arg2_min_exp,
int arg2_max_exp) {
cpp::array<T, N> inputs1;
cpp::array<T, N> inputs2;
for (size_t i = 0; i < N; ++i) {
inputs1[i] = get_rand_input<T>(arg1_min_exp, arg1_max_exp);
inputs2[i] = get_rand_input<T>(arg2_min_exp, arg2_max_exp);
}
uint64_t total_time = LIBC_NAMESPACE::throughput(f, inputs1, inputs2);
return total_time / N;
}
};
} // namespace benchmarks
} // namespace LIBC_NAMESPACE_DECL
// Passing -1 indicates the benchmark should be run with as many threads as
// allocated by the user in the benchmark's CMake.
#define BENCHMARK(SuiteName, TestName, Func) \
LIBC_NAMESPACE::benchmarks::Benchmark SuiteName##_##TestName##_Instance( \
Func, #SuiteName, #TestName, -1)
#define BENCHMARK_N_THREADS(SuiteName, TestName, Func, NumThreads) \
LIBC_NAMESPACE::benchmarks::Benchmark SuiteName##_##TestName##_Instance( \
Func, #SuiteName, #TestName, NumThreads)
#define SINGLE_THREADED_BENCHMARK(SuiteName, TestName, Func) \
BENCHMARK_N_THREADS(SuiteName, TestName, Func, 1)
#define SINGLE_WAVE_BENCHMARK(SuiteName, TestName, Func) \
BENCHMARK_N_THREADS(SuiteName, TestName, Func, \
LIBC_NAMESPACE::gpu::get_lane_size())
#endif
|