1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127
|
#ifndef LLVM_LIBC_BENCHMARKS_LIBC_GPU_BENCHMARK_H
#define LLVM_LIBC_BENCHMARKS_LIBC_GPU_BENCHMARK_H
#include "benchmarks/gpu/BenchmarkLogger.h"
#include "benchmarks/gpu/timing/timing.h"
#include "src/__support/CPP/functional.h"
#include "src/__support/CPP/limits.h"
#include "src/__support/CPP/string_view.h"
#include "src/__support/macros/config.h"
#include "src/time/clock.h"
#include <stdint.h>
namespace LIBC_NAMESPACE_DECL {
namespace benchmarks {
struct BenchmarkOptions {
uint32_t initial_iterations = 1;
uint32_t max_iterations = 10000000;
uint32_t min_samples = 4;
uint32_t max_samples = 1000;
int64_t min_duration = 0; // in nanoseconds (ns)
int64_t max_duration = 1000 * 1000 * 1000; // 1e9 nanoseconds = 1 second
double epsilon = 0.01;
double scaling_factor = 1.4;
};
struct Measurement {
uint32_t iterations = 0;
uint64_t elapsed_cycles = 0;
};
class RefinableRuntimeEstimation {
uint64_t total_cycles = 0;
uint32_t total_iterations = 0;
public:
uint64_t update(const Measurement &M) {
total_cycles += M.elapsed_cycles;
total_iterations += M.iterations;
return total_cycles / total_iterations;
}
};
// Tracks the progression of the runtime estimation
class RuntimeEstimationProgression {
RefinableRuntimeEstimation rre;
public:
uint64_t current_estimation = 0;
double compute_improvement(const Measurement &M) {
const uint64_t new_estimation = rre.update(M);
double ratio =
(static_cast<double>(current_estimation) / new_estimation) - 1.0;
// Get absolute value
if (ratio < 0)
ratio *= -1;
current_estimation = new_estimation;
return ratio;
}
};
struct BenchmarkResult {
uint64_t cycles = 0;
double standard_deviation = 0;
uint64_t min = UINT64_MAX;
uint64_t max = 0;
uint32_t samples = 0;
uint32_t total_iterations = 0;
clock_t total_time = 0;
};
BenchmarkResult benchmark(const BenchmarkOptions &options,
cpp::function<uint64_t(void)> wrapper_func);
class Benchmark {
const cpp::function<uint64_t(void)> func;
const cpp::string_view suite_name;
const cpp::string_view test_name;
const uint32_t num_threads;
public:
Benchmark(cpp::function<uint64_t(void)> func, char const *suite_name,
char const *test_name, uint32_t num_threads)
: func(func), suite_name(suite_name), test_name(test_name),
num_threads(num_threads) {
add_benchmark(this);
}
static void run_benchmarks();
const cpp::string_view get_suite_name() const { return suite_name; }
const cpp::string_view get_test_name() const { return test_name; }
protected:
static void add_benchmark(Benchmark *benchmark);
private:
BenchmarkResult run() {
BenchmarkOptions options;
return benchmark(options, func);
}
};
} // namespace benchmarks
} // namespace LIBC_NAMESPACE_DECL
// Passing -1 indicates the benchmark should be run with as many threads as
// allocated by the user in the benchmark's CMake.
#define BENCHMARK(SuiteName, TestName, Func) \
LIBC_NAMESPACE::benchmarks::Benchmark SuiteName##_##TestName##_Instance( \
Func, #SuiteName, #TestName, -1)
#define BENCHMARK_N_THREADS(SuiteName, TestName, Func, NumThreads) \
LIBC_NAMESPACE::benchmarks::Benchmark SuiteName##_##TestName##_Instance( \
Func, #SuiteName, #TestName, NumThreads)
#define SINGLE_THREADED_BENCHMARK(SuiteName, TestName, Func) \
BENCHMARK_N_THREADS(SuiteName, TestName, Func, 1)
#define SINGLE_WAVE_BENCHMARK(SuiteName, TestName, Func) \
BENCHMARK_N_THREADS(SuiteName, TestName, Func, \
LIBC_NAMESPACE::gpu::get_lane_size())
#endif
|