1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329
|
//===-- Benchmark function --------------------------------------*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
// This file mainly defines a `Benchmark` function.
//
// The benchmarking process is as follows:
// - We start by measuring the time it takes to run the function
// `InitialIterations` times. This is called a Sample. From this we can derive
// the time it took to run a single iteration.
//
// - We repeat the previous step with a greater number of iterations to lower
// the impact of the measurement. We can derive a more precise estimation of the
// runtime for a single iteration.
//
// - Each sample gives a more accurate estimation of the runtime for a single
// iteration but also takes more time to run. We stop the process when:
// * The measure stabilize under a certain precision (Epsilon),
// * The overall benchmarking time is greater than MaxDuration,
// * The overall sample count is greater than MaxSamples,
// * The last sample used more than MaxIterations iterations.
//
// - We also makes sure that the benchmark doesn't run for a too short period of
// time by defining MinDuration and MinSamples.
#ifndef LLVM_LIBC_UTILS_BENCHMARK_BENCHMARK_H
#define LLVM_LIBC_UTILS_BENCHMARK_BENCHMARK_H
#include "benchmark/benchmark.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/Optional.h"
#include "llvm/ADT/SmallVector.h"
#include <array>
#include <chrono>
#include <cmath>
#include <cstdint>
namespace llvm {
namespace libc_benchmarks {
using Duration = std::chrono::duration<double>;
enum class BenchmarkLog {
None, // Don't keep the internal state of the benchmark.
Last, // Keep only the last batch.
Full // Keep all iterations states, useful for testing or debugging.
};
// An object to configure the benchmark stopping conditions.
// See documentation at the beginning of the file for the overall algorithm and
// meaning of each field.
struct BenchmarkOptions {
// The minimum time for which the benchmark is running.
Duration MinDuration = std::chrono::seconds(0);
// The maximum time for which the benchmark is running.
Duration MaxDuration = std::chrono::seconds(10);
// The number of iterations in the first sample.
uint32_t InitialIterations = 1;
// The maximum number of iterations for any given sample.
uint32_t MaxIterations = 10000000;
// The minimum number of samples.
uint32_t MinSamples = 4;
// The maximum number of samples.
uint32_t MaxSamples = 1000;
// The benchmark will stop if the relative difference between the current and
// the last estimation is less than epsilon. This is 1% by default.
double Epsilon = 0.01;
// The number of iterations grows exponentially between each sample.
// Must be greater or equal to 1.
double ScalingFactor = 1.4;
BenchmarkLog Log = BenchmarkLog::None;
};
// The state of a benchmark.
enum class BenchmarkStatus {
Running,
MaxDurationReached,
MaxIterationsReached,
MaxSamplesReached,
PrecisionReached,
};
// The internal state of the benchmark, useful to debug, test or report
// statistics.
struct BenchmarkState {
size_t LastSampleIterations;
Duration LastBatchElapsed;
BenchmarkStatus CurrentStatus;
Duration CurrentBestGuess; // The time estimation for a single run of `foo`.
double ChangeRatio; // The change in time estimation between previous and
// current samples.
};
// A lightweight result for a benchmark.
struct BenchmarkResult {
BenchmarkStatus TerminationStatus = BenchmarkStatus::Running;
Duration BestGuess = {};
llvm::Optional<llvm::SmallVector<BenchmarkState, 16>> MaybeBenchmarkLog;
};
// Stores information about a cache in the host memory system.
struct CacheInfo {
std::string Type; // e.g. "Instruction", "Data", "Unified".
int Level; // 0 is closest to processing unit.
int Size; // In bytes.
int NumSharing; // The number of processing units (Hyper-Threading Thread)
// with which this cache is shared.
};
// Stores information about the host.
struct HostState {
std::string CpuName; // returns a string compatible with the -march option.
double CpuFrequency; // in Hertz.
std::vector<CacheInfo> Caches;
static HostState get();
};
namespace internal {
struct Measurement {
size_t Iterations = 0;
Duration Elapsed = {};
};
// Updates the estimation of the elapsed time for a single iteration.
class RefinableRuntimeEstimation {
Duration TotalTime = {};
size_t TotalIterations = 0;
public:
Duration update(const Measurement &M) {
assert(M.Iterations > 0);
// Duration is encoded as a double (see definition).
// `TotalTime` and `M.Elapsed` are of the same magnitude so we don't expect
// loss of precision due to radically different scales.
TotalTime += M.Elapsed;
TotalIterations += M.Iterations;
return TotalTime / TotalIterations;
}
};
// This class tracks the progression of the runtime estimation.
class RuntimeEstimationProgression {
RefinableRuntimeEstimation RRE;
public:
Duration CurrentEstimation = {};
// Returns the change ratio between our best guess so far and the one from the
// new measurement.
double computeImprovement(const Measurement &M) {
const Duration NewEstimation = RRE.update(M);
const double Ratio = fabs(((CurrentEstimation / NewEstimation) - 1.0));
CurrentEstimation = NewEstimation;
return Ratio;
}
};
} // namespace internal
// Measures the runtime of `foo` until conditions defined by `Options` are met.
//
// To avoid measurement's imprecisions we measure batches of `foo`.
// The batch size is growing by `ScalingFactor` to minimize the effect of
// measuring.
//
// Note: The benchmark is not responsible for serializing the executions of
// `foo`. It is not suitable for measuring, very small & side effect free
// functions, as the processor is free to execute several executions in
// parallel.
//
// - Options: A set of parameters controlling the stopping conditions for the
// benchmark.
// - foo: The function under test. It takes one value and returns one value.
// The input value is used to randomize the execution of `foo` as part of a
// batch to mitigate the effect of the branch predictor. Signature:
// `ProductType foo(ParameterProvider::value_type value);`
// The output value is a product of the execution of `foo` and prevents the
// compiler from optimizing out foo's body.
// - ParameterProvider: An object responsible for providing a range of
// `Iterations` values to use as input for `foo`. The `value_type` of the
// returned container has to be compatible with `foo` argument.
// Must implement one of:
// `Container<ParameterType> generateBatch(size_t Iterations);`
// `const Container<ParameterType>& generateBatch(size_t Iterations);`
// - Clock: An object providing the current time. Must implement:
// `std::chrono::time_point now();`
template <typename Function, typename ParameterProvider,
typename BenchmarkClock = const std::chrono::high_resolution_clock>
BenchmarkResult benchmark(const BenchmarkOptions &Options,
ParameterProvider &PP, Function foo,
BenchmarkClock &Clock = BenchmarkClock()) {
BenchmarkResult Result;
internal::RuntimeEstimationProgression REP;
Duration TotalBenchmarkDuration = {};
size_t Iterations = std::max(Options.InitialIterations, uint32_t(1));
size_t Samples = 0;
if (Options.ScalingFactor < 1.0)
report_fatal_error("ScalingFactor should be >= 1");
if (Options.Log != BenchmarkLog::None)
Result.MaybeBenchmarkLog.emplace();
for (;;) {
// Request a new Batch of size `Iterations`.
const auto &Batch = PP.generateBatch(Iterations);
// Measuring this Batch.
const auto StartTime = Clock.now();
for (const auto Parameter : Batch) {
const auto Production = foo(Parameter);
benchmark::DoNotOptimize(Production);
}
const auto EndTime = Clock.now();
const Duration Elapsed = EndTime - StartTime;
// Updating statistics.
++Samples;
TotalBenchmarkDuration += Elapsed;
const double ChangeRatio = REP.computeImprovement({Iterations, Elapsed});
Result.BestGuess = REP.CurrentEstimation;
// Stopping condition.
if (TotalBenchmarkDuration >= Options.MinDuration &&
Samples >= Options.MinSamples && ChangeRatio < Options.Epsilon)
Result.TerminationStatus = BenchmarkStatus::PrecisionReached;
else if (Samples >= Options.MaxSamples)
Result.TerminationStatus = BenchmarkStatus::MaxSamplesReached;
else if (TotalBenchmarkDuration >= Options.MaxDuration)
Result.TerminationStatus = BenchmarkStatus::MaxDurationReached;
else if (Iterations >= Options.MaxIterations)
Result.TerminationStatus = BenchmarkStatus::MaxIterationsReached;
if (Result.MaybeBenchmarkLog) {
auto &BenchmarkLog = *Result.MaybeBenchmarkLog;
if (Options.Log == BenchmarkLog::Last && !BenchmarkLog.empty())
BenchmarkLog.pop_back();
BenchmarkState BS;
BS.LastSampleIterations = Iterations;
BS.LastBatchElapsed = Elapsed;
BS.CurrentStatus = Result.TerminationStatus;
BS.CurrentBestGuess = Result.BestGuess;
BS.ChangeRatio = ChangeRatio;
BenchmarkLog.push_back(BS);
}
if (Result.TerminationStatus != BenchmarkStatus::Running)
return Result;
if (Options.ScalingFactor > 1 &&
Iterations * Options.ScalingFactor == Iterations)
report_fatal_error(
"`Iterations *= ScalingFactor` is idempotent, increase ScalingFactor "
"or InitialIterations.");
Iterations *= Options.ScalingFactor;
}
}
// Interprets `Array` as a circular buffer of `Size` elements.
template <typename T> class CircularArrayRef {
llvm::ArrayRef<T> Array;
size_t Size;
public:
using value_type = T;
using reference = T &;
using const_reference = const T &;
using difference_type = ssize_t;
using size_type = size_t;
class const_iterator
: public std::iterator<std::input_iterator_tag, T, ssize_t> {
llvm::ArrayRef<T> Array;
size_t Index;
size_t Offset;
public:
explicit const_iterator(llvm::ArrayRef<T> Array, size_t Index = 0)
: Array(Array), Index(Index), Offset(Index % Array.size()) {}
const_iterator &operator++() {
++Index;
++Offset;
if (Offset == Array.size())
Offset = 0;
return *this;
}
bool operator==(const_iterator Other) const { return Index == Other.Index; }
bool operator!=(const_iterator Other) const { return !(*this == Other); }
const T &operator*() const { return Array[Offset]; }
};
CircularArrayRef(llvm::ArrayRef<T> Array, size_t Size)
: Array(Array), Size(Size) {
assert(Array.size() > 0);
}
const_iterator begin() const { return const_iterator(Array); }
const_iterator end() const { return const_iterator(Array, Size); }
};
// A convenient helper to produce a CircularArrayRef from an ArrayRef.
template <typename T>
CircularArrayRef<T> cycle(llvm::ArrayRef<T> Array, size_t Size) {
return {Array, Size};
}
// Creates an std::array which storage size is constrained under `Bytes`.
template <typename T, size_t Bytes>
using ByteConstrainedArray = std::array<T, Bytes / sizeof(T)>;
// A convenient helper to produce a CircularArrayRef from a
// ByteConstrainedArray.
template <typename T, size_t N>
CircularArrayRef<T> cycle(const std::array<T, N> &Container, size_t Size) {
return {llvm::ArrayRef<T>(Container.cbegin(), Container.cend()), Size};
}
// Makes sure the binary was compiled in release mode and that frequency
// governor is set on performance.
void checkRequirements();
} // namespace libc_benchmarks
} // namespace llvm
#endif // LLVM_LIBC_UTILS_BENCHMARK_BENCHMARK_H
|