1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168
|
// Copyright The OpenTelemetry Authors
// SPDX-License-Identifier: Apache-2.0
#include <benchmark/benchmark.h>
#include <atomic>
#include <cstdint>
#include <thread>
#include <vector>
#if defined(__i386__) || defined(__x86_64__)
# if defined(__clang__) || defined(__INTEL_COMPILER)
# include <emmintrin.h> // for _mm_pause
# endif
#endif
#include "opentelemetry/common/macros.h"
#include "opentelemetry/common/spin_lock_mutex.h"
namespace
{
using opentelemetry::common::SpinLockMutex;
constexpr int TightLoopLocks = 10000;
// Runs a thrash-test where we spin up N threads, each of which will
// attempt to lock-mutate-unlock a total of `TightLoopLocks` times.
//
// lock: A lambda denoting how to lock. Accepts a reference to `SpinLockType`.
// unlock: A lambda denoting how to unlock. Accepts a reference to `SpinLockType`.
template <typename SpinLockType, typename LockF, typename UnlockF>
inline void SpinThrash(benchmark::State &s, SpinLockType &spinlock, LockF lock, UnlockF unlock)
{
auto num_threads = s.range(0);
// Value we will increment, fighting over a spinlock.
// The contention is meant to be brief, as close to our expected
// use cases of "updating pointers" or "pushing an event onto a buffer".
std::int64_t value OPENTELEMETRY_MAYBE_UNUSED = 0;
std::vector<std::thread> threads;
threads.reserve(num_threads);
// Timing loop
for (auto _ : s)
{
for (auto i = 0; i < num_threads; i++)
{
threads.emplace_back([&] {
// Increment value once each time the lock is acquired. Spin a few times
// to ensure maximum thread contention.
for (int i = 0; i < TightLoopLocks; i++)
{
lock(spinlock);
value++;
unlock(spinlock);
}
});
}
// Join threads
for (auto &thread : threads)
thread.join();
threads.clear();
}
}
// Benchmark of full spin-lock implementation.
static void BM_SpinLockThrashing(benchmark::State &s)
{
SpinLockMutex spinlock;
SpinThrash(s, spinlock, [](SpinLockMutex &m) { m.lock(); }, [](SpinLockMutex &m) { m.unlock(); });
}
// Naive `while(try_lock()) {}` implementation of lock.
static void BM_NaiveSpinLockThrashing(benchmark::State &s)
{
SpinLockMutex spinlock;
SpinThrash(
s, spinlock,
[](SpinLockMutex &m) {
while (!m.try_lock())
{
// Left this comment to keep the same format on old and new versions of clang-format
}
},
[](SpinLockMutex &m) { m.unlock(); });
}
// Simple `while(try_lock()) { yield-processor }`
static void BM_ProcYieldSpinLockThrashing(benchmark::State &s)
{
SpinLockMutex spinlock;
SpinThrash<SpinLockMutex>(
s, spinlock,
[](SpinLockMutex &m) {
while (!m.try_lock())
{
#if defined(_MSC_VER)
YieldProcessor();
#elif defined(__i386__) || defined(__x86_64__)
# if defined(__clang__) || defined(__INTEL_COMPILER)
_mm_pause();
# else
__builtin_ia32_pause();
# endif
#elif defined(__armel__) || defined(__ARMEL__)
asm volatile("nop" ::: "memory");
#elif defined(__arm__) || defined(__aarch64__) // arm big endian / arm64
__asm__ __volatile__("yield" ::: "memory");
#endif
}
},
[](SpinLockMutex &m) { m.unlock(); });
}
// SpinLock thrashing with thread::yield().
static void BM_ThreadYieldSpinLockThrashing(benchmark::State &s)
{
#if defined(__cpp_lib_atomic_value_initialization) && \
__cpp_lib_atomic_value_initialization >= 201911L
std::atomic_flag mutex{};
#else
std::atomic_flag mutex = ATOMIC_FLAG_INIT;
#endif
SpinThrash<std::atomic_flag>(
s, mutex,
[](std::atomic_flag &l) {
uint32_t try_count = 0;
while (l.test_and_set(std::memory_order_acq_rel))
{
++try_count;
if (try_count % 32)
{
std::this_thread::yield();
}
}
std::this_thread::yield();
},
[](std::atomic_flag &l) { l.clear(std::memory_order_release); });
}
// Run the benchmarks at 2x thread/core and measure the amount of time to thrash around.
BENCHMARK(BM_SpinLockThrashing)
->RangeMultiplier(2)
->Range(1, std::thread::hardware_concurrency())
->MeasureProcessCPUTime()
->UseRealTime()
->Unit(benchmark::kMillisecond);
BENCHMARK(BM_ProcYieldSpinLockThrashing)
->RangeMultiplier(2)
->Range(1, std::thread::hardware_concurrency())
->MeasureProcessCPUTime()
->UseRealTime()
->Unit(benchmark::kMillisecond);
BENCHMARK(BM_NaiveSpinLockThrashing)
->RangeMultiplier(2)
->Range(1, std::thread::hardware_concurrency())
->MeasureProcessCPUTime()
->UseRealTime()
->Unit(benchmark::kMillisecond);
BENCHMARK(BM_ThreadYieldSpinLockThrashing)
->RangeMultiplier(2)
->Range(1, std::thread::hardware_concurrency())
->MeasureProcessCPUTime()
->UseRealTime()
->Unit(benchmark::kMillisecond);
} // namespace
BENCHMARK_MAIN();
|