1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438
|
// Copyright 2018 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "base/sampling_heap_profiler/poisson_allocation_sampler.h"
#include <atomic>
#include <cmath>
#include <memory>
#include <utility>
#include "base/allocator/dispatcher/reentry_guard.h"
#include "base/allocator/dispatcher/tls.h"
#include "base/check.h"
#include "base/compiler_specific.h"
#include "base/no_destructor.h"
#include "base/rand_util.h"
#include "base/ranges/algorithm.h"
#include "build/build_config.h"
#include "third_party/abseil-cpp/absl/base/attributes.h"
namespace base {
namespace {
using ::base::allocator::dispatcher::ReentryGuard;
const size_t kDefaultSamplingIntervalBytes = 128 * 1024;
const intptr_t kAccumulatedBytesOffset = 1 << 29;
// Controls if sample intervals should not be randomized. Used for testing.
bool g_deterministic = false;
// Pointer to the current |LockFreeAddressHashSet|.
ABSL_CONST_INIT std::atomic<LockFreeAddressHashSet*> g_sampled_addresses_set{
nullptr};
// Sampling interval parameter, the mean value for intervals between samples.
ABSL_CONST_INIT std::atomic_size_t g_sampling_interval{
kDefaultSamplingIntervalBytes};
struct ThreadLocalData {
// Accumulated bytes towards sample.
intptr_t accumulated_bytes = 0;
// Used as a workaround to avoid bias from muted samples. See
// ScopedMuteThreadSamples for more details.
intptr_t accumulated_bytes_snapshot = 0;
// PoissonAllocationSampler performs allocations while handling a
// notification. The guard protects against recursions originating from these.
bool internal_reentry_guard = false;
// A boolean used to distinguish first allocation on a thread:
// false - first allocation on the thread;
// true - otherwise.
// Since accumulated_bytes is initialized with zero the very first
// allocation on a thread would always trigger the sample, thus skewing the
// profile towards such allocations. To mitigate that we use the flag to
// ensure the first allocation is properly accounted.
bool sampling_interval_initialized = false;
};
ThreadLocalData* GetThreadLocalData() {
#if USE_LOCAL_TLS_EMULATION()
// If available, use ThreadLocalStorage to bypass dependencies introduced by
// Clang's implementation of thread_local.
static base::NoDestructor<
base::allocator::dispatcher::ThreadLocalStorage<ThreadLocalData>>
thread_local_data("poisson_allocation_sampler");
return thread_local_data->GetThreadLocalData();
#else
// Notes on TLS usage:
//
// * There's no safe way to use TLS in malloc() as both C++ thread_local and
// pthread do not pose any guarantees on whether they allocate or not.
// * We think that we can safely use thread_local w/o re-entrancy guard
// because the compiler will use "tls static access model" for static builds
// of Chrome [https://www.uclibc.org/docs/tls.pdf].
// But there's no guarantee that this will stay true, and in practice
// it seems to have problems on macOS/Android. These platforms do allocate
// on the very first access to a thread_local on each thread.
// * Directly using/warming-up platform TLS seems to work on all platforms,
// but is also not guaranteed to stay true. We make use of it for reentrancy
// guards on macOS/Android.
// * We cannot use Windows Tls[GS]etValue API as it modifies the result of
// GetLastError.
//
// Android thread_local seems to be using __emutls_get_address from libgcc:
// https://github.com/gcc-mirror/gcc/blob/master/libgcc/emutls.c
// macOS version is based on _tlv_get_addr from dyld:
// https://opensource.apple.com/source/dyld/dyld-635.2/src/threadLocalHelpers.s.auto.html
thread_local ThreadLocalData thread_local_data;
return &thread_local_data;
#endif
}
} // namespace
PoissonAllocationSampler::ScopedMuteThreadSamples::ScopedMuteThreadSamples() {
ThreadLocalData* const thread_local_data = GetThreadLocalData();
DCHECK(!thread_local_data->internal_reentry_guard);
thread_local_data->internal_reentry_guard = true;
// We mute thread samples immediately after taking a sample, which is when we
// reset g_tls_accumulated_bytes. This breaks the random sampling requirement
// of the poisson process, and causes us to systematically overcount all other
// allocations. That's because muted allocations rarely trigger a sample
// [which would cause them to be ignored] since they occur right after
// g_tls_accumulated_bytes is reset.
//
// To counteract this, we drop g_tls_accumulated_bytes by a large, fixed
// amount to lower the probability that a sample is taken to close to 0. Then
// we reset it after we're done muting thread samples.
thread_local_data->accumulated_bytes_snapshot =
thread_local_data->accumulated_bytes;
thread_local_data->accumulated_bytes -= kAccumulatedBytesOffset;
}
PoissonAllocationSampler::ScopedMuteThreadSamples::~ScopedMuteThreadSamples() {
ThreadLocalData* const thread_local_data = GetThreadLocalData();
DCHECK(thread_local_data->internal_reentry_guard);
thread_local_data->internal_reentry_guard = false;
thread_local_data->accumulated_bytes =
thread_local_data->accumulated_bytes_snapshot;
}
// static
bool PoissonAllocationSampler::ScopedMuteThreadSamples::IsMuted() {
ThreadLocalData* const thread_local_data = GetThreadLocalData();
return thread_local_data->internal_reentry_guard;
}
PoissonAllocationSampler::ScopedSuppressRandomnessForTesting::
ScopedSuppressRandomnessForTesting() {
DCHECK(!g_deterministic);
g_deterministic = true;
// The accumulated_bytes may contain a random value from previous
// test runs, which would make the behaviour of the next call to
// RecordAlloc unpredictable.
ThreadLocalData* const thread_local_data = GetThreadLocalData();
thread_local_data->accumulated_bytes = 0;
}
PoissonAllocationSampler::ScopedSuppressRandomnessForTesting::
~ScopedSuppressRandomnessForTesting() {
DCHECK(g_deterministic);
g_deterministic = false;
}
// static
bool PoissonAllocationSampler::ScopedSuppressRandomnessForTesting::
IsSuppressed() {
return g_deterministic;
}
PoissonAllocationSampler::ScopedMuteHookedSamplesForTesting::
ScopedMuteHookedSamplesForTesting() {
SetProfilingStateFlag(ProfilingStateFlag::kHookedSamplesMutedForTesting);
// Reset the accumulated bytes to 0 on this thread.
ThreadLocalData* const thread_local_data = GetThreadLocalData();
accumulated_bytes_snapshot_ = thread_local_data->accumulated_bytes;
thread_local_data->accumulated_bytes = 0;
}
PoissonAllocationSampler::ScopedMuteHookedSamplesForTesting::
~ScopedMuteHookedSamplesForTesting() {
// Restore the accumulated bytes.
ThreadLocalData* const thread_local_data = GetThreadLocalData();
thread_local_data->accumulated_bytes = accumulated_bytes_snapshot_;
ResetProfilingStateFlag(ProfilingStateFlag::kHookedSamplesMutedForTesting);
}
// static
ABSL_CONST_INIT std::atomic<PoissonAllocationSampler::ProfilingStateFlagMask>
PoissonAllocationSampler::profiling_state_{0};
PoissonAllocationSampler::PoissonAllocationSampler() {
Init();
auto* sampled_addresses = new LockFreeAddressHashSet(64);
g_sampled_addresses_set.store(sampled_addresses, std::memory_order_release);
}
// static
void PoissonAllocationSampler::Init() {
[[maybe_unused]] static bool init_once = []() {
// Touch thread local data on initialization to enforce proper setup of
// underlying storage system.
GetThreadLocalData();
ReentryGuard::InitTLSSlot();
return true;
}();
}
void PoissonAllocationSampler::SetSamplingInterval(
size_t sampling_interval_bytes) {
// TODO(alph): Reset the sample being collected if running.
g_sampling_interval.store(sampling_interval_bytes, std::memory_order_relaxed);
}
size_t PoissonAllocationSampler::SamplingInterval() const {
return g_sampling_interval.load(std::memory_order_relaxed);
}
// static
size_t PoissonAllocationSampler::GetNextSampleInterval(size_t interval) {
if (UNLIKELY(g_deterministic)) {
return interval;
}
// We sample with a Poisson process, with constant average sampling
// interval. This follows the exponential probability distribution with
// parameter λ = 1/interval where |interval| is the average number of bytes
// between samples.
// Let u be a uniformly distributed random number (0,1], then
// next_sample = -ln(u) / λ
// RandDouble returns numbers [0,1). We use 1-RandDouble to correct it to
// avoid a possible floating point exception from taking the log of 0.
// The allocator shim uses the PoissonAllocationSampler, hence avoid
// allocation to avoid infinite recursion.
double uniform = internal::RandDoubleAvoidAllocation();
double value = -log(1 - uniform) * interval;
size_t min_value = sizeof(intptr_t);
// We limit the upper bound of a sample interval to make sure we don't have
// huge gaps in the sampling stream. Probability of the upper bound gets hit
// is exp(-20) ~ 2e-9, so it should not skew the distribution.
size_t max_value = interval * 20;
if (UNLIKELY(value < min_value)) {
return min_value;
}
if (UNLIKELY(value > max_value)) {
return max_value;
}
return static_cast<size_t>(value);
}
void PoissonAllocationSampler::DoRecordAllocation(
const ProfilingStateFlagMask state,
void* address,
size_t size,
base::allocator::dispatcher::AllocationSubsystem type,
const char* context) {
ThreadLocalData* const thread_local_data = GetThreadLocalData();
thread_local_data->accumulated_bytes += size;
intptr_t accumulated_bytes = thread_local_data->accumulated_bytes;
if (LIKELY(accumulated_bytes < 0)) {
return;
}
if (UNLIKELY(!(state & ProfilingStateFlag::kIsRunning))) {
// Sampling was in fact disabled when the hook was called. Reset the state
// of the sampler. We do this check off the fast-path, because it's quite a
// rare state when the sampler is stopped after it's started. (The most
// common caller of PoissonAllocationSampler starts it and leaves it running
// for the rest of the Chrome session.)
thread_local_data->sampling_interval_initialized = false;
thread_local_data->accumulated_bytes = 0;
return;
}
// Failed allocation? Skip the sample.
if (UNLIKELY(!address)) {
return;
}
size_t mean_interval = g_sampling_interval.load(std::memory_order_relaxed);
if (UNLIKELY(!thread_local_data->sampling_interval_initialized)) {
thread_local_data->sampling_interval_initialized = true;
// This is the very first allocation on the thread. It always makes it
// passing the condition at |RecordAlloc|, because accumulated_bytes
// is initialized with zero due to TLS semantics.
// Generate proper sampling interval instance and make sure the allocation
// has indeed crossed the threshold before counting it as a sample.
accumulated_bytes -= GetNextSampleInterval(mean_interval);
if (accumulated_bytes < 0) {
thread_local_data->accumulated_bytes = accumulated_bytes;
return;
}
}
// This cast is safe because this function is only called with a positive
// value of `accumulated_bytes`.
size_t samples = static_cast<size_t>(accumulated_bytes) / mean_interval;
accumulated_bytes %= mean_interval;
do {
accumulated_bytes -= GetNextSampleInterval(mean_interval);
++samples;
} while (accumulated_bytes >= 0);
thread_local_data->accumulated_bytes = accumulated_bytes;
if (UNLIKELY(ScopedMuteThreadSamples::IsMuted())) {
return;
}
ScopedMuteThreadSamples no_reentrancy_scope;
std::vector<SamplesObserver*> observers_copy;
{
AutoLock lock(mutex_);
// TODO(alph): Sometimes RecordAlloc is called twice in a row without
// a RecordFree in between. Investigate it.
if (sampled_addresses_set().Contains(address)) {
return;
}
sampled_addresses_set().Insert(address);
BalanceAddressesHashSet();
observers_copy = observers_;
}
size_t total_allocated = mean_interval * samples;
for (auto* observer : observers_copy) {
observer->SampleAdded(address, size, total_allocated, type, context);
}
}
void PoissonAllocationSampler::DoRecordFree(void* address) {
// There is a rare case on macOS and Android when the very first thread_local
// access in ScopedMuteThreadSamples constructor may allocate and
// thus reenter DoRecordAlloc. However the call chain won't build up further
// as RecordAlloc accesses are guarded with pthread TLS-based ReentryGuard.
ScopedMuteThreadSamples no_reentrancy_scope;
std::vector<SamplesObserver*> observers_copy;
{
AutoLock lock(mutex_);
observers_copy = observers_;
sampled_addresses_set().Remove(address);
}
for (auto* observer : observers_copy) {
observer->SampleRemoved(address);
}
}
void PoissonAllocationSampler::BalanceAddressesHashSet() {
// Check if the load_factor of the current addresses hash set becomes higher
// than 1, allocate a new twice larger one, copy all the data,
// and switch to using it.
// During the copy process no other writes are made to both sets
// as it's behind the lock.
// All the readers continue to use the old one until the atomic switch
// process takes place.
LockFreeAddressHashSet& current_set = sampled_addresses_set();
if (current_set.load_factor() < 1) {
return;
}
auto new_set =
std::make_unique<LockFreeAddressHashSet>(current_set.buckets_count() * 2);
new_set->Copy(current_set);
// Atomically switch all the new readers to the new set.
g_sampled_addresses_set.store(new_set.release(), std::memory_order_release);
// We leak the older set because we still have to keep all the old maps alive
// as there might be reader threads that have already obtained the map,
// but haven't yet managed to access it.
}
// static
LockFreeAddressHashSet& PoissonAllocationSampler::sampled_addresses_set() {
return *g_sampled_addresses_set.load(std::memory_order_acquire);
}
// static
PoissonAllocationSampler* PoissonAllocationSampler::Get() {
static NoDestructor<PoissonAllocationSampler> instance;
return instance.get();
}
// static
void PoissonAllocationSampler::SetProfilingStateFlag(ProfilingStateFlag flag) {
ProfilingStateFlagMask flags = flag;
if (flag == ProfilingStateFlag::kIsRunning) {
flags |= ProfilingStateFlag::kWasStarted;
}
ProfilingStateFlagMask old_state =
profiling_state_.fetch_or(flags, std::memory_order_relaxed);
DCHECK(!(old_state & flag));
}
// static
void PoissonAllocationSampler::ResetProfilingStateFlag(
ProfilingStateFlag flag) {
DCHECK_NE(flag, kWasStarted);
ProfilingStateFlagMask old_state =
profiling_state_.fetch_and(~flag, std::memory_order_relaxed);
DCHECK(old_state & flag);
}
void PoissonAllocationSampler::AddSamplesObserver(SamplesObserver* observer) {
// The following implementation (including ScopedMuteThreadSamples) will use
// `thread_local`, which may cause a reentrancy issue. So, temporarily
// disable the sampling by having a ReentryGuard.
ReentryGuard guard;
ScopedMuteThreadSamples no_reentrancy_scope;
AutoLock lock(mutex_);
DCHECK(ranges::find(observers_, observer) == observers_.end());
bool profiler_was_stopped = observers_.empty();
observers_.push_back(observer);
// Adding the observer will enable profiling. This will use
// `g_sampled_address_set` so it had better be initialized.
DCHECK(g_sampled_addresses_set.load(std::memory_order_relaxed));
// Start the profiler if this was the first observer. Setting/resetting
// kIsRunning isn't racy because it's performed based on `observers_.empty()`
// while holding `mutex_`.
if (profiler_was_stopped) {
SetProfilingStateFlag(ProfilingStateFlag::kIsRunning);
}
DCHECK(profiling_state_.load(std::memory_order_relaxed) &
ProfilingStateFlag::kIsRunning);
}
void PoissonAllocationSampler::RemoveSamplesObserver(
SamplesObserver* observer) {
// The following implementation (including ScopedMuteThreadSamples) will use
// `thread_local`, which may cause a reentrancy issue. So, temporarily
// disable the sampling by having a ReentryGuard.
ReentryGuard guard;
ScopedMuteThreadSamples no_reentrancy_scope;
AutoLock lock(mutex_);
auto it = ranges::find(observers_, observer);
DCHECK(it != observers_.end());
observers_.erase(it);
// Stop the profiler if there are no more observers. Setting/resetting
// kIsRunning isn't racy because it's performed based on `observers_.empty()`
// while holding `mutex_`.
DCHECK(profiling_state_.load(std::memory_order_relaxed) &
ProfilingStateFlag::kIsRunning);
if (observers_.empty()) {
ResetProfilingStateFlag(ProfilingStateFlag::kIsRunning);
}
}
} // namespace base
|