1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280
|
//===-- working_set.cpp ---------------------------------------------------===//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
//
// This file is a part of EfficiencySanitizer, a family of performance tuners.
//
// This file contains working-set-specific code.
//===----------------------------------------------------------------------===//
#include "working_set.h"
#include "esan.h"
#include "esan_circular_buffer.h"
#include "esan_flags.h"
#include "esan_shadow.h"
#include "esan_sideline.h"
#include "sanitizer_common/sanitizer_procmaps.h"
// We shadow every cache line of app memory with one shadow byte.
// - The highest bit of each shadow byte indicates whether the corresponding
// cache line has ever been accessed.
// - The lowest bit of each shadow byte indicates whether the corresponding
// cache line was accessed since the last sample.
// - The other bits are used for working set snapshots at successively
// lower frequencies, each bit to the left from the lowest bit stepping
// down the frequency by 2 to the power of getFlags()->snapshot_step.
// Thus we have something like this:
// Bit 0: Since last sample
// Bit 1: Since last 2^2 samples
// Bit 2: Since last 2^4 samples
// Bit 3: ...
// Bit 7: Ever accessed.
// We live with races in accessing each shadow byte.
typedef unsigned char byte;
namespace __esan {
// Our shadow memory assumes that the line size is 64.
static const u32 CacheLineSize = 64;
// See the shadow byte layout description above.
static const u32 TotalWorkingSetBitIdx = 7;
// We accumulate to the left until we hit this bit.
// We don't need to accumulate to the final bit as it's set on each ref
// by the compiler instrumentation.
static const u32 MaxAccumBitIdx = 6;
static const u32 CurWorkingSetBitIdx = 0;
static const byte ShadowAccessedVal =
(1 << TotalWorkingSetBitIdx) | (1 << CurWorkingSetBitIdx);
static SidelineThread Thread;
// If we use real-time-based timer samples this won't overflow in any realistic
// scenario, but if we switch to some other unit (such as memory accesses) we
// may want to consider a 64-bit int.
static u32 SnapshotNum;
// We store the wset size for each of 8 different sampling frequencies.
static const u32 NumFreq = 8; // One for each bit of our shadow bytes.
// We cannot use static objects as the global destructor is called
// prior to our finalize routine.
// These are each circular buffers, sized up front.
CircularBuffer<u32> SizePerFreq[NumFreq];
// We cannot rely on static initializers (they may run too late) but
// we record the size here for clarity:
u32 CircularBufferSizes[NumFreq] = {
// These are each mmap-ed so our minimum is one page.
32*1024,
16*1024,
8*1024,
4*1024,
4*1024,
4*1024,
4*1024,
4*1024,
};
void processRangeAccessWorkingSet(uptr PC, uptr Addr, SIZE_T Size,
bool IsWrite) {
if (Size == 0)
return;
SIZE_T I = 0;
uptr LineSize = getFlags()->cache_line_size;
// As Addr+Size could overflow at the top of a 32-bit address space,
// we avoid the simpler formula that rounds the start and end.
SIZE_T NumLines = Size / LineSize +
// Add any extra at the start or end adding on an extra line:
(LineSize - 1 + Addr % LineSize + Size % LineSize) / LineSize;
byte *Shadow = (byte *)appToShadow(Addr);
// Write shadow bytes until we're word-aligned.
while (I < NumLines && (uptr)Shadow % 4 != 0) {
if ((*Shadow & ShadowAccessedVal) != ShadowAccessedVal)
*Shadow |= ShadowAccessedVal;
++Shadow;
++I;
}
// Write whole shadow words at a time.
// Using a word-stride loop improves the runtime of a microbenchmark of
// memset calls by 10%.
u32 WordValue = ShadowAccessedVal | ShadowAccessedVal << 8 |
ShadowAccessedVal << 16 | ShadowAccessedVal << 24;
while (I + 4 <= NumLines) {
if ((*(u32*)Shadow & WordValue) != WordValue)
*(u32*)Shadow |= WordValue;
Shadow += 4;
I += 4;
}
// Write any trailing shadow bytes.
while (I < NumLines) {
if ((*Shadow & ShadowAccessedVal) != ShadowAccessedVal)
*Shadow |= ShadowAccessedVal;
++Shadow;
++I;
}
}
// This routine will word-align ShadowStart and ShadowEnd prior to scanning.
// It does *not* clear for BitIdx==TotalWorkingSetBitIdx, as that top bit
// measures the access during the entire execution and should never be cleared.
static u32 countAndClearShadowValues(u32 BitIdx, uptr ShadowStart,
uptr ShadowEnd) {
u32 WorkingSetSize = 0;
u32 ByteValue = 0x1 << BitIdx;
u32 WordValue = ByteValue | ByteValue << 8 | ByteValue << 16 |
ByteValue << 24;
// Get word aligned start.
ShadowStart = RoundDownTo(ShadowStart, sizeof(u32));
bool Accum = getFlags()->record_snapshots && BitIdx < MaxAccumBitIdx;
// Do not clear the bit that measures access during the entire execution.
bool Clear = BitIdx < TotalWorkingSetBitIdx;
for (u32 *Ptr = (u32 *)ShadowStart; Ptr < (u32 *)ShadowEnd; ++Ptr) {
if ((*Ptr & WordValue) != 0) {
byte *BytePtr = (byte *)Ptr;
for (u32 j = 0; j < sizeof(u32); ++j) {
if (BytePtr[j] & ByteValue) {
++WorkingSetSize;
if (Accum) {
// Accumulate to the lower-frequency bit to the left.
BytePtr[j] |= (ByteValue << 1);
}
}
}
if (Clear) {
// Clear this bit from every shadow byte.
*Ptr &= ~WordValue;
}
}
}
return WorkingSetSize;
}
// Scan shadow memory to calculate the number of cache lines being accessed,
// i.e., the number of non-zero bits indexed by BitIdx in each shadow byte.
// We also clear the lowest bits (most recent working set snapshot).
// We do *not* clear for BitIdx==TotalWorkingSetBitIdx, as that top bit
// measures the access during the entire execution and should never be cleared.
static u32 computeWorkingSizeAndReset(u32 BitIdx) {
u32 WorkingSetSize = 0;
MemoryMappingLayout MemIter(true/*cache*/);
MemoryMappedSegment Segment;
while (MemIter.Next(&Segment)) {
VPrintf(4, "%s: considering %p-%p app=%d shadow=%d prot=%u\n", __FUNCTION__,
Segment.start, Segment.end, Segment.protection,
isAppMem(Segment.start), isShadowMem(Segment.start));
if (isShadowMem(Segment.start) && Segment.IsWritable()) {
VPrintf(3, "%s: walking %p-%p\n", __FUNCTION__, Segment.start,
Segment.end);
WorkingSetSize +=
countAndClearShadowValues(BitIdx, Segment.start, Segment.end);
}
}
return WorkingSetSize;
}
// This is invoked from a signal handler but in a sideline thread doing nothing
// else so it is a little less fragile than a typical signal handler.
static void takeSample(void *Arg) {
u32 BitIdx = CurWorkingSetBitIdx;
u32 Freq = 1;
++SnapshotNum; // Simpler to skip 0 whose mod matches everything.
while (BitIdx <= MaxAccumBitIdx && (SnapshotNum % Freq) == 0) {
u32 NumLines = computeWorkingSizeAndReset(BitIdx);
VReport(1, "%s: snapshot #%5d bit %d freq %4d: %8u\n", SanitizerToolName,
SnapshotNum, BitIdx, Freq, NumLines);
SizePerFreq[BitIdx].push_back(NumLines);
Freq = Freq << getFlags()->snapshot_step;
BitIdx++;
}
}
unsigned int getSampleCountWorkingSet()
{
return SnapshotNum;
}
// Initialization that must be done before any instrumented code is executed.
void initializeShadowWorkingSet() {
CHECK(getFlags()->cache_line_size == CacheLineSize);
registerMemoryFaultHandler();
}
void initializeWorkingSet() {
if (getFlags()->record_snapshots) {
for (u32 i = 0; i < NumFreq; ++i)
SizePerFreq[i].initialize(CircularBufferSizes[i]);
Thread.launchThread(takeSample, nullptr, getFlags()->sample_freq);
}
}
static u32 getPeriodForPrinting(u32 MilliSec, const char *&Unit) {
if (MilliSec > 600000) {
Unit = "min";
return MilliSec / 60000;
} else if (MilliSec > 10000) {
Unit = "sec";
return MilliSec / 1000;
} else {
Unit = "ms";
return MilliSec;
}
}
static u32 getSizeForPrinting(u32 NumOfCachelines, const char *&Unit) {
// We need a constant to avoid software divide support:
static const u32 KilobyteCachelines = (0x1 << 10) / CacheLineSize;
static const u32 MegabyteCachelines = KilobyteCachelines << 10;
if (NumOfCachelines > 10 * MegabyteCachelines) {
Unit = "MB";
return NumOfCachelines / MegabyteCachelines;
} else if (NumOfCachelines > 10 * KilobyteCachelines) {
Unit = "KB";
return NumOfCachelines / KilobyteCachelines;
} else {
Unit = "Bytes";
return NumOfCachelines * CacheLineSize;
}
}
void reportWorkingSet() {
const char *Unit;
if (getFlags()->record_snapshots) {
u32 Freq = 1;
Report(" Total number of samples: %u\n", SnapshotNum);
for (u32 i = 0; i < NumFreq; ++i) {
u32 Time = getPeriodForPrinting(getFlags()->sample_freq*Freq, Unit);
Report(" Samples array #%d at period %u %s\n", i, Time, Unit);
// FIXME: report whether we wrapped around and thus whether we
// have data on the whole run or just the last N samples.
for (u32 j = 0; j < SizePerFreq[i].size(); ++j) {
u32 Size = getSizeForPrinting(SizePerFreq[i][j], Unit);
Report("#%4d: %8u %s (%9u cache lines)\n", j, Size, Unit,
SizePerFreq[i][j]);
}
Freq = Freq << getFlags()->snapshot_step;
}
}
// Get the working set size for the entire execution.
u32 NumOfCachelines = computeWorkingSizeAndReset(TotalWorkingSetBitIdx);
u32 Size = getSizeForPrinting(NumOfCachelines, Unit);
Report(" %s: the total working set size: %u %s (%u cache lines)\n",
SanitizerToolName, Size, Unit, NumOfCachelines);
}
int finalizeWorkingSet() {
if (getFlags()->record_snapshots)
Thread.joinThread();
reportWorkingSet();
if (getFlags()->record_snapshots) {
for (u32 i = 0; i < NumFreq; ++i)
SizePerFreq[i].free();
}
return 0;
}
} // namespace __esan
|