1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261
|
//===-- X86Counter.cpp ------------------------------------------*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#include "X86Counter.h"
#if defined(__linux__) && defined(HAVE_LIBPFM) && \
defined(LIBPFM_HAS_FIELD_CYCLES)
// FIXME: Use appropriate wrappers for poll.h and mman.h
// to support Windows and remove this linux-only guard.
#include "llvm/Support/Endian.h"
#include "llvm/Support/Errc.h"
#include <perfmon/perf_event.h>
#include <perfmon/pfmlib.h>
#include <perfmon/pfmlib_perf_event.h>
#include <atomic>
#include <chrono>
#include <cstddef>
#include <cstdint>
#include <limits>
#include <memory>
#include <vector>
#include <poll.h>
#include <sys/mman.h>
#include <unistd.h>
namespace llvm {
namespace exegesis {
// Number of entries in the LBR.
static constexpr int kLbrEntries = 16;
static constexpr size_t kBufferPages = 8;
static const size_t kDataBufferSize = kBufferPages * getpagesize();
// First page is reserved for perf_event_mmap_page. Data buffer starts on
// the next page, so we allocate one more page.
static const size_t kMappedBufferSize = (kBufferPages + 1) * getpagesize();
// Waits for the LBR perf events.
static int pollLbrPerfEvent(const int FileDescriptor) {
struct pollfd PollFd;
PollFd.fd = FileDescriptor;
PollFd.events = POLLIN;
PollFd.revents = 0;
return poll(&PollFd, 1 /* num of fds */, 10000 /* timeout in ms */);
}
// Copies the data-buffer into Buf, given the pointer to MMapped.
static void copyDataBuffer(void *MMappedBuffer, char *Buf, uint64_t Tail,
size_t DataSize) {
// First page is reserved for perf_event_mmap_page. Data buffer starts on
// the next page.
char *Start = reinterpret_cast<char *>(MMappedBuffer) + getpagesize();
// The LBR buffer is a cyclic buffer, we copy data to another buffer.
uint64_t Offset = Tail % kDataBufferSize;
size_t CopySize = kDataBufferSize - Offset;
memcpy(Buf, Start + Offset, CopySize);
if (CopySize >= DataSize)
return;
memcpy(Buf + CopySize, Start, Offset);
return;
}
// Parses the given data-buffer for stats and fill the CycleArray.
// If data has been extracted successfully, also modifies the code to jump
// out the benchmark loop.
static llvm::Error parseDataBuffer(const char *DataBuf, size_t DataSize,
const void *From, const void *To,
llvm::SmallVector<int64_t, 4> *CycleArray) {
const char *DataPtr = DataBuf;
while (DataPtr < DataBuf + DataSize) {
struct perf_event_header Header;
memcpy(&Header, DataPtr, sizeof(struct perf_event_header));
if (Header.type != PERF_RECORD_SAMPLE) {
// Ignores non-sample records.
DataPtr += Header.size;
continue;
}
DataPtr += sizeof(Header);
uint64_t Count = llvm::support::endian::read64(DataPtr, support::native);
DataPtr += sizeof(Count);
struct perf_branch_entry Entry;
memcpy(&Entry, DataPtr, sizeof(struct perf_branch_entry));
// Read the perf_branch_entry array.
for (uint64_t i = 0; i < Count; ++i) {
const uint64_t BlockStart = From == nullptr
? std::numeric_limits<uint64_t>::min()
: reinterpret_cast<uint64_t>(From);
const uint64_t BlockEnd = To == nullptr
? std::numeric_limits<uint64_t>::max()
: reinterpret_cast<uint64_t>(To);
if (BlockStart <= Entry.from && BlockEnd >= Entry.to)
CycleArray->push_back(Entry.cycles);
if (i == Count - 1)
// We've reached the last entry.
return llvm::Error::success();
// Advance to next entry
DataPtr += sizeof(Entry);
memcpy(&Entry, DataPtr, sizeof(struct perf_branch_entry));
}
}
return llvm::make_error<llvm::StringError>("Unable to parse databuffer.",
llvm::errc::io_error);
}
X86LbrPerfEvent::X86LbrPerfEvent(unsigned SamplingPeriod) {
assert(SamplingPeriod > 0 && "SamplingPeriod must be positive");
EventString = "BR_INST_RETIRED.NEAR_TAKEN";
Attr = new perf_event_attr();
Attr->size = sizeof(*Attr);
Attr->type = PERF_TYPE_RAW;
// FIXME This is SKL's encoding. Not sure if it'll change.
Attr->config = 0x20c4; // BR_INST_RETIRED.NEAR_TAKEN
Attr->sample_type = PERF_SAMPLE_BRANCH_STACK;
// Don't need to specify "USER" because we've already excluded HV and Kernel.
Attr->branch_sample_type = PERF_SAMPLE_BRANCH_ANY;
Attr->sample_period = SamplingPeriod;
Attr->wakeup_events = 1; // We need this even when using ioctl REFRESH.
Attr->disabled = 1;
Attr->exclude_kernel = 1;
Attr->exclude_hv = 1;
Attr->read_format = PERF_FORMAT_GROUP;
FullQualifiedEventString = EventString;
}
X86LbrCounter::X86LbrCounter(pfm::PerfEvent &&NewEvent)
: Counter(std::move(NewEvent)) {
MMappedBuffer = mmap(nullptr, kMappedBufferSize, PROT_READ | PROT_WRITE,
MAP_SHARED, FileDescriptor, 0);
if (MMappedBuffer == MAP_FAILED)
llvm::errs() << "Failed to mmap buffer.";
}
X86LbrCounter::~X86LbrCounter() {
if (0 != munmap(MMappedBuffer, kMappedBufferSize))
llvm::errs() << "Failed to munmap buffer.";
}
void X86LbrCounter::start() {
ioctl(FileDescriptor, PERF_EVENT_IOC_REFRESH, 1024 /* kMaxPollsPerFd */);
}
llvm::Error X86LbrCounter::checkLbrSupport() {
// Do a sample read and check if the results contain non-zero values.
X86LbrCounter counter(X86LbrPerfEvent(123));
counter.start();
// Prevent the compiler from unrolling the loop and get rid of all the
// branches. We need at least 16 iterations.
int Sum = 0;
int V = 1;
volatile int *P = &V;
auto TimeLimit =
std::chrono::high_resolution_clock::now() + std::chrono::microseconds(5);
for (int I = 0;
I < kLbrEntries || std::chrono::high_resolution_clock::now() < TimeLimit;
++I) {
Sum += *P;
}
counter.stop();
(void)Sum;
auto ResultOrError = counter.doReadCounter(nullptr, nullptr);
if (ResultOrError)
if (!ResultOrError.get().empty())
// If there is at least one non-zero entry, then LBR is supported.
for (const int64_t &Value : ResultOrError.get())
if (Value != 0)
return Error::success();
return llvm::make_error<llvm::StringError>(
"LBR format with cycles is not suppported on the host.",
llvm::errc::not_supported);
}
llvm::Expected<llvm::SmallVector<int64_t, 4>>
X86LbrCounter::readOrError(StringRef FunctionBytes) const {
// Disable the event before reading
ioctl(FileDescriptor, PERF_EVENT_IOC_DISABLE, 0);
// Find the boundary of the function so that we could filter the LBRs
// to keep only the relevant records.
if (FunctionBytes.empty())
return llvm::make_error<llvm::StringError>("Empty function bytes",
llvm::errc::invalid_argument);
const void *From = reinterpret_cast<const void *>(FunctionBytes.data());
const void *To = reinterpret_cast<const void *>(FunctionBytes.data() +
FunctionBytes.size());
return doReadCounter(From, To);
}
llvm::Expected<llvm::SmallVector<int64_t, 4>>
X86LbrCounter::doReadCounter(const void *From, const void *To) const {
// The max number of time-outs/retries before we give up.
static constexpr int kMaxTimeouts = 160;
// Parses the LBR buffer and fills CycleArray with the sequence of cycle
// counts from the buffer.
llvm::SmallVector<int64_t, 4> CycleArray;
auto DataBuf = std::make_unique<char[]>(kDataBufferSize);
int NumTimeouts = 0;
int PollResult = 0;
while (PollResult <= 0) {
PollResult = pollLbrPerfEvent(FileDescriptor);
if (PollResult > 0)
break;
if (PollResult == -1)
return llvm::make_error<llvm::StringError>("Cannot poll LBR perf event.",
llvm::errc::io_error);
if (NumTimeouts++ >= kMaxTimeouts)
return llvm::make_error<llvm::StringError>(
"LBR polling still timed out after max number of attempts.",
llvm::errc::device_or_resource_busy);
}
struct perf_event_mmap_page Page;
memcpy(&Page, MMappedBuffer, sizeof(struct perf_event_mmap_page));
const uint64_t DataTail = Page.data_tail;
const uint64_t DataHead = Page.data_head;
// We're supposed to use a barrier after reading data_head.
std::atomic_thread_fence(std::memory_order_acq_rel);
const size_t DataSize = DataHead - DataTail;
if (DataSize > kDataBufferSize)
return llvm::make_error<llvm::StringError>(
"DataSize larger than buffer size.", llvm::errc::invalid_argument);
copyDataBuffer(MMappedBuffer, DataBuf.get(), DataTail, DataSize);
llvm::Error error =
parseDataBuffer(DataBuf.get(), DataSize, From, To, &CycleArray);
if (!error)
return CycleArray;
return std::move(error);
}
} // namespace exegesis
} // namespace llvm
#endif // defined(__linux__) && defined(HAVE_LIBPFM) &&
// defined(LIBPFM_HAS_FIELD_CYCLES)
|