1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264
|
// -*- mode: C++; c-file-style: "cc-mode" -*-
//=============================================================================
//
// Code available from: https://verilator.org
//
// Copyright 2012-2025 by Wilson Snyder. This program is free software; you
// can redistribute it and/or modify it under the terms of either the GNU
// Lesser General Public License Version 3 or the Perl Artistic License
// Version 2.0.
// SPDX-License-Identifier: LGPL-3.0-only OR Artistic-2.0
//
//=============================================================================
///
/// \file
/// \brief Verilated run-time profiling header
///
/// This file is not part of the Verilated public-facing API.
/// It is only for internal use by Verilated library routines.
///
//=============================================================================
#ifndef VERILATOR_VERILATED_PROFILER_H_
#define VERILATOR_VERILATED_PROFILER_H_
#include "verilatedos.h"
#include "verilated.h"
#include <array>
#include <atomic>
#include <cassert>
#include <string>
#include <type_traits>
#include <vector>
class VlExecutionProfiler;
class VlThreadPool;
//=============================================================================
// Macros to simplify generated code
#define VL_EXEC_TRACE_ADD_RECORD(vlSymsp) \
if (VL_UNLIKELY((vlSymsp)->__Vm_executionProfilerp->enabled())) \
(vlSymsp)->__Vm_executionProfilerp->addRecord()
//=============================================================================
// Return high-precision counter for profiling, or 0x0 if not available
VL_ATTR_ALWINLINE QData VL_CPU_TICK() {
uint64_t val;
VL_GET_CPU_TICK(val);
return val;
}
//=============================================================================
// Private class used by VlExecutionProfiler
#define _VL_FOREACH_APPLY(macro, arg) macro(arg, #arg)
// clang-format off
#define FOREACH_VlExecutionRecord_TYPE(macro) \
_VL_FOREACH_APPLY(macro, SECTION_PUSH) \
_VL_FOREACH_APPLY(macro, SECTION_POP) \
_VL_FOREACH_APPLY(macro, MTASK_BEGIN) \
_VL_FOREACH_APPLY(macro, MTASK_END) \
_VL_FOREACH_APPLY(macro, EXEC_GRAPH_BEGIN) \
_VL_FOREACH_APPLY(macro, EXEC_GRAPH_END)
// clang-format on
class VlExecutionRecord final {
friend class VlExecutionProfiler;
// TYPES
enum class Type : uint8_t {
#define VL_FOREACH_MACRO(id, name) id,
FOREACH_VlExecutionRecord_TYPE(VL_FOREACH_MACRO)
#undef VL_FOREACH_MACRO
};
static constexpr const char* const s_ascii[] = {
#define VL_FOREACH_MACRO(id, name) name,
FOREACH_VlExecutionRecord_TYPE(VL_FOREACH_MACRO)
#undef VL_FOREACH_MACRO
};
union Payload {
struct {
const char* m_name; // Name of section being entered
} sectionPush;
struct {
uint32_t m_id; // MTask id
uint32_t m_predictStart; // Time scheduler predicted would start
uint32_t m_cpu; // Executing CPU id
} mtaskBegin;
struct {
uint32_t m_id; // MTask id
uint32_t m_predictCost; // How long scheduler predicted would take
} mtaskEnd;
};
// STATE
// Layout below allows efficient packing.
const uint64_t m_tick = VL_CPU_TICK(); // Tick at construction
Payload m_payload; // The record payload
Type m_type; // The record type
static_assert(alignof(uint64_t) >= alignof(Payload), "Padding not allowed");
static_assert(alignof(Payload) >= alignof(Type), "Padding not allowed");
static uint16_t getcpu(); // Return currently executing CPU id
public:
// CONSTRUCTOR
VlExecutionRecord() = default;
// METHODS
void sectionPush(const char* name) {
m_payload.sectionPush.m_name = name;
m_type = Type::SECTION_PUSH;
}
void sectionPop() { m_type = Type::SECTION_POP; }
void mtaskBegin(uint32_t id, uint32_t predictStart) {
m_payload.mtaskBegin.m_id = id;
m_payload.mtaskBegin.m_predictStart = predictStart;
m_payload.mtaskBegin.m_cpu = getcpu();
m_type = Type::MTASK_BEGIN;
}
void mtaskEnd(uint32_t id, uint32_t predictCost) {
m_payload.mtaskEnd.m_id = id;
m_payload.mtaskEnd.m_predictCost = predictCost;
m_type = Type::MTASK_END;
}
void execGraphBegin() { m_type = Type::EXEC_GRAPH_BEGIN; }
void execGraphEnd() { m_type = Type::EXEC_GRAPH_END; }
};
static_assert(std::is_trivially_destructible<VlExecutionRecord>::value,
"VlExecutionRecord should be trivially destructible for fast buffer clearing");
//=============================================================================
// VlExecutionProfiler is for collecting profiling data about model execution
class VlExecutionProfiler final : public VerilatedVirtualBase {
// CONSTANTS
// In order to try to avoid dynamic memory allocations during the actual profiling phase,
// trace buffers are pre-allocated to be able to hold [a multiple] of this many records.
static constexpr size_t RESERVED_TRACE_CAPACITY = 4096;
// TYPES
// Execution traces are recorded into thread local vectors. We can append records of profiling
// events to this vector with very low overhead, and then dump them out later. This prevents
// the overhead of printf/malloc/IO from corrupting the profiling data. It's super cheap to
// append a VlProfileRec struct on the end of a pre-allocated vector; this is the only cost we
// pay in real-time during a profiling cycle. Internal note: Globals may multi-construct, see
// verilated.cpp top.
using ExecutionTrace = std::vector<VlExecutionRecord>;
// STATE
VerilatedContext& m_context; // The context this profiler is under
static thread_local ExecutionTrace t_trace; // thread-local trace buffers
mutable VerilatedMutex m_mutex;
// Map from thread id to &t_trace of given thread
std::map<uint32_t, ExecutionTrace*> m_traceps VL_GUARDED_BY(m_mutex);
bool m_enabled = false; // Is profiling currently enabled
uint64_t m_tickBegin = 0; // Sample time (rdtsc() on x86) at beginning of collection
uint64_t m_lastStartReq = 0; // Last requested profiling start (in simulation time)
uint32_t m_windowCount = 0; // Track our position in the cache warmup and profile window
public:
// CONSTRUCTOR
explicit VlExecutionProfiler(VerilatedContext& context);
~VlExecutionProfiler() override = default;
// METHODS
// Is profiling enabled
bool enabled() const { return m_enabled; }
// Append a trace record to the trace buffer of the current thread
static VlExecutionRecord& addRecord() {
t_trace.emplace_back();
return t_trace.back();
}
// Configure profiler (called in beginning of 'eval')
void configure();
// Setup profiling on a particular thread;
void setupThread(uint32_t threadId);
// Clear all profiling data
void clear() VL_MT_SAFE_EXCLUDES(m_mutex);
// Write profiling data into file
void dump(const char* filenamep, uint64_t tickEnd) VL_MT_SAFE_EXCLUDES(m_mutex);
// Passed to VerilatedContext to create the VlExecutionProfiler profiler instance
static VerilatedVirtualBase* construct(VerilatedContext& context);
};
//=============================================================================
// VlPgoProfiler is for collecting profiling data for PGO
template <std::size_t N_Entries>
class VlPgoProfiler final {
// TYPES
struct Record final {
const std::string m_name; // Hashed name of mtask/etc
const size_t m_counterNumber = 0; // Which counter has data
};
// Counters are stored packed, all together to reduce cache effects
std::array<uint64_t, N_Entries> m_counters; // Time spent on this record
std::vector<Record> m_records; // Record information
public:
// METHODS
VlPgoProfiler() = default;
~VlPgoProfiler() = default;
void write(const char* modelp, const std::string& filename) VL_MT_SAFE;
void addCounter(size_t counter, const std::string& name) {
VL_DEBUG_IF(assert(counter < N_Entries););
m_records.emplace_back(Record{name, counter});
}
void startCounter(size_t counter) {
// -= so when we add end time in stopCounter, the net effect is adding the difference,
// without needing to hold onto a temporary
m_counters[counter] -= VL_CPU_TICK();
}
void stopCounter(size_t counter) { m_counters[counter] += VL_CPU_TICK(); }
};
template <std::size_t N_Entries>
void VlPgoProfiler<N_Entries>::write(const char* modelp, const std::string& filename) VL_MT_SAFE {
static VerilatedMutex s_mutex;
const VerilatedLockGuard lock{s_mutex};
// On the first call we create the file. On later calls we append.
// So when we have multiple models in an executable, possibly even
// running on different threads, each will have a different symtab so
// each will collect is own data correctly. However when each is
// destroyed we need to get all the data, not keep overwriting and only
// get the last model's data.
static bool s_firstCall = true;
VL_DEBUG_IF(VL_DBG_MSGF("+prof+vlt+file writing to '%s'\n", filename.c_str()););
FILE* const fp = std::fopen(filename.c_str(), s_firstCall ? "w" : "a");
if (VL_UNLIKELY(!fp)) {
VL_FATAL_MT(filename.c_str(), 0, "", "+prof+vlt+file file not writable");
}
s_firstCall = false;
// TODO Perhaps merge with verilated_coverage output format, so can
// have a common merging and reporting tool, etc.
fprintf(fp, "// Verilated model profile-guided optimization data dump file\n");
fprintf(fp, "`verilator_config\n");
for (const Record& rec : m_records) {
fprintf(fp, "profile_data -model \"%s\" -mtask \"%s\" -cost 64'd%" PRIu64 "\n", modelp,
rec.m_name.c_str(), m_counters[rec.m_counterNumber]);
}
std::fclose(fp);
}
#endif
|