File: verilated_profiler.h

package info (click to toggle)
verilator 5.032-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, trixie
  • size: 93,932 kB
  • sloc: cpp: 131,288; python: 19,365; ansic: 10,234; yacc: 5,733; lex: 1,905; makefile: 1,229; sh: 489; perl: 282; fortran: 22
file content (264 lines) | stat: -rw-r--r-- 9,938 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
// -*- mode: C++; c-file-style: "cc-mode" -*-
//=============================================================================
//
// Code available from: https://verilator.org
//
// Copyright 2012-2025 by Wilson Snyder. This program is free software; you
// can redistribute it and/or modify it under the terms of either the GNU
// Lesser General Public License Version 3 or the Perl Artistic License
// Version 2.0.
// SPDX-License-Identifier: LGPL-3.0-only OR Artistic-2.0
//
//=============================================================================
///
/// \file
/// \brief Verilated run-time profiling header
///
/// This file is not part of the Verilated public-facing API.
/// It is only for internal use by Verilated library routines.
///
//=============================================================================

#ifndef VERILATOR_VERILATED_PROFILER_H_
#define VERILATOR_VERILATED_PROFILER_H_

#include "verilatedos.h"

#include "verilated.h"

#include <array>
#include <atomic>
#include <cassert>
#include <string>
#include <type_traits>
#include <vector>

class VlExecutionProfiler;
class VlThreadPool;

//=============================================================================
// Macros to simplify generated code

#define VL_EXEC_TRACE_ADD_RECORD(vlSymsp) \
    if (VL_UNLIKELY((vlSymsp)->__Vm_executionProfilerp->enabled())) \
    (vlSymsp)->__Vm_executionProfilerp->addRecord()

//=============================================================================
// Return high-precision counter for profiling, or 0x0 if not available
VL_ATTR_ALWINLINE QData VL_CPU_TICK() {
    uint64_t val;
    VL_GET_CPU_TICK(val);
    return val;
}

//=============================================================================
// Private class used by VlExecutionProfiler

#define _VL_FOREACH_APPLY(macro, arg) macro(arg, #arg)

// clang-format off
#define FOREACH_VlExecutionRecord_TYPE(macro) \
    _VL_FOREACH_APPLY(macro, SECTION_PUSH) \
    _VL_FOREACH_APPLY(macro, SECTION_POP) \
    _VL_FOREACH_APPLY(macro, MTASK_BEGIN) \
    _VL_FOREACH_APPLY(macro, MTASK_END) \
    _VL_FOREACH_APPLY(macro, EXEC_GRAPH_BEGIN) \
    _VL_FOREACH_APPLY(macro, EXEC_GRAPH_END)
// clang-format on

class VlExecutionRecord final {
    friend class VlExecutionProfiler;

    // TYPES
    enum class Type : uint8_t {
#define VL_FOREACH_MACRO(id, name) id,
        FOREACH_VlExecutionRecord_TYPE(VL_FOREACH_MACRO)
#undef VL_FOREACH_MACRO
    };

    static constexpr const char* const s_ascii[] = {
#define VL_FOREACH_MACRO(id, name) name,
        FOREACH_VlExecutionRecord_TYPE(VL_FOREACH_MACRO)
#undef VL_FOREACH_MACRO
    };

    union Payload {
        struct {
            const char* m_name;  // Name of section being entered
        } sectionPush;
        struct {
            uint32_t m_id;  // MTask id
            uint32_t m_predictStart;  // Time scheduler predicted would start
            uint32_t m_cpu;  // Executing CPU id
        } mtaskBegin;
        struct {
            uint32_t m_id;  // MTask id
            uint32_t m_predictCost;  // How long scheduler predicted would take
        } mtaskEnd;
    };

    // STATE
    // Layout below allows efficient packing.
    const uint64_t m_tick = VL_CPU_TICK();  // Tick at construction
    Payload m_payload;  // The record payload
    Type m_type;  // The record type
    static_assert(alignof(uint64_t) >= alignof(Payload), "Padding not allowed");
    static_assert(alignof(Payload) >= alignof(Type), "Padding not allowed");

    static uint16_t getcpu();  // Return currently executing CPU id

public:
    // CONSTRUCTOR
    VlExecutionRecord() = default;

    // METHODS
    void sectionPush(const char* name) {
        m_payload.sectionPush.m_name = name;
        m_type = Type::SECTION_PUSH;
    }
    void sectionPop() { m_type = Type::SECTION_POP; }
    void mtaskBegin(uint32_t id, uint32_t predictStart) {
        m_payload.mtaskBegin.m_id = id;
        m_payload.mtaskBegin.m_predictStart = predictStart;
        m_payload.mtaskBegin.m_cpu = getcpu();
        m_type = Type::MTASK_BEGIN;
    }
    void mtaskEnd(uint32_t id, uint32_t predictCost) {
        m_payload.mtaskEnd.m_id = id;
        m_payload.mtaskEnd.m_predictCost = predictCost;
        m_type = Type::MTASK_END;
    }
    void execGraphBegin() { m_type = Type::EXEC_GRAPH_BEGIN; }
    void execGraphEnd() { m_type = Type::EXEC_GRAPH_END; }
};

static_assert(std::is_trivially_destructible<VlExecutionRecord>::value,
              "VlExecutionRecord should be trivially destructible for fast buffer clearing");

//=============================================================================
// VlExecutionProfiler is for collecting profiling data about model execution

class VlExecutionProfiler final : public VerilatedVirtualBase {
    // CONSTANTS

    // In order to try to avoid dynamic memory allocations during the actual profiling phase,
    // trace buffers are pre-allocated to be able to hold [a multiple] of this many records.
    static constexpr size_t RESERVED_TRACE_CAPACITY = 4096;

    // TYPES

    // Execution traces are recorded into thread local vectors. We can append records of profiling
    // events to this vector with very low overhead, and then dump them out later. This prevents
    // the overhead of printf/malloc/IO from corrupting the profiling data. It's super cheap to
    // append a VlProfileRec struct on the end of a pre-allocated vector; this is the only cost we
    // pay in real-time during a profiling cycle. Internal note: Globals may multi-construct, see
    // verilated.cpp top.
    using ExecutionTrace = std::vector<VlExecutionRecord>;

    // STATE
    VerilatedContext& m_context;  // The context this profiler is under
    static thread_local ExecutionTrace t_trace;  // thread-local trace buffers
    mutable VerilatedMutex m_mutex;
    // Map from thread id to &t_trace of given thread
    std::map<uint32_t, ExecutionTrace*> m_traceps VL_GUARDED_BY(m_mutex);

    bool m_enabled = false;  // Is profiling currently enabled

    uint64_t m_tickBegin = 0;  // Sample time (rdtsc() on x86) at beginning of collection
    uint64_t m_lastStartReq = 0;  // Last requested profiling start (in simulation time)
    uint32_t m_windowCount = 0;  // Track our position in the cache warmup and profile window

public:
    // CONSTRUCTOR
    explicit VlExecutionProfiler(VerilatedContext& context);
    ~VlExecutionProfiler() override = default;

    // METHODS

    // Is profiling enabled
    bool enabled() const { return m_enabled; }
    // Append a trace record to the trace buffer of the current thread
    static VlExecutionRecord& addRecord() {
        t_trace.emplace_back();
        return t_trace.back();
    }
    // Configure profiler (called in beginning of 'eval')
    void configure();
    // Setup profiling on a particular thread;
    void setupThread(uint32_t threadId);
    // Clear all profiling data
    void clear() VL_MT_SAFE_EXCLUDES(m_mutex);
    // Write profiling data into file
    void dump(const char* filenamep, uint64_t tickEnd) VL_MT_SAFE_EXCLUDES(m_mutex);

    // Passed to VerilatedContext to create the VlExecutionProfiler profiler instance
    static VerilatedVirtualBase* construct(VerilatedContext& context);
};

//=============================================================================
// VlPgoProfiler is for collecting profiling data for PGO

template <std::size_t N_Entries>
class VlPgoProfiler final {
    // TYPES
    struct Record final {
        const std::string m_name;  // Hashed name of mtask/etc
        const size_t m_counterNumber = 0;  // Which counter has data
    };

    // Counters are stored packed, all together to reduce cache effects
    std::array<uint64_t, N_Entries> m_counters;  // Time spent on this record
    std::vector<Record> m_records;  // Record information

public:
    // METHODS
    VlPgoProfiler() = default;
    ~VlPgoProfiler() = default;
    void write(const char* modelp, const std::string& filename) VL_MT_SAFE;
    void addCounter(size_t counter, const std::string& name) {
        VL_DEBUG_IF(assert(counter < N_Entries););
        m_records.emplace_back(Record{name, counter});
    }
    void startCounter(size_t counter) {
        // -= so when we add end time in stopCounter, the net effect is adding the difference,
        // without needing to hold onto a temporary
        m_counters[counter] -= VL_CPU_TICK();
    }
    void stopCounter(size_t counter) { m_counters[counter] += VL_CPU_TICK(); }
};

template <std::size_t N_Entries>
void VlPgoProfiler<N_Entries>::write(const char* modelp, const std::string& filename) VL_MT_SAFE {
    static VerilatedMutex s_mutex;
    const VerilatedLockGuard lock{s_mutex};

    // On the first call we create the file.  On later calls we append.
    // So when we have multiple models in an executable, possibly even
    // running on different threads, each will have a different symtab so
    // each will collect is own data correctly.  However when each is
    // destroyed we need to get all the data, not keep overwriting and only
    // get the last model's data.
    static bool s_firstCall = true;

    VL_DEBUG_IF(VL_DBG_MSGF("+prof+vlt+file writing to '%s'\n", filename.c_str()););

    FILE* const fp = std::fopen(filename.c_str(), s_firstCall ? "w" : "a");
    if (VL_UNLIKELY(!fp)) {
        VL_FATAL_MT(filename.c_str(), 0, "", "+prof+vlt+file file not writable");
    }
    s_firstCall = false;

    // TODO Perhaps merge with verilated_coverage output format, so can
    // have a common merging and reporting tool, etc.
    fprintf(fp, "// Verilated model profile-guided optimization data dump file\n");
    fprintf(fp, "`verilator_config\n");

    for (const Record& rec : m_records) {
        fprintf(fp, "profile_data -model \"%s\" -mtask \"%s\" -cost 64'd%" PRIu64 "\n", modelp,
                rec.m_name.c_str(), m_counters[rec.m_counterNumber]);
    }

    std::fclose(fp);
}

#endif