File: JitterDataStruct.h

package info (click to toggle)
intel-graphics-compiler2 2.24.13-1
  • links: PTS, VCS
  • area: main
  • in suites: sid
  • size: 113,504 kB
  • sloc: cpp: 812,849; lisp: 288,219; ansic: 102,423; python: 4,010; yacc: 2,588; lex: 1,666; pascal: 318; sh: 162; makefile: 38
file content (228 lines) | stat: -rw-r--r-- 7,937 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
/*========================== begin_copyright_notice ============================

Copyright (C) 2017-2021 Intel Corporation

SPDX-License-Identifier: MIT

============================= end_copyright_notice ===========================*/

#ifndef JITTERDATASTRUCT_
#define JITTERDATASTRUCT_

#include <bitset>
#include <optional>
#include <stdint.h>

// clang-format off
#include "common/LLVMWarningsPush.hpp"
#include "llvm/Support/JSON.h"
#include "common/LLVMWarningsPop.hpp"
// clang-format on

namespace vISA {

struct VISA_BB_INFO {
  int id;
  unsigned staticCycle;
  unsigned sendStallCycle;
  unsigned char loopNestLevel;
};

// PERF_STATS_CORE - the core vISA static performance stats
// This set of stats may be used not only for stats report, but for
// other purposes such as spill cost estimation by IGC.
struct PERF_STATS {
public:
  // Hash value of the binary. Used by stats report.
  uint64_t binaryHash = 0;

  // Number of GRF acutally being used. Stats collection only.
  uint32_t numGRFUsed = 0;

  // Number of configured threads and GRF number. Used by IGC for
  // setting execution environment in output.
  uint32_t numGRFTotal = 0;
  uint32_t numThreads = 0;

  // Un-weighted asm instructions count. Used by IGC for spill
  // cost calculation
  uint32_t numAsmCountUnweighted = 0;

  // Number of flag spill and fill. Used by VC Stats
  uint32_t numFlagSpillStore = 0;
  uint32_t numFlagSpillLoad = 0;

  // Number of spill/fill, weighted by loop. Used by IGC for
  // spill cost calculation.
  uint32_t numGRFSpillFillWeighted = 0;

  // The limit of scratch space size per kernel
  uint32_t scratchSpaceSizeLimit = 0;

  // spillMemUsed is the scratch size in byte of entire vISA stack for this
  // function/kernel. It contains spill size and caller/callee save size.
  // For kernel/entry functions, the value is the sum of potential callees
  // within the same vISABuilder, but it does not consider recursive or
  // indirect calls.
  uint32_t spillMemUsed = 0;

  // Unweighted cycles count estimated by the scheduler.
  uint32_t numCycles = 0;

  // Max register pressure before RA, used for GRF selection
  uint32_t maxGRFPressurePreRA = 0;
  // Final max register pressure
  uint32_t maxGRFPressure = 0;

  // These fields are currently used by IGC.
  // The first two are unweighted (i.e., just a sum of each basic block's
  // estimated cycles), while the last two are weighted by loop (16 iterations
  // per loop).
  // Note that these stats are valid only if post-RA scheduling is enabled.
  uint32_t sendStallCycle = 0;
  uint32_t staticCycle = 0;
  uint32_t loopNestedStallCycle = 0;
  uint32_t loopNestedCycle = 0;
};

// PERF_STATS_VERBOSE - the verbose vISA static performance stats.
// This set of stats are used/set only when the verbose stats are
// queried (vISA_DumpPerfStatsVerbose)
// TODO: This set will be disable completely in the Release build.
struct PERF_STATS_VERBOSE {
  // Number of GRF bank conflicts.
  uint32_t BCNum = 0;

  // Number of byte ALU read-modify-write instructions.
  uint32_t numByteRMWs = 0;

  // Number of ALU instructions.
  uint32_t numALUInst = 0;

  // Number of accumulator Def/Use operands generated by the accumulator
  // substitution pass. This does not include instructions that must use the
  // accumualtor (e.g., addc, mach).
  uint32_t accSubDef = 0;
  uint32_t accSubUse = 0;

  // Number of (local) operands that may be replaced by accumulators. This
  // should be equal to accSubDef/accSubUse if we have an infinite number of ACC
  // registers.
  uint32_t accSubCandidateDef = 0;
  uint32_t accSubCandidateUse = 0;

  // Number of explicit SWSB sync instructions. This includes both
  // distance-based and token-based syncs.
  uint32_t syncInstCount = 0;
  // Number of times SWSB pass has to reuse a token before it is cleared by
  // instructions depending on it. A high value indcates that the kernel may
  // have high SWSB token pressure (i.e., too many active long-latency
  // instructions).
  uint32_t tokenReuseCount = 0;
  // Number of @1 SWSB operations (i.e., a stall on a single ALU pipeline).
  // It can be L@1, I@1, F@1 or @1 of TGL.
  uint32_t singlePipeAtOneDistNum = 0;
  // Number of A@1 SWSB opreations (i.e., stall on all ALU pipes)
  uint32_t allAtOneDistNum = 0;
  // Number of $x.dst SWSB operations (i.e., waiting for a send/dpas to return
  // data in destination registers)
  uint32_t AfterWriteTokenDepCount = 0;
  // Number of $x.src SWSB operations (i.e., waiting for a send to leave the EU
  // so that we could overwrite its payload registers)
  uint32_t AfterReadTokenDepCount = 0;

  // Fields below are meaningful only when Graph-Coloring RA is executed.
  // Number of iterations taken by GCRA. Any variables that do not
  // get an assignment will be spilled and trigger another round of RA.
  uint32_t RAIterNum = 0;
  // Number of GRF declares.
  // FIXME: This should be reported for all RA types.
  uint32_t varNum = 0;
  // Number of global (i.e., referenced in more than one basic block) register
  // variables.
  // FIXME: This should be reported for all RA types.
  uint32_t globalVarNum = 0;
  // Max register pressure.
  // FIXME: This should be reported for all RA types.
  uint32_t maxRP = 0;

  // Max number of neighbors in the interfernce graph for GCRA.
  uint32_t maxNeighbors = 0;
  // Average number of neighbors in the interfernce graph for GCRA.
  float avgNeighbors = 0;
  // Number of SIMT inteference edges.
  uint32_t normIntfNum = 0;
  // Number of SIMD inteference edges.
  uint32_t augIntfNum = 0;

  // preRA scheduler counters
  uint32_t minRegClusterCount;
  uint32_t minRegSUCount;
  uint32_t minRegRestCount;
};

struct PERF_SENDINFO {
  // SendInfo information in separate vectors
  // 1st element = src0 Length
  std::vector<uint32_t> src0Vec;
  // 2nd element = src1 Length
  std::vector<uint32_t> src1Vec;
  // 3rd element = dst Lenth
  std::vector<uint32_t> destVec;
};

struct FINALIZER_INFO {
  // ----- Required by IGC/VC/Runtime ----- //
  // isSpill is deprecated. Retain the field for backward compatibility.
  // Do not use it in any case.
  bool DO_NOT_USE_isSpill = false;

  // Debug info is callee allocated and populated only if switch is passed
  // to JIT to emit debug info.
  void *genDebugInfo = nullptr;
  uint32_t genDebugInfoSize = 0;

  // Propagate information about barriers presence back to IGC. It's safer to
  // depend on vISA statistics as IGC is not able to detect barriers if they
  // are used as a part of Inline vISA code.
  // This information is used by legacy CMRT as well as OpenCL/L0 runtime.
  uint32_t numBarriers = 0;

  // Number of basic blocks in the kernel, used by IGC for stat reporting.
  uint32_t BBNum = 0;
  // TODO: this is no longer used, can we remove them without breaking stuff?
  VISA_BB_INFO *BBInfo = nullptr;

  // Whether kernel recompilation should be avoided. vISA hint for IGC.
  bool avoidRetry = false;

  // GTPin information
  void *freeGRFInfo = nullptr;
  uint32_t freeGRFInfoSize = 0;
  uint32_t numBytesScratchGtpin = 0;

  // Used by compiler output (zebin) for setting up "implicit_arg_buffer".
  // When set to true, runtime will allocate space for implicit_arg_buffer
  bool hasStackcalls = false;

  // load-thread-payload prologs offset required by runtime
  // for skipping the prologs
  uint32_t offsetToSkipPerThreadDataLoad = 0;
  uint32_t offsetToSkipCrossThreadDataLoad = 0;

  // When two entries prolog is added for setting FFID
  // for compute (GP or GP1), skip this offset to set FFID_GP1.
  // Will set FFID_GP if not skip
  uint32_t offsetToSkipSetFFIDGP = 0;
  uint32_t offsetToSkipSetFFIDGP1 = 0;

  // ----- vISA Stats ----- //
  PERF_STATS stats;
  PERF_STATS_VERBOSE statsVerbose;
  PERF_SENDINFO sendInfo;
};

llvm::json::Value toJSON(const PERF_STATS &p);
llvm::json::Value toJSON(const PERF_STATS_VERBOSE &p);
} // namespace vISA
#endif // JITTERDATASTRUCT_