File: KernelCost.hpp

package info (click to toggle)
intel-graphics-compiler2 2.24.13-1
  • links: PTS, VCS
  • area: main
  • in suites: sid
  • size: 113,504 kB
  • sloc: cpp: 812,849; lisp: 288,219; ansic: 102,423; python: 4,010; yacc: 2,588; lex: 1,666; pascal: 318; sh: 162; makefile: 38
file content (189 lines) | stat: -rw-r--r-- 5,618 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
/*========================== begin_copyright_notice ============================

Copyright (C) 2017-2021 Intel Corporation

SPDX-License-Identifier: MIT

============================= end_copyright_notice ===========================*/

#ifndef __KERNELCOSTANALYSIS_HPP__
#define __KERNELCOSTANALYSIS_HPP__

#include "G4_BB.hpp"
#include "KernelCostInfo.h"
#include "JitterDataStruct.h"
#include "LoopAnalysis.h"

#include <vector>
#include <algorithm>
#include <unordered_map>

namespace vISA {

class FlowGraph;
class G4_InstSend;

// Use fixed-point as probality type (better than float).
typedef uint32_t ProbType;

// Probability 1 is represented as 0x40000000 (bit 30 is 1).
//   Not using UINT_MAX as probability 1 is to make saturation of adding two
//   prob values easier (need satuation on the sum of prob values as prob
//   values are estimate and their sum could be more than 1).
// For example, prob value=10000 means probability is 10000/0x40000000.
constexpr uint32_t MAX_PROB_POINTS = (1u << 30);

struct BBCostInfo {
  uint32_t m_cycles; // total cycles taken by BB (taken from BB scheduler)
  ProbType m_prob;   // prob with which BB runs.
};

// Report of costs for code segments, such as loop/loop body/func/BB, etc.
// Cost value could be in symbolic form (based on kernel args).
class CostMetricsWrapper {
public:
  CostMetricsWrapper() : CM{0, 0, 0} {}

  uint32_t getCycles() const { return CM.cycles; }
  void setCycles(uint32_t v) { CM.cycles = v; }
  uint32_t getLoadBytes() const { return CM.loadBytes; }
  void setLoadBytes(uint32_t v) { CM.loadBytes = v; }
  uint32_t getStoreBytes() const { return CM.storeBytes; }
  void setStoreBytes(uint32_t v) { CM.storeBytes = v; }

  void add(CostMetricsWrapper& aCM, ProbType P = MAX_PROB_POINTS) {
    if (P != MAX_PROB_POINTS) {
      float factor = P / (float)MAX_PROB_POINTS;
      CM.cycles += (uint32_t)(aCM.getCycles() * factor);
      CM.loadBytes += (uint32_t)(aCM.getLoadBytes() * factor);
      CM.storeBytes += (uint32_t)(aCM.getStoreBytes() * factor);
    } else {
      CM.cycles += aCM.getCycles();
      CM.loadBytes += aCM.getLoadBytes();
      CM.storeBytes += aCM.getStoreBytes();
    }
  }

  void mul(uint32_t M) {
    CM.cycles *= M;
    CM.loadBytes *= M;
    CM.storeBytes *= M;
  }

  const CostMetrics &getCostMetrics() const { return CM; }

private:
  CostMetrics CM;
};

struct LoopCost;

// CostExprWrapper
//   cost for a kernel or a loop. It includes all its immediate
//   child loops (non-immediate nested loops are included in the
//   cost of its immediate child loops).
struct CostExprInternal {
  CostMetricsWrapper C;
  // One entry for each of all immediate child loops.
  std::vector<LoopCost *> LoopCosts;
};

// LoopCost: cost for a single loop
struct LoopCost {
  // loop id within a function (kernel, subroutine), starts from 0 for
  // each function and is in the increasing program order
  int m_loopId;
  // For matching loops b/w visa and igc
  int m_backedge_visaId;

  CostExprInternal m_loopBodyCost;
  // estimate cost (assuming LCE = 16)
  CostMetricsWrapper m_estimateCost;
};

struct FuncCost {
  CostExprInternal m_funcCost;
  CostMetricsWrapper m_estimateCost;

  // BB range for this func:  [m_startBB, m_endBB)
  FuncInfo *m_funcInfo;
  // All loops in this function, in program order.
  // m_loops[0] is the 1st loop and m_loops.back() is the last loop.
  std::vector<const Loop *> m_allLoopsInProgramOrder;
};

class KernelCost {
public:
  KernelCost(G4_Kernel *pK, std::vector<VISA_BB_INFO> &BBInfo);

  void run();

  FuncCost& getKernelCost() {
    if (m_metrics.empty()) {
      // sanity: create an empty FuncCost.
      m_metrics.push_back(FuncCost());
    }
    return m_metrics.back();
  }
  LoopCost& getLoopCost(const Loop *L) { return m_loopCosts[L]; }

private:
  G4_Kernel* m_kernel;
  LoopDetection& m_loops;
  std::unordered_map<G4_BB *, BBCostInfo> m_BBCostInfo;

  // Temporaries
  std::unordered_map<G4_BB *, int> visited;
  // Temporary : Reverse Post-Order traveral
  std::vector<G4_BB *> RPOT;

  // m_metrics.
  //   In reverse calling order. leaf subroutine appears first, kernel last.
  //   m_funcIndex[] is for mapping call site to its FuncCost.
  std::unordered_map<FuncInfo *, int> m_funcIndex;
  std::vector<FuncCost> m_metrics;

  // Metrics for all loops
  std::unordered_map<const Loop *, LoopCost> m_loopCosts;

  void updateBBProb(G4_BB *BB, ProbType P) {
    vISA_ASSERT(m_BBCostInfo.count(BB), "updateBBPro(): prob not set yet");
    BBCostInfo &BCI = m_BBCostInfo[BB];
    BCI.m_prob = std::min(MAX_PROB_POINTS, BCI.m_prob +  P);
  };

  ProbType getBBProb(G4_BB* BB) {
    vISA_ASSERT(m_BBCostInfo.count(BB), "getBBProb(): prob not set yet");
    BBCostInfo &BCI = m_BBCostInfo[BB];
    return BCI.m_prob;
  }

  void DFS_PO(G4_BB *BB);
  void doRPOT(G4_BB *EntryBB);

  // set up before calculating prob and collect metrics
  void init();

  void calculateProb();
  void propagateLoopProb(Loop *L, int RPOT_pos);
  void propagateBBProb(G4_BB* BB, Loop* L = nullptr);

  void getSuccEdgeProb(G4_BB *BB, std::vector<ProbType>& SuccEdgeProb);
  G4_INST* getBranchFlagLocalDef(G4_BB *BB, bool& DefByDst);

  void collectPerfMetrics();
  void calculateBBMetrics(CostMetricsWrapper &CM, G4_BB* BB);
  void collectLoopMetrics(Loop* L);
  void collectSendMetrics(G4_InstSend *SendI, uint32_t &ldBytes, uint32_t &stBytes);

  // helpers
  void print(std::ostream &OS);
  void printForLit(std::ostream &OS);
  void dump();
  void dump() const;
};

void collectKernelCostInfo(G4_Kernel* pK, std::vector<VISA_BB_INFO> &BBInfo);

} // namespace vISA
#endif