File: GenXVectorDecomposer.h

package info (click to toggle)
intel-graphics-compiler 1.0.12504.6-1%2Bdeb12u1
  • links: PTS, VCS
  • area: main
  • in suites: bookworm
  • size: 83,912 kB
  • sloc: cpp: 910,147; lisp: 202,655; ansic: 15,197; python: 4,025; yacc: 2,241; lex: 1,570; pascal: 244; sh: 104; makefile: 25
file content (181 lines) | stat: -rw-r--r-- 6,138 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
/*========================== begin_copyright_notice ============================

Copyright (C) 2017-2022 Intel Corporation

SPDX-License-Identifier: MIT

============================= end_copyright_notice ===========================*/

//
/// GenXVectorDecomposer
/// --------------------
///
/// GenXVectorDecomposer is not a pass; instead it is a class is called by by
/// the GenXPostLegalization pass to perform vector decomposition.
///
/// For a vector written by wrregion and read by rdregion, it finds the way that
/// the vector can be divided into parts, with each part a range of one or more
/// GRFs, such that no rdregion or wrregion crosses a part boundary. Then it
/// decomposes the vector into those parts. A rdregion/wrregion that reads/writes
/// a whole part can be removed completely; a rdregion/wrregion that reads/writes
/// only some of the part is replaced to read/write just the applicable part.
///
/// In fact it does all this for a web of vectors linked by wrregion, phi nodes
/// and bitcasts.
///
/// The idea is that having lots of small vectors instead of one big vector
/// reduces register fragmentation in the finalizer's register allocator.
///
/// There is an option -limit-genx-vector-decomposer=N to aid debugging the code
/// changes made by the vector decomposer.
///
//===----------------------------------------------------------------------===//

#pragma once

#include "llvm/ADT/SmallVector.h"
#include "llvm/IR/Instructions.h"

#include <map>
#include <set>

namespace llvm {
class Constant;
class Instruction;
class PHINode;
class Type;
class Use;

class GenXSubtarget;
} // namespace llvm

namespace vc {
class Region;

// VectorDecomposer : decomposes vectors in a function
class VectorDecomposer {
  using Constant = llvm::Constant;
  using DataLayout = llvm::DataLayout;
  using Instruction = llvm::Instruction;
  using PHINode = llvm::PHINode;
  using Type = llvm::Type;
  using Twine = llvm::Twine;
  using Use = llvm::Use;
  using Value = llvm::Value;
  using VectorType = llvm::VectorType;

  using GenXSubtarget = llvm::GenXSubtarget;

  const DataLayout *DL = nullptr;
  llvm::SmallVector<Instruction *, 16> StartWrRegions;
  std::set<Instruction *> Seen;
  llvm::SmallVector<Instruction *, 16> Web;
  llvm::SmallVector<Instruction *, 16> ToDelete;
  bool NotDecomposing = false;
  Instruction *NotDecomposingReportInst = nullptr;
  llvm::SmallVector<unsigned, 8> Decomposition;
  llvm::SmallVector<unsigned, 8> Offsets;
  std::map<PHINode *, llvm::SmallVector<Value *, 8>> PhiParts;
  llvm::SmallVector<Instruction *, 8> NewInsts;
  unsigned DecomposedCount = 0;

public:
  // clear : clear anything stored
  void clear() {
    clearOne();
    StartWrRegions.clear();
    Seen.clear();
    ToDelete.clear();
    DecomposedCount = 0;
  }
  // addStartWrRegion : add a wrregion with undef input to the list
  void addStartWrRegion(Instruction *Inst) { StartWrRegions.push_back(Inst); }
  // run : run the vector decomposer on the stored StartWrRegions
  bool run(const DataLayout &ArgDL);

private:
  // clearOne : clear from processing one web
  void clearOne() {
    Web.clear();
    Decomposition.clear();
    Offsets.clear();
    PhiParts.clear();
    NewInsts.clear();
  }
  bool processStartWrRegion(Instruction *Inst);
  bool determineDecomposition(Instruction *Inst);
  void addToWeb(Value *V, Instruction *User = nullptr);
  void adjustDecomposition(Instruction *Inst);
  void setNotDecomposing(Instruction *Inst, const char *Text);
  void decompose();
  void decomposeTree(Use *U, const llvm::SmallVectorImpl<Value *> *PartsIn);
  void decomposePhiIncoming(PHINode *Phi, unsigned OperandNum,
                            const llvm::SmallVectorImpl<Value *> *PartsIn);
  void decomposeRdRegion(Instruction *RdRegion,
                         const llvm::SmallVectorImpl<Value *> *PartsIn);
  void decomposeWrRegion(Instruction *WrRegion,
                         llvm::SmallVectorImpl<Value *> *Parts);
  void decomposeBitCast(Instruction *Inst,
                        llvm::SmallVectorImpl<Value *> *Parts);
  unsigned getPartIndex(Region *R);
  unsigned getPartOffset(unsigned PartIndex);
  unsigned getPartNumBytes(Type *WholeTy, unsigned PartIndex);
  unsigned getPartNumElements(Type *WholeTy, unsigned PartIndex);
  VectorType *getPartType(Type *WholeTy, unsigned PartIndex);
  Constant *getConstantPart(Constant *Whole, unsigned PartIndex);
  void removeDeadCode();
  void eraseInst(Instruction *Inst);

  void emitWarning(Instruction *Inst, const Twine &Msg);
};

// Decompose predicate computation sequences for select
// to reduce flag register pressure.
class SelectDecomposer {
  using GenXSubtarget = llvm::GenXSubtarget;
  using Instruction = llvm::Instruction;
  using Value = llvm::Value;

  const GenXSubtarget *ST;
  bool NotDecomposing = false;
  llvm::SmallVector<Instruction *, 8> StartSelects;
  llvm::SmallVector<Instruction *, 16> Web;
  llvm::SmallVector<unsigned, 8> Decomposition;
  llvm::SmallVector<unsigned, 8> Offsets;
  std::set<Instruction *> Seen;

  // Map each decomposed instructions to its corresonding part values.
  llvm::SmallDenseMap<Value *, llvm::SmallVector<Value *, 8>> DMap;

public:
  explicit SelectDecomposer(const GenXSubtarget *ST) : ST(ST) {}
  void addStartSelect(Instruction *Inst) { StartSelects.push_back(Inst); }
  bool run();

private:
  void clear() {
    NotDecomposing = false;
    Web.clear();
    Decomposition.clear();
    Offsets.clear();
    Seen.clear();
    DMap.clear();
  }
  bool processStartSelect(Instruction *Inst);
  bool determineDecomposition(Instruction* Inst);
  void setNotDecomposing() { NotDecomposing = true; }
  void addToWeb(Value *V);
  void decompose(Instruction *Inst);
  void decomposeSelect(Instruction *Inst);
  void decomposeBinOp(Instruction *Inst);
  void decomposeCmp(Instruction *Inst);

  unsigned getPartOffset(unsigned PartIndex) const {
    return Offsets[PartIndex];
  }
  unsigned getPartNumElements(unsigned PartIndex) const {
    return Decomposition[PartIndex];
  }
  Value *getPart(Value *Whole, unsigned PartIndex, Instruction *Inst) const;
};
} // namespace vc