File: kernel.h

package info (click to toggle)
pytorch 1.13.1%2Bdfsg-4
  • links: PTS, VCS
  • area: main
  • in suites: bookworm
  • size: 139,252 kB
  • sloc: cpp: 1,100,274; python: 706,454; ansic: 83,052; asm: 7,618; java: 3,273; sh: 2,841; javascript: 612; makefile: 323; xml: 269; ruby: 185; yacc: 144; objc: 68; lex: 44
file content (257 lines) | stat: -rw-r--r-- 7,582 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
#pragma once

#include <c10/macros/Export.h>

#include <torch/csrc/jit/codegen/cuda/fusion.h>
#include <torch/csrc/jit/codegen/cuda/ir_base_nodes.h>
#include <torch/csrc/jit/codegen/cuda/ir_builder.h>
#include <torch/csrc/jit/codegen/cuda/lower_sync_information.h>
#include <torch/csrc/jit/codegen/cuda/lower_warp_reduce.h>
#include <torch/csrc/jit/codegen/cuda/parallel_dimension_map.h>
#include <torch/csrc/jit/codegen/cuda/utils.h>
#include <torch/csrc/jit/codegen/cuda/vectorization_info.h>

#include <memory>
#include <unordered_map>
#include <utility>
#include <vector>

namespace torch {
namespace jit {
namespace fuser {
namespace cuda {
namespace kir {

//! Summary of interesting facts about the kernel
// NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
struct KernelSummary {
  //! Count of WAR (write-after-read) hazard barriers
  int war_hazard_syncs_count = 0;

  //! List of global buffers
  std::vector<const kir::Allocate*> global_allocations;

  //! List of dynamic shared memory buffers
  std::vector<const kir::Allocate*> dynamic_smem_allocations;

  //! List of static shared memory buffers
  std::vector<const kir::Allocate*> static_smem_allocations;

  //! Indicate the need to generate random numbers
  int max_rng_offsets = -1;

  //! Do we have any block reductions?
  bool has_block_reductions = false;

  //! Number of static grid reductions
  bool has_grid_reductions = false;

  //! Do we have any grid reduction in a loop, or grid reductions dependent on
  //! grid reductions
  bool has_cooperative_grid_reduction = false;

  //! Do we have any block broadcasts?
  bool has_block_broadcasts = false;

  //! Do we have any grid broadcasts?
  bool has_grid_broadcasts = false;

  //! Do we have any welford op?
  bool has_welford = false;

  //! Do we have any welford op?
  bool has_block_welford = false;

  //! Do we have any welford op?
  bool has_grid_welford = false;

  //! Largest shared memory buffer base type
  DataType largest_smem_data_type = DataType::Null;

  //! Do we have allocations of dynamic local memory?
  bool has_dynamic_local_memory_allocations = false;

  //! List of dynamic local memory buffers.
  //! Only used for debugging.
  std::vector<const kir::Allocate*> dynamic_lmem_allocations;

  //! ceilDiv extents that must be divisible
  std::vector<std::pair<const Val*, const Val*>> splits_to_validate;

  //! Effective ParallelTypes of broadcast ops
  std::unordered_map<const BroadcastOp*, ParallelTypeBitmap>
      broadcast_parallel_types;

  //! Track which tensor views are inputs or outputs of a vectorized operation
  //! and their maximum vectorized access size
  std::unordered_map<TensorView*, int> vectorized_accesses;

  // Sync map is needed to figure out if global memory buffers need to be marked
  // as volatile because they're used for communication.
  SyncMap sync_map;

  // Parallel dimension map needed to set the correct properties of grid buffers
  // (is a dim inactive)
  ParallelDimensionMap parallel_dimension_map_;

  //! Track information on vectorized set operations for runtime validation
  std::vector<VectorizedSetInfo> vectorized_set_info;
};

class TORCH_CUDA_CU_API KernelPerformanceProfile {
 public:
  //! Register an expression to profile
  void registerExpr(const Expr* expr);

  //! Query if an expression is profiled
  bool isProfiled(const Expr* expr) const;

  //! Get the number of profiled expressions
  int getNumberOfProfileEntries() const {
    return num_profile_entries_;
  }

  //! Set the backing buffer of profile.
  void setBuffer(TensorView* buffer) {
    buffer_ = buffer;
  }

  //! Get the backing buffer
  TensorView* getBuffer() const {
    return buffer_;
  }

  //! Get the indices of the profile of an expression in the backing buffer
  std::array<int, 2> getIndicesInProfileBuffer(const Expr* expr) const;

  std::string toString(const at::Tensor& buffer) const;

 private:
  //! Get the new profile index
  int getNewIndex();

  //! Get the profile index
  c10::optional<int> getIndex(const Expr* expr) const;

 private:
  int num_profile_entries_ = 0;

  //! Backing buffer of Nx2 integer tensor, where N is the number of profiled
  //! regions. Each region has two integer values, one representing
  //! the cycles spent, and another the count.
  TensorView* buffer_ = nullptr;

  //! Map profiled expressions to profile entry offsets
  std::unordered_map<const Expr*, int> expr_entry_map_;

  // TODO: Allow profiling of ForLoops
  //! Map profiled ForLoop to profile entry offsets
  // std::unordered_map<const kir::ForLoop*, int> loop_entry_map_;
};

class KernelInternalProxy;

//! Container for a lowered Kernel IR
//!
// NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
class TORCH_CUDA_CU_API Kernel final : public Fusion {
  friend KernelInternalProxy;

 public:
  // Kernel starts by grabbing all the nodes from the provided fusion.
  // Kernel is not SSA, if a definition is not set, we should update it, but
  // not remove previous definition if it is set. This is primarily because when
  // we do something like generate an initialization statement for a reduction
  // TV, we may want to continue to do fusion like analysis on the original
  // expression.
  // TODO: Assert index type is int or int32
  Kernel(Fusion* fusion, DataType index_type = DataType::Int)
      : Fusion(*fusion), index_type_(index_type) {}

  Kernel() = delete;

  // No move or copy semantics
  Kernel(const Kernel&) = delete;
  Kernel& operator=(const Kernel&) = delete;

  //! Finalize a kernel definition
  //!
  //! At this point we have a complete kernel definition and we can
  //! run analysis passes to build a KernelSummary.
  void finalize(std::vector<Expr*> top_level_exprs);

  const std::vector<Expr*>& topLevelExprs() const {
    return top_level_exprs_;
  }

  const KernelSummary& summary() const {
    return summary_;
  }

  DataType indexType() const {
    return index_type_;
  }

  //! Checks if parallel type is padded
  bool isParallelTypePadded(ParallelType ptype) const {
    return ptype == ParallelType::TIDx &&
        warp_padded_parallel_info_.is_tidx_padded;
  }

  const WarpPaddedParallelInfo& getWarpPaddedParallelInfo() const {
    return warp_padded_parallel_info_;
  }

  const KernelPerformanceProfile& profile() const {
    return profile_;
  }

  //! Debug dump of the Kernel IR
  void print() const;

 protected:
  //! Register the Val with this fusion
  void registerVal(Val* val) override;

  //! Register expr with this fusion.
  //! When we register an expression, we want to update the dependency tracking
  //! of Vals. We add expr to our general expr_set_,
  void registerExpr(Expr* expr) override;

 private:
  // Analyze the kernel IR and caches the summary of interesting data
  void analyze();

  // Top level statements
  std::vector<Expr*> top_level_exprs_;

  // Summary of interesting kernel data
  KernelSummary summary_;

  // Is this kernel being compiled with int32 or int64 indexing. This
  // information is required to resolve DataType::Index
  DataType index_type_ = DataType::Int;

  WarpPaddedParallelInfo warp_padded_parallel_info_;

  KernelPerformanceProfile profile_;
};

//! A special debugging proxy for Kernel.
//!
//! Should not be used for other than testing and debugging.
class TORCH_CUDA_CU_API KernelInternalProxy {
 public:
  KernelInternalProxy(Kernel* kernel) : kernel_(kernel) {}

  std::vector<Expr*>& topLevelExprs();

 private:
  Kernel* kernel_ = nullptr;
};

} // namespace kir
} // namespace cuda
} // namespace fuser
} // namespace jit
} // namespace torch