1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257
|
#pragma once
#include <c10/macros/Export.h>
#include <torch/csrc/jit/codegen/cuda/fusion.h>
#include <torch/csrc/jit/codegen/cuda/ir_base_nodes.h>
#include <torch/csrc/jit/codegen/cuda/ir_builder.h>
#include <torch/csrc/jit/codegen/cuda/lower_sync_information.h>
#include <torch/csrc/jit/codegen/cuda/lower_warp_reduce.h>
#include <torch/csrc/jit/codegen/cuda/parallel_dimension_map.h>
#include <torch/csrc/jit/codegen/cuda/utils.h>
#include <torch/csrc/jit/codegen/cuda/vectorization_info.h>
#include <memory>
#include <unordered_map>
#include <utility>
#include <vector>
namespace torch {
namespace jit {
namespace fuser {
namespace cuda {
namespace kir {
//! Summary of interesting facts about the kernel
// NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
struct KernelSummary {
//! Count of WAR (write-after-read) hazard barriers
int war_hazard_syncs_count = 0;
//! List of global buffers
std::vector<const kir::Allocate*> global_allocations;
//! List of dynamic shared memory buffers
std::vector<const kir::Allocate*> dynamic_smem_allocations;
//! List of static shared memory buffers
std::vector<const kir::Allocate*> static_smem_allocations;
//! Indicate the need to generate random numbers
int max_rng_offsets = -1;
//! Do we have any block reductions?
bool has_block_reductions = false;
//! Number of static grid reductions
bool has_grid_reductions = false;
//! Do we have any grid reduction in a loop, or grid reductions dependent on
//! grid reductions
bool has_cooperative_grid_reduction = false;
//! Do we have any block broadcasts?
bool has_block_broadcasts = false;
//! Do we have any grid broadcasts?
bool has_grid_broadcasts = false;
//! Do we have any welford op?
bool has_welford = false;
//! Do we have any welford op?
bool has_block_welford = false;
//! Do we have any welford op?
bool has_grid_welford = false;
//! Largest shared memory buffer base type
DataType largest_smem_data_type = DataType::Null;
//! Do we have allocations of dynamic local memory?
bool has_dynamic_local_memory_allocations = false;
//! List of dynamic local memory buffers.
//! Only used for debugging.
std::vector<const kir::Allocate*> dynamic_lmem_allocations;
//! ceilDiv extents that must be divisible
std::vector<std::pair<const Val*, const Val*>> splits_to_validate;
//! Effective ParallelTypes of broadcast ops
std::unordered_map<const BroadcastOp*, ParallelTypeBitmap>
broadcast_parallel_types;
//! Track which tensor views are inputs or outputs of a vectorized operation
//! and their maximum vectorized access size
std::unordered_map<TensorView*, int> vectorized_accesses;
// Sync map is needed to figure out if global memory buffers need to be marked
// as volatile because they're used for communication.
SyncMap sync_map;
// Parallel dimension map needed to set the correct properties of grid buffers
// (is a dim inactive)
ParallelDimensionMap parallel_dimension_map_;
//! Track information on vectorized set operations for runtime validation
std::vector<VectorizedSetInfo> vectorized_set_info;
};
class TORCH_CUDA_CU_API KernelPerformanceProfile {
public:
//! Register an expression to profile
void registerExpr(const Expr* expr);
//! Query if an expression is profiled
bool isProfiled(const Expr* expr) const;
//! Get the number of profiled expressions
int getNumberOfProfileEntries() const {
return num_profile_entries_;
}
//! Set the backing buffer of profile.
void setBuffer(TensorView* buffer) {
buffer_ = buffer;
}
//! Get the backing buffer
TensorView* getBuffer() const {
return buffer_;
}
//! Get the indices of the profile of an expression in the backing buffer
std::array<int, 2> getIndicesInProfileBuffer(const Expr* expr) const;
std::string toString(const at::Tensor& buffer) const;
private:
//! Get the new profile index
int getNewIndex();
//! Get the profile index
c10::optional<int> getIndex(const Expr* expr) const;
private:
int num_profile_entries_ = 0;
//! Backing buffer of Nx2 integer tensor, where N is the number of profiled
//! regions. Each region has two integer values, one representing
//! the cycles spent, and another the count.
TensorView* buffer_ = nullptr;
//! Map profiled expressions to profile entry offsets
std::unordered_map<const Expr*, int> expr_entry_map_;
// TODO: Allow profiling of ForLoops
//! Map profiled ForLoop to profile entry offsets
// std::unordered_map<const kir::ForLoop*, int> loop_entry_map_;
};
class KernelInternalProxy;
//! Container for a lowered Kernel IR
//!
// NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
class TORCH_CUDA_CU_API Kernel final : public Fusion {
friend KernelInternalProxy;
public:
// Kernel starts by grabbing all the nodes from the provided fusion.
// Kernel is not SSA, if a definition is not set, we should update it, but
// not remove previous definition if it is set. This is primarily because when
// we do something like generate an initialization statement for a reduction
// TV, we may want to continue to do fusion like analysis on the original
// expression.
// TODO: Assert index type is int or int32
Kernel(Fusion* fusion, DataType index_type = DataType::Int)
: Fusion(*fusion), index_type_(index_type) {}
Kernel() = delete;
// No move or copy semantics
Kernel(const Kernel&) = delete;
Kernel& operator=(const Kernel&) = delete;
//! Finalize a kernel definition
//!
//! At this point we have a complete kernel definition and we can
//! run analysis passes to build a KernelSummary.
void finalize(std::vector<Expr*> top_level_exprs);
const std::vector<Expr*>& topLevelExprs() const {
return top_level_exprs_;
}
const KernelSummary& summary() const {
return summary_;
}
DataType indexType() const {
return index_type_;
}
//! Checks if parallel type is padded
bool isParallelTypePadded(ParallelType ptype) const {
return ptype == ParallelType::TIDx &&
warp_padded_parallel_info_.is_tidx_padded;
}
const WarpPaddedParallelInfo& getWarpPaddedParallelInfo() const {
return warp_padded_parallel_info_;
}
const KernelPerformanceProfile& profile() const {
return profile_;
}
//! Debug dump of the Kernel IR
void print() const;
protected:
//! Register the Val with this fusion
void registerVal(Val* val) override;
//! Register expr with this fusion.
//! When we register an expression, we want to update the dependency tracking
//! of Vals. We add expr to our general expr_set_,
void registerExpr(Expr* expr) override;
private:
// Analyze the kernel IR and caches the summary of interesting data
void analyze();
// Top level statements
std::vector<Expr*> top_level_exprs_;
// Summary of interesting kernel data
KernelSummary summary_;
// Is this kernel being compiled with int32 or int64 indexing. This
// information is required to resolve DataType::Index
DataType index_type_ = DataType::Int;
WarpPaddedParallelInfo warp_padded_parallel_info_;
KernelPerformanceProfile profile_;
};
//! A special debugging proxy for Kernel.
//!
//! Should not be used for other than testing and debugging.
class TORCH_CUDA_CU_API KernelInternalProxy {
public:
KernelInternalProxy(Kernel* kernel) : kernel_(kernel) {}
std::vector<Expr*>& topLevelExprs();
private:
Kernel* kernel_ = nullptr;
};
} // namespace kir
} // namespace cuda
} // namespace fuser
} // namespace jit
} // namespace torch
|