1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381
|
#include <torch/csrc/jit/codegen/cuda/executor.h>
#include <torch/csrc/jit/codegen/cuda/fusion.h>
#include <torch/csrc/jit/codegen/cuda/instrumentation.h>
#include <torch/csrc/jit/codegen/cuda/ir_iostream.h>
#include <torch/csrc/jit/codegen/cuda/kernel_cache.h>
#include <torch/csrc/jit/codegen/cuda/manager.h>
#include <torch/csrc/jit/codegen/cuda/parser.h>
#include <torch/csrc/jit/codegen/cuda/scheduler/all_schedulers.h>
#include <torch/csrc/jit/codegen/cuda/type_inference.h>
#include <torch/csrc/jit/codegen/cuda/utils.h>
#include <torch/csrc/jit/jit_log.h>
#include <torch/csrc/jit/passes/canonicalize.h>
#include <torch/csrc/jit/passes/cuda_graph_fuser.h>
#include <torch/csrc/jit/passes/shape_analysis.h>
#include <torch/csrc/jit/passes/symbolic_shape_analysis.h>
#include <torch/csrc/jit/runtime/graph_executor.h>
#include <torch/csrc/jit/runtime/interpreter.h>
#include <ATen/DimVector.h>
#include <c10/core/DeviceType.h>
#include <c10/util/irange.h>
#include <unordered_map>
namespace torch {
namespace jit {
namespace fuser {
namespace cuda {
//! [ Note -- cache entry indexing ]
//!
//! CudaFusionManager holds the cache and handles interfacing to CudaFusionGroup
//! node, including selection, construction and execution of FusionExecutors.
//!
//! CudaFusionManager bridges PyTorch IR node CudaFusionGroup to GraphCache.
//! Therefore, we want to cache on stringified graph. But it is expensive to
//! stringify and hash on a computational graph, we cache the hash of a
//! stringified graph on node via cache_id.
//!
//! CudaFusionGroup node stores:
//! i. a PyTorch IR in `attr::Subgraph`
//! ii. an int in `attr::cache_id`, (a cached hash value of
//! `attr::Subgraph`)
//!
//! We have 2 unordered_map at CudaFusionGroup:
//! std::unordered_map<std::string, int32_t> graph_cache_ids_;
//! std::unordered_map<int64_t, std::unique_ptr<GraphCache>> graph_cache_;
//!
//! Mapping from std::string to graph_cache_id ensures that we assign the same
//! cache_id to CudaFusionGroup with identical computational grah, allowing
//! kernel reuse; Direct mapping from cache_id to GraphCache allows efficient
//! graph_cache indexing;
namespace {
// TODO remove this (75983):
// we don't need this any more. I think we can use revertAliasCopyOps.
// Similar refactor should be done infallback graph used by fusion guard.
// implementation of xxxx_copy ops should be removed.
//
// Mark string attribute in alias-copy nodes to enable its implementation
// in the fallback path.
void enableAliasCopyNodes(const std::shared_ptr<Graph>& graph, Block* block) {
static std::unordered_set<Symbol> alias_copy_op(
{prim::view_copy,
prim::reshape_copy,
prim::expand_copy,
prim::expand_as_copy,
prim::squeeze_copy,
prim::unsqueeze_copy});
for (Node* n : block->nodes()) {
for (Block* b : n->blocks()) {
enableAliasCopyNodes(graph, b);
}
if (alias_copy_op.find(n->kind()) != alias_copy_op.end()) {
n->s_(attr::name, "CudaFusionGroup");
}
}
}
static std::unique_ptr<Code> createFallbackCode(const Node* fusion_node) {
auto copied_graph = fusion_node->g(attr::Subgraph)->copy();
EraseShapeInformation(copied_graph);
enableAliasCopyNodes(copied_graph, copied_graph->block());
auto code = std::make_unique<Code>(copied_graph, "fallback_cuda_fuser");
return code;
}
// CudaFusionManager is not thread safe!
// TODO: we should make the tradeoff here to use thread_local instead of global
// singleton;
// NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
class CudaFusionManager {
public:
static CudaFusionManager& getManager() {
static CudaFusionManager cuda_fusion_manager_;
return cuda_fusion_manager_;
};
// TODO: I'm assuming we have stride information in `graph->toString`
// We need to make sure stride information is in the final string, as we
// want to AVOID kernel reuse between different fusion_node, unless they
// have identical contiguity information! (So identical stride + shape
// is even more restricting in a good way)
int32_t registerOrGetCacheId(std::shared_ptr<Graph>& graph) {
// prepare graph for lowering;
// We should not call `EraseShapeInformation(graph);`, graph representation
// does not incorporate static sizes, but just rank of input tensors, which
// is exactly what we wanted.
auto canonical_graph = Canonicalize(graph, false);
auto repr = canonical_graph->toString(false);
std::lock_guard<std::mutex> guard(mutex_);
// create new graph_cache_ids_ entry if none existed yet;
if (graph_cache_ids_.count(repr) == 0) {
int32_t kernel_id = getNextUniqueID();
graph_cache_ids_[repr] = kernel_id;
TORCH_CHECK(
graph_cache_.emplace(kernel_id, std::make_unique<GraphCache>(graph))
.second);
}
return graph_cache_ids_[repr];
};
// get fallback kernel id
int32_t getFallbackKernelId() {
std::lock_guard<std::mutex> guard(mutex_);
return getNextUniqueID();
}
void unregisterCacheId(std::shared_ptr<Graph>& graph) {
auto canonical_graph = Canonicalize(graph, false);
auto repr = canonical_graph->toString(false);
// create new graph_cache_ids_ entry if none existed yet;
if (graph_cache_ids_.count(repr) > 0) {
int32_t kernel_id = graph_cache_ids_[repr];
graph_cache_.erase(kernel_id);
graph_cache_ids_.erase(repr);
}
}
std::vector<at::Tensor> runFusionNode(
int32_t kernel_id,
const at::ArrayRef<IValue> inputs) {
std::lock_guard<std::mutex> guard(mutex_);
TORCH_INTERNAL_ASSERT(
graph_cache_.count(kernel_id) > 0, "graph cache miss at run time");
return graph_cache_[kernel_id]->runGraphWithInputs(inputs);
}
bool hasFallbackCode(int32_t kernel_id) {
std::lock_guard<std::mutex> guard(mutex_);
return fallback_cache_.count(kernel_id);
}
Code* getFallbackCode(int32_t kernel_id, const Node* fusion_node) {
{
std::lock_guard<std::mutex> guard(mutex_);
auto it = fallback_cache_.find(kernel_id);
if (it != fallback_cache_.end()) {
return it->second.get();
}
}
std::unique_ptr<Code> code = createFallbackCode(fusion_node);
std::lock_guard<std::mutex> guard(mutex_);
auto it = fallback_cache_.insert({kernel_id, std::move(code)}).first;
return it->second.get();
}
private:
// TODO: Dimension collapsing should be abstracted out and integrated into
// graph caching.
// Dimension collapsing only applicable to profiling executor at this moment
bool graphHasReduction(const std::shared_ptr<Graph>& graph) {
for (const auto& n : graph->nodes()) {
if (isReductionNode(n)) {
return true;
}
}
return false;
}
private:
std::mutex mutex_;
void runCudaKernel(
int32_t key,
const std::vector<int>& contiguity_tag,
const c10::Device){};
int32_t getNextUniqueID() {
return next_unique_id_++;
};
std::unordered_map<std::string, int32_t> graph_cache_ids_;
std::unordered_map<int64_t, std::unique_ptr<GraphCache>> graph_cache_;
std::unordered_map<int64_t, std::unique_ptr<Code>> fallback_cache_;
int32_t next_unique_id_ = 0;
};
} // namespace
void compileCudaFusionGroup(Node* fusion_node) {
FUSER_PERF_SCOPE("nvFuser::Manager::compileCudaFusionGroup");
TORCH_CHECK(
fusion_node->kind() == prim::CudaFusionGroup,
"Only prim::CudaFusionGroup can be compiled");
if (fusion_node->hasAttribute(attr::cache_id)) {
TORCH_WARN("Double registration of CudaFusionGroup on CudaFusionManager");
}
// This is not a critical code path, it's OK to do graph copy here;
auto graph = fusion_node->g(attr::Subgraph)->copy();
auto compile_fusion = [&]() {
// type propagation is needed, as the protocol only requires scalar type on
// input tensors.
// Note that even for Profiling Executor, scalar type could still be
// missing, especially for output tensor from a given node (as profiling
// node only insert meta information after itself).
PropagateShapesOnGraph(graph);
TypePropagate(graph);
int32_t fusion_cache_id =
CudaFusionManager::getManager().registerOrGetCacheId(graph);
fusion_node->i_(attr::cache_id, fusion_cache_id);
};
if (useFallback()) {
try {
compile_fusion();
} catch (...) {
TORCH_WARN(
"FALLBACK path has been taken inside: ",
__FUNCTION__,
". This is an indication that codegen Failed for some reason.\n"
"To debug try disable codegen fallback path via setting the env"
" variable `export PYTORCH_NVFUSER_DISABLE=fallback`\n"
"To report the issue, try enable logging via setting the env"
"variable ` export PYTORCH_JIT_LOG_LEVEL=manager.cpp`\n");
GRAPH_DUMP("`compile_fusion` hits fallback on graph\n", graph);
CudaFusionManager::getManager().unregisterCacheId(graph);
}
} else {
compile_fusion();
}
// Assigning a cache_id to facilitate graph execution and fallback
if (!fusion_node->hasAttribute(attr::cache_id)) {
int32_t fusion_cache_id =
CudaFusionManager::getManager().getFallbackKernelId();
fusion_node->i_(attr::cache_id, fusion_cache_id);
}
}
void runCudaFusionGroup(const Node* fusion_node, Stack& stack) {
FUSER_PERF_SCOPE("nvFuser::Manager::runCudaFusionGroup");
TORCH_CHECK(
fusion_node->hasAttribute(attr::cache_id),
"node prim::CudaFusionGroup has not been compiled yet");
// Fallback to use if anything goes wrong
auto take_fallback = [&](Stack& stack) {
std::unique_ptr<Code> fallback_code_unique;
Code* fallback_code;
int32_t kernel_id = fusion_node->i(attr::cache_id);
fallback_code =
CudaFusionManager::getManager().getFallbackCode(kernel_id, fusion_node);
InterpreterState{*fallback_code}.run(stack);
};
c10::optional<Stack> stack_copy;
auto compare_callback = getCudaFuserComparisonCallback();
if (compare_callback.run_fallback) {
// make a copy of the stack
int64_t inputs_size =
static_cast<int64_t>(fusion_node->g(attr::Subgraph)->inputs().size());
TORCH_INTERNAL_ASSERT(stack.size() >= inputs_size);
stack_copy = Stack();
stack_copy->insert(
stack_copy->end(), stack.begin(), stack.end() - inputs_size);
// deepcopy the last (inputs_size) stack items
std::transform(
stack.end() - inputs_size,
stack.end(),
std::back_inserter(*stack_copy),
[](const c10::IValue& ivalue) { return ivalue.deepcopy(); });
}
auto run_fusion = [&]() {
TORCH_CHECK(
fusion_node->kind() == prim::CudaFusionGroup,
"prim::CudaFusionGroup expected");
int32_t kernel_id = fusion_node->i(attr::cache_id);
// Currently we just construct I/O tensors for static graph;
const auto nInputs = fusion_node->g(attr::Subgraph)->inputs().size();
at::ArrayRef<IValue> inputs = last(stack, nInputs);
auto outputs =
CudaFusionManager::getManager().runFusionNode(kernel_id, inputs);
drop(stack, inputs.size());
stack.insert(
stack.end(),
std::make_move_iterator(outputs.begin()),
std::make_move_iterator(outputs.end()));
};
if (useFallback()) {
try {
// if fusion failed once, it's likely to fail again; and failures are
// slow. So if the fusion fails, then record the failure and always use
// the fallback instead
int32_t kernel_id = fusion_node->i(attr::cache_id);
bool force_fallback =
CudaFusionManager::getManager().hasFallbackCode(kernel_id);
if (force_fallback) {
take_fallback(stack);
} else {
run_fusion();
}
} catch (...) {
TORCH_WARN(
"FALLBACK path has been taken inside: ",
__FUNCTION__,
". This is an indication that codegen Failed for some reason.\n"
"To debug try disable codegen fallback path via setting the env"
" variable `export PYTORCH_NVFUSER_DISABLE=fallback`\n");
take_fallback(stack);
}
} else {
run_fusion();
}
if (compare_callback.callback != nullptr) {
Stack fused_outputs;
Stack fallback_outputs;
int64_t output_count =
static_cast<int64_t>(fusion_node->g(attr::Subgraph)->outputs().size());
TORCH_CHECK(
output_count <= stack.size(),
"Expected ",
output_count,
" outputs but found only ",
stack.size(),
" items on the stack");
fused_outputs.insert(
fused_outputs.begin(), stack.end() - output_count, stack.end());
if (stack_copy) {
take_fallback(*stack_copy);
TORCH_CHECK(
stack_copy->size() == stack.size(),
"Fused graph returns stack with ",
stack.size(),
" items, compared to ",
stack_copy->size(),
" from unfused graph");
fallback_outputs.insert(
fallback_outputs.begin(),
stack_copy->end() - output_count,
stack_copy->end());
}
auto graph_str = fusion_node->g(attr::Subgraph)->toString();
compare_callback.callback(fused_outputs, fallback_outputs, graph_str);
}
}
} // namespace cuda
} // namespace fuser
} // namespace jit
} // namespace torch
|