1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
|
#pragma once
#include <c10/macros/Export.h>
#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
#include <torch/csrc/jit/codegen/cuda/kernel_ir.h>
#include <vector>
namespace torch {
namespace jit {
namespace fuser {
namespace cuda {
//! Insert sync at end of for-loops to prevent write-after-read race condition.
//!
//! WAR race condition occurs when the next iteration of the loop overwrites
//! shared memory value before a previous operation has finished reading it.
std::vector<Expr*> insertWarThreadSynchronization(
const std::vector<Expr*>& exprs);
//! Insert syncs between writing to shared memory and then reading it.
//! RAW pass is run before indexing, unrolling (loop duplication), memory
//! aliasing, and index (grid/block bcast/reduction)
std::vector<Expr*> insertRawThreadSynchronization(
const std::vector<Expr*>& exprs);
} // namespace cuda
} // namespace fuser
} // namespace jit
} // namespace torch
|