1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51
|
#pragma once
#include <torch/csrc/WindowsTorchApiMacro.h>
#include <torch/csrc/jit/codegen/cuda/dispatch.h>
#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
#include <vector>
namespace torch {
namespace jit {
namespace fuser {
// Insert sync at end of for-loops to prevent write-after-read race condition.
// WAR race condition occurs when the next iteration of the loop overwrites
// shared memory value before a previous operation has finished reading it.
// WAR Race Check:
// Track all output shared memory TVs before first sync
// Track all input shared memory TVs after last sync
// If the intersection is non-empty, then there is a WAR race condition.
// Recursively check each nested for-loop
// Parent-Child For-Loop Recursive Relationship
// Notation:
// None - Zero Syncs
// 1+ - One or more Syncs
// End - Sync is last op in for-loop to prevent WAR race condition
// Default: Track all shared memory inputs and outputs
// Parent - None
// Child - None => Append All Child Outputs to Parent Initial
// Child - 1+ => Parent first sync => Inherit Child Initial + Final
// Child - End => Parent first sync => Keep Child Initial / Clear Parent Final
// Parent - 1+
// Child - None => Append All Child to Parent Last
// Child - 1+ => Child Final to Parent Final / Discard Child Initial
// Child - End => Clear Parent Last / Discard Child Initial
// If Child - End and Parent has zero remaining operations, then
// Parent inherits Child End.
std::vector<Expr*> insertThreadSynchronization(
Fusion* fusion,
const std::vector<Expr*>& exprs);
} // namespace fuser
} // namespace jit
} // namespace torch
|