1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179
|
#pragma once
#include <ATen/ATen.h>
#include <c10/util/Exception.h>
#include <torch/csrc/jit/codegen/cuda/type.h>
#include <torch/csrc/jit/ir/ir.h>
namespace torch {
namespace jit {
namespace fuser {
namespace cuda {
void debugPrint(const c10::TensorTypePtr& type);
bool is_zero_dim_tensor(const std::shared_ptr<c10::TensorType>& tensor_type);
bool is_zero_sized_tensor(const std::shared_ptr<c10::TensorType>& tensor_type);
bool is_cpu_scalar(const at::Tensor& tensor);
bool is_cpu_scalar(const c10::TensorType& tensor_type);
// TODO: merge these two
// check if input is compatible with 32b index mode
int getCommonDeviceCUDA(const at::ArrayRef<IValue>& inputs);
KernelIndexMode collectIndexMode(const at::ArrayRef<at::IValue>& inputs);
//! Types of debug print-outs
//!
//! These can be set through the `PYTORCH_NVFUSER_DUMP` environment variable
//!
enum class DebugDumpOption {
FusionIr, //!< Dump the Fusion IR before lowering
FusionIrMath, //!< Dump just the compute (math) part of the Fusion IR
FusionIrPresched, //!< Dump the Fusion IR before it is scheduled.
KernelIr, //!< Dump the compiler Kernel IR
ComputeAtMap, //!< Dump the computeAt map
CudaKernel, //!< Dump the generated CUDA C++ kernel code
CudaFull, //!< Dump the complete CUDA C++ code
CudaToFile, //!< Dump CUDA Strings to File
DebugInfo, //!< Embed line info and debug info to compiled kernel, and dump
//!< the full CUDA C++ code
LaunchParam, //!< Dump the Launch parameters of kernel
FusionSegments, //!< Dump Segmented Fusion Graph
FusionSegmenterLog, //!< Dump Detailed Segmenter Logging
FusionArgs, //!< Print the runtime fusion arguments
KernelArgs, //!< Print the runtime kernel arguments when launching kernels
EffectiveBandwidth, //! Measure kernel performance and print effective
//! bandwidth
FusionSegmentsDrawing, //!< Dump Segmented Fusion Graph
PrintPtxasLog, //!< Print the ptxas verbose log including register usage
BufferReuseInfo, //!< Dump the analysis details of local/shared buffer re-use
SchedulerDebug, //! Dump scheduler heuristic parameters
ParallelDimensions, //!< Dump known parallel dimensions
Halo, //! Halo information of tensors
PerfDebugVerbose, //! When running kernels, print verbose information
//! associated with what's running
PythonDefinition, //! Python Frontend Fusion Definition.
PythonFrontendDebug, //! Python Frontend debug information.
TransformPropagator, //! When running TransformPropagator, print propagation
//! path and replay result
InlinePropagator, //! When running InlinePropagator, print propagation
//! path and inlining result
Cubin, //! Dump compiled CUBIN
Ptx //! Dump compiled PTX
};
TORCH_CUDA_CU_API bool isDebugDumpEnabled(DebugDumpOption option);
//! Types of features to disable
//!
//! These can be set through the `PYTORCH_NVFUSER_DISABLE` environment variable
//!
enum class DisableOption {
ArchCheck, //! Disable hardware-specific checks to enable cross arch debug
Fallback, //! Disable fallback
Fma, //! Disable FMA instructions
IndexHoist, //! Disable index hoisting
Nvtx, //! Disable NVTX instrumentation
PredicateElimination //! Disable predicate elimination
};
TORCH_CUDA_CU_API bool isOptionDisabled(DisableOption option);
//! Types of features to enable
//!
//! These can be set through the `PYTORCH_NVFUSER_ENABLE` environment variable
//!
enum class EnableOption {
Complex, //! Enable complex support on python
KernelProfile, //! Enable intra-kernel performance profiling
LinearDecomposition, //! Enable linear-bias decomposition
ConvDecomposition, //! Enable conv-bias decomposition
TransposeScheduler //! Enable the experimental transpose scheduler
};
TORCH_CUDA_CU_API bool isOptionEnabled(EnableOption option);
// Check if fallback path should be used which will dispatch to eagermode if any
// errors are encountered. Helpful for debugging.
bool useFallback();
//! Ceil integer division
constexpr int64_t ceilDiv(int64_t a, int64_t b) {
return (a + b - 1) / b;
}
//! Simple mixin for suppressing copy & move operations, ex:
//!
//! class Foo : public NonCopyable {
//! ...
//! };
//!
class NonCopyable {
public:
NonCopyable() = default;
// No copy/move semantics
NonCopyable(const NonCopyable&) = delete;
NonCopyable& operator=(const NonCopyable&) = delete;
};
//! A generic root for a hierarchy of polymorphic classes:
//! - It ensures virtual destructors
//! - Provides the base->as<Derived>() and node->isA<T>() notation
class PolymorphicBase {
public:
virtual ~PolymorphicBase() = default;
// Replacement for static_cast<T*>(ptr): ptr->as<T>()
// (checked in DEBUG builds)
template <class T>
T* as() {
#ifdef NDEBUG
auto downcast_ptr = static_cast<T*>(this);
#else
auto downcast_ptr = dynamic_cast<T*>(this);
TORCH_INTERNAL_ASSERT(downcast_ptr != nullptr);
#endif
return downcast_ptr;
}
template <class T>
const T* as() const {
#ifdef NDEBUG
auto downcast_ptr = static_cast<const T*>(this);
#else
auto downcast_ptr = dynamic_cast<const T*>(this);
TORCH_INTERNAL_ASSERT(downcast_ptr != nullptr);
#endif
return downcast_ptr;
}
//! Check if the runtime time is T (or derived from T)
//!
//! \note Don't use this for conditional casts. Instead, use:
//!
//! if (auto t = dynamic_cast<T>(p)) { ... }
//!
//! instead of:
//!
//! if (p->isA<T>()) { auto t = p->as<T>(); ... }
//!
template <class T>
bool isA() const {
return dynamic_cast<const T*>(this) != nullptr;
}
};
template <class T, std::enable_if_t<std::is_enum<T>::value, bool> = true>
constexpr unsigned int switch_pair(T t1, T t2) {
constexpr unsigned int _WORD_SHIFT = 16;
return ((unsigned int)t1 << _WORD_SHIFT) + (unsigned int)t2;
}
std::vector<int64_t> getTensorSizes(TensorTypePtr const& tensor_type);
} // namespace cuda
} // namespace fuser
} // namespace jit
} // namespace torch
|