File: utils.h

package info (click to toggle)
pytorch 1.13.1%2Bdfsg-4
  • links: PTS, VCS
  • area: main
  • in suites: bookworm
  • size: 139,252 kB
  • sloc: cpp: 1,100,274; python: 706,454; ansic: 83,052; asm: 7,618; java: 3,273; sh: 2,841; javascript: 612; makefile: 323; xml: 269; ruby: 185; yacc: 144; objc: 68; lex: 44
file content (204 lines) | stat: -rw-r--r-- 7,615 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
#pragma once

#include <torch/csrc/jit/codegen/cuda/executor.h>
#include <torch/csrc/jit/codegen/cuda/fusion.h>
#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
#include <torch/csrc/jit/codegen/cuda/ir_utils.h>
#include <torch/csrc/jit/codegen/cuda/kernel_cache.h>
#include <torch/csrc/jit/codegen/cuda/lower2device.h>
#include <torch/csrc/jit/codegen/cuda/ops/all_ops.h>
#include <torch/csrc/jit/codegen/cuda/scheduler/all_schedulers.h>

#include <benchmark/benchmark.h>

#include <ATen/cuda/CUDAContext.h>
#include <torch/torch.h>

#include <cuda_runtime.h>

using namespace torch::jit::fuser::cuda;

// Make a tensor that is known to be non-contiguous of dimensionality=ndims,
// but unknown sizes
TensorView* makeSymbolicTensor(size_t ndims, DataType dtype = DataType::Float);

// Make a tensor that is known to be fully contiguous of dimensionality=ndims,
// but unknown sizes. Taken from test_gpu.cpp
TensorView* makeContigTensor(size_t ndims, DataType dtype = DataType::Float);

// Make a non-contiguous tensor of compile-time known sizes
TensorView* makeConcreteTensor(
    std::vector<int64_t> shape,
    DataType dtype = DataType::Float);

// Make a contiguous tensor of compile-time known sizes
TensorView* makeContigConcreteTensor(
    std::vector<int64_t> shape,
    DataType dtype = DataType::Float);

std::string toString(const ReductionParams& rparams);
std::string toString(const PointwiseParams& params);
std::string toString(const TransposeParams& params);
std::string toString(const std::shared_ptr<HeuristicParams>& params);
std::string toString(LaunchParams lparams);

// Run benchmark iterations with provided inputs. If not segmented, report
// kernel time from the runtime, as well as heuristic parameters. If segmented
// use timers. Make sure to clear L2 between iterations.
void runBenchmarkIterations(
    benchmark::State& benchmark_state,
    FusionExecutorCache* fusion_executor_cache,
    std::vector<c10::IValue>& aten_inputs);

void clearL2Cache();

class CudaKernelTimer {
 public:
  CudaKernelTimer() {
    // Setup
    C10_CUDA_CHECK(cudaEventCreate(&start_event));
    C10_CUDA_CHECK(cudaEventCreate(&finish_event));
    C10_CUDA_CHECK(cudaEventRecord(start_event));
  }

  ~CudaKernelTimer() {
    C10_CUDA_IGNORE_ERROR(cudaEventDestroy(start_event));
    C10_CUDA_IGNORE_ERROR(cudaEventDestroy(finish_event));
  }

  void restart() {
    C10_CUDA_CHECK(cudaEventRecord(start_event));
  }

  float elapsed() {
    // Record
    C10_CUDA_CHECK(cudaEventRecord(finish_event));
    C10_CUDA_CHECK(cudaEventSynchronize(start_event));
    C10_CUDA_CHECK(cudaEventSynchronize(finish_event));
    C10_CUDA_CHECK(
        cudaEventElapsedTime(&kernel_time_ms_, start_event, finish_event));
    return kernel_time_ms_;
  }

 private:
  // Create
  float kernel_time_ms_ = 0;
  cudaEvent_t start_event = {};
  cudaEvent_t finish_event = {};
};

namespace executorCache {
using ExecutorPtr = std::unique_ptr<FusionExecutorCache>;
using ExecutorMap = std::unordered_map<std::string, ExecutorPtr>;
ExecutorMap& getGlobalMap();
} // namespace executorCache

//! Utility to manage FusionExecutorCache instances for
//!  all defined benchmarks
class BenchmarkGraph : public benchmark::Fixture {
 public:
  using SetupFusionFunction = std::function<void(Fusion*)>;
  using SetupFusionMap = std::unordered_map<std::string, SetupFusionFunction>;

  virtual std::string graphName() = 0;
  virtual SetupFusionFunction setupFusion() = 0;

  FusionExecutorCache* getExecutorCache() {
    auto& executor_ = getExecutorCacheMap()[graphName()];
    TORCH_INTERNAL_ASSERT(executor_);
    return executor_.get();
  }

  void SetUp(const ::benchmark::State& state) {
    auto& executor_ = getExecutorCacheMap()[graphName()];
    // Makes sure same graph hasn't been compiled before
    if (!executor_) {
      auto fusion_ptr = std::make_unique<Fusion>();
      FusionGuard(fusion_ptr.get());
      setupFusion()(fusion_ptr.get());
      getExecutorCacheMap()[graphName()] =
          std::make_unique<FusionExecutorCache>(std::move(fusion_ptr));
    }
  }

  void TearDown(const ::benchmark::State& state) {}

 protected:
  static executorCache::ExecutorMap& getExecutorCacheMap() {
    return executorCache::getGlobalMap();
  }
};

#define NVFUSER_TO_STRING_HELPER(n) std::string(#n)
#define NVFUSER_TO_STRING(n) NVFUSER_TO_STRING_HELPER(n)

//! NVFUSER_BENCHMARK_RUN utility usage:
//!  This utility helps create and manage FusionExecutorCaches and tries to use
//!  the caching
//! mechanism in NVFuser to avoid re-compilation.
//!
//!  There are two macros in this utility: NVFUSER_BENCHMARK_DEFINE, and
//!  NVFUSER_BENCHMARK_RUN,
//! and user needs to supply two functions SETUP_FUSION and RUN_FUSION, with
//! following signatures:
//!
//!  SETUP_FUSION(Fusion* , args...);
//!  RUN_FUSION(benchmark::State&, FusionExecutorCache* , args...);
//!
//!  where args... are additional arguments, and they need to be the same for
//!  SETUP_FUSION and RUN_FUSION.
//!
//!  SETUP_FUSION is called once in each definition of benchmark to build the
//!  fusionIR graph
//!
//!  RUN_FUSION is just like the normal benchmark instance, except that a
//!  FusionExecutorCache
//!   will be provided for scheduling, running and timing the fusion runs. It is
//!   called once in each benchmark instance. For example:
//!   NVFUSER_BENCHMARK_RUN(my_benchmark)
//!    ->RangeMultiplier(2)
//!    ->Ranges({{1, 4})
//!  Calls RUN_FUSION 3 times.
//!
//!  To register a benchmark, the API is:
//!
//!  NVFUSER_BENCHMARK_DEFINE(my_benchmark,SETUP_FUSION,RUN_FUSION,args...);
//!
//!    where my_benchmark is any unique name given for this benchmark,
//!      SETUP_FUSION, RUN_FUSION as described above,
//!      args... is the arg list supplied to both setup_fusion and run_fusion
//!
//!  each NVFUSER_BENCHMARK_DEFINE registers a benchmark with a single
//!  FusionExecutorCache, i.e. a single fusion graph, and multiple benchmark
//!  data points can be registered like:
//!
//!  NVFUSER_BENCHMARK_RUN(my_benchmark)
//!    ->Ranges({{1,2}});
//!
//!  NVFUSER_BENCHMARK_RUN(my_benchmark)
//!    ->Ranges({{3,4}});
//!
//!  All datapoints will use the same FusionExecutorCache so recompilation is
//!  avoided as much as possible.

#define NVFUSER_BENCHMARK_DEFINE(                                       \
    BENCHMARK_NAME, SETUP_FUSION, RUN_FUSION, ...)                      \
  class BENCHMARK_NAME##___GRAPH : public BenchmarkGraph {              \
   public:                                                              \
    std::string graphName() {                                           \
      return NVFUSER_TO_STRING(BENCHMARK_NAME##___GRAPH);               \
    }                                                                   \
    SetupFusionFunction setupFusion() {                                 \
      return [](Fusion* fusion) { SETUP_FUSION(fusion, __VA_ARGS__); }; \
    }                                                                   \
  };                                                                    \
  BENCHMARK_DEFINE_F(BENCHMARK_NAME##___GRAPH, BENCHMARK_NAME)          \
  (benchmark::State & benchmark_state) {                                \
    RUN_FUSION(                                                         \
        benchmark_state,                                                \
        BENCHMARK_NAME##___GRAPH::getExecutorCache(),                   \
        __VA_ARGS__);                                                   \
  }

#define NVFUSER_BENCHMARK_RUN(BENCHMARK_NAME) \
  BENCHMARK_REGISTER_F(BENCHMARK_NAME##___GRAPH, BENCHMARK_NAME)