File: executor.h

package info (click to toggle)
pytorch 1.7.1-7
  • links: PTS, VCS
  • area: main
  • in suites: bullseye
  • size: 80,340 kB
  • sloc: cpp: 670,830; python: 343,991; ansic: 67,845; asm: 5,503; sh: 2,924; java: 2,888; xml: 266; makefile: 244; ruby: 148; yacc: 144; objc: 51; lex: 44
file content (150 lines) | stat: -rw-r--r-- 4,624 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
#pragma once
#include <torch/csrc/jit/codegen/cuda/executor_launch_params.h>
#include <torch/csrc/jit/codegen/cuda/executor_utils.h>
#include <torch/csrc/jit/codegen/cuda/expr_evaluator.h>
#include <torch/csrc/jit/codegen/cuda/fusion.h>
#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
#include <torch/csrc/jit/codegen/cuda/ir_cloner.h>
#include <torch/csrc/jit/codegen/cuda/ir_printer.h>
#include <torch/csrc/jit/codegen/cuda/lower2device.h>
#include <torch/csrc/jit/codegen/cuda/utils.h>

#include <c10/core/DeviceType.h>

namespace torch {
namespace jit {
namespace fuser {
namespace cuda {

// TODO: Should this actually be in launch params?
struct TORCH_CUDA_API CompileOptions {
  c10::Device device = c10::Device(c10::DeviceType::CUDA, 0);
};

class TORCH_CUDA_API FusionExecutor : public NonCopyable {
 public:
  // Unsafe compilation that's useful for debugging kernels, iterating over
  // slight modifications of a generated kernel
  void debugCompileFusionFromStr(
      Fusion* fusion,
      const std::string& code,
      const std::string& name,
      int id,
      CompileOptions options = CompileOptions());

  void compileFusion(Fusion* fusion, CompileOptions options = CompileOptions());

  std::vector<at::Tensor> runFusion(
      const at::ArrayRef<IValue>& inputs,
      const std::vector<at::Tensor>& outputs,
      const LaunchParams& launch_constraints = LaunchParams(),
      const c10::optional<size_t>& opt_code = c10::nullopt);

  std::vector<at::Tensor> runFusion(
      const at::ArrayRef<IValue>& inputs,
      const LaunchParams& launch_constraints = LaunchParams(),
      const c10::optional<size_t>& opt_code = c10::nullopt) {
    return runFusion(inputs, {}, launch_constraints, opt_code);
  }

  // function to query whether a `FusionExecutor` has a compiled kernel to
  // execute
  bool compiled() const {
    return fusion_id_ != -1;
  };

  void evictCache(size_t cache_id) {
    executor_entry_lookup_.erase(cache_id);
  }

  // TODO: strides would also be important when we handle permutations in
  //       codegen.
  // struct used to hold necessary information to launch compiled kernel on a
  // given input set.
  struct ExecutorEntry {
    bool init = false;
    LaunchParams launch_params;
    std::vector<std::vector<int64_t>> output_sizes;
    std::vector<at::ScalarType> output_types;
    std::vector<std::vector<int64_t>> empty_buffer_sizes;
    std::vector<at::ScalarType> empty_buffer_types;
    std::vector<std::vector<int64_t>> zero_buffer_sizes;
    std::vector<at::ScalarType> zero_buffer_types;
    uint64_t rand_offset;
  };

  Kernel* kernel() const {
    return lowered_.kernel();
  }

 private:
  struct GlobalBuffers {
    std::vector<at::Tensor> empty_buffers;
    std::vector<at::Tensor> zero_buffers;
  };

  std::string kernelName() const {
    std::stringstream ss;
    ss << "kernel" << fusion_id_;
    return ss.str();
  }

  static std::string kernelNamespace() {
    return "CudaCodeGen";
  }

  // Add preamble and wrap in namespace
  std::string getStructuredCode(const std::string& kernel);

  LaunchParams computeLaunchParams(
      const LaunchParams& launch_constraints,
      StatefulExpressionEvaluator& see);

  uint64_t computeSharedMemory(
      StatefulExpressionEvaluator& see,
      const std::vector<kir::Allocate*>& buffers,
      bool align_padding = false,
      uint64_t total = 0);

  // return a pair of vector of tensors, where tensors in the first vector are
  // not initialized, while the second vector contains zero-initiliazed tensors
  GlobalBuffers allocGlobalVals(StatefulExpressionEvaluator& see);

  std::vector<at::Tensor> allocOutputs(StatefulExpressionEvaluator& see);

  void setUsedTVs();

  const std::vector<TensorView*>& getUsedTVs() const {
    return used_tvs_;
  };

 private:
  Fusion fusion_;

  // TODO(kir): caching the values here is no longer needed
  bool has_block_reductions = false;
  bool has_grid_reductions = false;
  bool has_block_broadcasts = false;

  CompileOptions options_;
  size_t max_device_smem = std::numeric_limits<size_t>().max();
  executor_utils::NvrtcFunction compiled_kernel_;

  // TensorViews actually used in the kernel.
  std::vector<TensorView*> used_tvs_;

  // Counter to be used for kernel name.
  int fusion_id_ = -1;
  static int fusion_id_counter_;

  GpuLower lowered_;

  // lookup table to take short cut to retrieve recorded information in order to
  // launch kernels without re-inference parameters.
  std::unordered_map<size_t, ExecutorEntry> executor_entry_lookup_;
};

} // namespace cuda
} // namespace fuser
} // namespace jit
} // namespace torch