File: lower_index.h

package info (click to toggle)
pytorch 1.13.1%2Bdfsg-4
  • links: PTS, VCS
  • area: main
  • in suites: bookworm
  • size: 139,252 kB
  • sloc: cpp: 1,100,274; python: 706,454; ansic: 83,052; asm: 7,618; java: 3,273; sh: 2,841; javascript: 612; makefile: 323; xml: 269; ruby: 185; yacc: 144; objc: 68; lex: 44
file content (141 lines) | stat: -rw-r--r-- 4,828 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
#pragma once

#include <c10/macros/Export.h>

#include <torch/csrc/jit/codegen/cuda/instrumentation.h>
#include <torch/csrc/jit/codegen/cuda/kernel_ir.h>
#include <torch/csrc/jit/codegen/cuda/kernel_ir_dispatch.h>
#include <torch/csrc/jit/codegen/cuda/root_domain_map.h>

#include <vector>

namespace torch {
namespace jit {
namespace fuser {
namespace cuda {

// TODO: Replace with mutator as IndexLowering is replacing expr's with
// versions that are doing indexing
class TORCH_CUDA_CU_API IndexLowering : private OptOutConstDispatch {
 public:
  static std::vector<Expr*> getIndexedExprs(std::vector<Expr*> incoming_exprs) {
    FUSER_PERF_SCOPE("GpuLower::Lower::IndexLowering::getIndexedExprs");
    IndexLowering il;
    il.generate(incoming_exprs);
    return il.lowered_exprs_;
  }

 private:
  IndexLowering() = default;

  void pushBack(Expr*);

  // Return the most recently inserted
  //  expression in the current active
  //  scope or global scope.
  Expr* back() const;

  // Insert an expression before the current top-level expression.
  void insertAtTopLevel(Expr* expr);

  void handle(const ARangeOp*) final;
  void handle(const ViewAsScalar*) final;
  void handle(const UnaryOp*) final;

  void handle(const BinaryOp*) final;
  void handle(const TernaryOp*) final;
  void handle(const RNGOp*) final;
  void handle(const ReductionOp*) final;
  void handle(const GroupedReductionOp*) final;
  void handle(const WelfordOp*) final;
  void handle(const GroupedWelfordOp*) final;
  void handle(const LoadStoreOp*) final;
  void handle(const MmaOp*) final;
  void handle(const BroadcastOp*) final;

  void handle(const kir::ForLoop*) final;
  void handle(const kir::IfThenElse*) final;
  void handle(const kir::Allocate*) final;
  void handle(const kir::BlockSync*) final;
  void handle(const kir::GridSync*) final;
  void handle(const kir::CpAsyncWait*) final;
  void handle(const kir::CpAsyncCommit*) final;

  void generate(const std::vector<Expr*>& exprs);

  Val* lowerSrcIndex(Val* val, Val* dst) const;

  Val* lowerDstIndex(Val* dst) const;

  void handleBlockReduction(const ReductionOp* rop, Val* out, Val* in);
  void handleGridReduction(const ReductionOp* rop, Val* out, Val* in);

  void handleBlockReduction(
      const GroupedReductionOp* rop,
      const std::vector<Val*>& outputs,
      const std::vector<Val*>& inputs);
  void handleGridReduction(
      const GroupedReductionOp* rop,
      const std::vector<Val*>& outputs,
      const std::vector<Val*>& inputs);

  void handleGridWelford(WelfordOp* new_wop);

  void handleGroupedBlockWelford(
      const GroupedWelfordOp* wop,
      const std::vector<WelfordTriplet>& output_vals,
      const std::vector<WelfordTriplet>& input_vals,
      const std::vector<WelfordTriplet>& init_vals);
  void handleGroupedGridWelford(
      const GroupedWelfordOp* wop,
      const std::vector<WelfordTriplet>& output_vals,
      const std::vector<WelfordTriplet>& input_vals,
      const std::vector<WelfordTriplet>& init_vals);

  // Allocate a unique buffer for grid reductions and broadcast. A
  // buffer is uniquely allocated for each output tensor of an
  // expression.
  kir::Allocate* allocateUniqueBuffer(
      Val* buffer_size,
      DataType dtype,
      bool zero_init,
      TensorView* out_tv,
      std::unordered_map<TensorView*, kir::Allocate*>& alloc_map);

  std::vector<kir::Allocate*> allocateWelfordWorkBuffer(
      const std::vector<WelfordTriplet>& triplets,
      WelfordTriplet::ValName name,
      Val* buffer_size);

  // Allocate a fused reduction object uniquely for a given
  // TensorView. Parameter expr is the expression corresponding to the
  // fused reduction.
  void allocateUniqueFusedReduction(Expr* expr, TensorView* out_tv);

 private:
  std::vector<Expr*> lowered_exprs_;

  // This is a slight work around as scope has a couple definitions, we have the
  // Scope that's in ForLoop/IfThenElse which is really just a wrapper around
  // std::vector<Expr*> and then we have the actual ForLoop/IfThenElse. We want
  // to be able to carry both around because when we push back to a scope it
  // could be either the body or else body of the IfThenElse. However, we want
  // to understand the nesting of IfThenElse/ForLoop nodes.
  kir::Scope* active_scope_ = nullptr;

  // Track for loops to send to indexing. Similar to what's done in
  // kir::IrVisitor
  std::vector<kir::ForLoop*> for_loops_;

  // Maps to keep track of allocated buffers and objects that must be
  // allocated only once
  std::unordered_map<TensorView*, kir::Allocate*> sync_buffer_map_;
  std::unordered_map<TensorView*, kir::Allocate*> work_buffer_map_;
  std::unordered_map<TensorView*, kir::AllocateFusedReduction*>
      fused_reduction_map_;
};

} // namespace cuda
} // namespace fuser
} // namespace jit
} // namespace torch