File: partition.cpp

package info (click to toggle)
pytorch 1.7.1-7
  • links: PTS, VCS
  • area: main
  • in suites: bullseye
  • size: 80,340 kB
  • sloc: cpp: 670,830; python: 343,991; ansic: 67,845; asm: 5,503; sh: 2,924; java: 2,888; xml: 266; makefile: 244; ruby: 148; yacc: 144; objc: 51; lex: 44
file content (322 lines) | stat: -rw-r--r-- 12,238 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
#include <torch/csrc/jit/codegen/cuda/partition.h>
#include <ATen/core/jit_type.h>
#include <torch/csrc/jit/codegen/cuda/instrumentation.h>
#include <torch/csrc/jit/codegen/cuda/parser.h>

namespace torch {
namespace jit {
namespace fuser {
namespace cuda {

namespace {

// Check all outputs are:
//   1. TensorType
//   2. on the same device;
// TODO: update this when codegen can output scalar
static c10::optional<c10::Device> getDevice(const Value* value) {
  if (!value->type()->isSubtypeOf(TensorType::get())) {
    // not tensor type, return false as the op is not outputing scalar.
    return c10::nullopt;
  }
  return value->type()->expect<TensorType>()->device();
}

static c10::optional<c10::Device> getDevice(const Node* node) {
  auto outputs = node->outputs();
  for (auto output : outputs) {
    auto device = getDevice(output);
    if (device.has_value()) {
      return device;
    }
  }
  return c10::nullopt;
}

static bool isFusableDevice(const Node* node, const c10::Device device) {
  for (auto value : node->outputs()) {
    auto output_device = getDevice(value);
    if (output_device.has_value() && output_device.value() != device) {
      return false;
    }
  }
  return true;
}

// TODO: we need to check input type when we handle `to()`
static bool isFusableDevice(const Node* node) {
  auto device = getDevice(node);
  if (!device.has_value()) {
    return true;
  }
  return device->is_cuda();
}

inline bool isFusableNode(const Node* node) {
  // checks if node is compatible with parser:
  // 1. if we have a parsing rule; or 2. if the node is already a fusion group.
  return (isNodeParsible(node) || node->kind() == prim::CudaFusionGroup);
}

bool hasReductionOperation(const Node* node) {
  if (isReductionNode(node)) {
    return true;
  }
  if (node->kind() == prim::CudaFusionGroup) {
    for (auto n : node->g(attr::Subgraph)->nodes()) {
      if (hasReductionOperation(n)) {
        return true;
      }
    }
  }
  return false;
}

// utility function to check if the node implies broadcast on a given shape (
// assumed to be shape of an input tensor)
// limitations:
//   1. we rely on shape information to judge this. so we would require output
//      shape to be available;
//   2. we basically compares given shape to the shape of the only output of
//      the node and return true if it implies broadcast from the former to the
//      latter.
bool maybeBroadcastOnShape(
    const Node* n,
    const std::vector<c10::optional<int64_t>>& shape) {
  TORCH_INTERNAL_ASSERT(
      n->outputs().size() == 1,
      "not expecting multiple outputs from a node, graph partitioning logic needs to be updated");
  // assumes that if output is not a tensor type, it's not broadcasting
  if (auto out_type = n->output(0)->type()->cast<TensorType>()) {
    if (out_type->dim()) {
      if (out_type->dim().value() < shape.size()) {
        // no broadcast for reduction operation;
        return false;
      } else if (out_type->dim().value() > shape.size()) {
        // increased rank means there is reduction;
        return true;
      } else {
        // same rank, we need to iterate through sizes and check if size-1
        // exists in input `shape`
        for (const auto& opt_size : shape) {
          // TODO: not sure if we need to check for output size != 1, since we
          // are currently marking all size-1 dimension as broadcast in codegen.
          if (opt_size.has_value() && opt_size.value() == 1) {
            return true;
          }
        }
      }
    }
  }
  return false;
};

// [ Note - tricky broadcasting ]
//
// github issue # 190
//
// To extend the issue further, we consider two difficult broadcasting cases
// that is difficult to naively schedule:
//   scenario 1: single tensor with multiple broadcasting semantics;
//               ```
//                   %t = op(...)
//                   %t0_o = op0(%t, %t0)
//                   %t1_o = op1(%t, %t1)
//               ```
//               It's hard to check/validate whether `%t0` and `%t1` implies
//               identical broadcasting for `%t` so that we can simply broadcast
//               it to their common shape and use the broadcasted tensor view in
//               both `op0` and `op1`; or, if `%t0` and `%t1` has different
//               shapes, we would need differently broadcasted `%t` for the two
//               ops.
//               Even with this condition sorted out, scheduling is challenging.
//               As we cannot inline the computation of `%t` to the downstream
//               consumer of `%t0_o` and `%t1_o` easily, because `computeAt`
//               could propagate contradicting transformations on the common
//               ancestor `%t`.
//               See footnote*;
//   scenario 2: output tensor_view which is broadcasted later;
//               ```
//                   %t = op(...)
//                   %t0_o = op0(%t, %t0)
//                   return (%t, %t0_o)
//               ```
//               Similarly, if we need to broadcast `%t` to `%t0` for `op0`, and
//               use it as output, it also complicates schedule.
//
// Currently we just avoid the two cases in our graph partitioning.
//
// We bake the implementation along with our partition, where we merge nodes
// from producer to consumer. In the example down, we list all "type"s of edges
// among producer/consumer and the out side world.
//
//   %input_t0, %input_t1, %input_t2 # inputs from outside world feeding
//                                   # producer/consumer pair
//   %p_out_t0, %p_out_t1 = producer(%input_t0, %input_t1)
//   %c_out_t, ... = consumer(%input_t0, %input_t2, %p_out_t0)
//
// producer/consumer : the nodes that we are trying to merge, each node could be
//                     a parsible real operation or a `CudaFusionGroup`.
// %input_t0         : inputs shared by both producer & consumer
// %input_t1         : inputs feed only to producer, but not to consumer
// %input_t2         : inputs feed only to consumer, but not to producer
// %p_put_t0         : outputs of producer that is fed to consumer
// %p_put_t1         : outputs of producer that is not fed to consumer
// %c_put_t0         : outputs of consumer
//
// We can see that after merging consumer & producer, we will have:
//   %input_t0, %input_t1, %input_t2 # inputs from outside world feeding
//                                   # producer/consumer pair
//   %p_out_t, %c_out_t = group(%input_t0, %input_t1, %input_t2)
//
// Under the assumption that any existing `CudaFusionGroup` does not have
// violating broadcasting semantics mentioned above.
//
// If we examine the `group`, new cases of scenario 1 (multiple broadcast) could
// only be created by merging new edges in the new `group`, that is:
//   case 1. `%input_t0`, shared by `producer` and `consumer`
//   case 2. `%p_out_t0`, produced by `producer` and fed to `consumer`
//
// new cases of scenario 2 (output was broadcasted later) could only be added
// via:
//   case 3. `%p_out_t0`, produced by `producer` and fed to `consumer`, which
//           could be broadcasted in the consumer subgraph.
//
// footnote*:
// We are only disabling multiple broadcast right on the tensor, instead of
// tracing all the broadcast further down.
// I don't think we need to worry about broadcasting further down the dependency
// chain, as those would create new IterDomain, which doesn't have th problem of
// conflicting broadcasting.
bool createTrickyBroadcast(const Node* consumer, const Node* producer) {
  auto count_broadcasting_in_node =
      [](const Node* node,
         const std::vector<c10::optional<int64_t>>& shape,
         size_t offset) {
        int num_broadcasting = 0;
        if (node->kind() == prim::CudaFusionGroup) {
          // be careful here as `subgraph_input`, as its name suggests, is in a
          // different fraph from `node`.
          const auto& subgraph_input =
              node->g(attr::Subgraph)->inputs()[offset];
          for (const auto& use : subgraph_input->uses()) {
            if (maybeBroadcastOnShape(use.user, shape)) {
              num_broadcasting++;
            }
          }
        } else {
          if (maybeBroadcastOnShape(node, shape)) {
            num_broadcasting++;
          }
        }
        return num_broadcasting;
      };

  // case 1. We check shared inputs to `producer` & `consumer`;
  for (int i = 0; i < static_cast<int>(producer->inputs().size()); i++) {
    auto n_input = producer->input(i);
    auto n_input_type = n_input->type()->cast<TensorType>();
    if (n_input_type != nullptr && n_input_type->sizes().sizes()) {
      std::vector<c10::optional<int64_t>> n_input_shape =
          n_input_type->sizes().sizes().value();
      int num_broadcasting = 0;

      // check broadcasting for the n_input inside `consumer`;
      for (const auto& use : n_input->uses()) {
        if (use.user == consumer) {
          num_broadcasting +=
              count_broadcasting_in_node(consumer, n_input_shape, use.offset);
        }
      }

      // if no broadcasting happened for consumer, there's no point check
      // multiple broadcasting in producer alone;
      if (num_broadcasting == 0) {
        continue;
      }

      // check broadcasting for n_input inside `producer`;
      num_broadcasting +=
          count_broadcasting_in_node(producer, n_input_shape, i);

      // encounted multiple broadcasting scheme for a single TV, we will not be
      // able to schedule this, prevent the fusion; (case 1)
      if (num_broadcasting > 1) {
        return true;
      }
    }
  }

  // case 2. We check input to `consumer` that is also the output from
  // `producer`
  for (int i = 0; i < static_cast<int>(producer->outputs().size()); i++) {
    auto n_output = producer->output(i);
    auto n_output_type = n_output->type()->cast<TensorType>();
    if (n_output_type != nullptr && n_output_type->sizes().sizes()) {
      std::vector<c10::optional<int64_t>> n_output_shape =
          n_output_type->sizes().sizes().value();
      int num_broadcasting = 0;
      // If we only look at case 1 & case 2, we need to check broadcast of
      // `n_output` inside `producer`, if it is a `prim::CudaFusionGroup`.
      // this is actually not necessary when we consider case 3, as we avoid
      // broadcasting on outputs already;

      // TODO: merge this code with case 1.
      // check broadcasting for the n_output inside `consumer`;
      bool use_as_output = false;
      for (const auto& use : n_output->uses()) {
        if (use.user == consumer) {
          num_broadcasting +=
              count_broadcasting_in_node(consumer, n_output_shape, use.offset);
        } else {
          // case 3. output is used by other nodes not the consumer, no
          //         broadcasting is allowed;
          use_as_output = true;
        }
      }

      // encounted multiple broadcasting scheme for a single TV, we will not be
      // able to schedule this, prevent the fusion; (case 2)
      // Alternatively, if use_as_output is true, we would not permit broadcast
      // at all. (case 3)
      if (num_broadcasting > (use_as_output ? 0 : 1)) {
        return true;
      }
    }
  }

  return false;
}

} // namespace

bool isFusableCudaFusionGroup(const Node* node) {
  FUSER_PERF_SCOPE("isFusableCudaFusionGroup");

  if (isFusableNode(node)) {
    return isFusableDevice(node);
  }
  return false;
}

bool isFusableCudaFusionGroup(const Node* fusion, const Node* node) {
  FUSER_PERF_SCOPE("isFusableCudaFusionGroup");

  // TODO: lift the restriction of not fusing producer containing reduction when
  //       we have proper scheduling.
  if (isFusableCudaFusionGroup(node) && !hasReductionOperation(node) &&
      !createTrickyBroadcast(fusion, node)) {
    // ensure if the node has a designated device, it's on the same device with
    // fusion.
    // TODO: is there a danger of us fusing operations that's supposed to be on
    //       separate GPUs? And is that necessarily bad?
    auto device = getDevice(fusion);
    return (!device.has_value() || isFusableDevice(node, device.value()));
  }
  return false;
}

} // namespace cuda
} // namespace fuser
} // namespace jit
} // namespace torch