File: operator_fallback_ideep.h

package info (click to toggle)
pytorch 1.13.1%2Bdfsg-4
  • links: PTS, VCS
  • area: main
  • in suites: bookworm
  • size: 139,252 kB
  • sloc: cpp: 1,100,274; python: 706,454; ansic: 83,052; asm: 7,618; java: 3,273; sh: 2,841; javascript: 612; makefile: 323; xml: 269; ruby: 185; yacc: 144; objc: 68; lex: 44
file content (190 lines) | stat: -rw-r--r-- 7,776 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
#pragma once

#include <caffe2/core/common.h>
#include <caffe2/core/context.h>
#include <caffe2/core/operator.h>
#include <caffe2/ideep/ideep_utils.h>
#include <caffe2/proto/caffe2_pb.h>

namespace caffe2 {

/**
 * @brief A templated class to allow one to wrap a CPU operator as an IDEEP
 * operator.
 *
 * This class can be used when one does not have the IDEEP implementation ready
 * yet for an operator. Essentially, what this op does is to automatically
 * deal with data copy for you. Plausibly, this causes a lot of overhead and
 * is not optimal, so you should use this operator mostly for quick prototyping
 * purpose.
 *
 * All the input and output of the original operator should be TensorCPU.
 *
 * Example usage: if you have a class MyMagicOp that is CPU based, and you use
 * the registration code
 *     REGISTER_CPU_OPERATOR(MyMagic, MyMagicOp);
 * to register the CPU side, you can create its corresponding IDEEP operator
 * (with performance hits of course) via
 *     REGISTER_IDEEP_OPERATOR(MyMagic,
 *                            IDEEPFallbackOp<MyMagicOp>);
 *
 * Advanced usage: if you want to have some specific outputs never copied, you
 * can use the SkipOutputCopy template argument to do that. For example, if
 * MyMagic produces two outputs and the first output is always going to live on
 * the CPU, you can do
 *     REGISTER_IDEEP_OPERATOR(MyMagic,
 *                            IDEEPFallbackOp<MyMagicOp, SkipIndices<0>>);
 */
template <class CPUOp, typename SkipOutputCopy = SkipIndices<>>
class IDEEPFallbackOp final : public IDEEPOperator {
 public:
  USE_IDEEP_DEF_ALIASES();
  USE_IDEEP_OPERATOR_FUNCTIONS();

  IDEEPFallbackOp(const OperatorDef& def, Workspace* ws)
      : IDEEPOperator(def, ws) {
    CAFFE_ENFORCE_EQ(def.device_option().device_type(), PROTO_IDEEP);
    base_def_.CopyFrom(def);
    // base_def_ runs on CPU, so we will set its device option to CPU.
    // Copy to allow random_seed to be correctly propagated.
    base_def_.mutable_device_option()->CopyFrom(def.device_option());
    base_def_.mutable_device_option()->set_device_type(PROTO_CPU);
    // Create output blobs in parent workspace,
    // then forward output blobs to local workspace.
    std::unordered_map<string, string> forwarded_output_blobs;
    for (const auto i : c10::irange(base_def_.output_size())) {
      // For in-place case, the in/output tensor for local_ws must be
      // re-created, instead of forwarding from current workspace.
      string parent_name(base_def_.output(i));
      if (!SkipOutputCopy::Contains(i)) {
        parent_name += "_cpu_output_blob_" + base_def_.type();
      }
      local_output_blobs_.push_back(ws->CreateBlob(parent_name));
      TORCH_CHECK_NOTNULL(local_output_blobs_.back());
      forwarded_output_blobs[base_def_.output(i)] = parent_name;
      output_inplace_.push_back(false);
      for (const string &input_name : base_def_.input()) {
        if (input_name == base_def_.output(i)) {
          output_inplace_[i] = true;
          break;
        }
      }
    }
    local_ws_.reset(new Workspace(ws, forwarded_output_blobs));
    // Set up the symbols for the local workspace.
    for (const string& name : base_def_.input()) {
      local_input_blobs_.push_back(local_ws_->CreateBlob(name));
      TORCH_CHECK_NOTNULL(local_input_blobs_.back());
    }
    input_share_.resize(local_input_blobs_.size(), false);
    base_op_.reset(new CPUOp(base_def_, local_ws_.get()));
  }

  bool RunOnDevice() override {
    for (const auto i : c10::irange(InputSize())) {
      if (InputIsType<itensor>(i)
          && (Input(i).has_scale()
            || Input(i).get_data_type() == idtype::f32)) {
        auto& input = Input(i);
        if (input_share_[i]) {
          local_input_blobs_[i]->Reset();
          input_share_[i] = false;
        }
        auto dtensor = BlobGetMutableTensor(local_input_blobs_[i], CPU);
        dtensor->Resize(input.get_dims());
        // If fallback from INT8, the public format of original input is nhwc.
        // While the required format is nchw, need to reorder to nchw.
        if (input.get_desc().is_nhwc()) {
          itensor temp_ten ({input.get_dims(), idtype::f32, iformat::nchw},
              dtensor->template mutable_data<float>());
          temp_ten.feed_from(input);
        } else if (!input.need_reorder()) {
          CAFFE_ENFORCE(!input.has_scale(),
              "Incorrect invocation of get_data_handle");
          dtensor->ShareExternalPointer(
              static_cast<float*>(input.get_data_handle()));
        } else {
          input.to_public(dtensor->template mutable_data<float>());
        }
      } else {
        VLOG(1) << "Input " << i << " is not ideep::tensor. Skipping copy.";
        if (OperatorBase::Inputs()[i]->GetRaw() != local_input_blobs_[i]->GetRaw()) {
          // Note(jiayq): This removes a const but conceptually
          // local_input_blobs will only be used as const blob input for the
          // base op so we are still fine.
          local_input_blobs_[i]->ShareExternal(
              const_cast<void *>(OperatorBase::Inputs()[i]->GetRaw()),
              OperatorBase::Inputs()[i]->meta());
        }
        input_share_[i] = true;
      }
    }

    // Some CPU ops inherited from OperatorBase directly might need this default
    // input argument '0' like 'PrefetchOperator'.
    if (!base_op_->Run(0)) {
      LOG(ERROR) << "Base op run failed in IDEEPFallbackOp. Def: "
                 << ProtoDebugString(this->debug_def());
      return false;
    }

    for (const auto i : c10::irange(OutputSize())) {
      if (SkipOutputCopy::Contains(i)) {
        VLOG(1) << "Copy output: index " << i << " skipped.";
        continue;
      }
      CAFFE_ENFORCE(
          BlobIsTensorType(*local_output_blobs_[i], CPU),
          "IDEEP fallback op currently does not support non-TensorCPU "
          "output type who needs copying.");
      const auto& src = local_output_blobs_[i]->template Get<TensorCPU>();
      auto src_dims = src.sizes().vec();
      if (src.template IsType<float>() && src.dim() != 0 && base_op_->type() != "Python") {
        Blob* dst = OperatorBase::OutputBlob(i);
        // The output tensor must be ideep tensor with public format.
        // If reusing ideep tensor with non-public format, the tensor buffer
        // will be interpreted incorrectly.
        if (!dst->template IsType<itensor>() ||
            !dst->template Get<itensor>().is_public_format()) {
          dst->Reset(new itensor());
        }

        itensor::dims dst_dims (src_dims.begin(), src_dims.end());
        auto dtensor = dst->template GetMutable<itensor>();
        if (dtensor->get_dims() != dst_dims) {
          dtensor->resize(dst_dims, idtype::f32);
        }
        if (output_inplace_[i]) {
          dtensor->feed_from(dst_dims, idtype::f32,
              const_cast<void*>(src.raw_data()));
        } else {
          CAFFE_ENFORCE(!dtensor->has_scale(),
              "Incorrect invocation of set_data_handle");
          dtensor->set_data_handle(const_cast<void *>(src.raw_data()));
        }
      } else {
        VLOG(2) << "Output " << base_def_.output(i) << " as CPUTensor";
        Blob* dst = OperatorBase::OutputBlob(i);
        if (output_inplace_[i]) {
          auto dtensor = BlobGetMutableTensor(dst, CPU);
          dtensor->CopyFrom(src);
        } else {
          dst->Reset(new Tensor(CPU));
          BlobSetTensor(dst, src.Alias());
        }
      }
    }
    return true;
  }

 protected:
  vector<Blob*> local_input_blobs_;
  vector<Blob*> local_output_blobs_;
  vector<bool> output_inplace_;
  vector<bool> input_share_;
  std::unique_ptr<CPUOp> base_op_;
  std::unique_ptr<Workspace> local_ws_;
  OperatorDef base_def_;
};

} // namespace caffe2