File: tensorrt_op_trt.cc

package info (click to toggle)
pytorch 1.13.1%2Bdfsg-4
  • links: PTS, VCS
  • area: main
  • in suites: bookworm
  • size: 139,252 kB
  • sloc: cpp: 1,100,274; python: 706,454; ansic: 83,052; asm: 7,618; java: 3,273; sh: 2,841; javascript: 612; makefile: 323; xml: 269; ruby: 185; yacc: 144; objc: 68; lex: 44
file content (254 lines) | stat: -rw-r--r-- 9,187 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
#include "caffe2/contrib/tensorrt/tensorrt_op_trt.h"

#include <c10/util/accumulate.h>
#include "caffe2/contrib/tensorrt/tensorrt_tranformer.h"
#include "caffe2/core/logging.h"
#include "onnx/onnx_pb.h"

#include <unordered_map>
#include <numeric>

namespace caffe2 {

namespace {
// Note that input of trt tensor is in CHW format, while our tensor is NCHW
// \return -1 if there is dimension mismatch between C2 tensor and trt tensor.
// Otherwise, return the product of CHW dimensions
int64_t CheckDims(
    const nvinfer1::Dims& nv_dims,
    at::ArrayRef<int64_t> c2_dims) {
  if (nv_dims.nbDims + 1 != c2_dims.size()) {
    CAFFE_THROW(
        "Mismatched dimensions between TRT input (",
        nv_dims.nbDims + 1,
        ") and C2 input (",
        c2_dims.size(),
        ")");
  }
  int64_t chw = 1;
  for (int i = 0; i < nv_dims.nbDims; ++i) {
    if (nv_dims.d[i] != c2_dims[i + 1]) {
      CAFFE_THROW(
          "Mismatched value at dimension ",
          i,
          "  between TRT input (",
          nv_dims.d[i],
          ") and C2 input (",
          c2_dims[i + 1],
          ")");
    }
    chw *= nv_dims.d[i];
  }
  return chw;
}

} // namespace

// Upon construction, we build the inference engine by deserializing from
// protobuf string. And since we know the input/output blobs, we can do the
// binding here too.
TensorRTOp::TensorRTOp(const OperatorDef& operator_def, Workspace* ws)
    : Operator<CUDAContext>(operator_def, ws),
      logger_(
          (nvinfer1::ILogger::Severity)(OperatorBase::GetSingleArgument<int>(
              "log_verbosity",
              FLAGS_caffe2_log_level))),
      max_batch_size_(
          OperatorBase::GetSingleArgument<int>("max_batch_size", 1)) {
  {
    auto engine_string =
        OperatorBase::GetSingleArgument<std::string>("backend_buffer", "");
    if (!engine_string.empty()) {
      auto trt_runtime =
          tensorrt::TrtObject(nvinfer1::createInferRuntime(logger_));
      // TODO(support trt plugin factory)
      trt_engine_ = tensorrt::TrtObject(trt_runtime->deserializeCudaEngine(
          engine_string.data(), engine_string.size(), nullptr));
    } else {
      auto onnx_model_str =
          OperatorBase::GetSingleArgument<std::string>("onnx_model", "");
      CAFFE_ENFORCE(!onnx_model_str.empty(), "onnx_model cannot be empty");
      auto debug_builder = OperatorBase::GetSingleArgument<int>("debug_builder", 0);
      auto max_workspace_size = OperatorBase::GetSingleArgument<int>(
          "max_workspace_size", 1024 * 1024 * 2);

      // Pull the weights from workspace and assembly it back to the onnx model,
      // notice that since we may have rewritten the net, we need to map the
      // weight names
      auto initializers = OperatorBase::GetRepeatedArgument<std::string>("initializers");
      CAFFE_ENFORCE_EQ(
          initializers.size() % 2, 0, "initializers should come in pairs");
      std::unordered_set<std::string> initializer_set;
      std::unordered_map<std::string, std::string> input_mapping;
      for (auto it = initializers.begin(); it != initializers.end(); ++it)  {
        auto key = *it++;
        input_mapping.emplace(key, *it);
        initializer_set.emplace(key);
      }
      Workspace mapped_ws(ws, input_mapping);
      ::ONNX_NAMESPACE::ModelProto onnx_model;
      ParseProtoFromLargeString(onnx_model_str, &onnx_model);
      BuildInitializationList(&mapped_ws, onnx_model.mutable_graph(), &initializer_set);
      onnx_model_str.clear();
      onnx_model.SerializeToString(&onnx_model_str);

      // Build the trt engine
      trt_engine_ = tensorrt::BuildTrtEngine(
          onnx_model_str,
          &logger_,
          max_batch_size_,
          max_workspace_size,
          debug_builder);
    }
  }

  CAFFE_ENFORCE(trt_engine_, "Cannot build TensorRT engine!");

  // match and bind the input/output
  const int num_bindings = trt_engine_->getNbBindings();
  int output_idx = 0;
  for (int b = 0; b < num_bindings; ++b) {
    nv_dims_.push_back(trt_engine_->getBindingDimensions(b));
    bool is_input = trt_engine_->bindingIsInput(b);
    is_input_.push_back(is_input);
    if (!is_input) {
      // For output, we try to get its output size hint
      const std::string key = c10::str("output_size_hint_", output_idx);
      auto output_size_hint = OperatorBase::GetRepeatedArgument<int>(key);
      if (!output_size_hint.empty()) {
        std::vector<int64_t> dims;
        for (const auto v : output_size_hint) {
          dims.push_back(v);
        }
        output_size_hints_.emplace(output_idx, std::move(dims));
      }
      ++output_idx;
    }
  }

  trt_executor_ = tensorrt::TrtObject(trt_engine_->createExecutionContext());
}

void TensorRTOp::MaybeAdjustOutputShape(
    int output_idx,
    std::vector<int64_t>* dims) {
  const auto it = output_size_hints_.find(output_idx);
  if (it != output_size_hints_.end()) {
    const auto& dims_hint = it->second;
    const auto total_trt = c10::multiply_integers(*dims);
    const auto total_c2 = c10::multiply_integers(dims_hint);
    CAFFE_ENFORCE_EQ(
        total_trt,
        total_c2,
        "The total size of TensorRT op output and hint don't match: ",
        total_trt,
        " vs ",
        total_c2);

    // We conform to the output shape hints. NB: We might need an explicit
    // reshape op for this
    *dims = dims_hint;
  }
}

bool TensorRTOp::RunOnDevice() {
  CAFFE_ENFORCE(trt_executor_);
  // Decide input batch size
  size_t N = 0;
  for (int i = 0; i < InputSize(); ++i) {
    const auto& input_tensor = Input(i);
    const auto tensor_dims = input_tensor.sizes();
    CAFFE_ENFORCE(!tensor_dims.empty(), "Input tensor cannot be empty");
    if (i == 0) {
      N = tensor_dims.front();
    } else {
      CAFFE_ENFORCE_EQ(
          N, tensor_dims.front(), "Mismatched batch size in input tensors");
    }
  }
  if (N > max_batch_size_ && !batch_warning_issued_) {
    LOG(WARNING) << "Batch size (" << N << ") is larger than max_batch_size ("
                 << max_batch_size_ << ") optimized for TensorRT operator. "
                 << "Performance may be sub-optimal.";
    batch_warning_issued_ = true;
  }

  // We need to do the binding at RunOnDevice time because we only know the
  // exact shapes of the tensors now. In addition, since TensorRT engine has
  // max_batch_size, we need to call that multiple times if input batch size
  // exceeeds this limit.
  CAFFE_ENFORCE_EQ(is_input_.size(), nv_dims_.size());
  std::vector<void*> bindings;
  bindings.reserve(is_input_.size());
  auto batch_size = max_batch_size_;
  for (size_t offset = 0; offset < N; offset += batch_size) {
    bindings.clear();
    batch_size = std::min<size_t>(N - offset, max_batch_size_);
    VLOG(2) << "Offset: " << offset << ", batch_size: " << batch_size
            << ", N: " << N;
    int input_idx = 0;
    int output_idx = 0;
    for (auto i = 0; i < is_input_.size(); ++i) {
      const auto& dims = nv_dims_[i];
      if (is_input_[i]) {
        // input, check input dimensions
        const auto& input_tensor = Input(input_idx++);
        const float* input_data = input_tensor.data<float>();
        const auto tensor_dims = input_tensor.sizes();
        auto chw = CheckDims(dims, tensor_dims);
        bindings.push_back((void*)(input_data + offset * chw));
      } else {
        // output, we need to allocate the output tensor at first batch run
        auto* output_tensor = Output(output_idx);
        std::vector<int64_t> tensor_dims;
        tensor_dims.push_back(N);
        int64_t chw = 1;
        for (int i = 0; i < dims.nbDims; ++i) {
          tensor_dims.push_back(dims.d[i]);
          chw *= dims.d[i];
        }

        if (offset == 0) {
          MaybeAdjustOutputShape(output_idx, &tensor_dims);
          output_tensor->Resize(tensor_dims);
        }
        ++output_idx;
        float* output_data = output_tensor->mutable_data<float>();
        bindings.push_back((void*)(output_data + offset * chw));
      }
    }

    CAFFE_ENFORCE_EQ(bindings.size(), InputSize() + OutputSize());
    if (!trt_executor_->execute(batch_size, bindings.data())) {
      CAFFE_THROW("Error running the TensorRT executor");
    }
  }
  return true;
}

OPERATOR_SCHEMA(TensorRT)
    .NumInputs(0, INT_MAX)
    .NumOutputs(0, INT_MAX)
    .SetDoc(R"DOC(
The TensorRT operator is a black-box operator serialized from prebuilt TensorRT
Engine string. It will take the input, do the computation by calling TensorRT
inference engine and generate the outputs.

This is a GPU only operator.
)DOC")
    .Arg(
        "log_verbosity",
        "(int default 0) verbosity of the TensorRt engine log.")
    .Arg(
        "backend_buffer",
        "(string default=\"\" blob for serialized TensorRT engine."
        "Note that serialized engine is not compatible across platform and "
        "different TensorRT version.")
    .Arg(
        "max_batch_size",
        "(int default 0) Batch size set by the TensorRT engine builder."
        "It must be no larger than the max_batch_size of the engine builder so "
        "it is better not to edit this manually.");

REGISTER_CUDA_OPERATOR(TensorRT, TensorRTOp);
} // namespace caffe2