File: nnpack_ops.cc

package info (click to toggle)
pytorch 1.13.1%2Bdfsg-4
  • links: PTS, VCS
  • area: main
  • in suites: bookworm
  • size: 139,252 kB
  • sloc: cpp: 1,100,274; python: 706,454; ansic: 83,052; asm: 7,618; java: 3,273; sh: 2,841; javascript: 612; makefile: 323; xml: 269; ruby: 185; yacc: 144; objc: 68; lex: 44
file content (352 lines) | stat: -rw-r--r-- 12,656 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
#include "caffe2/core/common.h"

#ifdef CAFFE2_USE_MKL
#include <mkl.h>
#endif

#include "caffe2/core/context.h"
#include "caffe2/core/logging.h"
#include "caffe2/core/operator.h"
#include "caffe2/operators/conv_pool_op_base.h"
#include "caffe2/operators/leaky_relu_op.h"
#include "caffe2/utils/cpuid.h"
#include "caffe2/utils/math.h"
#include "nnpack.h"

C10_DEFINE_int(
    caffe2_nnpack_num_threads,
    1,
    "The number of nnpack pthreadpool threads.");
C10_DEFINE_bool(
    caffe2_nnpack_use_mkl_num_threads,
    true,
    "If MKL is built, this sets nnpack to use the same number of threads as "
    "MKL does. This overrides caffe2_nnpack_num_threads if set.");

namespace caffe2 {
////////////////////////////////////////////////////////////////////////////////
// Helper Functions
////////////////////////////////////////////////////////////////////////////////

namespace {

bool has_nnpack() {
  // nnp_initialize is a noop after the first call so it's safe to invoke it
  // repeatedly
  auto nnpack_status = nnp_initialize();
  return nnpack_status == nnp_status_success;
}

nnp_convolution_algorithm get_nnp_convolution_algorithm(
    const std::string& algo) {
  if (algo == "AUTO") {
    return nnp_convolution_algorithm_auto;
  }
  if (algo == "WINOGRAD") {
    return nnp_convolution_algorithm_wt8x8;
  }
  if (algo == "FT16") {
    return nnp_convolution_algorithm_ft16x16;
  }
  if (algo == "FT8") {
    return nnp_convolution_algorithm_ft8x8;
  }
  return nnp_convolution_algorithm_auto;
}

nnp_convolution_transform_strategy get_nnp_convolution_transform_strategy(
    const std::string& kts) {
  if (kts == "BLOCK") {
    return nnp_convolution_transform_strategy_block_based;
  }
  if (kts == "TUPLE") {
    return nnp_convolution_transform_strategy_tuple_based;
  }
  return nnp_convolution_transform_strategy_block_based;
}

////////////////////////////////////////////////////////////////////////////////
// Thread Pool
////////////////////////////////////////////////////////////////////////////////

static pthreadpool_t nnpack_threadpool_ = nullptr;

pthreadpool_t nnpack_threadpool() {
  if (nnpack_threadpool_ == nullptr) {
    enum nnp_status nnpack_status = nnp_initialize();
    CAFFE_ENFORCE(
        nnpack_status == nnp_status_success, "NNPack is not supported here!");
    int num_threads = FLAGS_caffe2_nnpack_num_threads;
    if (FLAGS_caffe2_nnpack_use_mkl_num_threads) {
#ifdef CAFFE2_USE_MKL
      num_threads = mkl_get_max_threads();
#else
      VLOG(1) << "I am asked to use MKL num of threads for NNPACK but this "
                 "Caffe2 is not built with MKL. Skipping.";
#endif
    }
    nnpack_threadpool_ = pthreadpool_create(num_threads);
  }
  return nnpack_threadpool_;
}
}

////////////////////////////////////////////////////////////////////////////////
// NNPACK Ops
////////////////////////////////////////////////////////////////////////////////

class NNPACKConvOp final : public ConvPoolOpBase<CPUContext> {
 public:
  NNPACKConvOp(const OperatorDef& operator_def, Workspace* ws)
      : ConvPoolOpBase<CPUContext>(operator_def, ws),
        algo_(get_nnp_convolution_algorithm(
            OperatorBase::GetSingleArgument<std::string>("algo", "AUTO"))),
        kts_(get_nnp_convolution_transform_strategy(
            OperatorBase::GetSingleArgument<std::string>("kts", "TUPLE"))) {
    OPERATOR_NEEDS_FEATURE(
        this->order_ == StorageOrder::NCHW,
        "NNPack only supports NCHW order. Please consider adding "
        "TransposeOp with axes=[0, 3, 1, 2] before NNPack Conv.");
    OPERATOR_NEEDS_FEATURE(
        dilation_h() == 1 && dilation_w() == 1,
        "The NNPack convolution does not support dilation yet.");
    // NNPACK can be built with avx2 support only and might not be able to run
    // on a given machine.
    OPERATOR_NEEDS_FEATURE(has_nnpack(), "NNPack can't run here. No AVX2?");
  }

  bool RunOnDeviceWithOrderNCHW() override {
    auto& X = Input(0);
    auto& filter = Input(1);
    auto& bias = Input(2);
    auto* Y = Output(0);

    const int N = X.dim32(0), C = X.dim32(1), H = X.dim32(2), W = X.dim32(3);
    const int M = filter.dim32(0);

    CAFFE_ENFORCE(X.dim() == 4, "Input dim should be 4");
    CAFFE_ENFORCE(filter.dim(), 4);
    CAFFE_ENFORCE(C % this->group_ == 0, "");
    CAFFE_ENFORCE(M % this->group_ == 0, "");
    CAFFE_ENFORCE(filter.dim32(1) == C / this->group_, "");
    CAFFE_ENFORCE(filter.dim32(2) == this->kernel_h(), "");
    CAFFE_ENFORCE(filter.dim32(3) == this->kernel_w(), "");
    CAFFE_ENFORCE(bias.numel() == M, "");

    ConvPoolOpBase<CPUContext>::SetOutputSize(X, Y, filter.dim32(0));
    const int oH = Y->dim32(2), oW = Y->dim32(3);

    if (N > 1) {
      CAFFE_ENFORCE_EQ(
          this->stride_h(),
          1,
          "NNPack only supports stride = 1 when doing batch feedforward");
      CAFFE_ENFORCE_EQ(
          this->stride_w(),
          1,
          "NNPack only supports stride = 1 when doing batch feedforward");
    }
    std::vector<int> pads(
        {this->pad_t(), this->pad_b(), this->pad_l(), this->pad_r()});
    std::vector<int> stride({this->stride_h(), this->stride_w()});

    const size_t input_channels = X.dim32(1);
    const size_t output_channels = Y->dim32(1);
    const nnp_size input_size = {.width = static_cast<size_t>(X.dim32(3)),
                                 .height = static_cast<size_t>(X.dim32(2))};
    // filter is MCHW
    const nnp_size kernel_size = {
        .width = static_cast<size_t>(filter.dim32(3)),
        .height = static_cast<size_t>(filter.dim32(2))};
    // pad is tblr
    const nnp_padding padding = {.top = static_cast<size_t>(pads[0]),
                                 .right = static_cast<size_t>(pads[3]),
                                 .bottom = static_cast<size_t>(pads[1]),
                                 .left = static_cast<size_t>(pads[2])};

    const nnp_size output_subsample = {
        .width = static_cast<size_t>(stride[1]),
        .height = static_cast<size_t>(stride[0])};
    if (N == 1) {
      VLOG(1) << "Running inference mode";
      for (auto g = 0; g < group_; ++g) {
        const auto status = nnp_convolution_inference(
            algo_,
            kts_,
            C / group_,
            M / group_,
            input_size,
            padding,
            kernel_size,
            output_subsample,
            X.template data<float>() + g * H * W * (C / group_),
            filter.template data<float>() + filter.numel() / group_ * g,
            bias.template data<float>() + bias.numel() / group_ * g,
            Y->template mutable_data<float>() + g * oH * oW * (M / group_),
            nnpack_threadpool(),
            nullptr);
        CAFFE_ENFORCE(nnp_status_success == status, "");
      }
    } else {
      VLOG(1) << "Running batched mode";
      for (auto g = 0; g < group_; ++g) {
        const auto status = nnp_convolution_output(
            algo_,
            N,
            C / group_,
            M / group_,
            input_size,
            padding,
            kernel_size,
            X.template data<float>() + g * H * W * (C / group_),
            filter.template data<float>() + filter.numel() / group_ * g,
            bias.template data<float>() + bias.numel() / group_ * g,
            Y->template mutable_data<float>() + g * oH * oW * (M / group_),
            nnpack_threadpool(),
            nullptr);
        CAFFE_ENFORCE(nnp_status_success == status, "");
      }
    }
    return true;
  }

 private:
  const nnp_convolution_algorithm algo_;
  const nnp_convolution_transform_strategy kts_;
};

class NNPACKMaxPoolOp final : public ConvPoolOpBase<CPUContext> {
 public:
  NNPACKMaxPoolOp(const OperatorDef& operator_def, Workspace* ws)
      : ConvPoolOpBase<CPUContext>(operator_def, ws) {
    OPERATOR_NEEDS_FEATURE(
        this->order_ == StorageOrder::NCHW,
        "NNPack only supports NCHW order. Please consider add "
        "TransposeOp with axes=[0, 3, 1, 2] before NNPack Conv.");
    OPERATOR_NEEDS_FEATURE(
        this->kernel_h() == 2, "NNPack only supports MaxPool kernel size 2*2!");
    OPERATOR_NEEDS_FEATURE(
        this->kernel_w() == 2, "NNPack only supports MaxPool kernel size 2*2!");
    OPERATOR_NEEDS_FEATURE(
        this->stride_h() == 2, "NNPack only supports MaxPool stride size 2*2!");
    OPERATOR_NEEDS_FEATURE(
        this->stride_w() == 2, "NNPack only supports MaxPool stride size 2*2!");
    OPERATOR_NEEDS_FEATURE(
        this->pad_t() == 0,
        "NNPack Pooling differs from Caffe2 Pooling when pad > 0!");
    OPERATOR_NEEDS_FEATURE(
        this->pad_l() == 0,
        "NNPack Pooling differs from Caffe2 Pooling when pad > 0!");
    OPERATOR_NEEDS_FEATURE(
        this->pad_r() == 0,
        "NNPack Pooling differs from Caffe2 Pooling when pad > 0!");
    OPERATOR_NEEDS_FEATURE(
        this->pad_b() == 0,
        "NNPack Pooling differs from Caffe2 Pooling when pad > 0!");
    // NNPACK can be built with avx2 support only and might not be able to run
    // on a given machine.
    OPERATOR_NEEDS_FEATURE(has_nnpack(), "NNPack can't run here. No AVX2?");
  }

  bool RunOnDeviceWithOrderNCHW() override {
    auto& X = Input(0);
    auto* Y = Output(0);
    CAFFE_ENFORCE(X.dim() == 4, "");
    const int H = X.dim32(2), W = X.dim32(3);
    ConvPoolOpBase<CPUContext>::SetOutputSize(X, Y, X.dim32(1));
    std::vector<int> pads(
        {this->pad_t(), this->pad_b(), this->pad_l(), this->pad_r()});
    std::vector<int> stride({this->stride_h(), this->stride_w()});
    std::vector<int> pooling({this->kernel_h(), this->kernel_w()});

    // Input X is in NCHW order
    const size_t batch_size = X.dim32(0);
    const size_t input_channels = X.dim32(1);
    const nnp_size input_size = {.width = static_cast<size_t>(X.dim32(3)),
                                 .height = static_cast<size_t>(X.dim32(2))};
    // pooling kernel
    const nnp_size pooling_size = {.width = static_cast<size_t>(pooling[1]),
                                   .height = static_cast<size_t>(pooling[0])};
    // pad is tblr
    const nnp_padding padding = {.top = static_cast<size_t>(pads[0]),
                                 .right = static_cast<size_t>(pads[3]),
                                 .bottom = static_cast<size_t>(pads[1]),
                                 .left = static_cast<size_t>(pads[2])};

    const nnp_size pooling_stride = {.width = static_cast<size_t>(stride[1]),
                                     .height = static_cast<size_t>(stride[0])};
    const auto status = nnp_max_pooling_output(
        batch_size,
        input_channels,
        input_size,
        padding,
        pooling_size,
        pooling_stride,
        X.template data<float>(),
        Y->template mutable_data<float>(),
        nnpack_threadpool());
    CAFFE_ENFORCE(nnp_status_success == status, "");
    return true;
  }

 private:
};

class NNPACKReluOp final : public Operator<CPUContext> {
 public:
  NNPACKReluOp(const OperatorDef& operator_def, Workspace* ws)
      : Operator<CPUContext>(operator_def, ws) {
    // NNPACK can be built with avx2 support only and might not be able to run
    // on a given machine.
    OPERATOR_NEEDS_FEATURE(has_nnpack(), "NNPack can't run here. No AVX2?");
  }

  bool RunOnDevice() override {
    auto& X = Input(0);
    auto* Y = Output(0);
    const auto status = nnp_relu_output(
        1,
        X.numel(),
        X.template data<float>(),
        Y->template mutable_data<float>(),
        0.0,
        nnpack_threadpool());
    CAFFE_ENFORCE(nnp_status_success == status, "");
    return true;
  }

 private:
};

class NNPACKLeakyReluOp final : public LeakyReluOp<float, CPUContext> {
 public:
  NNPACKLeakyReluOp(const OperatorDef& operator_def, Workspace* ws)
      : LeakyReluOp<float, CPUContext>(operator_def, ws) {
    // NNPACK can be built with avx2 support only and might not be able to run
    // on a given machine.
    OPERATOR_NEEDS_FEATURE(has_nnpack(), "NNPack can't run here. No AVX2?");
  }

  bool RunOnDevice() override {
    auto& X = Input(0);
    auto* Y = Output(0);
    const auto status = nnp_relu_output(
        1,
        X.numel(),
        X.template data<float>(),
        Y->template mutable_data<float>(),
        alpha_,
        nnpack_threadpool());
    CAFFE_ENFORCE(nnp_status_success == status, "");
    return true;
  }

 private:
};

REGISTER_CPU_OPERATOR_WITH_ENGINE(Conv, NNPACK, NNPACKConvOp);
REGISTER_CPU_OPERATOR_WITH_ENGINE(MaxPool, NNPACK, NNPACKMaxPoolOp);
REGISTER_CPU_OPERATOR_WITH_ENGINE(Relu, NNPACK, NNPACKReluOp);
REGISTER_CPU_OPERATOR_WITH_ENGINE(LeakyRelu, NNPACK, NNPACKLeakyReluOp);

} // namespace caffe2