1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134
|
#include "caffe2/quantization/server/channel_shuffle_dnnlowp_op.h"
#include "caffe2/quantization/server/caffe2_dnnlowp_utils.h"
#include "caffe2/quantization/server/transpose.h"
#include "caffe2/utils/eigen_utils.h"
namespace caffe2 {
template <typename T>
ChannelShuffleDNNLowPOp<T>::ChannelShuffleDNNLowPOp(
const OperatorDef& operator_def,
Workspace* ws)
: BaseType(operator_def, ws),
order_(StringToStorageOrder(
this->template GetSingleArgument<std::string>("order", "NCHW"))),
OP_SINGLE_ARG(int, "group", group_, 1) {
CAFFE_ENFORCE_NE(order_, StorageOrder::UNKNOWN);
}
template <typename T>
bool ChannelShuffleDNNLowPOp<T>::RunOnDevice() {
return order_ == StorageOrder::NCHW ? RunOnDeviceWithOrderNCHW()
: RunOnDeviceWithOrderNHWC();
}
template <typename T>
bool ChannelShuffleDNNLowPOp<T>::RunOnDeviceWithOrderNCHW() {
using namespace dnnlowp;
this->ParseDNNLowPOperatorArguments_();
// Choose quantization params
TensorQuantizationParams in_qparams =
GetInputTensorQuantizationParamsOf(this, 0, qfactory_.get());
const auto& X = InputTensorCPU_(0);
auto* Y = OutputTensorCPU_(0);
Y->ResizeLike(X);
const int N = X.dim32(0);
const int C = X.dim32(1);
const int G = group_;
CAFFE_ENFORCE_EQ(C % G, 0);
const int K = C / G;
const int HxW = X.size_from_dim(2);
const int stride = C * HxW;
const T* X_data = X.template data<T>();
T* Y_data = Y->template mutable_data<T>();
#ifdef _OPENMP
#pragma omp parallel for
#endif
for (int i = 0; i < N; ++i) {
ConstEigenMatrixMap<T> X_mat(X_data + i * stride, K * HxW, G);
for (int j = 0; j < K; ++j) {
EigenMatrixMap<T>(Y_data + i * stride + j * G * HxW, HxW, G) =
X_mat.block(j * HxW, 0, HxW, G);
}
}
// Even if there is a pre-chosen quantization parameters for the output,
// it is ignored because channel shuffle output quantization should be same
// as the input.
PropagateOutputTensorQuantizationParams(this, 0, in_qparams);
return true;
}
template <typename T>
bool ChannelShuffleDNNLowPOp<T>::RunOnDeviceWithOrderNHWC() {
using namespace dnnlowp;
this->ParseDNNLowPOperatorArguments_();
// Choose quantization params
TensorQuantizationParams in_qparams =
GetInputTensorQuantizationParamsOf(this, 0, qfactory_.get());
const auto& X = InputTensorCPU_(0);
auto* Y = OutputTensorCPU_(0);
Y->ResizeLike(X);
const auto C = X.dim32(X.ndim() - 1);
const auto G = this->group_;
CAFFE_ENFORCE(C % G == 0, "");
const auto K = C / G;
std::array<int, 2> dims = {G, K};
std::array<int, 2> axes = {1, 0};
const T* X_data = X.template data<T>();
T* Y_data = Y->template mutable_data<T>();
if (G == 4 && std::is_same<T, std::uint8_t>::value && GetCpuId().avx2()) {
#ifdef _OPENMP
#pragma omp parallel for
#endif
for (auto i = 0; i < X.numel(); i += C) {
// Transpose each C = GxK matrix
fbgemm::transpose_4rows(
K,
reinterpret_cast<const std::uint8_t*>(X_data + i),
reinterpret_cast<std::uint8_t*>(Y_data + i));
}
} else {
#ifdef _OPENMP
#pragma omp parallel for
#endif
for (auto i = 0; i < X.numel(); i += C) {
// Transpose each C = GxK matrix
math::Transpose(
2, dims.data(), axes.data(), X_data + i, Y_data + i, &context_);
}
}
// Even if there is a pre-chosen quantization parameters for the output,
// it is ignored because channel shuffle output quantization should be same
// as the input.
PropagateOutputTensorQuantizationParams(this, 0, in_qparams);
return true;
}
REGISTER_CPU_OPERATOR_WITH_ENGINE(
ChannelShuffle,
DNNLOWP,
ChannelShuffleDNNLowPOp<uint8_t>);
REGISTER_CPU_OPERATOR_WITH_ENGINE(
Int8ChannelShuffle,
DNNLOWP,
ChannelShuffleDNNLowPOp<uint8_t>);
REGISTER_CPU_OPERATOR_WITH_ENGINE(
ChannelShuffle,
DNNLOWP_16,
ChannelShuffleDNNLowPOp<uint16_t>);
} // namespace caffe2
|