1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37
|
#pragma once
#include "caffe2/quantization/server/fully_connected_dnnlowp_op.h"
namespace caffe2 {
/**
* Quantized FC operator with 16-bit accumulation.
* We'll encounter saturation but this will be faster in Intel CPUs
*/
class FullyConnectedDNNLowPAcc16Op final
: public FullyConnectedDNNLowPOp<std::uint8_t> {
public:
FullyConnectedDNNLowPAcc16Op(const OperatorDef& operator_def, Workspace* ws);
bool RunOnDevice() override;
USE_OPERATOR_FUNCTIONS(CPUContext);
using BaseType = FullyConnectedDNNLowPOp<std::uint8_t>;
using BaseType::dequantize_output_;
using BaseType::in_qparams_;
using BaseType::InputTensorCPU_;
using BaseType::out_qparams_;
using BaseType::OutputTensorCPU_;
using BaseType::W_quantized_;
private:
std::shared_ptr<fbgemm::PackBMatrix<std::int8_t, std::int16_t>>
Wq_acc16_packed_;
// Wq outlier in CSC format
std::shared_ptr<fbgemm::CompressedSparseColumn> Wq_outlier_;
int nbits_in_non_outlier_;
int copy_to_32bit_frequency_;
}; // class FullyConnectedDNNLowPAcc16Op
} // namespace caffe2
|