1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135
|
#include "caffe2/operators/gelu_op.h"
#include <algorithm>
#include <cmath>
#include <functional>
#include <numeric>
#include <string>
#include <vector>
#ifdef _MSC_VER
#ifndef _USE_MATH_DEFINES
#define _USE_MATH_DEFINES
#endif
#include <math.h>
#endif // _MSC_VER
#include "caffe2/utils/eigen_utils.h"
#include "caffe2/utils/math.h"
namespace caffe2 {
template <>
template <typename T>
bool GeluFunctor<CPUContext>::
operator()(const int N, const T* X, T* Y, CPUContext* context) const {
if (fast_gelu) {
// y = 0.5x * (1 + tanh(sqrt(2/Pi) * (x + 0.044715x^3)))
constexpr T kAlpha = M_2_SQRTPI * M_SQRT1_2;
ConstEigenVectorArrayMap<T> X_arr(X, N);
EigenVectorArrayMap<T> Y_arr(Y, N);
Y_arr = X_arr *
(((X_arr + X_arr.cube() * gelu_utils::kFastCoeff) * kAlpha).tanh() +
T(1)) *
static_cast<T>(0.5);
} else {
// y = x * P(X <= x) where X ~ N(0, 1)
math::CdfNorm<T, CPUContext>(N, X, Y, context);
math::Mul<T, CPUContext>(N, X, Y, Y, context);
}
return true;
}
template <>
template <typename T>
bool GeluGradientFunctor<CPUContext>::Forward(
const std::vector<int>& dY_dims,
const std::vector<int>& /* X_dims */,
const T* dY,
const T* X,
T* dX,
CPUContext* context) const {
const int N = std::accumulate(
// NOLINTNEXTLINE(modernize-use-transparent-functors)
dY_dims.cbegin(), dY_dims.cend(), 1, std::multiplies<int>());
ConstEigenVectorArrayMap<T> dY_arr(dY, N);
ConstEigenVectorArrayMap<T> X_arr(X, N);
EigenVectorArrayMap<T> dX_arr(dX, N);
if (fast_gelu) {
constexpr T kAlpha = M_2_SQRTPI * M_SQRT1_2;
constexpr T kBeta = kAlpha * gelu_utils::kFastCoeff * T(3);
dX_arr = ((X_arr + X_arr.cube() * gelu_utils::kFastCoeff) * kAlpha).tanh();
dX_arr =
(T(1) + dX_arr +
X_arr * (T(1) - dX_arr.square()) * (kBeta * X_arr.square() + kAlpha)) *
dY_arr * static_cast<T>(0.5);
} else {
constexpr T kAlpha = M_2_SQRTPI * M_SQRT1_2 * T(0.5);
math::CdfNorm<T, CPUContext>(N, X, dX, context);
dX_arr = (dX_arr +
X_arr * (-X_arr.square() * static_cast<T>(0.5)).exp() * kAlpha) *
dY_arr;
}
return true;
}
REGISTER_CPU_OPERATOR(Gelu, GeluOp<CPUContext>);
REGISTER_CPU_OPERATOR(GeluGradient, GeluGradientOp<CPUContext>);
namespace {
OpSchema::Cost CostInferenceForGelu(
const OperatorDef& def,
const vector<TensorShape>& in) {
struct OpSchema::Cost cost = PointwiseCostInference<2>(def, in);
cost.params_bytes = 0;
return cost;
}
} // namespace
// Input: X, output: Y
OPERATOR_SCHEMA(Gelu)
.NumInputs(1)
.NumOutputs(1)
.Arg(
"fast_gelu",
"If true, use y = 0.5x * (1 + tanh(sqrt(2/Pi) * (x + 0.044715x^3))).")
.CostInferenceFunction(CostInferenceForGelu)
.IdenticalTypeAndShape()
.SetDoc(R"DOC(
Relu takes one input data (Tensor) and produces one output data
(Tensor) where the rectified linear function, y = xP(X <= x) where X ~ N(0, 1),
is applied to the tensor elementwise.
)DOC")
.Input(0, "X", "1D input tensor")
.Output(0, "Y", "1D input tensor");
OPERATOR_SCHEMA(GeluGradient)
.NumInputs(2)
.NumOutputs(1)
.IdenticalTypeAndShapeOfInput(1);
namespace {
class GetGeluGradient : public GradientMakerBase {
using GradientMakerBase::GradientMakerBase;
std::vector<OperatorDef> GetGradientDefs() override {
return SingleGradientDef(
"GeluGradient",
"",
std::vector<std::string>{GO(0), I(0)},
std::vector<std::string>{GI(0)});
}
};
} // namespace
REGISTER_GRADIENT(Gelu, GetGeluGradient);
} // namespace caffe2
C10_EXPORT_CAFFE2_OP_TO_C10_CPU(
Gelu,
"_caffe2::Gelu(Tensor input, bool fast_gelu = False) -> (Tensor output)",
caffe2::GeluOp<caffe2::CPUContext>);
|