File: decay_adagrad_op.h

package info (click to toggle)

pytorch 1.13.1%2Bdfsg-4

links: PTS, VCS
area: main
in suites: bookworm
size: 139,252 kB
sloc: cpp: 1,100,274; python: 706,454; ansic: 83,052; asm: 7,618; java: 3,273; sh: 2,841; javascript: 612; makefile: 323; xml: 269; ruby: 185; yacc: 144; objc: 68; lex: 44

file content (95 lines) | stat: -rw-r--r-- 3,215 bytes

#pragma once

#include "caffe2/core/operator.h"
#include "caffe2/utils/eigen_utils.h"

namespace caffe2 {

template <typename Context>
void decay_adagrad_compute(
    int N,
    const float* w,
    const float* g,
    const float* m,
    const float* v,
    float* nw,
    float* nm,
    float* nv,
    float beta1,
    float beta2,
    float eps_hat,
    float weight_decay,
    float c,
    const float* lr,
    Context* /*context*/) {
    ConstEigenVectorArrayMap<float> w_arr(w, N);
    ConstEigenVectorArrayMap<float> g_arr(g, N);
    ConstEigenVectorArrayMap<float> m_arr(m, N);
    ConstEigenVectorArrayMap<float> v_arr(v, N);
    EigenVectorArrayMap<float> nw_arr(nw, N);
    EigenVectorArrayMap<float> nm_arr(nm, N);
    EigenVectorArrayMap<float> nv_arr(nv, N);
    nm_arr = m_arr * beta1 + g_arr * (1.0f - beta1);
    nv_arr = v_arr + g_arr.square();
    nw_arr = w_arr + *lr * (nm_arr / c / (nv_arr.sqrt() + eps_hat) + weight_decay * w_arr);
}

template <typename T, class Context>
class DecayAdagradOp final : public Operator<Context> {
 public:
  USE_OPERATOR_CONTEXT_FUNCTIONS;
  DecayAdagradOp(const OperatorDef& operator_def, Workspace* ws)
      : Operator<Context>(operator_def, ws),
        beta1_(this->template GetSingleArgument<float>("beta1", 0.9f)),
        beta2_(this->template GetSingleArgument<float>("beta2", 0.999f)),
        epsilon_(this->template GetSingleArgument<float>("epsilon", 1e-5f)),
        weight_decay_(this->template GetSingleArgument<float>("weight_decay", 0.0f)),
        bias_correction_first_(this->template GetSingleArgument<bool>("bias_correction_first", true)) {}

  bool RunOnDevice() override {
    // Iter live on the CPU
    CAFFE_ENFORCE(OperatorBase::InputIsTensorType(ITER, CPU));
    CAFFE_ENFORCE(Input(LR).numel() == 1);
    CAFFE_ENFORCE(Input(GRAD).numel() == Input(PARAM).numel());
    CAFFE_ENFORCE(Input(GRAD).numel() == Input(MOMENT_1).numel());
    CAFFE_ENFORCE(Input(GRAD).numel() == Input(MOMENT_2).numel());
    Output(OUTPUT_PARAM)->ResizeLike(Input(PARAM));
    Output(OUTPUT_MOMENT_1)->ResizeLike(Input(MOMENT_1));
    Output(OUTPUT_MOMENT_2)->ResizeLike(Input(MOMENT_2));

    const auto iter =
        OperatorBase::Input<Tensor>(ITER, CPU).template data<int64_t>()[0];

    const auto t = iter + 1;
    const auto c = (bias_correction_first_)? (T(1.) - std::pow(beta1_, t)) : 1.0;
    decay_adagrad_compute<Context>(
        Input(GRAD).numel(),
        Input(PARAM).template data<T>(),
        Input(GRAD).template data<T>(),
        Input(MOMENT_1).template data<T>(),
        Input(MOMENT_2).template data<T>(),
        Output(OUTPUT_PARAM)->template mutable_data<T>(),
        Output(OUTPUT_MOMENT_1)->template mutable_data<T>(),
        Output(OUTPUT_MOMENT_2)->template mutable_data<T>(),
        beta1_,
        beta2_,
        epsilon_,
        weight_decay_,
        c,
        Input(LR).template data<T>(),
        &context_);

    return true;
  }

 protected:
  T beta1_{0.9};
  T beta2_{0.999};
  T epsilon_{1e-8};
  T weight_decay_{0.0};
  bool bias_correction_first_{true};
  INPUT_TAGS(PARAM, MOMENT_1, MOMENT_2, GRAD, LR, ITER);
  OUTPUT_TAGS(OUTPUT_PARAM, OUTPUT_MOMENT_1, OUTPUT_MOMENT_2);
};

} // namespace caffe2