File: adagrad_op.h

package info (click to toggle)
pytorch 1.13.1%2Bdfsg-4
links: PTS, VCS
area: main
in suites: bookworm
size: 139,252 kB
sloc: cpp: 1,100,274; python: 706,454; ansic: 83,052; asm: 7,618; java: 3,273; sh: 2,841; javascript: 612; makefile: 323; xml: 269; ruby: 185; yacc: 144; objc: 68; lex: 44
file content (574 lines) | stat: -rw-r--r-- 18,035 bytes
#pragma once

#include "caffe2/core/operator.h"
#include "caffe2/perfkernels/adagrad.h"
#if defined(USE_FBGEMM) && !defined(__NVCC__)
#include "fbgemm/FbgemmEmbedding.h"
#endif

namespace caffe2 {

template <typename Context>
void adagrad_update(
    int N,
    const float* w,
    const float* g,
    const float* h,
    float* nw,
    float* nh,
    float epsilon,
    float decay,
    const float* lr,
    Context* /*context*/,
    float weight_decay = 0.f) {
  return adagrad_update(
      N, w, g, h, nw, nh, epsilon, decay, lr[0], weight_decay);
}

template <typename Context>
void adagrad_update_output_effective_lr(
    int N,
    const float* paramIn,
    const float* gradIn,
    const float* momentIn,
    float* paramOut,
    float* momentOut,
    float* effectiveLROut,
    float epsilon,
    float decay,
    const float* lr,
    Context* /*context*/,
    float weight_decay = 0.f) {
  for (const auto i : c10::irange(N)) {
    float grad = std::fma(weight_decay, paramIn[i], gradIn[i]);
    float moment = momentOut[i] = decay * momentIn[i] + grad * grad;
    float effective_lr = effectiveLROut[i] =
        lr[0] / (std::sqrt(moment) + epsilon);
    paramOut[i] = paramIn[i] + effective_lr * grad;
  }
}

template <typename Context>
void adagrad_update_output_effective_lr_and_update(
    int N,
    const float* paramIn,
    const float* gradIn,
    const float* momentIn,
    float* paramOut,
    float* momentOut,
    float* effectiveLROut,
    float* updateOut,
    float epsilon,
    float decay,
    const float* lr,
    Context* /*context*/,
    float weight_decay = 0.f) {
  for (const auto i : c10::irange(N)) {
    float grad = std::fma(weight_decay, paramIn[i], gradIn[i]);
    float moment = momentOut[i] = decay * momentIn[i] + grad * grad;
    float effective_lr = effectiveLROut[i] =
        lr[0] / (std::sqrt(moment) + epsilon);
    float update = updateOut[i] = effective_lr * grad;
    paramOut[i] = paramIn[i] + update;
  }
}

template <class Context>
class AdagradOp final : public Operator<Context> {
 public:
  USE_OPERATOR_CONTEXT_FUNCTIONS;
  AdagradOp(const OperatorDef& operator_def, Workspace* ws)
      : Operator<Context>(operator_def, ws),
        epsilon_(this->template GetSingleArgument<float>("epsilon", 1e-5f)),
        decay_(this->template GetSingleArgument<float>("decay", 1.0f)),
        weight_decay_(
            this->template GetSingleArgument<float>("weight_decay", 0.f)) {
    VLOG(1) << "gradient optimization operator in use: "
            << "AdagradOp"
            << " weight_decay_=" << weight_decay_;
  }

  bool RunOnDevice() override {
    CAFFE_ENFORCE_EQ(
        Input(GRAD).numel(),
        Input(MOMENT_1).numel(),
        "PARAM size: ",
        Input(PARAM).numel(),
        ", GRAD size: ",
        Input(GRAD).numel(),
        ", MOMENT_1 size: ",
        Input(MOMENT_1).numel(),
        ", LR size: ",
        Input(LR).numel());

    CAFFE_ENFORCE_EQ(Input(GRAD).numel(), Input(PARAM).numel());
    Output(OUTPUT_PARAM)->ResizeLike(Input(PARAM));
    Output(OUTPUT_MOMENT_1)->ResizeLike(Input(MOMENT_1));
    if (OutputSize() == 2) {
      adagrad_update<Context>(
          Input(GRAD).numel(),
          Input(PARAM).template data<float>(),
          Input(GRAD).template data<float>(),
          Input(MOMENT_1).template data<float>(),
          Output(OUTPUT_PARAM)->template mutable_data<float>(),
          Output(OUTPUT_MOMENT_1)->template mutable_data<float>(),
          epsilon_,
          decay_,
          Input(LR).template data<float>(),
          &context_,
          weight_decay_);
    } else if (OutputSize() == 3) {
      Output(OUTPUT_EFFECTIVE_LR)->ResizeLike(Input(GRAD));
      adagrad_update_output_effective_lr<Context>(
          Input(GRAD).numel(),
          Input(PARAM).template data<float>(),
          Input(GRAD).template data<float>(),
          Input(MOMENT_1).template data<float>(),
          Output(OUTPUT_PARAM)->template mutable_data<float>(),
          Output(OUTPUT_MOMENT_1)->template mutable_data<float>(),
          Output(OUTPUT_EFFECTIVE_LR)->template mutable_data<float>(),
          epsilon_,
          decay_,
          Input(LR).template data<float>(),
          &context_,
          weight_decay_);
    } else {
      Output(OUTPUT_EFFECTIVE_LR)->ResizeLike(Input(GRAD));
      Output(OUTPUT_UPDATE)->ResizeLike(Input(GRAD));
      adagrad_update_output_effective_lr_and_update<Context>(
          Input(GRAD).numel(),
          Input(PARAM).template data<float>(),
          Input(GRAD).template data<float>(),
          Input(MOMENT_1).template data<float>(),
          Output(OUTPUT_PARAM)->template mutable_data<float>(),
          Output(OUTPUT_MOMENT_1)->template mutable_data<float>(),
          Output(OUTPUT_EFFECTIVE_LR)->template mutable_data<float>(),
          Output(OUTPUT_UPDATE)->template mutable_data<float>(),
          epsilon_,
          decay_,
          Input(LR).template data<float>(),
          &context_,
          weight_decay_);
    }

    return true;
  }

 protected:
  float epsilon_;
  float decay_;
  float weight_decay_;
  INPUT_TAGS(PARAM, MOMENT_1, GRAD, LR);
  OUTPUT_TAGS(
      OUTPUT_PARAM,
      OUTPUT_MOMENT_1,
      OUTPUT_EFFECTIVE_LR,
      OUTPUT_UPDATE);
};

class SparseAdagradOp final : public Operator<CPUContext> {
 public:
  SparseAdagradOp(const OperatorDef& operator_def, Workspace* ws)
      : Operator<CPUContext>(operator_def, ws),
        epsilon_(this->template GetSingleArgument<float>("epsilon", 1e-5f)),
        weight_decay_(
            this->template GetSingleArgument<float>("weight_decay", 0.f)) {
    VLOG(1) << "gradient optimization operator in use: "
            << "SparseAdagradOp"
            << " weight_decay_=" << weight_decay_;
    const float decay = this->template GetSingleArgument<float>("decay", 1.0);
    CAFFE_ENFORCE_EQ(
        decay, 1.0, "Decay is not supported for SparseSimdAdagradOp");
  }

  bool RunOnDevice() override {
    // Enforce shapes
    // input(embedding/momentum) == outputs(embedding/momentum)
    CAFFE_ENFORCE_EQ(
        Input(PARAM).numel(),
        Input(MOMENT_1).numel(),
        "Input Param size: ",
        Input(PARAM).numel(),
        " Input Moment size: ",
        Input(MOMENT_1).numel());
    CAFFE_ENFORCE_EQ(Input(LR).numel(), 1);
    CAFFE_ENFORCE_EQ(
        Input(PARAM).size_from_dim(1),
        Input(GRAD).size_from_dim(Input(INDICES).dim()));

    return DispatchHelper<TensorTypes<int32_t, int64_t>>::call(
        this, Input(INDICES));
  }

  template <typename SIndex>
  bool DoRunWithType() {
    const auto* lr = Input(LR).template data<float>();

    auto n = Input(INDICES).numel();

    const auto* indices = Input(INDICES).template data<SIndex>();
    const auto* gradIn = Input(GRAD).template data<float>();
    auto* paramOut = Output(OUTPUT_PARAM)->template mutable_data<float>();
    auto* momentOut = Output(OUTPUT_MOMENT_1)->template mutable_data<float>();

    if (n == 0) {
      return true;
    }
    auto block_size = Input(GRAD).numel() / n;

    // input(grad) is compatible with size of indexes
    CAFFE_ENFORCE_EQ(
        Input(GRAD).numel() % n,
        0,
        "Incorrect gradient size:",
        Input(GRAD).numel(),
        " size of indexes:",
        n);

#if defined(USE_FBGEMM) && !defined(__NVCC__)
    VLOG(1) << "using fbgemm::GenerateSparseAdaGrad in SparseAdagradOp";

    if (block_size != last_block_size_) {
      last_block_size_ = block_size;
      if (std::is_same<SIndex, std::int32_t>::value) {
        kernel_i32_ = fbgemm::GenerateSparseAdaGrad<std::int32_t>(
            block_size,
            /*rowwise=*/false,
            /*prefetch=*/16,
            weight_decay_ != 0.0f);
      } else {
        CAFFE_ENFORCE((std::is_same<SIndex, std::int64_t>::value));
        kernel_i64_ = fbgemm::GenerateSparseAdaGrad<std::int64_t>(
            block_size,
            /*rowwise=*/false,
            /*prefetch=*/16,
            weight_decay_ != 0.0f);
      }
    }

    int num_rows_processed;
    if (std::is_same<SIndex, std::int32_t>::value) {
      num_rows_processed = kernel_i32_(
          n,
          Input(PARAM).numel(),
          paramOut,
          gradIn,
          momentOut,
          reinterpret_cast<const std::int32_t*>(indices),
          epsilon_,
          lr[0],
          weight_decay_,
          /*counter=*/nullptr,
          /*counter_halflife=*/0);
    } else {
      num_rows_processed = kernel_i64_(
          n,
          Input(PARAM).numel(),
          paramOut,
          gradIn,
          momentOut,
          reinterpret_cast<const std::int64_t*>(indices),
          epsilon_,
          lr[0],
          weight_decay_,
          /*counter=*/nullptr,
          /*counter_halflife=*/0);
    }
    if (num_rows_processed < n) {
      CAFFE_ENFORCE_GE(
          Input(PARAM).numel(),
          (indices[num_rows_processed] + 1) * block_size,
          this->debug_def().input(PARAM),
          ", out of bound,  idx:",
          indices[num_rows_processed],
          " for input i:",
          num_rows_processed,
          " and block_size:",
          block_size,
          " max size:",
          Input(PARAM).numel());
      return false;
    } else {
      return true;
    }
#endif

    VLOG(1)
        << "using internal::adagrad_update_prefetch_inlined in SparseAdagradOp";

    const auto* paramIn = Input(PARAM).template data<float>();
    const auto* momentIn = Input(MOMENT_1).template data<float>();

    std::vector<float> grad(block_size);
    for (const auto i : c10::irange(n)) {
      auto idx = indices[i];
      auto offsetI = i * block_size;
      auto offsetIdx = idx * block_size;

      // Enforce:
      // access within range
      // gradient access within range
      CAFFE_ENFORCE_GE(
          Input(PARAM).numel(),
          block_size + offsetIdx,
          this->debug_def().input(PARAM),
          ", out of bound,  idx:",
          idx,
          " for input i:",
          i,
          " and block size:",
          block_size,
          " max size:",
          Input(PARAM).numel());

      if (block_size == 1) {
        float gi = std::fma(weight_decay_, paramIn[idx], gradIn[i]);
        float hi = momentOut[idx] = momentIn[idx] + gi * gi;
        paramOut[idx] = paramIn[idx] + lr[0] * gi / (std::sqrt(hi) + epsilon_);
      } else {
        // prefetching
        const int prefdist_T0 = 16;
        int i_pref = (i < n - prefdist_T0) ? i + prefdist_T0 : i;
        std::size_t idx_pref = indices[i_pref];

        internal::adagrad_update_prefetch_inlined(
            block_size,
            paramIn + offsetIdx,
            &paramIn[idx_pref * block_size],
            gradIn + offsetI,
            momentIn + offsetIdx,
            &momentIn[idx_pref * block_size],
            paramOut + offsetIdx,
            &paramOut[idx_pref * block_size],
            momentOut + offsetIdx,
            &momentOut[idx_pref * block_size],
            epsilon_,
            lr[0],
            weight_decay_);
      }
    }
    return true;
  }

 protected:
  float epsilon_;
  const float weight_decay_;
#if defined(USE_FBGEMM) && !defined(__NVCC__)
  fbgemm::SparseAdaGradSignature<std::int32_t>::Type kernel_i32_;
  fbgemm::SparseAdaGradSignature<std::int64_t>::Type kernel_i64_;
  std::int64_t last_block_size_{-1};
#endif

  INPUT_TAGS(PARAM, MOMENT_1, INDICES, GRAD, LR);
  OUTPUT_TAGS(OUTPUT_PARAM, OUTPUT_MOMENT_1);
};

template <class Context>
class RowWiseSparseAdagradOp final : public Operator<Context> {
 public:
  USE_OPERATOR_CONTEXT_FUNCTIONS;
  RowWiseSparseAdagradOp(const OperatorDef& operator_def, Workspace* ws)
      : Operator<Context>(operator_def, ws),
        epsilon_(this->template GetSingleArgument<float>("epsilon", 1e-5f)),
        weight_decay_(
            this->template GetSingleArgument<float>("weight_decay", 0.f)),
        counter_halflife_(
            this->template GetSingleArgument<int64_t>("counter_halflife", -1)) {
    VLOG(1) << "gradient optimization operator in use: "
            << "RowWiseSparseAdagradOp"
            << " weight_decay_=" << weight_decay_
            << " counter_halflife=" << counter_halflife_;
  }

  bool RunOnDevice() override {
    // Enforce shapes
    CAFFE_ENFORCE_EQ(Input(PARAM).sizes()[0], Input(MOMENT_1).numel());
    CAFFE_ENFORCE_EQ(Input(LR).numel(), 1);
    CAFFE_ENFORCE_EQ(
        Input(PARAM).size_from_dim(1),
        Input(GRAD).size_from_dim(Input(INDICES).dim()));

    return DispatchHelper<TensorTypes<int32_t, int64_t>>::call(
        this, Input(INDICES));
  }

  template <typename SIndex>
  bool DoRunWithType() {
    const auto* lr = Input(LR).template data<float>();
    auto* param = Output(OUTPUT_PARAM)->template mutable_data<float>();
    auto* moment = Output(OUTPUT_MOMENT_1)->template mutable_data<float>();

    const auto* indices = Input(INDICES).template data<SIndex>();
    const auto* gradIn = Input(GRAD).template data<float>();
    const auto* count = counter_halflife_ == -1
        ? nullptr
        : Input(COUNTER).template data<double>();

    auto n = Input(INDICES).numel();
    if (n == 0) {
      return true;
    }

    auto block_size = Input(GRAD).numel() / n;

    // Enforce:
    // Input(embedding/momentum) == outputs(embedding/momentum)
    CAFFE_ENFORCE_EQ(
        Input(PARAM).numel() / block_size,
        Input(MOMENT_1).numel(),
        "Input Param size: ",
        Input(PARAM).numel(),
        " Block size: ",
        block_size,
        " Input Moment size: ",
        Input(MOMENT_1).numel());

    // input(grad) is compatible with size of indexes
    CAFFE_ENFORCE_EQ(
        Input(GRAD).numel() % n,
        0,
        "Incorrect gradient size:",
        Input(GRAD).numel(),
        " size of indexes:",
        n);

#if defined(USE_FBGEMM) && !defined(__NVCC__)
    VLOG(1) << "using fbgemm::GenerateSparseAdaGrad in RowWiseSparseAdagradOp";

    if (block_size != last_block_size_) {
      last_block_size_ = block_size;
      if (std::is_same<SIndex, std::int32_t>::value) {
        kernel_i32_ = fbgemm::GenerateSparseAdaGrad<std::int32_t>(
            block_size,
            /*rowwise=*/true,
            /*prefetch=*/16,
            weight_decay_ != 0.0f);
      } else {
        CAFFE_ENFORCE((std::is_same<SIndex, std::int64_t>::value));
        kernel_i64_ = fbgemm::GenerateSparseAdaGrad<std::int64_t>(
            block_size,
            /*rowwise=*/true,
            /*prefetch=*/16,
            weight_decay_ != 0.0f);
      }
    }

    int num_rows_processed;
    if (std::is_same<SIndex, std::int32_t>::value) {
      num_rows_processed = kernel_i32_(
          n,
          Input(PARAM).numel(),
          param,
          gradIn,
          moment,
          reinterpret_cast<const std::int32_t*>(indices),
          epsilon_,
          lr[0],
          weight_decay_,
          (counter_halflife_ > 0) ? count : nullptr,
          counter_halflife_);
    } else {
      num_rows_processed = kernel_i64_(
          n,
          Input(PARAM).numel(),
          param,
          gradIn,
          moment,
          reinterpret_cast<const std::int64_t*>(indices),
          epsilon_,
          lr[0],
          weight_decay_,
          (counter_halflife_ > 0) ? count : nullptr,
          counter_halflife_);
    }

    if (num_rows_processed < n) {
      // Enforce:
      // access within range
      CAFFE_ENFORCE_GE(
          Input(PARAM).numel(),
          (indices[num_rows_processed] + 1) * block_size,
          this->debug_def().input(PARAM),
          ", out of bound,  idx:",
          indices[num_rows_processed],
          " for input i:",
          num_rows_processed,
          " and block size:",
          block_size,
          " max size:",
          Input(PARAM).numel());
      return false;
    } else {
      return true;
    }
#else
    VLOG(1) << "using plain adagrad updates in RowWiseSparseAdagradOp";

    for (const auto i : c10::irange(n)) {
      auto idx = indices[i];
      float freq = (counter_halflife_ > 0 && count[idx] > 0)
          ? counter_halflife_ / count[idx]
          : 1.0;
      if (block_size == 1) {
        float gi = std::fma(weight_decay_ * freq, param[idx], gradIn[i]);
        float hi = moment[idx] = moment[idx] + gi * gi;
        param[idx] = param[idx] + lr[0] * gi / (std::sqrt(hi) + epsilon_);
      } else {
        auto offsetI = i * block_size;
        auto offsetIdx = idx * block_size;

#ifndef NDEBUG
        CAFFE_ENFORCE_GE(
            Input(PARAM).numel(),
            block_size + offsetIdx,
            this->debug_def().input(PARAM),
            ", out of bound,  idx:",
            idx,
            " for input i:",
            i,
            " and block size:",
            block_size);
        CAFFE_ENFORCE_GE(
            Input(GRAD).numel(),
            block_size + offsetI,
            this->debug_def().input(GRAD),
            ", out of bound idx, idx:",
            idx,
            " for input i:",
            i);
#endif

        float* w = param + offsetIdx;
        const float* g = gradIn + offsetI;
        float* h = moment + idx;
        float hs = 0.;
        for (const auto j : c10::irange(block_size)) {
          float gj = std::fma(weight_decay_ * freq, w[j], g[j]);
          hs += gj * gj;
        }
        float hi = h[0] = h[0] + hs / block_size;
        float step = lr[0] / (std::sqrt(hi) + epsilon_);
        for (const auto j : c10::irange(block_size)) {
          float gj = std::fma(weight_decay_ * freq, w[j], g[j]);
          w[j] = w[j] + gj * step;
        }
      }
    }
    return true;
#endif // !USE_FBGEMM
  }

 protected:
  float epsilon_;
  const float weight_decay_;
  const int64_t counter_halflife_;
#if defined(USE_FBGEMM) && !defined(__NVCC__)
  fbgemm::SparseAdaGradSignature<std::int32_t>::Type kernel_i32_;
  fbgemm::SparseAdaGradSignature<std::int64_t>::Type kernel_i64_;
  std::int64_t last_block_size_{-1};
#endif

  INPUT_TAGS(PARAM, MOMENT_1, INDICES, GRAD, LR, COUNTER);
  OUTPUT_TAGS(OUTPUT_PARAM, OUTPUT_MOMENT_1);
};
} // namespace caffe2