1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72
|
#include "caffe2/sgd/adadelta_op.h"
namespace caffe2 {
REGISTER_CPU_OPERATOR(Adadelta, AdadeltaOp<CPUContext>);
OPERATOR_SCHEMA(Adadelta)
.NumInputs(5)
.NumOutputs(3)
.AllowInplace({{0, 0}, {1, 1}, {2, 2}})
.SetDoc(R"DOC(
Computes the AdaDelta update (https://arxiv.org/abs/1212.5701) for an input
gradient and accumulated history of squared gradients. Concretely, given
inputs (param, moment, moment_delta, grad, learning_rate), computes:
new_moment = moment * decay + square(grad) * (1 - decay)
new_grad = sqrt(moment_delta + epsilon) / sqrt(new_moment + epsilon) * grad
new_param = param + learning_rate * new_grad
new_moment_delta = moment_delta * decay + square(new_grad) * (1 - decay)
and returns (new_param, new_moment, new_moment_delta).
)DOC")
.Input(0, "param", "Parameters to be updated")
.Input(1, "moment", "Average of squared gradients")
.Input(2, "moment_delta", "Average of squared parameter updates")
.Input(3, "grad", "Gradient computed")
.Input(4, "lr", "Learning rate")
.Output(0, "output_param", "Updated parameters")
.Output(1, "output_moment", "Updated average squared gradient")
.Output(
2,
"output_moment_delta",
"Updated average of squared parameter updates")
.Arg("epsilon", "Default 1e-5")
.Arg(
"decay",
"Default 0.95, the squared gradient sum is decayed by this factor.");
REGISTER_CPU_OPERATOR(SparseAdadelta, SparseAdadeltaOp<CPUContext>);
OPERATOR_SCHEMA(SparseAdadelta)
.NumInputs(6)
.NumOutputs(3)
.EnforceOneToOneInplace()
.SetDoc(R"DOC(
Given inputs (param, moment, moment_delta, indices, grad, lr),
runs the dense AdaDelta update on (param, grad, moment[indices],
moment_delta[indices], lr), and returns (new_param, new_moment,
new_moment_delta) as in the dense case.
)DOC")
.Input(0, "param", "Parameters to be updated")
.Input(1, "moment", "Average of squared gradients")
.Input(2, "moment_delta", "Average of squared parameter updates")
.Input(3, "indices", "Sparse indices")
.Input(4, "grad", "Gradient computed")
.Input(5, "lr", "learning rate")
.Output(0, "output_param", "Updated parameters")
.Output(1, "output_moment", "Updated average squared gradient")
.Output(
2,
"output_moment_delta",
"Updated average of squared parameter updates")
.Arg("epsilon", "Default 1e-5")
.Arg(
"decay",
"Default 0.95, the squared gradient sum is decayed by this factor.");
SHOULD_NOT_DO_GRADIENT(Adadelta);
SHOULD_NOT_DO_GRADIENT(SparseAdadelta);
} // namespace caffe2
|