1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71
|
#include "storm_op.h"
namespace caffe2 {
REGISTER_CPU_OPERATOR(Storm, StormOp<CPUContext>);
OPERATOR_SCHEMA(Storm)
.NumInputs(5)
.NumOutputs(3)
.AllowInplace({{0, 0}, {1, 1}, {2, 2}})
.SetDoc(R"DOC(
Computes the STORM (https://arxiv.org/abs/1905.10018) update for an input
gradient and accumulated history of gradients. Concretely, given inputs
(param, moment, grad_sq_sum, grad, lr), computes:
new_grad_sq_sum = grad_sq_sum + norm(grad)^2
effective_lr = lr / (beta + new_grad_sq_sum)^1/3
alpha = momentum * square(effective_lr)
new_moment = grad + (1 - alpha) * (moment - grad)
new_param = param + effective_lr * new_moment
and returns (new_param, new_moment, new_grad_sq_sum).
Note that due to caffe2 limitation, it is difficult to re-calculate gradient
in the previous iteration using the current example. We simplied calculation
for new_moment by using the gradient from the current iteration.
)DOC")
.Input(0, "param", "Parameters to be updated.")
.Input(1, "moment", "Moment history.")
.Input(2, "grad_sq_sum", "Sum of observed squared gradients.")
.Input(3, "grad", "Gradients computed.")
.Input(4, "lr", "Learning rate, k in the original paper.")
.Output(0, "output_param", "Updated parameters.")
.Output(1, "output_moment", "Updated moment.")
.Output(2, "output_grad_sq_sum", "Updated sum of squared gradients.")
.Arg("momentum", "Momentum hyperparameter, c in the original paper.")
.Arg(
"beta",
"denominator in adaptive learning rate, w in the original paper.");
REGISTER_CPU_OPERATOR(SparseStorm, SparseStormOp<CPUContext>);
OPERATOR_SCHEMA(SparseStorm)
.NumInputs(6)
.NumOutputs(3)
.EnforceOneToOneInplace()
.SetDoc(R"DOC(
This operator implement the STORM (https://arxiv.org/abs/1905.10018)
optimization algorithm. Given inputs (param, moment, grad_sq_sum, grad,
indices, lr), computes the dense STORM update on (param, moment[indices],
grad_sq_sum, grad, lr), and returns (new_param, new_moment, new_grad_sq_sum)
as in the dense case.
)DOC")
.Input(0, "param", "Parameters to be updated.")
.Input(1, "moment", "Moment history.")
.Input(2, "grad_sq_sum", "Sum of observed squared gradients.")
.Input(3, "grad", "Gradients computed.")
.Input(4, "indices", "Sparse indices.")
.Input(5, "lr", "Learning rate, k in the original paper.")
.Output(0, "output_param", "Updated parameters.")
.Output(1, "output_moment", "Updated moment.")
.Output(2, "output_grad_sq_sum", "Updated sum of squared gradients.")
.Arg("momentum", "Momentum hyperparameter, c in the original paper.")
.Arg(
"beta",
"denominator in adaptive learning rate, w in the original paper.");
SHOULD_NOT_DO_GRADIENT(Storm);
SHOULD_NOT_DO_GRADIENT(SparseStorm);
} // namespace caffe2
|