File: storm_op.cc

package info (click to toggle)
pytorch 1.13.1%2Bdfsg-4
  • links: PTS, VCS
  • area: main
  • in suites: bookworm
  • size: 139,252 kB
  • sloc: cpp: 1,100,274; python: 706,454; ansic: 83,052; asm: 7,618; java: 3,273; sh: 2,841; javascript: 612; makefile: 323; xml: 269; ruby: 185; yacc: 144; objc: 68; lex: 44
file content (71 lines) | stat: -rw-r--r-- 2,834 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
#include "storm_op.h"

namespace caffe2 {

REGISTER_CPU_OPERATOR(Storm, StormOp<CPUContext>);
OPERATOR_SCHEMA(Storm)
    .NumInputs(5)
    .NumOutputs(3)
    .AllowInplace({{0, 0}, {1, 1}, {2, 2}})
    .SetDoc(R"DOC(

Computes the STORM (https://arxiv.org/abs/1905.10018) update for an input
gradient and accumulated history of gradients. Concretely, given inputs
(param, moment, grad_sq_sum, grad, lr), computes:

    new_grad_sq_sum = grad_sq_sum + norm(grad)^2
    effective_lr = lr / (beta + new_grad_sq_sum)^1/3
    alpha = momentum * square(effective_lr)
    new_moment = grad + (1 - alpha) * (moment - grad)
    new_param = param + effective_lr * new_moment

and returns (new_param, new_moment, new_grad_sq_sum).

Note that due to caffe2 limitation, it is difficult to re-calculate gradient
in the previous iteration using the current example. We simplied calculation
for new_moment by using the gradient from the current iteration.

)DOC")
    .Input(0, "param", "Parameters to be updated.")
    .Input(1, "moment", "Moment history.")
    .Input(2, "grad_sq_sum", "Sum of observed squared gradients.")
    .Input(3, "grad", "Gradients computed.")
    .Input(4, "lr", "Learning rate, k in the original paper.")
    .Output(0, "output_param", "Updated parameters.")
    .Output(1, "output_moment", "Updated moment.")
    .Output(2, "output_grad_sq_sum", "Updated sum of squared gradients.")
    .Arg("momentum", "Momentum hyperparameter, c in the original paper.")
    .Arg(
        "beta",
        "denominator in adaptive learning rate, w in the original paper.");

REGISTER_CPU_OPERATOR(SparseStorm, SparseStormOp<CPUContext>);
OPERATOR_SCHEMA(SparseStorm)
    .NumInputs(6)
    .NumOutputs(3)
    .EnforceOneToOneInplace()
    .SetDoc(R"DOC(

This operator implement the STORM (https://arxiv.org/abs/1905.10018)
optimization algorithm. Given inputs (param, moment, grad_sq_sum, grad,
indices, lr), computes the dense STORM update on (param, moment[indices],
grad_sq_sum, grad, lr), and returns (new_param, new_moment, new_grad_sq_sum)
as in the dense case.
)DOC")
    .Input(0, "param", "Parameters to be updated.")
    .Input(1, "moment", "Moment history.")
    .Input(2, "grad_sq_sum", "Sum of observed squared gradients.")
    .Input(3, "grad", "Gradients computed.")
    .Input(4, "indices", "Sparse indices.")
    .Input(5, "lr", "Learning rate, k in the original paper.")
    .Output(0, "output_param", "Updated parameters.")
    .Output(1, "output_moment", "Updated moment.")
    .Output(2, "output_grad_sq_sum", "Updated sum of squared gradients.")
    .Arg("momentum", "Momentum hyperparameter, c in the original paper.")
    .Arg(
        "beta",
        "denominator in adaptive learning rate, w in the original paper.");

SHOULD_NOT_DO_GRADIENT(Storm);
SHOULD_NOT_DO_GRADIENT(SparseStorm);
} // namespace caffe2