File: adadelta_op.cc

package info (click to toggle)
pytorch 1.13.1%2Bdfsg-4
  • links: PTS, VCS
  • area: main
  • in suites: bookworm
  • size: 139,252 kB
  • sloc: cpp: 1,100,274; python: 706,454; ansic: 83,052; asm: 7,618; java: 3,273; sh: 2,841; javascript: 612; makefile: 323; xml: 269; ruby: 185; yacc: 144; objc: 68; lex: 44
file content (72 lines) | stat: -rw-r--r-- 2,622 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
#include "caffe2/sgd/adadelta_op.h"

namespace caffe2 {

REGISTER_CPU_OPERATOR(Adadelta, AdadeltaOp<CPUContext>);
OPERATOR_SCHEMA(Adadelta)
    .NumInputs(5)
    .NumOutputs(3)
    .AllowInplace({{0, 0}, {1, 1}, {2, 2}})
    .SetDoc(R"DOC(

Computes the AdaDelta update (https://arxiv.org/abs/1212.5701) for an input
gradient and accumulated history of squared gradients. Concretely, given
inputs (param, moment, moment_delta, grad, learning_rate), computes:

    new_moment = moment * decay + square(grad) * (1 - decay)
    new_grad = sqrt(moment_delta + epsilon) / sqrt(new_moment + epsilon) * grad
    new_param = param + learning_rate * new_grad
    new_moment_delta = moment_delta * decay + square(new_grad) * (1 - decay)

and returns (new_param, new_moment, new_moment_delta).

)DOC")
    .Input(0, "param", "Parameters to be updated")
    .Input(1, "moment", "Average of squared gradients")
    .Input(2, "moment_delta", "Average of squared parameter updates")
    .Input(3, "grad", "Gradient computed")
    .Input(4, "lr", "Learning rate")
    .Output(0, "output_param", "Updated parameters")
    .Output(1, "output_moment", "Updated average squared gradient")
    .Output(
        2,
        "output_moment_delta",
        "Updated average of squared parameter updates")
    .Arg("epsilon", "Default 1e-5")
    .Arg(
        "decay",
        "Default 0.95, the squared gradient sum is decayed by this factor.");

REGISTER_CPU_OPERATOR(SparseAdadelta, SparseAdadeltaOp<CPUContext>);
OPERATOR_SCHEMA(SparseAdadelta)
    .NumInputs(6)
    .NumOutputs(3)
    .EnforceOneToOneInplace()
    .SetDoc(R"DOC(

Given inputs (param, moment, moment_delta, indices, grad, lr),
runs the dense AdaDelta update on (param, grad, moment[indices],
 moment_delta[indices], lr), and returns (new_param, new_moment,
 new_moment_delta) as in the dense case.

)DOC")
    .Input(0, "param", "Parameters to be updated")
    .Input(1, "moment", "Average of squared gradients")
    .Input(2, "moment_delta", "Average of squared parameter updates")
    .Input(3, "indices", "Sparse indices")
    .Input(4, "grad", "Gradient computed")
    .Input(5, "lr", "learning rate")
    .Output(0, "output_param", "Updated parameters")
    .Output(1, "output_moment", "Updated average squared gradient")
    .Output(
        2,
        "output_moment_delta",
        "Updated average of squared parameter updates")
    .Arg("epsilon", "Default 1e-5")
    .Arg(
        "decay",
        "Default 0.95, the squared gradient sum is decayed by this factor.");

SHOULD_NOT_DO_GRADIENT(Adadelta);
SHOULD_NOT_DO_GRADIENT(SparseAdadelta);
} // namespace caffe2