File: gradient_clipping.py

package info (click to toggle)
pytorch 1.13.1%2Bdfsg-4
  • links: PTS, VCS
  • area: main
  • in suites: bookworm
  • size: 139,252 kB
  • sloc: cpp: 1,100,274; python: 706,454; ansic: 83,052; asm: 7,618; java: 3,273; sh: 2,841; javascript: 612; makefile: 323; xml: 269; ruby: 185; yacc: 144; objc: 68; lex: 44
file content (153 lines) | stat: -rw-r--r-- 5,766 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153





from caffe2.python import core
from caffe2.proto import caffe2_pb2
from caffe2.python.optimizer import get_param_device
from caffe2.python.modeling.net_modifier import NetModifier

import logging

logger = logging.getLogger(__name__)


class GradientClipping(NetModifier):

    L1_NORM = 'l1_norm'
    L2_NORM = 'l2_norm'

    BY_NORM = 'by_norm'
    BY_VALUE = 'by_value'

    GRAD_CLIP_METHODS = [BY_NORM, BY_VALUE]
    CLIP_GRADIENT_NORM_TYPES = [L2_NORM, L1_NORM]

    def __init__(self, grad_clip_method, clip_norm_type='l2_norm',
                clip_threshold=0.1, use_parameter_norm=False,
                compute_norm_ratio=False, clip_max=1, clip_min=-1,
                blobs_to_include=None, blobs_to_exclude=None):
        """
        Clips gradient to avoid gradient magnitude explosion or vanishing gradient.

        Args:
        grad_clip_method: ways to clip the gradients
        clip_norm_type: type of norm used in the necessary computation
        clip_threshold: threshold used to determine whether to clip
        use_parameter_norm: a boolean to indicate whether to incorporate
            the norm of the parameter
        compute_norm_ratio: a boolean to compute the ratio between gradient norm
            and parameter norm explicitly for debugging purpose
        clip_max: when clipping by_value, any value that is greater than
            clip_max will be clipped to clip_max
        clip_min: when clipping by_value, any value that is smaller than
            clip_min will be clipped to clip_min
        blobs_to_include: names of blobs whose gradient is to be clipped. If it is set
            to none, all param 's gradient in grad_map will be clipped.
        blobs_to_exclude: names of blobs whose gradient is not to be clipped.
        """

        assert grad_clip_method in self.GRAD_CLIP_METHODS, (
            "This method of clipping, {}, has not been implemented.".format(
                clip_norm_type))
        if clip_norm_type is not None:
            assert clip_norm_type in self.CLIP_GRADIENT_NORM_TYPES, (
                "This method of clipping, {}, has not been implemented.".format(
                    clip_norm_type))

        self.grad_clip_method = grad_clip_method
        self.clip_norm_type = clip_norm_type
        self.clip_threshold = float(clip_threshold)
        self.use_parameter_norm = use_parameter_norm
        self.compute_norm_ratio = compute_norm_ratio
        self.clip_max = float(clip_max)
        self.clip_min = float(clip_min)
        self.blobs_to_include = blobs_to_include
        self.blobs_to_exclude = blobs_to_exclude

    def modify_net(self, net, init_net=None, grad_map=None, blob_to_device=None,
                   modify_output_record=False):

        assert grad_map is not None

        CPU = core.DeviceOption(caffe2_pb2.CPU)

        final_param_map = {}
        if self.blobs_to_include is None:
            final_param_map = grad_map
        else:
            for blob in self.blobs_to_include:
                param = core.BlobReference(blob)
                if not net.BlobIsDefined(param):
                    raise Exception('param {0} is not defined in net {1}'.format(
                        param, net.Name()))
                final_param_map[param] = grad_map[param]

        if self.blobs_to_exclude is not None:
            for blob in self.blobs_to_exclude:
                final_param_map.pop(blob, None)

        for param, grad in final_param_map.items():
            # currently sparse gradients won't be clipped
            # further implementation is needed to enable it
            if isinstance(grad, core.GradientSlice):
                continue

            device = get_param_device(
                param,
                grad_map[str(param)],
                param_to_device=blob_to_device,
                default_device=CPU,
            )

            with core.DeviceScope(device):
                if self.grad_clip_method == self.BY_NORM:
                    if self.clip_norm_type == self.L2_NORM:
                        p = 2
                    elif self.clip_norm_type == self.L1_NORM:
                        p = 1

                    grad_norm = net.LpNorm(
                        [grad],
                        net.NextScopedBlob(prefix=str(grad) + '_l{}_norm'.format(p)),
                        p=p,
                    )

                    if p == 2:
                        grad_norm = net.Pow([grad_norm], exponent=0.5)

                    op_inputs = [grad, grad_norm]

                    if self.use_parameter_norm:
                        param_norm = net.LpNorm(
                            [param],
                            net.NextScopedBlob(
                                prefix=str(param) + '_l{}_norm'.format(p)),
                            p=p,
                        )

                        if p == 2:
                            param_norm = net.Pow([param_norm], exponent=0.5)

                        op_inputs.append(param_norm)

                        if self.compute_norm_ratio:
                            net.Div(
                                [grad_norm, param_norm],
                                [net.NextScopedBlob(
                                    prefix=str(param) + "_norm_ratio")]
                            )

                    net.ClipTensorByScaling(
                        op_inputs,
                        [grad],
                        threshold=self.clip_threshold,
                    )
                elif self.grad_clip_method == self.BY_VALUE:
                    net.Clip(
                        [grad],
                        [grad],
                        max=self.clip_max,
                        min=self.clip_min,
                    )