1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153
|
from caffe2.python import core
from caffe2.proto import caffe2_pb2
from caffe2.python.optimizer import get_param_device
from caffe2.python.modeling.net_modifier import NetModifier
import logging
logger = logging.getLogger(__name__)
class GradientClipping(NetModifier):
L1_NORM = 'l1_norm'
L2_NORM = 'l2_norm'
BY_NORM = 'by_norm'
BY_VALUE = 'by_value'
GRAD_CLIP_METHODS = [BY_NORM, BY_VALUE]
CLIP_GRADIENT_NORM_TYPES = [L2_NORM, L1_NORM]
def __init__(self, grad_clip_method, clip_norm_type='l2_norm',
clip_threshold=0.1, use_parameter_norm=False,
compute_norm_ratio=False, clip_max=1, clip_min=-1,
blobs_to_include=None, blobs_to_exclude=None):
"""
Clips gradient to avoid gradient magnitude explosion or vanishing gradient.
Args:
grad_clip_method: ways to clip the gradients
clip_norm_type: type of norm used in the necessary computation
clip_threshold: threshold used to determine whether to clip
use_parameter_norm: a boolean to indicate whether to incorporate
the norm of the parameter
compute_norm_ratio: a boolean to compute the ratio between gradient norm
and parameter norm explicitly for debugging purpose
clip_max: when clipping by_value, any value that is greater than
clip_max will be clipped to clip_max
clip_min: when clipping by_value, any value that is smaller than
clip_min will be clipped to clip_min
blobs_to_include: names of blobs whose gradient is to be clipped. If it is set
to none, all param 's gradient in grad_map will be clipped.
blobs_to_exclude: names of blobs whose gradient is not to be clipped.
"""
assert grad_clip_method in self.GRAD_CLIP_METHODS, (
"This method of clipping, {}, has not been implemented.".format(
clip_norm_type))
if clip_norm_type is not None:
assert clip_norm_type in self.CLIP_GRADIENT_NORM_TYPES, (
"This method of clipping, {}, has not been implemented.".format(
clip_norm_type))
self.grad_clip_method = grad_clip_method
self.clip_norm_type = clip_norm_type
self.clip_threshold = float(clip_threshold)
self.use_parameter_norm = use_parameter_norm
self.compute_norm_ratio = compute_norm_ratio
self.clip_max = float(clip_max)
self.clip_min = float(clip_min)
self.blobs_to_include = blobs_to_include
self.blobs_to_exclude = blobs_to_exclude
def modify_net(self, net, init_net=None, grad_map=None, blob_to_device=None,
modify_output_record=False):
assert grad_map is not None
CPU = core.DeviceOption(caffe2_pb2.CPU)
final_param_map = {}
if self.blobs_to_include is None:
final_param_map = grad_map
else:
for blob in self.blobs_to_include:
param = core.BlobReference(blob)
if not net.BlobIsDefined(param):
raise Exception('param {0} is not defined in net {1}'.format(
param, net.Name()))
final_param_map[param] = grad_map[param]
if self.blobs_to_exclude is not None:
for blob in self.blobs_to_exclude:
final_param_map.pop(blob, None)
for param, grad in final_param_map.items():
# currently sparse gradients won't be clipped
# further implementation is needed to enable it
if isinstance(grad, core.GradientSlice):
continue
device = get_param_device(
param,
grad_map[str(param)],
param_to_device=blob_to_device,
default_device=CPU,
)
with core.DeviceScope(device):
if self.grad_clip_method == self.BY_NORM:
if self.clip_norm_type == self.L2_NORM:
p = 2
elif self.clip_norm_type == self.L1_NORM:
p = 1
grad_norm = net.LpNorm(
[grad],
net.NextScopedBlob(prefix=str(grad) + '_l{}_norm'.format(p)),
p=p,
)
if p == 2:
grad_norm = net.Pow([grad_norm], exponent=0.5)
op_inputs = [grad, grad_norm]
if self.use_parameter_norm:
param_norm = net.LpNorm(
[param],
net.NextScopedBlob(
prefix=str(param) + '_l{}_norm'.format(p)),
p=p,
)
if p == 2:
param_norm = net.Pow([param_norm], exponent=0.5)
op_inputs.append(param_norm)
if self.compute_norm_ratio:
net.Div(
[grad_norm, param_norm],
[net.NextScopedBlob(
prefix=str(param) + "_norm_ratio")]
)
net.ClipTensorByScaling(
op_inputs,
[grad],
threshold=self.clip_threshold,
)
elif self.grad_clip_method == self.BY_VALUE:
net.Clip(
[grad],
[grad],
max=self.clip_max,
min=self.clip_min,
)
|