1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163
|
#include "caffe2/sgd/momentum_sgd_op.h"
#include "caffe2/core/common_gpu.h"
#include "caffe2/core/context_gpu.h"
namespace caffe2 {
inline int CaffeGetBlocksSGD(const int N) {
return std::max(
(N + CAFFE_CUDA_NUM_THREADS - 1) / CAFFE_CUDA_NUM_THREADS,
// Use at least 1 block, since CUDA does not allow empty block
1);
}
template <bool nesterov>
__global__ void MomentumSGDKernel(
const int N,
const float* g,
const float* m,
float* ng,
float* nm,
const float* lr,
const float momentum,
float* param);
template <>
__global__ void MomentumSGDKernel<true>(
const int N,
const float* g,
const float* m,
float* ng,
float* nm,
const float* lr,
const float momentum,
float* param) {
const float LR = lr[0];
CUDA_1D_KERNEL_LOOP(i, N) {
const float mi = m[i];
const float mi_new = momentum * mi + LR * g[i];
nm[i] = mi_new;
ng[i] = fmaf(momentum, mi_new - mi, mi_new);
if (param != nullptr) {
param[i] -= ng[i];
}
}
}
template <>
__global__ void MomentumSGDKernel<false>(
const int N,
const float* g,
const float* m,
float* ng,
float* nm,
const float* lr,
const float momentum,
float* param) {
const float LR = lr[0];
CUDA_1D_KERNEL_LOOP(i, N) {
const float adjusted_gradient = LR * g[i] + momentum * m[i];
nm[i] = adjusted_gradient;
ng[i] = adjusted_gradient;
if (param != nullptr) {
param[i] -= adjusted_gradient;
}
}
}
template <>
void momentum_sgd_update<CUDAContext>(
const int N,
const float* g,
const float* m,
float* ng,
float* nm,
const float* lr,
const float momentum,
const bool nesterov,
float* param,
CUDAContext* context) {
if (nesterov) {
MomentumSGDKernel<true>
<<<CaffeGetBlocksSGD(N),
CAFFE_CUDA_NUM_THREADS,
0,
context->cuda_stream()>>>(N, g, m, ng, nm, lr, momentum, param);
C10_CUDA_KERNEL_LAUNCH_CHECK();
} else {
MomentumSGDKernel<false>
<<<CaffeGetBlocksSGD(N),
CAFFE_CUDA_NUM_THREADS,
0,
context->cuda_stream()>>>(N, g, m, ng, nm, lr, momentum, param);
C10_CUDA_KERNEL_LAUNCH_CHECK();
}
}
template <typename SIndex>
__global__ void SparseMomentumSGDKernel(
const size_t N,
const size_t sz,
const float momentum,
const bool nesterov,
float *param,
float *param_mom,
const SIndex *indices,
const float *gradIn,
float *gradOut,
const float *lr)
{
const float LR = lr[0];
CUDA_1D_KERNEL_LOOP(i, N)
{
const size_t gradIdx = i;
const SIndex index = indices[i / sz];
const size_t paramIdx = index * sz + (i % sz);
if (!nesterov)
{
const float adjusted_gradient = LR * gradIn[gradIdx] +
momentum * param_mom[paramIdx];
gradOut[gradIdx] = adjusted_gradient;
param_mom[paramIdx] = adjusted_gradient;
param[paramIdx] -= adjusted_gradient;
} else {
const float mom_old = param_mom[paramIdx];
const float mom_new = LR * gradIn[gradIdx] + momentum * mom_old;
param_mom[paramIdx] = mom_new;
const float adjusted_gradient = (1 + momentum) * mom_new -
momentum * mom_old;
gradOut[gradIdx] = adjusted_gradient;
param[paramIdx] -= adjusted_gradient;
}
}
}
// Specialization of DoRunWithType for CUDA
template <>
template <typename SIndex>
bool SparseMomentumSGDUpdateOp<float, CUDAContext>::DoRunWithType() {
auto N = Input(GRAD).size();
auto grad_slice_sz = Input(GRAD).size_from_dim(Input(INDICES).ndim());
SparseMomentumSGDKernel<SIndex><<<
CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS, 0,
context_.cuda_stream()>>>(
N, grad_slice_sz,
momentum_, nesterov_,
Output(OUTPUT_PARAM)->template mutable_data<float>(),
Output(OUTPUT_MOMENTUM)->template mutable_data<float>(),
Input(INDICES).template data<SIndex>(),
Input(GRAD).template data<float>(),
Output(OUTPUT_GRAD)->template mutable_data<float>(),
Input(LR).template data<float>());
C10_CUDA_KERNEL_LAUNCH_CHECK();
return true;
}
REGISTER_CUDA_OPERATOR(MomentumSGD, MomentumSGDOp<float, CUDAContext>);
REGISTER_CUDA_OPERATOR(MomentumSGDUpdate, MomentumSGDUpdateOp<float, CUDAContext>);
REGISTER_CUDA_OPERATOR(SparseMomentumSGDUpdate, SparseMomentumSGDUpdateOp<float, CUDAContext>);
}
|