File: smooth_l1_loss_op.cu

package info (click to toggle)
pytorch 1.13.1%2Bdfsg-4
links: PTS, VCS
area: main
in suites: bookworm
size: 139,252 kB
sloc: cpp: 1,100,274; python: 706,454; ansic: 83,052; asm: 7,618; java: 3,273; sh: 2,841; javascript: 612; makefile: 323; xml: 269; ruby: 185; yacc: 144; objc: 68; lex: 44
file content (185 lines) | stat: -rw-r--r-- 6,528 bytes
/**
 * Copyright (c) 2016-present, Facebook, Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#include "caffe2/core/context_gpu.h"
#include "modules/detectron/smooth_l1_loss_op.h"

namespace caffe2 {

namespace {
template <typename T>
__global__ void SmoothL1Kernel(
    const int n, const T* in, T* out, T beta) {
  // f(x) = 0.5 * x^2 / beta      if |x| < beta
  //        |x| - 0.5 * beta      otherwise
  CUDA_1D_KERNEL_LOOP(index, n) {
    T val = in[index];
    T abs_val = c10::cuda::compat::abs(val);
    if (abs_val < beta) {
      out[index] = 0.5 * val * val / beta;
    } else {
      out[index] = abs_val - 0.5 * beta;
    }
  }
}

template <typename T>
__global__ void SmoothL1GradientKernel(
    const int n,
    const T* in,
    T* out,
    const T* d_loss_data,
    T norm,
    T beta) {
  // f'(x) = x / beta     if |x| < beta
  //       = sign(x)      otherwise
  // We also scale by norm * d_loss in this kernel for convenience
  CUDA_1D_KERNEL_LOOP(index, n) {
    T val = in[index];
    T abs_val = c10::cuda::compat::abs(val);
    T d_loss = *d_loss_data;
    if (abs_val < beta) {
      out[index] = norm * d_loss * val / beta;
    } else {
      out[index] = norm * d_loss * ((T(0) < val) - (val < T(0)));
    }
  }
}
} // namespace

template<>
bool SmoothL1LossOp<float, CUDAContext>::RunOnDevice() {
  auto& Y_hat     = Input(0);
  auto& Y         = Input(1);
  auto& alpha_in  = Input(2);
  auto& alpha_out = Input(3);


  int N = Y.dim32(0);
  // Require the same number of elements along axis 0 (batch size), but
  // otherwise don't care about the shape (just the number of elements)
  CAFFE_ENFORCE_EQ(Y_hat.dim32(0), Y.dim32(0),
      "Y_hat and Y must have the same number of elements along axis 0");
  CAFFE_ENFORCE_EQ(Y_hat.size(), Y.size(),
      "Y_hat and Y must have the same number of elements");
  CAFFE_ENFORCE_EQ(Y_hat.size(), alpha_in.size());
  CAFFE_ENFORCE_EQ(Y_hat.size(), alpha_out.size());

  auto* avg_loss = Output(0, vector<int64_t>(), at::dtype<float>());
  buff_.ResizeLike(Y);

  // Difference
  // d := y_hat - y
  math::Sub<float, CUDAContext>(
      Y.size(), Y_hat.data<float>(), Y.data<float>(),
      buff_.mutable_data<float>(), &context_);
  // Element-wise weighted difference (can be used to ignore or reweight
  // specific components)
  // d := alpha_in * (y_hat - y)
  math::Mul<float, CUDAContext>(
      buff_.size(), buff_.data<float>(), alpha_in.data<float>(),
      buff_.mutable_data<float>(), &context_);

  // Element-wise smooth l1 loss
  // l := SmoothL1(alpha_in * (y_hat - y))
  SmoothL1Kernel<float>
  <<<CAFFE_GET_BLOCKS(buff_.size()),
     CAFFE_CUDA_NUM_THREADS,
     0,
     context_.cuda_stream()>>>(
          buff_.size(), buff_.data<float>(), buff_.mutable_data<float>(),
          beta_);
  C10_CUDA_KERNEL_LAUNCH_CHECK();

  // Element-wise weighted smooth l1 loss (can be used to specify a per-element
  // loss weight)
  // l := alpha_out * SmoothL1(alpha_in * (y_hat - y))
  math::Mul<float, CUDAContext>(
      buff_.size(), buff_.data<float>(), alpha_out.data<float>(),
      buff_.mutable_data<float>(), &context_);
  // Sum of all losses
  // al := sum_i l_i
  float* avg_loss_data = avg_loss->mutable_data<float>();
  math::Sum<float, CUDAContext>(
      buff_.size(), buff_.data<float>(), avg_loss_data, &context_);
  // Average of input batch size
  // al := 1/N * al
  math::Scale<float, float, CUDAContext>(
      1, scale_ / N, avg_loss_data, avg_loss_data, &context_);
  return true;
}

template<>
bool SmoothL1LossGradientOp<float, CUDAContext>::RunOnDevice() {
  auto& Y_hat      = Input(0);
  auto& Y          = Input(1);
  auto& alpha_in   = Input(2);
  auto& alpha_out  = Input(3);
  auto& d_avg_loss = Input(4);  // gradient of net w.r.t. avg_loss ("gradOuput")
  // We intentially don't compute gradients for Y, alpha_{in,out} since they
  // are not needed (can change in the future if desired)

  int N = Y.dim32(0);
  // Require the same number of elements along axis 0 (batch size), but
  // otherwise don't care about the shape (just the number of elements)
  CAFFE_ENFORCE_EQ(Y_hat.dim32(0), Y.dim32(0),
      "Y_hat and Y must have the same number of elements along axis 0");
  CAFFE_ENFORCE_EQ(Y_hat.size(), Y.size(),
      "Y_hat and Y must have the same number of elements");
  CAFFE_ENFORCE_EQ(Y_hat.size(), alpha_in.size());
  CAFFE_ENFORCE_EQ(Y_hat.size(), alpha_out.size());
  CAFFE_ENFORCE_EQ(d_avg_loss.size(), 1);

  auto* d_Y_hat = Output(0, Y_hat.sizes(), at::dtype<float>()); // gradient of net w.r.t. Y_hat ("gradInput")
  buff_.ResizeLike(Y);

  // Difference
  // d := y_hat - y
  math::Sub<float, CUDAContext>(
      Y.size(), Y_hat.data<float>(), Y.data<float>(),
      buff_.mutable_data<float>(), &context_);
  // Element-wise weighted difference (can be used to ignore or reweight
  // specific components)
  // d := alpha_in * (y_hat - y)
  math::Mul<float, CUDAContext>(
      buff_.size(), buff_.data<float>(), alpha_in.data<float>(),
      buff_.mutable_data<float>(), &context_);
  // d_Y_hat := d_avg_loss / N * SmoothL1'(alpha_in * (y_hat - y))
  SmoothL1GradientKernel<float>
  <<<CAFFE_GET_BLOCKS(buff_.size()),
     CAFFE_CUDA_NUM_THREADS,
     0,
     context_.cuda_stream()>>>(
         buff_.size(), buff_.data<float>(), d_Y_hat->mutable_data<float>(),
         d_avg_loss.data<float>(), scale_ / N, beta_);
  C10_CUDA_KERNEL_LAUNCH_CHECK();

  // Element-wise scale by alpha_in and alpha_out
  math::Mul<float, CUDAContext>(
      d_Y_hat->size(), d_Y_hat->data<float>(), alpha_in.data<float>(),
      d_Y_hat->mutable_data<float>(), &context_);
  math::Mul<float, CUDAContext>(
      d_Y_hat->size(), d_Y_hat->data<float>(), alpha_out.data<float>(),
      d_Y_hat->mutable_data<float>(), &context_);
  return true;
}


REGISTER_CUDA_OPERATOR(SmoothL1Loss,
                       SmoothL1LossOp<float, CUDAContext>);
REGISTER_CUDA_OPERATOR(SmoothL1LossGradient,
                       SmoothL1LossGradientOp<float, CUDAContext>);
}  // namespace caffe2