1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154
|
#include "caffe2/operators/batch_sparse_to_dense_op.h"
#include "caffe2/utils/cub_namespace.cuh"
#include <cub/device/device_scan.cuh>
#include "caffe2/core/context_gpu.h"
namespace caffe2 {
namespace {
template <typename TLen>
void array_prefix_sum_inclusive(
const TLen* dev_array,
const int num_items,
Tensor& prefix_buffer,
Tensor& prefix_sum,
CUDAContext& context) {
// Retrieve buffer size
size_t temp_storage_bytes = 0;
prefix_sum.Resize(num_items);
cub::DeviceScan::InclusiveSum(
nullptr,
temp_storage_bytes,
dev_array,
prefix_sum.mutable_data<TLen>(),
num_items,
context.cuda_stream());
// Allocate temporary storage
auto buffer_size = (temp_storage_bytes + sizeof(TLen)) / sizeof(TLen);
prefix_buffer.Resize(buffer_size);
void* dev_temp_storage =
static_cast<void*>(prefix_buffer.mutable_data<TLen>());
// Inclusive sum
cub::DeviceScan::InclusiveSum(
dev_temp_storage,
temp_storage_bytes,
dev_array,
prefix_sum.mutable_data<TLen>(),
num_items,
context.cuda_stream());
}
template <typename TLen, typename TInd>
__global__ void FillInDenseValuesKernel(
const int64_t batch_size,
const int64_t dense_last_dim,
const TInd* indices_data,
const float* values_data,
const TLen* L_cum_sum_data,
float* output_data) {
CUDA_1D_KERNEL_LOOP(idx, batch_size) {
int offset_start = idx == 0 ? 0 : L_cum_sum_data[idx - 1];
int offset_end = L_cum_sum_data[idx];
for (int q = offset_start; q < offset_end; q++) {
int indice = indices_data[q];
float val = values_data[q];
output_data[idx * dense_last_dim + indice] = val;
}
}
}
template <typename TLen, typename TInd>
__global__ void FillInSparseValuesKernel(
const int64_t batch_size,
const int64_t dense_last_dim,
const TInd* indices_data,
const float* dense_data,
const TLen* L_cum_sum_data,
float* output_data) {
CUDA_1D_KERNEL_LOOP(idx, batch_size) {
int offset_start = idx == 0 ? 0 : L_cum_sum_data[idx - 1];
int offset_end = L_cum_sum_data[idx];
for (int q = offset_start; q < offset_end; q++) {
int indice = indices_data[q];
output_data[q] = dense_data[idx * dense_last_dim + indice];
}
}
}
} // namespace
template <>
template <typename TLen, typename TInd>
void BatchSparseToDenseOp<float, CUDAContext>::FillInDenseValues(
const int64_t batch_size,
const int64_t indice_lengths,
const TLen* lengths_data,
const TInd* indices_data,
const float* values_data,
float* output_data,
CUDAContext* context) {
// calculate the prefix sum of the length array
array_prefix_sum_inclusive<TLen>(
lengths_data, batch_size, len_prefix_tmp_, len_prefix_sum_, context_);
// launch the gpu kernel for to fill in dense values
const int64_t min_size = 1;
FillInDenseValuesKernel<TLen, TInd><<<
CAFFE_GET_BLOCKS(std::max(batch_size, min_size)),
CAFFE_CUDA_NUM_THREADS,
0,
context->cuda_stream()>>>(
batch_size,
dense_last_dim_,
indices_data,
values_data,
len_prefix_sum_.data<TLen>(),
output_data);
C10_CUDA_KERNEL_LAUNCH_CHECK();
}
template <>
template <typename TLen, typename TInd>
void BatchDenseToSparseOp<float, CUDAContext>::FillInSparseValues(
const int64_t batch_size,
const int64_t indice_lengths,
const TLen* lengths_data,
const TInd* indices_data,
const float* dense_data,
float* output_data,
CUDAContext* context) {
// calculate the prefix sum of the length array
array_prefix_sum_inclusive<TLen>(
lengths_data, batch_size, len_prefix_tmp_, len_prefix_sum_, context_);
// launch the gpu kernel for to fill in sparse values
const int64_t min_size = 1;
FillInSparseValuesKernel<TLen, TInd><<<
CAFFE_GET_BLOCKS(std::max(batch_size, min_size)),
CAFFE_CUDA_NUM_THREADS,
0,
context->cuda_stream()>>>(
batch_size,
dense_last_dim_,
indices_data,
dense_data,
len_prefix_sum_.data<TLen>(),
output_data);
C10_CUDA_KERNEL_LAUNCH_CHECK();
}
REGISTER_CUDA_OPERATOR(
BatchSparseToDense,
BatchSparseToDenseOp<float, CUDAContext>);
REGISTER_CUDA_OPERATOR(
BatchDenseToSparse,
BatchDenseToSparseOp<float, CUDAContext>);
} // namespace caffe2
|