1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130
|
#include <algorithm>
#include "caffe2/core/context_gpu.h"
#include "caffe2/operators/boolean_unmask_ops.h"
namespace caffe2 {
namespace {
__global__ void ComputeIndicesKernel(
const int numMasks,
const int maskSize,
int* indices,
bool* const masks[]) {
CUDA_1D_KERNEL_LOOP(i, maskSize) {
for (int j = 0; j < numMasks; ++j) {
if (masks[j][i]) {
indices[i] = j;
return;
}
}
CUDA_KERNEL_ASSERT(false);
}
}
__global__ void FillValuesKernel(
const int numMasks,
const int maskSize,
const size_t itemSize,
const int* indices,
char* const values[],
int* valueSizes,
char* dest) {
CUDA_1D_KERNEL_LOOP(j, numMasks) {
int k = 0;
for (int i = 0; i < maskSize; ++i) {
if (indices[i] == j) {
for (int h = 0; h < itemSize; ++h) {
dest[i * itemSize + h] = values[j][k * itemSize + h];
}
++k;
}
}
CUDA_KERNEL_ASSERT(valueSizes[j] == k);
}
}
} // namespace
template <>
class BooleanUnmaskOp<CUDAContext> final : public Operator<CUDAContext> {
public:
BooleanUnmaskOp(const OperatorDef& def, Workspace* ws)
: Operator<CUDAContext>(def, ws) {}
bool RunOnDevice() override {
int maskSize = Input(0).numel();
int numMasks = InputSize() / 2;
const auto& meta = Input(1).meta();
auto* out = Output(0);
out->Resize(maskSize);
auto* dest = (char*)out->raw_mutable_data(meta);
ReinitializeTensor(&hostMasks_, {numMasks}, at::dtype<bool*>().device(CPU));
auto* hostMasksData = hostMasks_.mutable_data<bool*>();
ReinitializeTensor(
&hostValues_, {numMasks}, at::dtype<char*>().device(CPU));
auto* hostValuesData = hostValues_.mutable_data<char*>();
ReinitializeTensor(
&hostValueSizes_, {numMasks}, at::dtype<int>().device(CPU));
auto* hostValueSizesData = hostValueSizes_.mutable_data<int>();
for (int i = 0; i < numMasks; ++i) {
auto& mask = Input(i * 2);
CAFFE_ENFORCE_EQ(mask.dim(), 1);
CAFFE_ENFORCE_EQ(mask.numel(), maskSize);
hostMasksData[i] = const_cast<bool*>(mask.data<bool>());
const auto& value = Input(i * 2 + 1);
CAFFE_ENFORCE_EQ(value.dim(), 1);
hostValuesData[i] = (char*)value.raw_data();
hostValueSizesData[i] = value.numel();
}
masks_.CopyFrom(hostMasks_);
values_.CopyFrom(hostValues_);
valueSizes_.CopyFrom(hostValueSizes_);
ReinitializeTensor(&indices_, {maskSize}, at::dtype<int>().device(CUDA));
auto* indicesData = indices_.mutable_data<int>();
ComputeIndicesKernel<<<
std::min(maskSize, CAFFE_MAXIMUM_NUM_BLOCKS),
CAFFE_CUDA_NUM_THREADS,
0,
context_.cuda_stream()>>>(
numMasks, maskSize, indicesData, masks_.data<bool*>());
C10_CUDA_KERNEL_LAUNCH_CHECK();
auto* valueSizesData = valueSizes_.mutable_data<int>();
FillValuesKernel<<<
std::min(numMasks, CAFFE_MAXIMUM_NUM_BLOCKS),
CAFFE_CUDA_NUM_THREADS,
0,
context_.cuda_stream()>>>(
numMasks,
maskSize,
meta.itemsize(),
indicesData,
values_.data<char*>(),
valueSizesData,
dest);
C10_CUDA_KERNEL_LAUNCH_CHECK();
return true;
}
private:
Tensor indices_;
Tensor masks_{CUDA};
Tensor values_{CUDA};
Tensor valueSizes_{CUDA};
Tensor hostMasks_;
Tensor hostValues_;
Tensor hostValueSizes_;
};
REGISTER_CUDA_OPERATOR(BooleanUnmask, BooleanUnmaskOp<CUDAContext>);
} // caffe2
|