File: weighted_sample_op.cu

package info (click to toggle)
pytorch 1.13.1%2Bdfsg-4
  • links: PTS, VCS
  • area: main
  • in suites: bookworm
  • size: 139,252 kB
  • sloc: cpp: 1,100,274; python: 706,454; ansic: 83,052; asm: 7,618; java: 3,273; sh: 2,841; javascript: 612; makefile: 323; xml: 269; ruby: 185; yacc: 144; objc: 68; lex: 44
file content (104 lines) | stat: -rw-r--r-- 2,878 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
#include "caffe2/core/context_gpu.h"
#include "caffe2/operators/weighted_sample_op.h"
#include "caffe2/utils/math.h"

namespace caffe2 {
namespace {

__global__ void WeightedSampleKernel(
    const int batch_size,
    const int weights_dim,
    const float* in_weights_data,
    const float* in_val_data,
    float* samples,
    int* out_idx_data,
    float* out_val_data) {
  CUDA_1D_KERNEL_LOOP(i, batch_size) {
    int offset = i * weights_dim;

    float sum = 0.0;
    for (int j = 0; j < weights_dim; j++) {
      sum += in_weights_data[offset + j];
    }
    samples[i] *= sum;

    float cum_sum = 0.0;
    int j = 0;
    for (; j < weights_dim; j++) {
      cum_sum += in_weights_data[offset + j];
      if (cum_sum >= samples[i]) {
        break;
      }
    }
    out_idx_data[i] = min(j, weights_dim - 1);

    if (out_val_data) {
      out_val_data[i] = in_val_data[offset + out_idx_data[i]];
    }
  }
}

} // namespace

template <>
bool WeightedSampleOp<float, CUDAContext>::RunOnDevice() {
  CAFFE_ENFORCE_EQ(
      InputSize(),
      OutputSize(),
      "The number of tensors of the input and the output must be the same.");

  auto& in_weights = Input(0);

  int batch_size = in_weights.dim(0);
  int weights_dim = in_weights.dim(1);

  if (batch_size > 0 && weights_dim > 0) {
    auto* out_idx = Output(0, {batch_size, 1}, at::dtype<int>());
    ReinitializeTensor(&unif_samples_, {batch_size}, at::dtype<float>().device(CUDA));

    const float* in_weights_data = in_weights.data<float>();
    const float* in_val_data = nullptr;
    int* out_idx_data = out_idx->template mutable_data<int>();
    float* out_val_data = nullptr;

    if (OutputSize() == 2) {
      auto& in_val = Input(1);
      CAFFE_ENFORCE_EQ(
          in_weights.sizes(),
          in_val.sizes(),
          "The sampling weights tensor and the sampling values tensor must have the same dimensions.");
      in_val_data = in_val.data<float>();

      auto* out_val = Output(1, {batch_size, 1}, at::dtype<float>());
      out_val_data = out_val->template mutable_data<float>();
    }

    float* unif_samples_data = unif_samples_.mutable_data<float>();
    CURAND_ENFORCE(curandGenerateUniform(
        context_.curand_generator(), unif_samples_data, batch_size));

    WeightedSampleKernel<<<
        CAFFE_GET_BLOCKS(batch_size),
        CAFFE_CUDA_NUM_THREADS,
        0,
        context_.cuda_stream()>>>(
        batch_size,
        weights_dim,
        in_weights_data,
        in_val_data,
        unif_samples_data,
        out_idx_data,
        out_val_data);
    C10_CUDA_KERNEL_LAUNCH_CHECK();
  } else {
    /* out_idx = */ Output(0, {0}, at::dtype<int>());
    if (OutputSize() == 2) {
      /* out_val = */ Output(1, {0}, at::dtype<float>());
    }
  }

  return true;
}

REGISTER_CUDA_OPERATOR(WeightedSample, WeightedSampleOp<float, CUDAContext>);
} // namespace caffe2