File: batch_bucketize_op.cc

package info (click to toggle)
pytorch 1.13.1%2Bdfsg-4
  • links: PTS, VCS
  • area: main
  • in suites: bookworm
  • size: 139,252 kB
  • sloc: cpp: 1,100,274; python: 706,454; ansic: 83,052; asm: 7,618; java: 3,273; sh: 2,841; javascript: 612; makefile: 323; xml: 269; ruby: 185; yacc: 144; objc: 68; lex: 44
file content (126 lines) | stat: -rw-r--r-- 4,243 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
#include "batch_bucketize_op.h"

#include "caffe2/core/context.h"
#include "caffe2/core/tensor.h"

namespace caffe2 {

template <>
bool BatchBucketizeOp<CPUContext>::RunOnDevice() {
  auto& feature = Input(FEATURE);
  auto& indices = Input(INDICES);
  auto& boundaries = Input(BOUNDARIES);
  auto& lengths = Input(LENGTHS);

  CAFFE_ENFORCE_EQ(lengths.dim(), 1);
  CAFFE_ENFORCE_EQ(indices.dim(), 1);
  CAFFE_ENFORCE_EQ(boundaries.dim(), 1);
  CAFFE_ENFORCE_EQ(feature.dim(), 2);
  CAFFE_ENFORCE_EQ(lengths.numel(), indices.numel());

  const auto* lengths_data = lengths.template data<int32_t>();
  const auto* indices_data = indices.template data<int32_t>();
  const auto* boundaries_data = boundaries.template data<float>();
  const auto* feature_data = feature.template data<float>();
  auto batch_size = feature.size(0);
  auto feature_dim = feature.size(1);
  auto output_dim = indices.numel();

  int64_t length_sum = 0;
  for (int64_t i = 0; i < lengths.numel(); i++) {
    CAFFE_ENFORCE_GE(feature_dim, indices_data[i]);
    length_sum += lengths_data[i];
  }
  CAFFE_ENFORCE_EQ(length_sum, boundaries.numel());

  int64_t lower_bound = 0;
  auto* output = Output(O, {batch_size, output_dim}, at::dtype<int32_t>());
  auto* output_data = output->template mutable_data<int32_t>();

  for (int64_t i = 0; i < batch_size; i++) {
    lower_bound = 0;
    for (int64_t j = 0; j < output_dim; j++) {
      for (int64_t k = 0; k <= lengths_data[j]; k++) {
        if (k == lengths_data[j] ||
            feature_data[i * feature_dim + indices_data[j]] <=
                boundaries_data[lower_bound + k]) {
          output_data[i * output_dim + j] = k;
          break;
        } else {
          continue;
        }
      }
      lower_bound += lengths_data[j];
    }
  }
  return true;
}

REGISTER_CPU_OPERATOR(BatchBucketize, BatchBucketizeOp<CPUContext>);

OPERATOR_SCHEMA(BatchBucketize)
    .NumInputs(4)
    .NumOutputs(1)
    .SetDoc(R"DOC(
Bucketize the float_features into sparse features.
The float_features is a N * D tensor where N is the batch_size, and D is the feature_dim.
The indices is a 1D tensor containing the indices of the features that need to be bucketized.
The lengths is a 1D tensor that splits the following 'boundaries' argument.
The boundaries is a 1D tensor containing the border list for each feature.

With in each batch, `indices` should not have duplicate number,
and the number of elements in `indices` should be less than or equal to `D`.
Each element in `lengths` vector (lengths[`i`]) represents
the number of boundaries in the sub border list.
The sum of all elements in `lengths` must be equal to the size of  `boundaries`.
If lengths[0] = 2, the first sub border list is [0.5, 1.0], which separate the
value to (-inf, 0.5], (0,5, 1.0], (1.0, inf). The bucketized feature will have
three possible values (i.e. 0, 1, 2).


For example, with input:

  float_features = [[1.42, 2.07, 3.19, 0.55, 4.32],
                    [4.57, 2.30, 0.84, 4.48, 3.09],
                    [0.89, 0.26, 2.41, 0.47, 1.05],
                    [0.03, 2.97, 2.43, 4.36, 3.11],
                    [2.74, 5.77, 0.90, 2.63, 0.38]]
  indices = [0, 1, 4]
  lengths = [2, 3, 1]
  boundaries =  [0.5, 1.0, 1.5, 2.5, 3.5, 2.5]

The output is:

  output =[[2, 1, 1],
           [2, 1, 1],
           [1, 0, 0],
           [0, 2, 1],
           [2, 3, 0]]

after running this operator.
)DOC")
    .Input(
        0,
        "float_features",
        "2-D dense tensor, the second dimension must be greater or equal to the indices dimension")
    .Input(
        1,
        "indices",
        "Flatten tensor, containing the indices of `float_features` to be bucketized. The datatype must be int32.")
    .Input(
        2,
        "lengths",
        "Flatten tensor, the size must be equal to that of `indices`. The datatype must be int32.")
    .Input(
        3,
        "boundaries",
        "Flatten tensor, dimension has to match the sum of lengths")
    .Output(
        0,
        "bucktized_feat",
        "2-D dense tensor, with 1st dim = float_features.dim(0), 2nd dim = size(indices)"
        "in the arg list, the tensor is of the same data type as `feature`.");

NO_GRADIENT(BatchBucketize);

} // namespace caffe2