File: manual_awkward_IndexedArray_reduce_next_64.cu

package info (click to toggle)
python-awkward 2.6.5-1
  • links: PTS, VCS
  • area: main
  • in suites: sid
  • size: 23,088 kB
  • sloc: python: 148,689; cpp: 33,562; sh: 432; makefile: 21; javascript: 8
file content (108 lines) | stat: -rw-r--r-- 4,240 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
// BSD 3-Clause License; see https://github.com/scikit-hep/awkward-1.0/blob/main/LICENSE

#define FILENAME(line)          \
  FILENAME_FOR_EXCEPTIONS_CUDA( \
      "src/cuda-kernels/awkward_IndexedArray_reduce_next_64.cu", line)

#include "standard_parallel_algorithms.h"
#include "awkward/kernels.h"

template <typename T>
__global__ void
awkward_IndexedArray_reduce_next_64_filter_mask(int8_t* filtered_mask,
                                                const T* index,
                                                int64_t length) {
  int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x;

  if (thread_id < length) {
    if (index[thread_id] >= 0) {
      filtered_mask[thread_id] = 1;
    }
  }
}

template <typename T>
__global__ void
awkward_IndexedArray_reduce_next_64_kernel(int64_t* nextcarry,
                                           int64_t* nextparents,
                                           int64_t* outindex,
                                           const T* index,
                                           const int64_t* parents,
                                           int64_t length,
                                           int64_t* prefixed_mask) {
  int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x;

  if (thread_id < length) {
    if (index[thread_id] >= 0) {
      nextcarry[prefixed_mask[thread_id] - 1] = index[thread_id];
      nextparents[prefixed_mask[thread_id] - 1] = parents[thread_id];
      outindex[thread_id] = prefixed_mask[thread_id] - 1;
    } else {
      outindex[thread_id] = -1;
    }
  }
}

template <typename T>
ERROR
awkward_IndexedArray_reduce_next_64(int64_t* nextcarry,
                                    int64_t* nextparents,
                                    int64_t* outindex,
                                    const T* index,
                                    const int64_t* parents,
                                    int64_t length) {
  dim3 blocks_per_grid = blocks(length);
  dim3 threads_per_block = threads(length);

  int8_t* filtered_mask;
  HANDLE_ERROR(cudaMalloc((void**)&filtered_mask, sizeof(int8_t) * length));
  HANDLE_ERROR(cudaMemset(
      filtered_mask, 0, sizeof(int8_t) * length));

  awkward_IndexedArray_reduce_next_64_filter_mask<<<blocks_per_grid,
                                                    threads_per_block>>>(
      filtered_mask, index, length);

  int64_t* prefixed_mask;
  HANDLE_ERROR(cudaMalloc((void**)&prefixed_mask, sizeof(int64_t) * length));

  exclusive_scan(prefixed_mask, filtered_mask, length);

  awkward_IndexedArray_reduce_next_64_kernel<<<blocks_per_grid,
                                               threads_per_block>>>(
      nextcarry, nextparents, outindex, index, parents, length, prefixed_mask);

  cudaDeviceSynchronize();

  return success();
}
ERROR
awkward_IndexedArray32_reduce_next_64(int64_t* nextcarry,
                                      int64_t* nextparents,
                                      int64_t* outindex,
                                      const int32_t* index,
                                      int64_t* parents,
                                      int64_t length) {
  return awkward_IndexedArray_reduce_next_64<int32_t>(
      nextcarry, nextparents, outindex, index, parents, length);
}
ERROR
awkward_IndexedArrayU32_reduce_next_64(int64_t* nextcarry,
                                       int64_t* nextparents,
                                       int64_t* outindex,
                                       const uint32_t* index,
                                       int64_t* parents,
                                       int64_t length) {
  return awkward_IndexedArray_reduce_next_64<uint32_t>(
      nextcarry, nextparents, outindex, index, parents, length);
}
ERROR
awkward_IndexedArray64_reduce_next_64(int64_t* nextcarry,
                                      int64_t* nextparents,
                                      int64_t* outindex,
                                      const int64_t* index,
                                      int64_t* parents,
                                      int64_t length) {
  return awkward_IndexedArray_reduce_next_64<int64_t>(
      nextcarry, nextparents, outindex, index, parents, length);
}