File: manual_awkward_NumpyArray_getitem_boolean_nonzero.cu

package info (click to toggle)
python-awkward 2.6.5-1
  • links: PTS, VCS
  • area: main
  • in suites: sid
  • size: 23,088 kB
  • sloc: python: 148,689; cpp: 33,562; sh: 432; makefile: 21; javascript: 8
file content (90 lines) | stat: -rw-r--r-- 2,222 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
#define FILENAME(line) FILENAME_FOR_EXCEPTIONS_CUDA("src/cuda-kernels/manual_awkward_NumpyArray_getitem_boolean_nonzero.cu", line)

#include "awkward/kernels.h"
#include "standard_parallel_algorithms.h"

__global__ void
awkward_NumpyArray_getitem_boolean_nonzero_filter_mask(
    const int8_t* fromptr,
    int64_t* filtered_index,
    int64_t stride,
    int64_t length) {
  int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x;

  if(thread_id < length  && thread_id % stride == 0) {
    if (fromptr[thread_id] != 0) {
      filtered_index[thread_id] = 1;
    }
  }
}

template <typename T>
__global__ void
awkward_NumpyArray_getitem_boolean_nonzero_kernel(
    T* toptr,
    int64_t* prefixedsum_mask,
    const int8_t* fromptr,
    int64_t length,
    int64_t stride) {
  int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x;

  if(thread_id < length && thread_id % stride == 0) {
    if(fromptr[thread_id] != 0) {
      toptr[prefixedsum_mask[thread_id] - 1] = thread_id;
    }
  }
}


template <typename T>
ERROR awkward_NumpyArray_getitem_boolean_nonzero(
    T* toptr,
    const int8_t* fromptr,
    int64_t length,
    int64_t stride) {

  int64_t* res_temp;
  int64_t* filtered_index;

  dim3 blocks_per_grid = blocks(length);
  dim3 threads_per_block = threads(length);

  HANDLE_ERROR(cudaMalloc((void**)&res_temp, sizeof(int64_t) * length));
  HANDLE_ERROR(cudaMalloc((void**)&filtered_index, sizeof(int64_t) * length));
  HANDLE_ERROR(cudaMemset(filtered_index, 0, sizeof(int64_t) * length));


  awkward_NumpyArray_getitem_boolean_nonzero_filter_mask<<<
  blocks_per_grid,
  threads_per_block>>>(fromptr, filtered_index, stride, length);


  exclusive_scan<int64_t, int64_t>(res_temp, filtered_index, length);


  awkward_NumpyArray_getitem_boolean_nonzero_kernel<<<blocks_per_grid,
  threads_per_block>>>(
      toptr,
      res_temp,
      fromptr,
      length,
      stride);

  cudaDeviceSynchronize();

  return success();


}

ERROR awkward_NumpyArray_getitem_boolean_nonzero_64(
    int64_t* toptr,
    const int8_t* fromptr,
    int64_t length,
    int64_t stride) {
  return awkward_NumpyArray_getitem_boolean_nonzero<int64_t>(
      toptr,
      fromptr,
      length,
      stride);
}