File: notcomplete_manual_awkward_IndexedArray_getitem_adjust_outindex.cu

package info (click to toggle)
python-awkward 2.6.5-1
  • links: PTS, VCS
  • area: main
  • in suites: sid
  • size: 23,088 kB
  • sloc: python: 148,689; cpp: 33,562; sh: 432; makefile: 21; javascript: 8
file content (102 lines) | stat: -rw-r--r-- 3,968 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
#define FILENAME(line)                                          \
  FILENAME_FOR_EXCEPTIONS_CUDA(                                 \
      "src/cuda-kernels/"                                       \
      "manual_awkward_IndexedArray_getitem_adjust_outindex.cu", \
      line)

#include "awkward/kernels.h"
#include "standard_parallel_algorithms.h"

__global__ void
awkward_IndexedArray_getitem_adjust_outindex_filter_k_and_mask(
    int64_t* fromindex, int8_t* filtered_k, int8_t* tomask, int64_t length) {
  int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x;

  if (thread_id < length) {
    tomask[thread_id] = (fromindex[thread_id] < 0);
    if (fromindex[thread_id] < 0) {
      filtered_k[thread_id] = 1;
    } else if (thread_id < nonzerolength)
  }
}

__global__ void
awkward_IndexedArray_getitem_adjust_outindex_filter_j(int64_t* fromindex,
                                                      int8_t* filtered_j,
                                                      int8_t* tomask,
                                                      int64_t length) {
  int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x;

  if (thread_id < length) {
    tomask[thread_id] = (fromindex[thread_id] < 0);
    if (fromindex[thread_id] < 0) {
      filtered_k[thread_id] = 1;
    } else if (thread_id < nonzerolength)
  }
}

__global__ void
awkward_IndexedArray_getitem_adjust_outindex_kernel(int64_t* prefixed_index,
                                                    int64_t* index_in,
                                                    int64_t* offsets_in,
                                                    int64_t* mask_out,
                                                    int64_t* starts_out,
                                                    int64_t* stops_out,
                                                    int64_t length) {
  int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x;

  if (thread_id < length) {
    int64_t pre_in = prefixed_index[thread_id] - 1;
    starts_out[thread_id] = offsets_in[pre_in];

    if (index_in[thread_id] < 0) {
      mask_out[thread_id] = -1;
      stops_out[thread_id] = offsets_in[pre_in];
    } else {
      mask_out[thread_id] = thread_id;
      stops_out[thread_id] = offsets_in[pre_in + 1];
    }
  }
}

ERROR
awkward_IndexedArray_getitem_adjust_outindex_64(int8_t* tomask,
                                                int64_t* toindex,
                                                int64_t* tononzero,
                                                const int64_t* fromindex,
                                                int64_t fromindexlength,
                                                const int64_t* nonzero,
                                                int64_t nonzerolength) {
  int64_t* res_temp;
  int8_t* filtered_j;
  int8_t* filtered_k;

  dim3 blocks_per_grid = blocks(length);
  dim3 threads_per_block = threads(length);

  HANDLE_ERROR(cudaMalloc((void**)&res_temp, sizeof(int64_t) * length));
  HANDLE_ERROR(cudaMalloc((void**)&filtered_index, sizeof(int64_t) * length));
  HANDLE_ERROR(cudaMemcpy(filtered_index,
                          index_in,
                          sizeof(int64_t) * length,
                          cudaMemcpyDeviceToDevice));

  awkward_IndexedArray_getitem_adjust_outindex_64_filter_j<<<blocks_per_grid,
                                                             threads_per_block>>>(
      nonzero, filtered_j, nonzerolength);

  awkward_IndexedArray_getitem_adjust_outindex_64_filter_k_and_mask<<<
      blocks_per_grid,
      threads_per_block>>>(fromindex, filtered_k, tomask, fromindexlength);

  exclusive_scan<int64_t, int64_t>(res_temp, filtered_index, length);

  awkward_Content_getitem_next_missing_jagged_getmaskstartstop_kernel<<<
      blocks_per_grid,
      threads_per_block>>>(
      res_temp, index_in, offsets_in, mask_out, starts_out, stops_out, length);

  cudaDeviceSynchronize();

  return success();
}