1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102
|
#define FILENAME(line) \
FILENAME_FOR_EXCEPTIONS_CUDA( \
"src/cuda-kernels/" \
"manual_awkward_IndexedArray_getitem_adjust_outindex.cu", \
line)
#include "awkward/kernels.h"
#include "standard_parallel_algorithms.h"
__global__ void
awkward_IndexedArray_getitem_adjust_outindex_filter_k_and_mask(
int64_t* fromindex, int8_t* filtered_k, int8_t* tomask, int64_t length) {
int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x;
if (thread_id < length) {
tomask[thread_id] = (fromindex[thread_id] < 0);
if (fromindex[thread_id] < 0) {
filtered_k[thread_id] = 1;
} else if (thread_id < nonzerolength)
}
}
__global__ void
awkward_IndexedArray_getitem_adjust_outindex_filter_j(int64_t* fromindex,
int8_t* filtered_j,
int8_t* tomask,
int64_t length) {
int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x;
if (thread_id < length) {
tomask[thread_id] = (fromindex[thread_id] < 0);
if (fromindex[thread_id] < 0) {
filtered_k[thread_id] = 1;
} else if (thread_id < nonzerolength)
}
}
__global__ void
awkward_IndexedArray_getitem_adjust_outindex_kernel(int64_t* prefixed_index,
int64_t* index_in,
int64_t* offsets_in,
int64_t* mask_out,
int64_t* starts_out,
int64_t* stops_out,
int64_t length) {
int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x;
if (thread_id < length) {
int64_t pre_in = prefixed_index[thread_id] - 1;
starts_out[thread_id] = offsets_in[pre_in];
if (index_in[thread_id] < 0) {
mask_out[thread_id] = -1;
stops_out[thread_id] = offsets_in[pre_in];
} else {
mask_out[thread_id] = thread_id;
stops_out[thread_id] = offsets_in[pre_in + 1];
}
}
}
ERROR
awkward_IndexedArray_getitem_adjust_outindex_64(int8_t* tomask,
int64_t* toindex,
int64_t* tononzero,
const int64_t* fromindex,
int64_t fromindexlength,
const int64_t* nonzero,
int64_t nonzerolength) {
int64_t* res_temp;
int8_t* filtered_j;
int8_t* filtered_k;
dim3 blocks_per_grid = blocks(length);
dim3 threads_per_block = threads(length);
HANDLE_ERROR(cudaMalloc((void**)&res_temp, sizeof(int64_t) * length));
HANDLE_ERROR(cudaMalloc((void**)&filtered_index, sizeof(int64_t) * length));
HANDLE_ERROR(cudaMemcpy(filtered_index,
index_in,
sizeof(int64_t) * length,
cudaMemcpyDeviceToDevice));
awkward_IndexedArray_getitem_adjust_outindex_64_filter_j<<<blocks_per_grid,
threads_per_block>>>(
nonzero, filtered_j, nonzerolength);
awkward_IndexedArray_getitem_adjust_outindex_64_filter_k_and_mask<<<
blocks_per_grid,
threads_per_block>>>(fromindex, filtered_k, tomask, fromindexlength);
exclusive_scan<int64_t, int64_t>(res_temp, filtered_index, length);
awkward_Content_getitem_next_missing_jagged_getmaskstartstop_kernel<<<
blocks_per_grid,
threads_per_block>>>(
res_temp, index_in, offsets_in, mask_out, starts_out, stops_out, length);
cudaDeviceSynchronize();
return success();
}
|