1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83
|
#define FILENAME(line) \
FILENAME_FOR_EXCEPTIONS_CUDA( \
"src/cuda-kernels/manual_awkward_ByteMaskedArray_reduce_next.cu", line)
#include "standard_parallel_algorithms.h"
#include "awkward/kernels.h"
__global__ void
awkward_ByteMaskedArray_reduce_next_64_filter_mask(int8_t* filtered_mask,
const int8_t* mask,
bool validwhen,
int64_t length) {
int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x;
if (thread_id < length) {
if ((mask[thread_id] != 0) == validwhen) {
filtered_mask[thread_id] = 1;
}
}
}
__global__ void
awkward_ByteMaskedArray_reduce_next_64_kernel(int64_t* nextcarry,
int64_t* nextparents,
int64_t* outindex,
const int8_t* mask,
const int64_t* parents,
int64_t length,
int8_t* filtered_mask,
int64_t* prefixed_mask) {
int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x;
if (thread_id < length) {
if (filtered_mask[thread_id] == 1) {
nextcarry[prefixed_mask[thread_id] - 1] = thread_id;
nextparents[prefixed_mask[thread_id] - 1] = parents[thread_id];
outindex[thread_id] = prefixed_mask[thread_id] - 1;
} else {
outindex[thread_id] = -1;
}
}
}
ERROR
awkward_ByteMaskedArray_reduce_next_64(int64_t* nextcarry,
int64_t* nextparents,
int64_t* outindex,
const int8_t* mask,
const int64_t* parents,
int64_t length,
bool validwhen) {
dim3 blocks_per_grid = blocks(length);
dim3 threads_per_block = threads(length);
int8_t* filtered_mask;
HANDLE_ERROR(cudaMalloc((void**)&filtered_mask, sizeof(int8_t) * length));
HANDLE_ERROR(cudaMemcpy(
filtered_mask, mask, sizeof(int8_t) * length, cudaMemcpyDeviceToDevice));
awkward_ByteMaskedArray_reduce_next_64_filter_mask<<<blocks_per_grid,
threads_per_block>>>(
filtered_mask, mask, validwhen, length);
int64_t* prefixed_mask;
HANDLE_ERROR(cudaMalloc((void**)&prefixed_mask, sizeof(int64_t) * length));
exclusive_scan(prefixed_mask, filtered_mask, length);
awkward_ByteMaskedArray_reduce_next_64_kernel<<<blocks_per_grid,
threads_per_block>>>(
nextcarry,
nextparents,
outindex,
mask,
parents,
length,
filtered_mask,
prefixed_mask);
cudaDeviceSynchronize();
return success();
}
|