1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37
|
namespace broadcast {
// Broadcasts within partitioned groups of threads.
//
// X_THREAD: Broadcast from threadIdx.x == 0 if true
// Y_THREAD: Broadcast from threadIdx.y == 0 if true
// Z_THREAD: Broadcast from threadIdx.z == 0 if true
// inp_val: Per-thread source value. Only valid when the thread is a source.
// out: Per-thread output location
//
template <bool X_THREAD, bool Y_THREAD, bool Z_THREAD, typename T>
__device__ void blockBroadcast(
T& out,
const T& inp_val,
T* shared_mem,
bool read_write_pred) {
const bool has_valid_data = (!X_THREAD || threadIdx.x == 0) &&
(!Y_THREAD || threadIdx.y == 0) && (!Z_THREAD || threadIdx.z == 0);
const auto shared_offset =
index_utils::maskedOffset<!X_THREAD, !Y_THREAD, !Z_THREAD>(
threadIdx, blockDim);
if (has_valid_data && read_write_pred) {
shared_mem[shared_offset] = inp_val;
}
block_sync::sync();
if (read_write_pred) {
out = shared_mem[shared_offset];
}
block_sync::sync();
}
} // namespace broadcast
|