1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63
|
//------------------------------------------------------------------------------
// GraphBLAS/CUDA/template/GB_cuda_tile_sum_uint64.cuh: warp-level reductions
//------------------------------------------------------------------------------
// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2025, All Rights Reserved.
// This file: Copyright (c) 2024-2025, NVIDIA CORPORATION. All rights reserved.
// SPDX-License-Identifier: BSD-3-Clause
//------------------------------------------------------------------------------
// See template/GB_cuda_tile_reduce_ztype.cuh for a description of
// tile.shfl_down.
//------------------------------------------------------------------------------
// GB_cuda_tile_sum_uint64: reduce a uint64_t value across a single warp
//------------------------------------------------------------------------------
// On input, each thread in the tile holds a single uint64_t value. On output,
// thread zero holds the sum of values from all the warps.
__device__ __inline__ uint64_t GB_cuda_tile_sum_uint64
(
thread_block_tile<tile_sz> tile,
uint64_t value
)
{
//--------------------------------------------------------------------------
// sum value on all threads to a single value
//--------------------------------------------------------------------------
#if (tile_sz == 32)
{
// this is the typical case
value += tile.shfl_down (value, 16) ;
value += tile.shfl_down (value, 8) ;
value += tile.shfl_down (value, 4) ;
value += tile.shfl_down (value, 2) ;
value += tile.shfl_down (value, 1) ;
}
#else
{
// tile_sz is less than 32 (either 1, 2, 4, 8, or 16)
#pragma unroll
for (int offset = tile_sz >> 1 ; offset > 0 ; offset >>= 1)
{
value += tile.shfl_down (value, offset) ;
}
}
#endif
//--------------------------------------------------------------------------
// return result
//--------------------------------------------------------------------------
// Note that only thread 0 will have the full summation of all values in
// the tile. To broadcast it to all threads, use the following:
// value = tile.shfl (value, 0) ;
return (value) ;
}
|