1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156
|
//------------------------------------------------------------------------------
// GraphBLAS/CUDA/GB_cuda_reduce_to_scalar: reduce on the GPU with semiring
//------------------------------------------------------------------------------
// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2025, All Rights Reserved.
// This file: Copyright (c) 2024-2025, NVIDIA CORPORATION. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
//------------------------------------------------------------------------------
// Reduce a matrix A to a scalar s, or to a smaller matrix V if the GPU was
// only able to do a partial reduction. This case occurs if the GPU does not
// cannot do an atomic update for the monoid. To handle this case, the GPU
// returns a full GrB_Matrix V, of size gridsize-by-1, with one entry per
// threadblock. Then GB_reduce_to_scalar on the CPU sees this V as the result,
// and calls itself recursively to continue the reduction.
#undef GB_FREE_WORKSPACE
#define GB_FREE_WORKSPACE \
{ \
GB_FREE_MEMORY (&zscalar, zscalar_size) ; \
if (stream != nullptr) \
{ \
cudaStreamSynchronize (stream) ; \
cudaStreamDestroy (stream) ; \
} \
stream = nullptr ; \
}
#define GB_FREE_ALL \
{ \
GB_FREE_WORKSPACE ; \
GB_Matrix_free (&V) ; \
}
#include "GB_cuda_reduce.hpp"
GrB_Info GB_cuda_reduce_to_scalar
(
// output:
GB_void *s, // note: statically allocated on CPU stack; if
// the result is in s then V is NULL.
GrB_Matrix *V_handle, // partial result if unable to reduce to scalar;
// NULL if result is in s.
// input:
const GrB_Monoid monoid,
const GrB_Matrix A
)
{
//--------------------------------------------------------------------------
// check inputs
//--------------------------------------------------------------------------
GB_void *zscalar = NULL ;
size_t zscalar_size = 0 ;
GrB_Matrix V = NULL ;
(*V_handle) = NULL ;
GrB_Info info = GrB_SUCCESS ;
//--------------------------------------------------------------------------
// create the stream
//--------------------------------------------------------------------------
// FIXME: use the stream pool
cudaStream_t stream = nullptr ;
CUDA_OK (cudaStreamCreate (&stream)) ;
//--------------------------------------------------------------------------
// determine problem characteristics and allocate worksbace
//--------------------------------------------------------------------------
int blocksz = 320 ; // # threads in each block
int work_per_thread = 256 ; // work each thread does in a single block
int number_of_sms = GB_Global_gpu_sm_get (0) ;
GrB_Type ztype = monoid->op->ztype ;
size_t zsize = ztype->size ;
// determine kernel launch geometry
int64_t anvals = GB_nnz_held (A) ;
int64_t work_per_block = work_per_thread*blocksz ;
// gridsz = ceil (anvals / work_per_block)
int64_t raw_gridsz = GB_ICEIL (anvals, work_per_block) ;
raw_gridsz = std::min (raw_gridsz, (int64_t) (number_of_sms * 256)) ;
int gridsz = (int) raw_gridsz ;
// FIXME: GB_enumify_reduce is called twice: here (to get has_cheeseburger)
// and in GB_cuda_reduce_to_scalar_jit. Can we just call it once?
uint64_t rcode ;
GB_enumify_reduce (&rcode, monoid, A) ;
bool has_cheeseburger = GB_RSHIFT (rcode, 16, 1) ;
GBURBLE ("has_cheeseburger %d\n", has_cheeseburger) ;
// determine the kind of reduction: partial (to &V), or complete
// (to the scalar output)
if (has_cheeseburger)
{
// the kernel launch can reduce A to zscalar all by itself
// allocate and initialize zscalar (upscaling it to at least 32 bits)
size_t zscalar_space = GB_IMAX (zsize, sizeof (uint32_t)) ;
zscalar = (GB_void *) GB_MALLOC_MEMORY (1, zscalar_space,
&zscalar_size) ;
if (zscalar == NULL)
{
// out of memory
GB_FREE_ALL ;
return (GrB_OUT_OF_MEMORY) ;
}
GB_cuda_upscale_identity (zscalar, monoid) ;
}
else
{
// allocate a full GrB_Matrix V for the partial result, of size
// gridsz-by-1, and of type ztype. V is allocated but not
// initialized.
GB_OK (GB_new_bix (&V, ztype, gridsz, 1, GB_ph_null,
/* is_csc: */ true, /* sparsity: */ GxB_FULL,
/* bitmap_calloc: */ false, /* hyper_switch: */ 0,
/* plen: */ -1, /* nzmax: */ gridsz, /* numeric: */ true,
/* iso: */ false, /* pji_is_32: */ false, false, false)) ;
}
GBURBLE ("(cuda reduce launch %d threads in %d blocks)",
blocksz, gridsz ) ;
//--------------------------------------------------------------------------
// reduce C to a scalar via the CUDA JIT
//--------------------------------------------------------------------------
GB_OK (GB_cuda_reduce_to_scalar_jit (zscalar, V, monoid, A,
stream, gridsz, blocksz)) ;
//--------------------------------------------------------------------------
// return result and destroy the stream
//--------------------------------------------------------------------------
CUDA_OK (cudaStreamSynchronize (stream)) ;
if (has_cheeseburger)
{
// return the scalar result
// s = zscalar (but only the first zsize bytes of it)
memcpy (s, zscalar, zsize) ;
}
else
{
// return the partial reduction
(*V_handle) = V ;
}
GB_FREE_WORKSPACE ;
return (GrB_SUCCESS) ;
}
|