1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259
|
//------------------------------------------------------------------------------
// GB_cuda_gateway.h: definitions for interface to GB_cuda_* functions
//------------------------------------------------------------------------------
// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2025, All Rights Reserved.
// SPDX-License-Identifier: Apache-2.0
//------------------------------------------------------------------------------
// CUDA gateway functions (DRAFT: in progress)
// This file can be #include'd into any GraphBLAS/Source file that needs to
// call a CUDA gateway function, or use the typedef defined below. It is also
// #include'd in GraphBLAS/CUDA/GB_cuda.h, for use by the CUDA/GB_cuda_*.cu
// gateway functions.
// If GRAPHBLAS_HAS_CUDA is defined in GraphBLAS/CMakeLists.txt, then GraphBLAS
// can call the C-callable gateway functions defined in GraphBLAS/CUDA/*.cu
// source files. If GRAPHBLAS_HAS_CUDA is not defined, then these functions
// are not called. The typedef always appears, since it is part of the
// GB_Global struct, whether or not CUDA is used.
#ifndef GB_CUDA_GATEWAY_H
#define GB_CUDA_GATEWAY_H
#define GB_CUDA_MAX_GPUS 32
// The GPU is only used if the work is larger than the GxB_GPU_CHUNK.
// The default value of this parameter is GB_GPU_CHUNK_DEFAULT:
#define GB_GPU_CHUNK_DEFAULT (1024*1024)
//------------------------------------------------------------------------------
// GB_cuda_device: properties of each GPU in the system
//------------------------------------------------------------------------------
typedef struct
{
char name [256] ;
size_t total_global_memory ;
int number_of_sms ;
int compute_capability_major ;
int compute_capability_minor ;
bool use_memory_pool ;
size_t pool_size ;
size_t max_pool_size ;
void *memory_resource ;
}
GB_cuda_device ;
//------------------------------------------------------------------------------
// GB_ngpus_to_use: determine # of GPUs to use for the next computation
//------------------------------------------------------------------------------
static inline int GB_ngpus_to_use
(
double work // total work to do
)
{
// gpu_hack: for testing only
// 2: never use GPU
// 1: always use GPU
// 0: default
int gpu_hack = (int) GB_Global_hack_get (2) ;
// get # of GPUs avaiable
int gpu_count = GB_Global_gpu_count_get ( ) ;
if (gpu_hack == 2 || gpu_count == 0 || work == 0)
{
// never use the GPU(s)
return (0) ;
}
else if (gpu_hack == 1)
{
// always use all available GPU(s)
// Fixme for CUDA: allow 1 to gpu_count to be requested
return (gpu_count) ;
}
else
{
// default: use no more than max_gpus_to_use
double gpu_chunk = 2e6 ;
double max_gpus_to_use = floor (work / gpu_chunk) ;
// but use no more than the # of GPUs available
if (max_gpus_to_use > gpu_count) return (gpu_count) ;
return ((int) max_gpus_to_use) ;
}
}
//------------------------------------------------------------------------------
// GB_cuda_* gateway functions
//------------------------------------------------------------------------------
GrB_Info GB_cuda_init (void) ;
bool GB_cuda_get_device_count // true if OK, false if failure
(
int *gpu_count // return # of GPUs in the system
) ;
bool GB_cuda_warmup (int device) ;
bool GB_cuda_get_device( int *device) ;
bool GB_cuda_set_device( int device) ;
bool GB_cuda_get_device_properties
(
int device,
GB_cuda_device *prop
) ;
bool GB_cuda_type_branch // return true if the type is OK on GPU
(
const GrB_Type type // type to query
) ;
bool GB_cuda_reduce_to_scalar_branch // return true to use the GPU
(
const GrB_Monoid monoid, // monoid to do the reduction
const GrB_Matrix A // input matrix
) ;
GrB_Info GB_cuda_reduce_to_scalar
(
// output:
GB_void *s, // note: statically allocated on CPU stack; if
// the result is in s then V is NULL.
GrB_Matrix *V_handle, // partial result if unable to reduce to scalar;
// NULL if result is in s.
// input:
const GrB_Monoid monoid,
const GrB_Matrix A
) ;
bool GB_cuda_rowscale_branch
(
const GrB_Matrix D,
const GrB_Matrix B,
const GrB_Semiring semiring,
const bool flipxy
) ;
GrB_Info GB_cuda_rowscale
(
GrB_Matrix C,
const GrB_Matrix D,
const GrB_Matrix B,
const GrB_Semiring semiring,
const bool flipxy
) ;
bool GB_cuda_colscale_branch
(
const GrB_Matrix A,
const GrB_Matrix D,
const GrB_Semiring semiring,
const bool flipxy
) ;
GrB_Info GB_cuda_colscale
(
GrB_Matrix C,
const GrB_Matrix A,
const GrB_Matrix D,
const GrB_Semiring semiring,
const bool flipxy
) ;
bool GB_cuda_apply_binop_branch
(
const GrB_Type ctype,
const GrB_BinaryOp op,
const GrB_Matrix A
) ;
bool GB_cuda_apply_unop_branch
(
const GrB_Type ctype,
const GrB_Matrix A,
const GB_Operator op
) ;
GrB_Info GB_cuda_apply_unop
(
GB_void *Cx,
const GrB_Type ctype,
const GB_Operator op,
const bool flipij,
const GrB_Matrix A,
const GB_void *ythunk
) ;
GrB_Info GB_cuda_apply_binop
(
GB_void *Cx,
const GrB_Type ctype,
const GrB_BinaryOp op,
const GrB_Matrix A,
const GB_void *scalarx,
const bool bind1st
) ;
bool GB_cuda_select_branch
(
const GrB_Matrix A,
const GrB_IndexUnaryOp op
) ;
GrB_Info GB_cuda_select_bitmap
(
GrB_Matrix C,
const GrB_Matrix A,
const bool flipij,
const GB_void *ythunk,
const GrB_IndexUnaryOp op
) ;
GrB_Info GB_cuda_select_sparse
(
GrB_Matrix C,
const bool C_iso,
const GrB_IndexUnaryOp op,
const bool flipij,
const GrB_Matrix A,
const GB_void *athunk,
const GB_void *ythunk,
GB_Werk Werk
) ;
bool GB_cuda_type_branch // return true if the type is OK on GPU
(
const GrB_Type type // type to query
) ;
GrB_Info GB_cuda_AxB_dot3 // C<M> = A'*B using dot product method
(
GrB_Matrix C, // output matrix, static header
const GrB_Matrix M, // mask matrix
const bool Mask_struct, // if true, use the only structure of M
const GrB_Matrix A, // input matrix
const GrB_Matrix B, // input matrix
const GrB_Semiring semiring, // semiring that defines C=A*B
const bool flipxy // if true, do z=fmult(b,a) vs fmult(a,b)
) ;
bool GB_cuda_AxB_dot3_branch
(
const GrB_Matrix M, // mask matrix
const bool Mask_struct, // if true, use the only structure of M
const GrB_Matrix A, // input matrix
const GrB_Matrix B, // input matrix
const GrB_Semiring semiring, // semiring that defines C=A*B
const bool flipxy // if true, do z=fmult(b,a) vs fmult(a,b)
);
#endif
|