1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67
|
#include "GB_cuda_apply.hpp"
#undef GB_FREE_WORKSPACE
#define GB_FREE_WORKSPACE \
{ \
GB_FREE_MEMORY (&ythunk_cuda, ythunk_cuda_size) ; \
if (stream != nullptr) \
{ \
cudaStreamSynchronize (stream) ; \
cudaStreamDestroy (stream) ; \
} \
stream = nullptr ; \
}
#undef GB_FREE_ALL
#define GB_FREE_ALL GB_FREE_WORKSPACE
#define BLOCK_SIZE 512
#define LOG2_BLOCK_SIZE 9
GrB_Info GB_cuda_apply_unop
(
GB_void *Cx,
const GrB_Type ctype,
const GB_Operator op,
const bool flipij,
const GrB_Matrix A,
const GB_void *ythunk
)
{
GrB_Info info ;
GB_void *ythunk_cuda = NULL ;
size_t ythunk_cuda_size = 0 ;
// FIXME: use the stream pool
cudaStream_t stream = nullptr ;
CUDA_OK (cudaStreamCreate (&stream)) ;
// FIXME: make this a CUDA helper function
if (ythunk != NULL && op != NULL && op->ytype != NULL)
{
// make a copy of ythunk, since ythunk might be allocated on
// the CPU stack and thus not accessible to the CUDA kernel.
ythunk_cuda = (GB_void *) GB_MALLOC_MEMORY (1, op->ytype->size,
&ythunk_cuda_size) ;
if (ythunk_cuda == NULL)
{
GB_FREE_WORKSPACE ;
return (GrB_OUT_OF_MEMORY) ;
}
memcpy (ythunk_cuda, ythunk, op->ytype->size) ;
}
GrB_Index anz = GB_nnz_held (A) ;
int32_t number_of_sms = GB_Global_gpu_sm_get (0) ;
int64_t raw_gridsz = GB_ICEIL (anz, BLOCK_SIZE) ;
// cap #of blocks to 256 * #of sms
int32_t gridsz = std::min (raw_gridsz, (int64_t) (number_of_sms * 256)) ;
GB_OK (GB_cuda_apply_unop_jit (Cx, ctype, op, flipij, A,
ythunk_cuda, stream, gridsz, BLOCK_SIZE)) ;
GB_FREE_WORKSPACE ;
return GrB_SUCCESS ;
}
|