File: GB_cuda_apply_unop.cpp

package info (click to toggle)

suitesparse 1%3A7.11.0%2Bdfsg-2

links: PTS, VCS
area: main
in suites: forky, sid
size: 258,172 kB
sloc: ansic: 1,153,566; cpp: 48,145; makefile: 4,997; fortran: 2,087; java: 1,826; sh: 1,113; ruby: 725; python: 676; asm: 371; sed: 166; awk: 44

file content (69 lines) | stat: -rw-r--r-- 2,017 bytes

#include "GB_cuda_apply.hpp"

#undef  GB_FREE_WORKSPACE
#define GB_FREE_WORKSPACE                                   \
{                                                           \
    GB_FREE_MEMORY (&ythunk_cuda, ythunk_cuda_size) ;       \
}

#undef  GB_FREE_ALL
#define GB_FREE_ALL                                         \
{                                                           \
    GB_FREE_WORKSPACE                                       \
    GB_cuda_release_stream (&stream) ;                      \
}

#define BLOCK_SIZE 512
#define LOG2_BLOCK_SIZE 9

GrB_Info GB_cuda_apply_unop
(
    GB_void *Cx,
    const GrB_Type ctype,
    const GB_Operator op,
    const bool flipij,
    const GrB_Matrix A,
    const GB_void *ythunk
)
{

    GrB_Info info ;
    GB_void *ythunk_cuda = NULL ;
    size_t ythunk_cuda_size = 0 ;

    cudaStream_t stream = nullptr ;

    GrB_Index anz = GB_nnz_held (A) ;
    if (anz == 0) return (GrB_SUCCESS) ;

    // get a stream on the current device
    GB_OK (GB_cuda_acquire_stream (&stream)) ;

    // FIXME: make this a CUDA helper function
    if (ythunk != NULL && op != NULL && op->ytype != NULL)
    {
        // make a copy of ythunk, since ythunk might be allocated on
        // the CPU stack and thus not accessible to the CUDA kernel.
        ythunk_cuda = (GB_void *) GB_MALLOC_MEMORY (1, op->ytype->size,
            &ythunk_cuda_size) ;
        if (ythunk_cuda == NULL)
        {
            GB_FREE_ALL ;
            return (GrB_OUT_OF_MEMORY) ;
        }
        memcpy (ythunk_cuda, ythunk, op->ytype->size) ;
    }

    int32_t number_of_sms = GB_Global_gpu_sm_get (0) ;
    int64_t raw_gridsz = GB_ICEIL (anz, BLOCK_SIZE) ;
    // cap #of blocks to 256 * #of sms
    int32_t gridsz = std::min (raw_gridsz, (int64_t) (number_of_sms * 256)) ;

    GB_OK (GB_cuda_apply_unop_jit (Cx, ctype, op, flipij, A,
        ythunk_cuda, stream, gridsz, BLOCK_SIZE)) ;

    GB_FREE_WORKSPACE ;
    GB_OK (GB_cuda_release_stream (&stream)) ;
    return GrB_SUCCESS ;
}