File: GB_cuda_apply_unop.cpp

package info (click to toggle)
suitesparse 1%3A7.11.0%2Bdfsg-2
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 258,172 kB
  • sloc: ansic: 1,153,566; cpp: 48,145; makefile: 4,997; fortran: 2,087; java: 1,826; sh: 1,113; ruby: 725; python: 676; asm: 371; sed: 166; awk: 44
file content (69 lines) | stat: -rw-r--r-- 2,017 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
#include "GB_cuda_apply.hpp"

#undef  GB_FREE_WORKSPACE
#define GB_FREE_WORKSPACE                                   \
{                                                           \
    GB_FREE_MEMORY (&ythunk_cuda, ythunk_cuda_size) ;       \
}

#undef  GB_FREE_ALL
#define GB_FREE_ALL                                         \
{                                                           \
    GB_FREE_WORKSPACE                                       \
    GB_cuda_release_stream (&stream) ;                      \
}

#define BLOCK_SIZE 512
#define LOG2_BLOCK_SIZE 9

GrB_Info GB_cuda_apply_unop
(
    GB_void *Cx,
    const GrB_Type ctype,
    const GB_Operator op,
    const bool flipij,
    const GrB_Matrix A,
    const GB_void *ythunk
)
{

    GrB_Info info ;
    GB_void *ythunk_cuda = NULL ;
    size_t ythunk_cuda_size = 0 ;

    cudaStream_t stream = nullptr ;

    GrB_Index anz = GB_nnz_held (A) ;
    if (anz == 0) return (GrB_SUCCESS) ;

    // get a stream on the current device
    GB_OK (GB_cuda_acquire_stream (&stream)) ;

    // FIXME: make this a CUDA helper function
    if (ythunk != NULL && op != NULL && op->ytype != NULL)
    {
        // make a copy of ythunk, since ythunk might be allocated on
        // the CPU stack and thus not accessible to the CUDA kernel.
        ythunk_cuda = (GB_void *) GB_MALLOC_MEMORY (1, op->ytype->size,
            &ythunk_cuda_size) ;
        if (ythunk_cuda == NULL)
        {
            GB_FREE_ALL ;
            return (GrB_OUT_OF_MEMORY) ;
        }
        memcpy (ythunk_cuda, ythunk, op->ytype->size) ;
    }

    int32_t number_of_sms = GB_Global_gpu_sm_get (0) ;
    int64_t raw_gridsz = GB_ICEIL (anz, BLOCK_SIZE) ;
    // cap #of blocks to 256 * #of sms
    int32_t gridsz = std::min (raw_gridsz, (int64_t) (number_of_sms * 256)) ;

    GB_OK (GB_cuda_apply_unop_jit (Cx, ctype, op, flipij, A,
        ythunk_cuda, stream, gridsz, BLOCK_SIZE)) ;

    GB_FREE_WORKSPACE ;
    GB_OK (GB_cuda_release_stream (&stream)) ;
    return GrB_SUCCESS ;
}