File: GB_cuda_jit_AxB_dot3_phase2.cuh

package info (click to toggle)
suitesparse 1%3A7.10.1%2Bdfsg-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, trixie
  • size: 254,920 kB
  • sloc: ansic: 1,134,743; cpp: 46,133; makefile: 4,875; fortran: 2,087; java: 1,826; sh: 996; ruby: 725; python: 495; asm: 371; sed: 166; awk: 44
file content (176 lines) | stat: -rw-r--r-- 5,575 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
//------------------------------------------------------------------------------
// GraphBLAS/CUDA/template/GB_cuda_jit_GB_AxB_dot3_phase2.cuh
//------------------------------------------------------------------------------

// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2025, All Rights Reserved.
// This file: Copyright (c) 2024-2025, NVIDIA CORPORATION. All rights reserved.
// SPDX-License-Identifier: Apache-2.0

//------------------------------------------------------------------------------

// AxB_dot3_phase2: fill the global buckets

//------------------------------------------------------------------------------
// BlockPrefixCallbackOp
//------------------------------------------------------------------------------

// A stateful callback functor that maintains a running prefix to be applied
// during consecutive scan operations.
struct BlockPrefixCallbackOp
{
    // Running prefix
    int64_t running_total ;

    // Constructor
    __device__ BlockPrefixCallbackOp (int64_t running_total) :
        running_total(running_total) {}

    // Callback operator to be entered by the first warp of threads in the
    // block.  Thread-0 is responsible for returning a value for seeding the
    // block-wide scan.
    __device__ int64_t operator()(int64_t block_aggregate)
    {
        int64_t old_prefix = running_total ;
        running_total += block_aggregate ;
        return old_prefix ;
    }
} ;

//------------------------------------------------------------------------------
// blockBucketExclusiveSum
//------------------------------------------------------------------------------

__inline__ __device__ void blockBucketExclusiveSum
(
    int bucketId,
    int64_t *d_data,
    int nblocks
)
{

    // Specialize BlockScan for a 1D block of 32 threads
    typedef cub::BlockScan<int64_t, 32, cub::BLOCK_SCAN_WARP_SCANS> BlockScan ;

    // Allocate shared memory for BlockScan
    __shared__ typename BlockScan::TempStorage temp_storage ;

    // Initialize running total
    BlockPrefixCallbackOp prefix_op (0) ;

    // Have the block iterate over segments of items
    int64_t data = 0 ;

    int64_t *blockbucket = d_data ;

    for (int block_id = 0 ; block_id < nblocks ; block_id += blocksize)
    {
        // Load a segment of consecutive items that are blocked across threads

        int loc = block_id + threadIdx.x;
        if (loc < nblocks)
        {
            data = blockbucket [bucketId*nblocks + loc] ;
        }
        this_thread_block().sync() ;

        // Collectively compute the block-wide exclusive prefix sum
        BlockScan(temp_storage).ExclusiveSum (data, data, prefix_op) ;
        this_thread_block().sync() ;

        if (loc < nblocks)
        {
            blockbucket [bucketId*nblocks + loc] = data ;
        }

        // this_thread_block().sync();

        data = 0 ;
    }
}

//------------------------------------------------------------------------------
// GB_cuda_AxB_dot3_phase2_kernel
//------------------------------------------------------------------------------

// GB_cuda_AxB__dot3_phase2 is a CUDA kernel that takes as input the
// nanobuckets and blockbucket arrays computed by the first phase kernel,
// GB_cuda_AxB__dot3_phase1.  The launch geometry of this kernel must match
// the GB_cuda_AxB_dot3_phase1 kernel, with the same # of threads and
// threadblocks.

__global__ void GB_cuda_AxB_dot3_phase2_kernel
(
    // input, not modified:
    int64_t *__restrict__ blockbucket,  // global bucket count,
                                        // of size NBUCKETS*nblocks
    // output:
    int64_t *__restrict__ offset,       // global offsets, for each bucket
    // inputs, not modified:
    const int nblocks               // input number of blocks to reduce
                                    // across, ie size of vector for 1 bucket
)
{

    //--------------------------------------------------------------------------
    // sum up the bucket counts of prior threadblocks
    //--------------------------------------------------------------------------

    // blockbucket is an array of size NBUCKETS-by-nblocks, held by row.  The
    // entry blockbucket [bucket * nblocks + t] holds the # of entries
    // in the bucket (in range 0 to NBUCKETS-1) found by threadblock t.

    uint64_t s [NBUCKETS] ;

    #pragma unroll
    for (int b = 0 ; b < NBUCKETS ; b++)
    {
        s [b] = 0 ;
    }

    thread_block_tile<32> tile = tiled_partition<32>(this_thread_block() );

     #pragma unroll
     for (int b = 0 ; b < NBUCKETS ; b++)
     {
        for (int64_t tid = threadIdx.x + blockIdx.x * blockDim.x ;
              tid < nblocks ;
              tid += blockDim.x*gridDim.x)
        {
            s [b] += blockbucket [b * nblocks + tid] ;
        }
        this_thread_block().sync(); 

        s [b] = GB_cuda_tile_sum_uint64 (tile, s [b]) ;
     }

    if (threadIdx.x == 0)
    {
        #pragma unroll
        for (int b = 0 ; b < NBUCKETS ; b++)
        {
            atomicAdd ((unsigned long long int*) &(offset [b]), s [b]) ;
        }
    }
    this_thread_block().sync(); 

    if (gridDim.x >= NBUCKETS)
    {
        // Cumulative sum across blocks for each bucket
        if (blockIdx.x <NBUCKETS)
        {
            blockBucketExclusiveSum (blockIdx.x, blockbucket, nblocks) ;
        }
    }
    else
    {
        if (blockIdx.x == 0)
        {
            #pragma unroll
            for (int b = 0 ; b < NBUCKETS ; b++)
            {
                blockBucketExclusiveSum (b, blockbucket, nblocks) ;
            }
        }
    }
}