File: GB_cuda_ek_slice.cuh

package info (click to toggle)
suitesparse 1%3A7.10.1%2Bdfsg-1
  • links: PTS, VCS
  • area: main
  • in suites: trixie
  • size: 254,920 kB
  • sloc: ansic: 1,134,743; cpp: 46,133; makefile: 4,875; fortran: 2,087; java: 1,826; sh: 996; ruby: 725; python: 495; asm: 371; sed: 166; awk: 44
file content (256 lines) | stat: -rw-r--r-- 11,244 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
//------------------------------------------------------------------------------
// GraphBLAS/CUDA/template/GB_cuda_ek_slice.cuh
//------------------------------------------------------------------------------

// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2025, All Rights Reserved.
// This file: Copyright (c) 2024-2025, NVIDIA CORPORATION. All rights reserved.
// SPDX-License-Identifier: Apache-2.0

//------------------------------------------------------------------------------

// The GB_cuda_ek_slice* methods provide an efficient method for the
// threadblocks to work in parallel on a single sparse or hypersparse matrix A.
// Let Ap = A->p be an array of "pointers" of size anvec+1.
// Let Ai = A->i be the array of row indices of the sparse/hypersparse matrix.
// Let Ax = A->x be the array of values of the sparse/hypersparse matrix.
// Let anz be the # of entries in the Ai and Ax arrays.
//
// Then matrix of A can be traversed as follows (suppose it is stored by col):
//
//  for (k = 0 ; k < anvec ; k++)
//      j = k if A is sparse, or j = Ah [k] if A is hypersparse
//      for (p = Ap [k] ; p < Ap [k+1] ; p++)
//          i = Ai [p] ;
//          atype aij = Ax [p] ;
//          the entry A(i,j) has value aij, row index i, column index j, and
//              column j is the kth column in the data structure of A,
//          ...
//
// However, parallelizing that loop across the GPU threadblocks is difficult.
// Instead, consider a single loop:
//
//  for (p = 0 ; p < anz ; p++)
//      find k so that Ap [k] <= p < Ap [k+1]
//      j = k if A is sparse, or j = Ah [k] if A is hypersparse
//      i = Ai [p]
//      aij = Ax [p]
//      ...
//
// The above loop can be easily parallelized on the GPU, and the
// GB_cuda_ek_slice* methods provide a way for each thread to find k for each
// entry at position p.  The methods assume a single threadblock is given the
// task to compute iterations p = pfirst : plast=1 where plast = min (anz,
// pfirst + max_pchunk) for the loop above.
//
// First, all threads call GB_cuda_ek_slice_setup, and do two binary searches.
// This determines the slope, and gives a way to estimate the vector k that
// contains any entry p in the given range pfirst:plast-1.  This estimate is
// then used in GB_cuda_ek_slice_entry to determine the k for any given p in
// that range.
//
// If the thread that computes k for a given p is also the thread that uses k,
// then there is no need for the threadblock to share that computation.
// Otherwise, the set of k values can be computed in a shared array ks, using
// the single method GB_cuda_ek_slice.

// FIXME: discuss grid-stride loops, the pdelta loop, and the max chunk size
// here

// question: why chunks are necessary? why not just do ek_slice_setup across
// all entries in one go?  answer: the slope method is only useful for a small
// range of entries; non-uniform entry distributions can distort the usefulness
// of the slope (will require an exhaustive linear search) for a large range of
// entries

//------------------------------------------------------------------------------
// GB_cuda_ek_slice_setup
//------------------------------------------------------------------------------

static __device__ __inline__ void GB_cuda_ek_slice_setup
(
    // inputs, not modified:
    const GB_Ap_TYPE *Ap,       // array of size anvec+1
    const int64_t anvec,        // # of vectors in the matrix A
    const int64_t anz,          // # of entries in the sparse/hyper matrix A
    const int64_t pfirst,       // first entry in A to find k
    const int64_t max_pchunk,   // max # of entries in A to find k
    // output:
    int64_t *kfirst,            // first vector of the slice for this chunk
    int64_t *klast,             // last vector of the slice for this chunk
    int64_t *my_chunk_size,     // size of the chunk for this threadblock
    int64_t *anvec1,            // anvec-1
    float *slope                // slope of vectors from kfirst to klast
)
{

    //--------------------------------------------------------------------------
    // determine the range of entries pfirst:plast-1 for this chunk
    //--------------------------------------------------------------------------

    // The slice for each threadblock contains entries pfirst:plast-1 of A.
    // The threadblock works on a chunk of entries in Ai/Ax [pfirst...plast-1].

    ASSERT (pfirst < anz) ;
    ASSERT (max_pchunk > 0) ;
    int64_t plast = pfirst + max_pchunk ;
    plast = GB_IMIN (plast, anz) ;
    (*my_chunk_size) = plast - pfirst ;
    ASSERT ((*my_chunk_size) > 0) ;

    //--------------------------------------------------------------------------
    // estimate the first and last vectors for this chunk
    //--------------------------------------------------------------------------

    // find kfirst, the first vector of the slice for this chunk.  kfirst is
    // the vector that owns the entry Ai [pfirst] and Ax [pfirst].  The search
    // does not need to be exact, so kfirst is an estimate.

    (*kfirst) = 0 ;
    int64_t kright = anvec ;
    GB_trim_binary_search (pfirst, Ap, GB_Ap_IS_32, kfirst, &kright) ;

    // find klast, the last vector of the slice for this chunk.  klast is the
    // vector that owns the entry Ai [plast-1] and Ax [plast-1].  The search
    // does not have to be exact, so klast is an estimate.

    (*klast) = (*kfirst) ;
    kright = anvec ;
    GB_trim_binary_search (plast, Ap, GB_Ap_IS_32, klast, &kright) ;

    //--------------------------------------------------------------------------
    // find slope of vectors in this chunk, and return result
    //--------------------------------------------------------------------------

    // number of vectors in A for this chunk, where
    // Ap [kfirst:klast-1] will be searched.
    int64_t nk = (*klast) - (*kfirst) + 1 ;

    // slope is the estimated # of vectors in this chunk, divided by the
    // chunk size.
    (*slope) = ((float) nk) / ((float) (*my_chunk_size)) ;
    (*anvec1) = anvec - 1 ;
}

//------------------------------------------------------------------------------
// GB_cuda_ek_slice_entry
//------------------------------------------------------------------------------

// Let p = pfirst + pdelta, where pdelta ranges from 0:my_chunk_size-1, and so
// p ranges from pdelta:(pdelta+my_chunk_size-1), and where my_chunk_size is
// normally of size max_pchunk, unless this is the last chunk in the entire
// matrix.  GB_cuda_ek_slice_entry computes k for this entry, so that the kth
// vector contains the entry aij with row index i = Ai [p] and value aij = Ax
// [p] (assuming that A is a sparse or hypersparse matrix held by column).
// That is, Ap [k] <= p < Ap [k+1] will hold.  If A is sparse and held by
// column, then aij is in column j = k.  If A is hypersparse, then aij is in
// column j = Ah [k].

// The method returns the index k of the vector in A that contains the pth
// entry in A, at position p = pfirst + pdelta.

static __device__ __inline__ int64_t GB_cuda_ek_slice_entry
(
    // output:
    int64_t *p_handle,          // p = pfirst + pdelta
    // inputs, not modified:
    const int64_t pdelta,       // find the k value of the pfirst+pdelta entry
    const int64_t pfirst,       // first entry in A to find k (for which
                                // pdelta=0)
    const GB_Ap_TYPE *Ap,       // array of size anvec+1
    const int64_t anvec1,       // anvec-1
    const int64_t kfirst,       // estimate of first vector in the chunk
    const float slope           // estimate # vectors in chunk / my_chunk_size
)
{

    // get a rough estimate of k for the pfirst + pdelta entry
    int64_t k = kfirst + (int64_t) (slope * ((float) pdelta)) ;

    // The estimate of k cannot be smaller than kfirst, but it might be bigger
    // than anvec-1, so ensure it is in the valid range, kfirst to anvec-1.
    k = GB_IMIN (k, anvec1) ;

    // look for p in Ap, where p is in range pfirst:plast-1
    // where pfirst >= 0 and plast < anz
    int64_t p = pfirst + pdelta ;
    (*p_handle) = p ;

    // linear-time search for the k value of the pth entry
    while (Ap [k+1] <= p) k++ ;
    while (Ap [k  ] >  p) k-- ;

    // the pth entry of A is contained in the kth vector of A
    ASSERT (Ap [k] <= p && p < Ap [k+1]) ;

    // return the result k
    return (k) ;
}

//------------------------------------------------------------------------------
// GB_cuda_ek_slice
//------------------------------------------------------------------------------

// GB_cuda_ek_slice finds the vector k that owns each entry in the sparse or
// hypersparse matrix A, in Ai/Ax [pfirst:plast-1], where plast = min (anz,
// pfirst+max_pchunk).  Returns my_chunk_size = plast - pfirst, which is the
// size of the chunk operated on by this threadblock.

// The function GB_cuda_ek_slice behaves somewhat like GB_ek_slice used on the
// CPU.  The latter is for OpenMP parallelism on the CPU only; it does not
// need to compute ks.

static __device__ __inline__ int64_t GB_cuda_ek_slice // returns my_chunk_size
(
    // inputs, not modified:
    const GB_Ap_TYPE *Ap,       // array of size anvec+1
    const int64_t anvec,        // # of vectors in the matrix A
    const int64_t anz,          // # of entries in the sparse/hyper matrix A
    const int64_t pfirst,       // first entry in A to find k
    const int64_t max_pchunk,   // max # of entries in A to find k
    // output:
    int64_t *ks                 // k value for each pfirst:plast-1
)
{

    //--------------------------------------------------------------------------
    // determine the chunk for this threadblock and its slope
    //--------------------------------------------------------------------------

    int64_t my_chunk_size, anvec1, kfirst, klast ;
    float slope ;
    GB_cuda_ek_slice_setup (Ap, anvec, anz, pfirst, max_pchunk,
        &kfirst, &klast, &my_chunk_size, &anvec1, &slope) ;

    //--------------------------------------------------------------------------
    // find the kth vector that contains each entry p = pfirst:plast-1
    //--------------------------------------------------------------------------

    for (int64_t pdelta = threadIdx.x ;
                 pdelta < my_chunk_size ;
                 pdelta += blockDim.x)
    {

        //----------------------------------------------------------------------
        // determine the kth vector that contains the pth entry
        //----------------------------------------------------------------------

        int64_t p ;     // unused, p = pfirst + pdelta
        int64_t k = GB_cuda_ek_slice_entry (&p, pdelta, pfirst, Ap, anvec1,
            kfirst, slope) ;

        //----------------------------------------------------------------------
        // save the result in ks
        //----------------------------------------------------------------------

        // the pth entry of the matrix is in vector k
        ks [pdelta] = k ;
    }

    //--------------------------------------------------------------------------
    // sync all threads and return result
    //--------------------------------------------------------------------------

    this_thread_block().sync() ;
    return (my_chunk_size) ;
}