1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256
|
//------------------------------------------------------------------------------
// GraphBLAS/CUDA/template/GB_cuda_ek_slice.cuh
//------------------------------------------------------------------------------
// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2025, All Rights Reserved.
// This file: Copyright (c) 2024-2025, NVIDIA CORPORATION. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
//------------------------------------------------------------------------------
// The GB_cuda_ek_slice* methods provide an efficient method for the
// threadblocks to work in parallel on a single sparse or hypersparse matrix A.
// Let Ap = A->p be an array of "pointers" of size anvec+1.
// Let Ai = A->i be the array of row indices of the sparse/hypersparse matrix.
// Let Ax = A->x be the array of values of the sparse/hypersparse matrix.
// Let anz be the # of entries in the Ai and Ax arrays.
//
// Then matrix of A can be traversed as follows (suppose it is stored by col):
//
// for (k = 0 ; k < anvec ; k++)
// j = k if A is sparse, or j = Ah [k] if A is hypersparse
// for (p = Ap [k] ; p < Ap [k+1] ; p++)
// i = Ai [p] ;
// atype aij = Ax [p] ;
// the entry A(i,j) has value aij, row index i, column index j, and
// column j is the kth column in the data structure of A,
// ...
//
// However, parallelizing that loop across the GPU threadblocks is difficult.
// Instead, consider a single loop:
//
// for (p = 0 ; p < anz ; p++)
// find k so that Ap [k] <= p < Ap [k+1]
// j = k if A is sparse, or j = Ah [k] if A is hypersparse
// i = Ai [p]
// aij = Ax [p]
// ...
//
// The above loop can be easily parallelized on the GPU, and the
// GB_cuda_ek_slice* methods provide a way for each thread to find k for each
// entry at position p. The methods assume a single threadblock is given the
// task to compute iterations p = pfirst : plast=1 where plast = min (anz,
// pfirst + max_pchunk) for the loop above.
//
// First, all threads call GB_cuda_ek_slice_setup, and do two binary searches.
// This determines the slope, and gives a way to estimate the vector k that
// contains any entry p in the given range pfirst:plast-1. This estimate is
// then used in GB_cuda_ek_slice_entry to determine the k for any given p in
// that range.
//
// If the thread that computes k for a given p is also the thread that uses k,
// then there is no need for the threadblock to share that computation.
// Otherwise, the set of k values can be computed in a shared array ks, using
// the single method GB_cuda_ek_slice.
// FIXME: discuss grid-stride loops, the pdelta loop, and the max chunk size
// here
// question: why chunks are necessary? why not just do ek_slice_setup across
// all entries in one go? answer: the slope method is only useful for a small
// range of entries; non-uniform entry distributions can distort the usefulness
// of the slope (will require an exhaustive linear search) for a large range of
// entries
//------------------------------------------------------------------------------
// GB_cuda_ek_slice_setup
//------------------------------------------------------------------------------
static __device__ __inline__ void GB_cuda_ek_slice_setup
(
// inputs, not modified:
const GB_Ap_TYPE *Ap, // array of size anvec+1
const int64_t anvec, // # of vectors in the matrix A
const int64_t anz, // # of entries in the sparse/hyper matrix A
const int64_t pfirst, // first entry in A to find k
const int64_t max_pchunk, // max # of entries in A to find k
// output:
int64_t *kfirst, // first vector of the slice for this chunk
int64_t *klast, // last vector of the slice for this chunk
int64_t *my_chunk_size, // size of the chunk for this threadblock
int64_t *anvec1, // anvec-1
float *slope // slope of vectors from kfirst to klast
)
{
//--------------------------------------------------------------------------
// determine the range of entries pfirst:plast-1 for this chunk
//--------------------------------------------------------------------------
// The slice for each threadblock contains entries pfirst:plast-1 of A.
// The threadblock works on a chunk of entries in Ai/Ax [pfirst...plast-1].
ASSERT (pfirst < anz) ;
ASSERT (max_pchunk > 0) ;
int64_t plast = pfirst + max_pchunk ;
plast = GB_IMIN (plast, anz) ;
(*my_chunk_size) = plast - pfirst ;
ASSERT ((*my_chunk_size) > 0) ;
//--------------------------------------------------------------------------
// estimate the first and last vectors for this chunk
//--------------------------------------------------------------------------
// find kfirst, the first vector of the slice for this chunk. kfirst is
// the vector that owns the entry Ai [pfirst] and Ax [pfirst]. The search
// does not need to be exact, so kfirst is an estimate.
(*kfirst) = 0 ;
int64_t kright = anvec ;
GB_trim_binary_search (pfirst, Ap, GB_Ap_IS_32, kfirst, &kright) ;
// find klast, the last vector of the slice for this chunk. klast is the
// vector that owns the entry Ai [plast-1] and Ax [plast-1]. The search
// does not have to be exact, so klast is an estimate.
(*klast) = (*kfirst) ;
kright = anvec ;
GB_trim_binary_search (plast, Ap, GB_Ap_IS_32, klast, &kright) ;
//--------------------------------------------------------------------------
// find slope of vectors in this chunk, and return result
//--------------------------------------------------------------------------
// number of vectors in A for this chunk, where
// Ap [kfirst:klast-1] will be searched.
int64_t nk = (*klast) - (*kfirst) + 1 ;
// slope is the estimated # of vectors in this chunk, divided by the
// chunk size.
(*slope) = ((float) nk) / ((float) (*my_chunk_size)) ;
(*anvec1) = anvec - 1 ;
}
//------------------------------------------------------------------------------
// GB_cuda_ek_slice_entry
//------------------------------------------------------------------------------
// Let p = pfirst + pdelta, where pdelta ranges from 0:my_chunk_size-1, and so
// p ranges from pdelta:(pdelta+my_chunk_size-1), and where my_chunk_size is
// normally of size max_pchunk, unless this is the last chunk in the entire
// matrix. GB_cuda_ek_slice_entry computes k for this entry, so that the kth
// vector contains the entry aij with row index i = Ai [p] and value aij = Ax
// [p] (assuming that A is a sparse or hypersparse matrix held by column).
// That is, Ap [k] <= p < Ap [k+1] will hold. If A is sparse and held by
// column, then aij is in column j = k. If A is hypersparse, then aij is in
// column j = Ah [k].
// The method returns the index k of the vector in A that contains the pth
// entry in A, at position p = pfirst + pdelta.
static __device__ __inline__ int64_t GB_cuda_ek_slice_entry
(
// output:
int64_t *p_handle, // p = pfirst + pdelta
// inputs, not modified:
const int64_t pdelta, // find the k value of the pfirst+pdelta entry
const int64_t pfirst, // first entry in A to find k (for which
// pdelta=0)
const GB_Ap_TYPE *Ap, // array of size anvec+1
const int64_t anvec1, // anvec-1
const int64_t kfirst, // estimate of first vector in the chunk
const float slope // estimate # vectors in chunk / my_chunk_size
)
{
// get a rough estimate of k for the pfirst + pdelta entry
int64_t k = kfirst + (int64_t) (slope * ((float) pdelta)) ;
// The estimate of k cannot be smaller than kfirst, but it might be bigger
// than anvec-1, so ensure it is in the valid range, kfirst to anvec-1.
k = GB_IMIN (k, anvec1) ;
// look for p in Ap, where p is in range pfirst:plast-1
// where pfirst >= 0 and plast < anz
int64_t p = pfirst + pdelta ;
(*p_handle) = p ;
// linear-time search for the k value of the pth entry
while (Ap [k+1] <= p) k++ ;
while (Ap [k ] > p) k-- ;
// the pth entry of A is contained in the kth vector of A
ASSERT (Ap [k] <= p && p < Ap [k+1]) ;
// return the result k
return (k) ;
}
//------------------------------------------------------------------------------
// GB_cuda_ek_slice
//------------------------------------------------------------------------------
// GB_cuda_ek_slice finds the vector k that owns each entry in the sparse or
// hypersparse matrix A, in Ai/Ax [pfirst:plast-1], where plast = min (anz,
// pfirst+max_pchunk). Returns my_chunk_size = plast - pfirst, which is the
// size of the chunk operated on by this threadblock.
// The function GB_cuda_ek_slice behaves somewhat like GB_ek_slice used on the
// CPU. The latter is for OpenMP parallelism on the CPU only; it does not
// need to compute ks.
static __device__ __inline__ int64_t GB_cuda_ek_slice // returns my_chunk_size
(
// inputs, not modified:
const GB_Ap_TYPE *Ap, // array of size anvec+1
const int64_t anvec, // # of vectors in the matrix A
const int64_t anz, // # of entries in the sparse/hyper matrix A
const int64_t pfirst, // first entry in A to find k
const int64_t max_pchunk, // max # of entries in A to find k
// output:
int64_t *ks // k value for each pfirst:plast-1
)
{
//--------------------------------------------------------------------------
// determine the chunk for this threadblock and its slope
//--------------------------------------------------------------------------
int64_t my_chunk_size, anvec1, kfirst, klast ;
float slope ;
GB_cuda_ek_slice_setup (Ap, anvec, anz, pfirst, max_pchunk,
&kfirst, &klast, &my_chunk_size, &anvec1, &slope) ;
//--------------------------------------------------------------------------
// find the kth vector that contains each entry p = pfirst:plast-1
//--------------------------------------------------------------------------
for (int64_t pdelta = threadIdx.x ;
pdelta < my_chunk_size ;
pdelta += blockDim.x)
{
//----------------------------------------------------------------------
// determine the kth vector that contains the pth entry
//----------------------------------------------------------------------
int64_t p ; // unused, p = pfirst + pdelta
int64_t k = GB_cuda_ek_slice_entry (&p, pdelta, pfirst, Ap, anvec1,
kfirst, slope) ;
//----------------------------------------------------------------------
// save the result in ks
//----------------------------------------------------------------------
// the pth entry of the matrix is in vector k
ks [pdelta] = k ;
}
//--------------------------------------------------------------------------
// sync all threads and return result
//--------------------------------------------------------------------------
this_thread_block().sync() ;
return (my_chunk_size) ;
}
|