1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383
|
//------------------------------------------------------------------------------
// CUDA/GB_cuda_kernel.h: definitions for all GraphBLAS CUDA kernels
//------------------------------------------------------------------------------
// SPDX-License-Identifier: Apache-2.0
//------------------------------------------------------------------------------
// This file is #include'd into all CUDA kernels for GraphBLAS. It provides
// a
#pragma once
#undef ASSERT
#define ASSERT(x)
//------------------------------------------------------------------------------
// TODO: this will be in the jit code:
#define chunksize 128
//------------------------------------------------------------------------------
// GETA, GETB: get entries from input matrices A and B
//------------------------------------------------------------------------------
// The entries are typecasted to the type of the inputs to the operator f(x,y),
// which is either the multiplicative operator of a semiring, or a binary
// operator for eWise operations. GETA and GETB can also be used for loading
// values to be passed to the binary accumulator operator.
#if GB_FLIPXY
// The operator is "flipped", so that f(b,a) is to be computed.
// In this case, aval must be typecasted to the ytype of f, which is
// T_Y, and bval to the xtype of f (that is, T_X).
// aval = (T_Y) A (i,j)
#if GB_A_IS_PATTERN
#define GB_DECLAREA(aval)
#define GB_SHAREDA(aval)
#define GB_GETA( aval, ax, p)
#else
#define GB_DECLAREA(aval) T_Y aval
#define GB_SHAREDA(aval) __shared__ T_Y aval
#if GB_A_ISO
#define GB_GETA( aval, ax, p) aval = (T_Y) (ax [0]) ;
#else
#define GB_GETA( aval, ax, p) aval = (T_Y) (ax [p]) ;
#endif
#endif
// bval = (T_X) B (i,j)
#if GB_B_IS_PATTERN
#define GB_DECLAREB(bval)
#define GB_SHAREDB(bval)
#define GB_GETB( bval, bx, p)
#else
#define GB_DECLAREB(bval) T_X bval
#define GB_SHAREDB(bval) __shared__ T_X bval
#if GB_B_ISO
#define GB_GETB( bval, bx, p) bval = (T_X) (bx [0]) ;
#else
#define GB_GETB( bval, bx, p) bval = (T_X) (bx [p]) ;
#endif
#endif
#else
// The operator is not "flipped", so that f(a,b) is to be computed.
// In this case, aval must be typecasted to the xtype of f, which is
// T_X, and bval to the xtype of f (that is, T_Y).
// aval = (T_X) A (i,j)
#if GB_A_IS_PATTERN
#define GB_DECLAREA(aval)
#define GB_SHAREDA(aval)
#define GB_GETA( aval, ax, p)
#else
#define GB_DECLAREA(aval) T_X aval
#define GB_SHAREDA(aval) __shared__ T_X aval
#if GB_A_ISO
#define GB_GETA( aval, ax, p) aval = (T_X) (ax [0]) ;
#else
#define GB_GETA( aval, ax, p) aval = (T_X) (ax [p]) ;
#endif
#endif
// bval = (T_Y) B (i,j)
#if GB_B_IS_PATTERN
#define GB_DECLAREB(bval)
#define GB_SHAREDB(bval)
#define GB_GETB( bval, bx, p)
#else
#define GB_DECLAREB(bval) T_Y bval
#define GB_SHAREDB(bval) __shared__ T_Y bval
#if GB_B_ISO
#define GB_GETB( bval, bx, p) bval = (T_Y) (bx [0]) ;
#else
#define GB_GETB( bval, bx, p) bval = (T_Y) (bx [p]) ;
#endif
#endif
#endif
//------------------------------------------------------------------------------
// operators
//------------------------------------------------------------------------------
#if GB_C_ISO
#define GB_MULTADD( c, a ,b, i, k, j)
#define GB_DOT_TERMINAL( c ) break
#define GB_DOT_MERGE(pA,pB) \
{ \
cij_exists = true ; \
}
#define GB_CIJ_EXIST_POSTCHECK
#else
// the result the multiply must be typecast to ztype of the add.
#define GB_MULTADD( c, a, b, i, k, j ) \
{ \
T_Z x_op_y ; \
GB_MULT (x_op_y, a, b, i, k, j) ; /* x_op_y = a*b */ \
GB_ADD (c, c, x_op_y) ; /* c += x_op_y */ \
}
#define GB_DOT_TERMINAL( c ) GB_IF_TERMINAL_BREAK ( c, z )
#if GB_IS_PLUS_PAIR_REAL_SEMIRING
// cij += A(k,i) * B(k,j), for merge operation (plus_pair_real semiring)
#if GB_ZTYPE_IGNORE_OVERFLOW
// plus_pair for int64, uint64, float, or double
#define GB_DOT_MERGE(pA,pB) cij++ ;
#define GB_CIJ_EXIST_POSTCHECK cij_exists = (cij != 0) ;
#else
// plus_pair semiring for small integers
#define GB_DOT_MERGE(pA,pB) \
{ \
cij_exists = true ; \
cij++ ; \
}
#define GB_CIJ_EXIST_POSTCHECK
#endif
#else
// cij += A(k,i) * B(k,j), for merge operation (general case)
#define GB_DOT_MERGE(pA,pB) \
{ \
GB_GETA (aki, Ax, pA) ; /* aki = A(k,i) */ \
GB_GETB (bkj, Bx, pB) ; /* bkj = B(k,j) */ \
cij_exists = true ; \
GB_MULTADD (cij, aki, bkj, i, k, j) ; /* cij += aki * bkj */ \
}
#define GB_CIJ_EXIST_POSTCHECK
#endif
#endif
//------------------------------------------------------------------------------
// subset of GraphBLAS.h
//------------------------------------------------------------------------------
#ifndef GRAPHBLAS_H
#define GRAPHBLAS_H
#undef restrict
#undef GB_restrict
#if defined ( GB_CUDA_KERNEL ) || defined ( __NVCC__ )
#define GB_restrict __restrict__
#else
#define GB_restrict
#endif
#define restrict GB_restrict
#include <stdint.h>
//#include <stdbool.h>
#include <stddef.h>
#include <string.h>
// GB_STR: convert the content of x into a string "x"
#define GB_XSTR(x) GB_STR(x)
#define GB_STR(x) #x
#undef GB_PUBLIC
#define GB_PUBLIC extern
#undef GxB_MAX_NAME_LEN
#define GxB_MAX_NAME_LEN 128
typedef uint64_t GrB_Index ;
typedef struct GB_Descriptor_opaque *GrB_Descriptor ;
typedef struct GB_Type_opaque *GrB_Type ;
typedef struct GB_UnaryOp_opaque *GrB_UnaryOp ;
typedef struct GB_BinaryOp_opaque *GrB_BinaryOp ;
typedef struct GB_SelectOp_opaque *GxB_SelectOp ;
typedef struct GB_IndexUnaryOp_opaque *GrB_IndexUnaryOp ;
typedef struct GB_Monoid_opaque *GrB_Monoid ;
typedef struct GB_Semiring_opaque *GrB_Semiring ;
typedef struct GB_Scalar_opaque *GrB_Scalar ;
typedef struct GB_Vector_opaque *GrB_Vector ;
typedef struct GB_Matrix_opaque *GrB_Matrix ;
#define GxB_HYPERSPARSE 1 // store matrix in hypersparse form
#define GxB_SPARSE 2 // store matrix as sparse form (compressed vector)
#define GxB_BITMAP 4 // store matrix as a bitmap
#define GxB_FULL 8 // store matrix as full; all entries must be present
typedef void (*GxB_unary_function) (void *, const void *) ;
typedef void (*GxB_binary_function) (void *, const void *, const void *) ;
typedef bool (*GxB_select_function) // return true if A(i,j) is kept
(
GrB_Index i, // row index of A(i,j)
GrB_Index j, // column index of A(i,j)
const void *x, // value of A(i,j)
const void *thunk // optional input for select function
) ;
typedef void (*GxB_index_unary_function)
(
void *z, // output value z, of type ztype
const void *x, // input value x of type xtype; value of v(i) or A(i,j)
GrB_Index i, // row index of A(i,j)
GrB_Index j, // column index of A(i,j), or zero for v(i)
const void *y // input scalar y
) ;
typedef enum
{
// for all GrB_Descriptor fields:
GxB_DEFAULT = 0, // default behavior of the method
// for GrB_OUTP only:
GrB_REPLACE = 1, // clear the output before assigning new values to it
// for GrB_MASK only:
GrB_COMP = 2, // use the structural complement of the input
GrB_SCMP = 2, // same as GrB_COMP (historical; use GrB_COMP instead)
GrB_STRUCTURE = 4, // use the only pattern of the mask, not its values
// for GrB_INP0 and GrB_INP1 only:
GrB_TRAN = 3, // use the transpose of the input
// for GxB_GPU_CONTROL only (DRAFT: in progress, do not use)
GxB_GPU_ALWAYS = 2001,
GxB_GPU_NEVER = 2002,
// for GxB_AxB_METHOD only:
GxB_AxB_GUSTAVSON = 1001, // gather-scatter saxpy method
GxB_AxB_DOT = 1003, // dot product
GxB_AxB_HASH = 1004, // hash-based saxpy method
GxB_AxB_SAXPY = 1005 // saxpy method (any kind)
}
GrB_Desc_Value ;
#include "GB_opaque.h"
#endif
//------------------------------------------------------------------------------
// subset of GB.h
//------------------------------------------------------------------------------
//#include GB_iceil.h
#define GB_ICEIL(a,b) (((a) + (b) - 1) / (b))
//#include GB_imin.h
#define GB_IMAX(x,y) (((x) > (y)) ? (x) : (y))
#define GB_IMIN(x,y) (((x) < (y)) ? (x) : (y))
//#include GB_zombie.h
#define GB_FLIP(i) (-(i)-2)
#define GB_IS_FLIPPED(i) ((i) < 0)
#define GB_IS_ZOMBIE(i) ((i) < 0)
#define GB_IS_NOT_FLIPPED(i) ((i) >= 0)
#define GB_UNFLIP(i) (((i) < 0) ? GB_FLIP(i) : (i))
#define GBI_UNFLIP(Ai,p,avlen) \
((Ai == NULL) ? ((p) % (avlen)) : GB_UNFLIP (Ai [p]))
#include "GB_nnz.h"
#include "GB_partition.h"
// version for the GPU, with fewer branches
#define GB_TRIM_BINARY_SEARCH(i,X,pleft,pright) \
{ \
/* binary search of X [pleft ... pright] for integer i */ \
while (pleft < pright) \
{ \
int64_t pmiddle = (pleft + pright) >> 1 ; \
bool less = (X [pmiddle] < i) ; \
pleft = less ? (pmiddle+1) : pleft ; \
pright = less ? pright : pmiddle ; \
} \
/* binary search is narrowed down to a single item */ \
/* or it has found the list is empty */ \
ASSERT (pleft == pright || pleft == pright + 1) ; \
}
#define GB_BINARY_SEARCH(i,X,pleft,pright,found) \
{ \
GB_TRIM_BINARY_SEARCH (i, X, pleft, pright) ; \
found = (pleft == pright && X [pleft] == i) ; \
}
#define GB_SPLIT_BINARY_SEARCH(i,X,pleft,pright,found) \
{ \
GB_BINARY_SEARCH (i, X, pleft, pright, found) \
if (!found && (pleft == pright)) \
{ \
if (i > X [pleft]) \
{ \
pleft++ ; \
} \
else \
{ \
pright++ ; \
} \
} \
}
__device__
static inline int64_t GB_search_for_vector_device
(
const int64_t p, // search for vector k that contains p
const int64_t *restrict Ap, // vector pointers to search
int64_t kleft, // left-most k to search
int64_t anvec, // Ap is of size anvec+1
int64_t avlen // A->vlen
)
{
//--------------------------------------------------------------------------
// check inputs
//--------------------------------------------------------------------------
if (Ap == NULL)
{
// A is full or bitmap
ASSERT (p >= 0 && p < avlen * anvec) ;
return ((avlen == 0) ? 0 : (p / avlen)) ;
}
// A is sparse
ASSERT (p >= 0 && p < Ap [anvec]) ;
//--------------------------------------------------------------------------
// search for k
//--------------------------------------------------------------------------
int64_t k = kleft ;
int64_t kright = anvec ;
bool found ;
GB_SPLIT_BINARY_SEARCH (p, Ap, k, kright, found) ;
if (found)
{
// Ap [k] == p has been found, but if k is an empty vector, then the
// next vector will also contain the entry p. In that case, k needs to
// be incremented until finding the first non-empty vector for which
// Ap [k] == p.
ASSERT (Ap [k] == p) ;
while (k < anvec-1 && Ap [k+1] == p)
{
k++ ;
}
}
else
{
// p has not been found in Ap, so it appears in the middle of Ap [k-1]
// ... Ap [k], as computed by the binary search. This is the range of
// entries for the vector k-1, so k must be decremented.
k-- ;
}
//--------------------------------------------------------------------------
// return result
//--------------------------------------------------------------------------
// The entry p must reside in a non-empty vector.
ASSERT (k >= 0 && k < anvec) ;
ASSERT (Ap [k] <= p && p < Ap [k+1]) ;
return (k) ;
}
|