File: GB_cuda_kernel.h

package info (click to toggle)
suitesparse-graphblas 7.4.0%2Bdfsg-2
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 67,112 kB
  • sloc: ansic: 1,072,243; cpp: 8,081; sh: 512; makefile: 506; asm: 369; python: 125; awk: 10
file content (383 lines) | stat: -rw-r--r-- 14,515 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
//------------------------------------------------------------------------------
// CUDA/GB_cuda_kernel.h: definitions for all GraphBLAS CUDA kernels
//------------------------------------------------------------------------------

// SPDX-License-Identifier: Apache-2.0

//------------------------------------------------------------------------------

// This file is #include'd into all CUDA kernels for GraphBLAS.  It provides
// a

#pragma once
#undef  ASSERT
#define ASSERT(x)

//------------------------------------------------------------------------------
// TODO: this will be in the jit code:
#define chunksize 128 

//------------------------------------------------------------------------------
// GETA, GETB: get entries from input matrices A and B
//------------------------------------------------------------------------------

// The entries are typecasted to the type of the inputs to the operator f(x,y),
// which is either the multiplicative operator of a semiring, or a binary
// operator for eWise operations.  GETA and GETB can also be used for loading
// values to be passed to the binary accumulator operator.

#if GB_FLIPXY

    // The operator is "flipped", so that f(b,a) is to be computed.
    // In this case, aval must be typecasted to the ytype of f, which is
    // T_Y, and bval to the xtype of f (that is, T_X).

    // aval = (T_Y) A (i,j)
    #if GB_A_IS_PATTERN
        #define GB_DECLAREA(aval)
        #define GB_SHAREDA(aval)
        #define GB_GETA( aval, ax, p)
    #else
        #define GB_DECLAREA(aval) T_Y aval
        #define GB_SHAREDA(aval) __shared__ T_Y aval
        #if GB_A_ISO
            #define GB_GETA( aval, ax, p) aval = (T_Y) (ax [0]) ;
        #else
            #define GB_GETA( aval, ax, p) aval = (T_Y) (ax [p]) ;
        #endif
    #endif

    // bval = (T_X) B (i,j)
    #if GB_B_IS_PATTERN
        #define GB_DECLAREB(bval)
        #define GB_SHAREDB(bval)
        #define GB_GETB( bval, bx, p)
    #else
        #define GB_DECLAREB(bval) T_X bval
        #define GB_SHAREDB(bval) __shared__ T_X bval
        #if GB_B_ISO
            #define GB_GETB( bval, bx, p) bval = (T_X) (bx [0]) ;
        #else
            #define GB_GETB( bval, bx, p) bval = (T_X) (bx [p]) ;
        #endif
    #endif

#else

    // The operator is not "flipped", so that f(a,b) is to be computed.
    // In this case, aval must be typecasted to the xtype of f, which is
    // T_X, and bval to the xtype of f (that is, T_Y).

    // aval = (T_X) A (i,j)
    #if GB_A_IS_PATTERN
        #define GB_DECLAREA(aval)
        #define GB_SHAREDA(aval)
        #define GB_GETA( aval, ax, p)
    #else
        #define GB_DECLAREA(aval) T_X aval
        #define GB_SHAREDA(aval) __shared__ T_X aval
        #if GB_A_ISO
            #define GB_GETA( aval, ax, p) aval = (T_X) (ax [0]) ;
        #else
            #define GB_GETA( aval, ax, p) aval = (T_X) (ax [p]) ;
        #endif
    #endif

    // bval = (T_Y) B (i,j)
    #if GB_B_IS_PATTERN
        #define GB_DECLAREB(bval)
        #define GB_SHAREDB(bval)
        #define GB_GETB( bval, bx, p)
    #else
        #define GB_DECLAREB(bval) T_Y bval
        #define GB_SHAREDB(bval) __shared__ T_Y bval
        #if GB_B_ISO
            #define GB_GETB( bval, bx, p) bval = (T_Y) (bx [0]) ;
        #else
            #define GB_GETB( bval, bx, p) bval = (T_Y) (bx [p]) ;
        #endif
    #endif

#endif

//------------------------------------------------------------------------------
// operators
//------------------------------------------------------------------------------

#if GB_C_ISO

    #define GB_MULTADD( c, a ,b, i, k, j)
    #define GB_DOT_TERMINAL( c ) break
    #define GB_DOT_MERGE(pA,pB)                                         \
    {                                                                   \
        cij_exists = true ;                                             \
    }
    #define GB_CIJ_EXIST_POSTCHECK

#else

    // the result the multiply must be typecast to ztype of the add.
    #define GB_MULTADD( c, a, b, i, k, j )                              \
    {                                                                   \
        T_Z x_op_y ;                                                    \
        GB_MULT (x_op_y, a, b, i, k, j) ;   /* x_op_y = a*b */          \
        GB_ADD (c, c, x_op_y) ;             /* c += x_op_y  */          \
    }

    #define GB_DOT_TERMINAL( c ) GB_IF_TERMINAL_BREAK ( c, z )

    #if GB_IS_PLUS_PAIR_REAL_SEMIRING

        // cij += A(k,i) * B(k,j), for merge operation (plus_pair_real semiring)
        #if GB_ZTYPE_IGNORE_OVERFLOW
            // plus_pair for int64, uint64, float, or double
            #define GB_DOT_MERGE(pA,pB) cij++ ;
            #define GB_CIJ_EXIST_POSTCHECK cij_exists = (cij != 0) ;
        #else
            // plus_pair semiring for small integers
            #define GB_DOT_MERGE(pA,pB)                                     \
            {                                                               \
                cij_exists = true ;                                         \
                cij++ ;                                                     \
            }
            #define GB_CIJ_EXIST_POSTCHECK
        #endif

    #else

        // cij += A(k,i) * B(k,j), for merge operation (general case)
        #define GB_DOT_MERGE(pA,pB)                                         \
        {                                                                   \
            GB_GETA (aki, Ax, pA) ;         /* aki = A(k,i) */              \
            GB_GETB (bkj, Bx, pB) ;         /* bkj = B(k,j) */              \
            cij_exists = true ;                                             \
            GB_MULTADD (cij, aki, bkj, i, k, j) ;  /* cij += aki * bkj */   \
        }
        #define GB_CIJ_EXIST_POSTCHECK

    #endif

#endif

//------------------------------------------------------------------------------
// subset of GraphBLAS.h
//------------------------------------------------------------------------------

#ifndef GRAPHBLAS_H
#define GRAPHBLAS_H

#undef restrict
#undef GB_restrict
#if defined ( GB_CUDA_KERNEL ) || defined ( __NVCC__ )
    #define GB_restrict __restrict__
#else
    #define GB_restrict
#endif
#define restrict GB_restrict

#include <stdint.h>
//#include <stdbool.h>
#include <stddef.h>
#include <string.h>

// GB_STR: convert the content of x into a string "x"
#define GB_XSTR(x) GB_STR(x)
#define GB_STR(x) #x

#undef  GB_PUBLIC
#define GB_PUBLIC extern
#undef  GxB_MAX_NAME_LEN
#define GxB_MAX_NAME_LEN 128

typedef uint64_t GrB_Index ;
typedef struct GB_Descriptor_opaque *GrB_Descriptor ;
typedef struct GB_Type_opaque *GrB_Type ;
typedef struct GB_UnaryOp_opaque *GrB_UnaryOp ;
typedef struct GB_BinaryOp_opaque *GrB_BinaryOp ;
typedef struct GB_SelectOp_opaque *GxB_SelectOp ;
typedef struct GB_IndexUnaryOp_opaque *GrB_IndexUnaryOp ;
typedef struct GB_Monoid_opaque *GrB_Monoid ;
typedef struct GB_Semiring_opaque *GrB_Semiring ;
typedef struct GB_Scalar_opaque *GrB_Scalar ;
typedef struct GB_Vector_opaque *GrB_Vector ;
typedef struct GB_Matrix_opaque *GrB_Matrix ;

#define GxB_HYPERSPARSE 1   // store matrix in hypersparse form
#define GxB_SPARSE      2   // store matrix as sparse form (compressed vector)
#define GxB_BITMAP      4   // store matrix as a bitmap
#define GxB_FULL        8   // store matrix as full; all entries must be present

typedef void (*GxB_unary_function)  (void *, const void *) ;
typedef void (*GxB_binary_function) (void *, const void *, const void *) ;

typedef bool (*GxB_select_function)      // return true if A(i,j) is kept
(
    GrB_Index i,                // row index of A(i,j)
    GrB_Index j,                // column index of A(i,j)
    const void *x,              // value of A(i,j)
    const void *thunk           // optional input for select function
) ;

typedef void (*GxB_index_unary_function)
(
    void *z,            // output value z, of type ztype
    const void *x,      // input value x of type xtype; value of v(i) or A(i,j)
    GrB_Index i,        // row index of A(i,j)
    GrB_Index j,        // column index of A(i,j), or zero for v(i)
    const void *y       // input scalar y
) ;

typedef enum
{
    // for all GrB_Descriptor fields:
    GxB_DEFAULT = 0,    // default behavior of the method

    // for GrB_OUTP only:
    GrB_REPLACE = 1,    // clear the output before assigning new values to it

    // for GrB_MASK only:
    GrB_COMP = 2,       // use the structural complement of the input
    GrB_SCMP = 2,       // same as GrB_COMP (historical; use GrB_COMP instead)
    GrB_STRUCTURE = 4,  // use the only pattern of the mask, not its values

    // for GrB_INP0 and GrB_INP1 only:
    GrB_TRAN = 3,       // use the transpose of the input

    // for GxB_GPU_CONTROL only (DRAFT: in progress, do not use)
    GxB_GPU_ALWAYS  = 2001,
    GxB_GPU_NEVER   = 2002,

    // for GxB_AxB_METHOD only:
    GxB_AxB_GUSTAVSON = 1001,   // gather-scatter saxpy method
    GxB_AxB_DOT       = 1003,   // dot product
    GxB_AxB_HASH      = 1004,   // hash-based saxpy method
    GxB_AxB_SAXPY     = 1005    // saxpy method (any kind)
}
GrB_Desc_Value ;

#include "GB_opaque.h"
#endif

//------------------------------------------------------------------------------
// subset of GB.h
//------------------------------------------------------------------------------

//#include GB_iceil.h
#define GB_ICEIL(a,b) (((a) + (b) - 1) / (b))
//#include GB_imin.h
#define GB_IMAX(x,y) (((x) > (y)) ? (x) : (y))
#define GB_IMIN(x,y) (((x) < (y)) ? (x) : (y))
//#include GB_zombie.h
#define GB_FLIP(i)             (-(i)-2)
#define GB_IS_FLIPPED(i)       ((i) < 0)
#define GB_IS_ZOMBIE(i)        ((i) < 0)
#define GB_IS_NOT_FLIPPED(i)   ((i) >= 0)
#define GB_UNFLIP(i)           (((i) < 0) ? GB_FLIP(i) : (i))
#define GBI_UNFLIP(Ai,p,avlen)      \
    ((Ai == NULL) ? ((p) % (avlen)) : GB_UNFLIP (Ai [p]))

#include "GB_nnz.h"
#include "GB_partition.h"

// version for the GPU, with fewer branches
#define GB_TRIM_BINARY_SEARCH(i,X,pleft,pright)                             \
{                                                                           \
    /* binary search of X [pleft ... pright] for integer i */               \
    while (pleft < pright)                                                  \
    {                                                                       \
        int64_t pmiddle = (pleft + pright) >> 1 ;                           \
        bool less = (X [pmiddle] < i) ;                                     \
        pleft  = less ? (pmiddle+1) : pleft ;                               \
        pright = less ? pright : pmiddle ;                                  \
    }                                                                       \
    /* binary search is narrowed down to a single item */                   \
    /* or it has found the list is empty */                                 \
    ASSERT (pleft == pright || pleft == pright + 1) ;                       \
}

#define GB_BINARY_SEARCH(i,X,pleft,pright,found)                            \
{                                                                           \
    GB_TRIM_BINARY_SEARCH (i, X, pleft, pright) ;                           \
    found = (pleft == pright && X [pleft] == i) ;                           \
}

#define GB_SPLIT_BINARY_SEARCH(i,X,pleft,pright,found)                      \
{                                                                           \
    GB_BINARY_SEARCH (i, X, pleft, pright, found)                           \
    if (!found && (pleft == pright))                                        \
    {                                                                       \
        if (i > X [pleft])                                                  \
        {                                                                   \
            pleft++ ;                                                       \
        }                                                                   \
        else                                                                \
        {                                                                   \
            pright++ ;                                                      \
        }                                                                   \
    }                                                                       \
}

__device__
static inline int64_t GB_search_for_vector_device
(
    const int64_t p,                // search for vector k that contains p
    const int64_t *restrict Ap,  // vector pointers to search
    int64_t kleft,                  // left-most k to search
    int64_t anvec,                  // Ap is of size anvec+1
    int64_t avlen                   // A->vlen
)
{

    //--------------------------------------------------------------------------
    // check inputs
    //--------------------------------------------------------------------------

    if (Ap == NULL)
    { 
        // A is full or bitmap
        ASSERT (p >= 0 && p < avlen * anvec) ;
        return ((avlen == 0) ? 0 : (p / avlen)) ;
    }

    // A is sparse
    ASSERT (p >= 0 && p < Ap [anvec]) ;

    //--------------------------------------------------------------------------
    // search for k
    //--------------------------------------------------------------------------

    int64_t k = kleft ;
    int64_t kright = anvec ;
    bool found ;
    GB_SPLIT_BINARY_SEARCH (p, Ap, k, kright, found) ;
    if (found)
    {
        // Ap [k] == p has been found, but if k is an empty vector, then the
        // next vector will also contain the entry p.  In that case, k needs to
        // be incremented until finding the first non-empty vector for which
        // Ap [k] == p.
        ASSERT (Ap [k] == p) ;
        while (k < anvec-1 && Ap [k+1] == p)
        { 
            k++ ;
        }
    }
    else
    { 
        // p has not been found in Ap, so it appears in the middle of Ap [k-1]
        // ... Ap [k], as computed by the binary search.  This is the range of
        // entries for the vector k-1, so k must be decremented.
        k-- ;
    }

    //--------------------------------------------------------------------------
    // return result
    //--------------------------------------------------------------------------

    // The entry p must reside in a non-empty vector.
    ASSERT (k >= 0 && k < anvec) ;
    ASSERT (Ap [k] <= p && p < Ap [k+1]) ;

    return (k) ;
}