1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122
|
using namespace cooperative_groups ;
#include "GB_cuda_ek_slice.cuh"
#define log2_chunk_size 10
#define chunk_size 1024
__global__ void GB_cuda_apply_unop_kernel
(
GB_void *Cx_out,
const GB_void *thunk,
GrB_Matrix A
)
{
GB_A_NHELD (anz) ;
#if ( GB_DEPENDS_ON_X )
const GB_A_TYPE *__restrict__ Ax = (GB_A_TYPE *) A->x ;
#endif
#if ( GB_A_IS_SPARSE || GB_A_IS_HYPER )
#if ( GB_DEPENDS_ON_I )
const GB_Ai_TYPE *__restrict__ Ai = (GB_Ai_TYPE *) A->i ;
#endif
#if ( GB_DEPENDS_ON_J )
#if ( GB_A_IS_HYPER )
const GB_Aj_TYPE *__restrict__ Ah = (GB_Aj_TYPE *) A->h ;
#endif
const GB_Ap_TYPE *__restrict__ Ap = (GB_Ap_TYPE *) A->p ;
#endif
#endif
#if ( GB_A_IS_BITMAP )
const int8_t *__restrict__ Ab = (int8_t *) A->b ;
#endif
GB_C_TYPE *__restrict__ Cx = (GB_C_TYPE *) Cx_out;
#define A_iso GB_A_ISO
int tid = blockDim.x * blockIdx.x + threadIdx.x ;
int nthreads = blockDim.x * gridDim.x ;
#if ( GB_DEPENDS_ON_Y )
// get thunk value (of type GB_Y_TYPE)
GB_Y_TYPE thunk_value = * ((GB_Y_TYPE *) thunk) ;
#endif
#if ( GB_A_IS_BITMAP || GB_A_IS_FULL )
// bitmap/full case
for (int64_t p = tid ; p < anz ; p += nthreads)
{
if (!GBb_A (Ab, p)) { continue ; }
#if ( GB_DEPENDS_ON_I )
int64_t row_idx = p % A->vlen ;
#endif
#if ( GB_DEPENDS_ON_J )
int64_t col_idx = p / A->vlen ;
#endif
GB_UNOP (Cx, p, Ax, p, A_iso, row_idx, col_idx, thunk_value) ;
}
#else
// sparse/hypersparse case
#if ( GB_DEPENDS_ON_J )
const int64_t anvec = A->nvec ;
// need to do ek_slice method
for (int64_t pfirst = blockIdx.x << log2_chunk_size ;
pfirst < anz ;
pfirst += gridDim.x << log2_chunk_size )
{
int64_t my_chunk_size, anvec_sub1, kfirst, klast ;
float slope ;
GB_cuda_ek_slice_setup (Ap, anvec, anz, pfirst, chunk_size,
&kfirst, &klast, &my_chunk_size, &anvec_sub1, &slope) ;
for (int64_t pdelta = threadIdx.x ; pdelta < my_chunk_size ; pdelta += blockDim.x)
{
int64_t p_final ;
int64_t k = GB_cuda_ek_slice_entry (&p_final, pdelta, pfirst, Ap, anvec_sub1, kfirst, slope) ;
int64_t col_idx = GBh_A (Ah, k) ;
#if ( GB_DEPENDS_ON_I )
int64_t row_idx = GBi_A (Ai, p_final, A->vlen) ;
#endif
GB_UNOP (Cx, p_final, Ax, p_final,
A_iso, row_idx, col_idx, thunk_value) ;
}
}
#else
const int64_t avlen = A->vlen ;
// can do normal method
for (int64_t p = tid ; p < anz ; p += nthreads)
{
#if ( GB_DEPENDS_ON_I )
int64_t row_idx = GBi_A (Ai, p, avlen) ;
#endif
GB_UNOP (Cx, p, Ax, p, A_iso, row_idx, /* col_idx */, thunk_value) ;
}
#endif
#endif
}
extern "C" {
GB_JIT_CUDA_KERNEL_APPLY_UNOP_PROTO (GB_jit_kernel) ;
}
GB_JIT_CUDA_KERNEL_APPLY_UNOP_PROTO (GB_jit_kernel)
{
GB_GET_CALLBACKS ;
dim3 grid (gridsz) ;
dim3 block (blocksz) ;
GB_cuda_apply_unop_kernel <<<grid, block, 0, stream>>> (Cx, ythunk, A) ;
return (GrB_SUCCESS) ;
}
|