File: GB_jit_reduceNonZombiesWarp.cuh

package info (click to toggle)
suitesparse-graphblas 7.4.0%2Bdfsg-2
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 67,112 kB
  • sloc: ansic: 1,072,243; cpp: 8,081; sh: 512; makefile: 506; asm: 369; python: 125; awk: 10
file content (152 lines) | stat: -rw-r--r-- 5,222 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
//------------------------------------------------------------------------------
// template/GB_jit_reduceNonZombiesWarp.cu
//------------------------------------------------------------------------------

// The GB_jit_reduceNonZombiesWarp CUDA kernel reduces an array g_idata of size
// n, of any type T_A, to an array g_odata of size grid.x.  Each threadblock
// (blockIdx.x) reduces its portion of g_idata to a single scalar, g_odata
// [blockIdx.x].

// Both the grid and block are 1D, so blockDim.x is the # threads in a
// threadblock, and the # of threadblocks is grid.x

// Let b = blockIdx.x, and let s be blockDim.x.
// Each threadblock owns s*8 contiguous items in the input data.

// Thus, threadblock b owns g_idata [b*s*8 ... min(n,(b+1)*s*8-1)].  It's job
// is to reduce this data to a scalar, and write it to g_odata [b].

#define GB_CUDA_KERNEL
#include <limits>
#include <type_traits>
#include "GB_cuda_kernel.h"
#include "GB_cuda_atomics.cuh"
#include <cstdint>
#include <cooperative_groups.h>

using namespace cooperative_groups;

template< typename T_A, int tile_sz, int rcode>
__inline__ __device__ 
T_A warp_ReduceSum( thread_block_tile<tile_sz> g, T_A val)
{
    // Each iteration halves the number of active threads
    // Each thread adds its partial sum[i] to sum[lane+i]

    /*
    #pragma unroll
    for (int i = tile_sz >> 1; i > 0; i >>= 1) {
        T_A fold = g.shfl_down( val, i);
        GB_ADD( val, val, fold );
        //printf("thd%d   %d OP %d is %d\n", threadIdx.x, val, fold, OP( val, fold));
    }
    */
        T_A fold = g.shfl_down( val, 16);
        GB_ADD( val, val, fold );
        fold = g.shfl_down( val, 8);
        GB_ADD( val, val, fold );
        fold = g.shfl_down( val, 4);
        GB_ADD( val, val, fold );
        fold = g.shfl_down( val, 2);
        GB_ADD( val, val, fold );
        fold = g.shfl_down( val, 1);
        GB_ADD( val, val, fold );
    //if (threadIdx.x ==0) printf("thd%d single warp sum is %d\n", threadIdx.x,  val);
    return val; // note: only thread 0 will return full sum
}


template<typename T_A, int warpSize, int rcode>
__inline__ __device__
T_A block_ReduceSum(thread_block g, T_A val)
{
  static __shared__ T_A shared[warpSize]; // Shared mem for 32 partial sums
  int lane = threadIdx.x & 31 ; // % warpSize;
  int wid  = threadIdx.x >> 5 ; // / warpSize;
  thread_block_tile<warpSize> tile = tiled_partition<warpSize>( g );

    // TODO: Figure out how to use graphblas-specific INFINITY macro
    #ifndef INFINITY
    #define INFINITY std::numeric_limits<double>::max()
    #endif


    // Each warp performs partial reduction
  val = warp_ReduceSum<T_A, warpSize, rcode>( tile, val);

  // Wait for all partial reductions
  if (lane==0) { 
     //printf("thd%d warp%d sum is %d\n", threadIdx.x, wid, val);
     shared[wid] = val; // Write reduced value to shared memory
     //printf("thd%d stored warp%d sum %d\n", threadIdx.x, wid, val);
  }
  this_thread_block().sync();     // Wait for all partial reductions
    GB_DECLARE_MONOID_IDENTITY (identity) ;

  //if (wid > 0 ) return val;
  //read from shared memory only if that warp existed
  //else { 
    val = (threadIdx.x < (blockDim.x / warpSize) ) ? shared[lane] : identity ;
    //if (lane < (blockDim.x/ warpSize) ) printf("thd%d warp%d loaded val = %d\n", threadIdx.x, lane, val);
    val = warp_ReduceSum<T_A, warpSize, rcode>( tile, val); //Final reduce within first warp
  //}

  return val;
}


template< typename T_A, typename T_Z, int rcode, bool atomic_reduce = true>
__global__ void reduceNonZombiesWarp
(
    GrB_Matrix A,
    GrB_Scalar R,      // array of size grid.x if atomic_reduce==false and size 1 if atomic_reduce==true
    int64_t N,         // number of edges for sparse, size of x array for full/bitmap
    bool is_sparse
)
{

    // TODO: Figure out how to use graphblas-specific INFINITY macro
    #ifndef INFINITY
    #define INFINITY std::numeric_limits<T_A>::max()
    #endif

    // set thread ID
    int tid = threadIdx.x ;

    const int64_t *__restrict__ index = A->i;
    const T_A *__restrict__ g_idata = (T_A*) A->x;  // T_A or GB_A_TYPENAME
    T_Z *g_odata = (T_Z*) R->x;

    // each thread tid reduces its result into sum
//  T_Z sum = (T_Z) GB_IDENTITY;
    GB_DECLARE_MONOID_IDENTITY (sum) ;

    for(int i = blockIdx.x * blockDim.x + threadIdx.x; 
        i < N;
        i += blockDim.x * gridDim.x) {

        if (is_sparse && index[i] < 0) continue; // skip zombies
        T_A fold = g_idata[i];
        GB_ADD( sum, sum, fold );
    }
    this_thread_block().sync(); 

    //--------------------------------------------------------------------------
    // reduce work [0..s-1] to a single scalar
    //--------------------------------------------------------------------------
    // this assumes blockDim is a multiple of 32
    sum = block_ReduceSum< T_A, 32, rcode >( this_thread_block(), sum) ;
    this_thread_block().sync(); 

    // write result for this block to global mem
    if (tid == 0)
    {
        // TODO: Assuming sum for now (like the rest of the kernel)
        if(atomic_reduce) {
            atomic_add<T_Z>(g_odata, sum);
        } else {
            g_odata [blockIdx.x] = sum ;
        }
    }
}