File: reduce-kernel.cpp

package info (click to toggle)
gpaw 25.7.0-1
  • links: PTS, VCS
  • area: main
  • in suites: sid
  • size: 18,888 kB
  • sloc: python: 174,804; ansic: 17,564; cpp: 5,668; sh: 972; csh: 139; makefile: 45
file content (117 lines) | stat: -rw-r--r-- 4,205 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
__device__ unsigned int INNAME(retirementCount) = {0};

__global__ void INNAME(reduce_kernel)(
        const Tgpu *g_idata1, const Tgpu *g_idata2, Tgpu *g_odata,
        Tgpu *results, unsigned int n, unsigned int block_in,
        int block_out, int nvec)
{
    extern __shared__ Tgpu Zgpu(sdata)[];

    // perform first level of reduction,
    // reading from global memory, writing to shared memory
    unsigned int tid = threadIdx.x;
    unsigned int gridSize = REDUCE_THREADS * 2 * gridDim.x;

    unsigned int i_vec = blockIdx.y;
    unsigned int i = blockIdx.x * (REDUCE_THREADS * 2) + threadIdx.x;
    Tgpu mySum = MAKED(0);

    // we reduce multiple elements per thread.  The number is determined by the
    // number of active thread blocks (via gridDim).  More blocks will result
    // in a larger gridSize and therefore fewer elements per thread
    while (i < n) {
        IADD(mySum, INFUNC(g_idata1[i + block_in * i_vec],
                           g_idata2[i + block_in * i_vec]));
        // ensure we don't read out of bounds
        if (i + REDUCE_THREADS < n) {
            IADD(mySum,
                 INFUNC(g_idata1[i + block_in * i_vec + REDUCE_THREADS],
                        g_idata2[i + block_in * i_vec + REDUCE_THREADS]));
        }
        i += gridSize;
    }
    Zgpu(sdata)[tid] = mySum;
    __syncthreads();

    if (REDUCE_THREADS >= 512) {
        if (tid < 256) {
            Zgpu(sdata)[tid] = mySum = ADD(mySum, Zgpu(sdata)[tid + 256]);
        }
        __syncthreads();
    }
    if (REDUCE_THREADS >= 256) {
        if (tid < 128) {
            Zgpu(sdata)[tid] = mySum = ADD(mySum, Zgpu(sdata)[tid + 128]);
        }
        __syncthreads();
    }
    if (REDUCE_THREADS >= 128) {
        if (tid <  64) {
            Zgpu(sdata)[tid] = mySum = ADD(mySum, Zgpu(sdata)[tid + 64]);
        }
        __syncthreads();
    }

    if (tid < 32) {
        volatile Tgpu *smem = Zgpu(sdata);
#ifdef GPU_USE_COMPLEX
        if (REDUCE_THREADS >= 64) {
            smem[tid].x = mySum.x = mySum.x + smem[tid + 32].x;
            smem[tid].y = mySum.y = mySum.y + smem[tid + 32].y;
        }
        if (REDUCE_THREADS >= 32) {
            smem[tid].x = mySum.x = mySum.x + smem[tid + 16].x;
            smem[tid].y = mySum.y = mySum.y + smem[tid + 16].y;
        }
        if (REDUCE_THREADS >= 16) {
            smem[tid].x = mySum.x = mySum.x + smem[tid + 8].x;
            smem[tid].y = mySum.y = mySum.y + smem[tid + 8].y;
        }
        if (REDUCE_THREADS >= 8) {
            smem[tid].x = mySum.x = mySum.x + smem[tid + 4].x;
            smem[tid].y = mySum.y = mySum.y + smem[tid + 4].y;
        }
        if (REDUCE_THREADS >= 4) {
            smem[tid].x = mySum.x = mySum.x + smem[tid + 2].x;
            smem[tid].y = mySum.y = mySum.y + smem[tid + 2].y;
        }
        if (REDUCE_THREADS >= 2) {
            smem[tid].x = mySum.x = mySum.x + smem[tid + 1].x;
            smem[tid].y = mySum.y = mySum.y + smem[tid + 1].y;
        }
#else
        if (REDUCE_THREADS >= 64)
            smem[tid] = mySum = ADD(mySum, smem[tid + 32]);
        if (REDUCE_THREADS >= 32)
            smem[tid] = mySum = ADD(mySum, smem[tid + 16]);
        if (REDUCE_THREADS >= 16)
            smem[tid] = mySum = ADD(mySum, smem[tid + 8]);
        if (REDUCE_THREADS >= 8)
            smem[tid] = mySum = ADD(mySum, smem[tid + 4]);
        if (REDUCE_THREADS >= 4)
            smem[tid] = mySum = ADD(mySum, smem[tid + 2]);
        if (REDUCE_THREADS >= 2)
            smem[tid] = mySum = ADD(mySum, smem[tid + 1]);
#endif
    }

    // write result for this block to global mem
    if (tid == 0)
        g_odata[blockIdx.x + block_out * i_vec] = Zgpu(sdata)[0];

    if (gridDim.x == 1) {
        __shared__ bool amLast;
        __threadfence();
        if (tid == 0) {
            unsigned int ticket = atomicInc(&INNAME(retirementCount),
                                            gridDim.y);
            amLast = (ticket == gridDim.y - 1);
        }
        __syncthreads();
        if (amLast) {
            for (int i=tid; i < nvec; i += blockDim.x)
                results[i] = g_odata[i * block_out];
            INNAME(retirementCount) = 0;
        }
    }
}