File: awkward_reduce_min_complex_tree_reduction.py

package info (click to toggle)
python-awkward 2.8.10-2
  • links: PTS, VCS
  • area: main
  • in suites: forky
  • size: 25,144 kB
  • sloc: python: 182,845; cpp: 33,828; sh: 432; makefile: 21; javascript: 8
file content (98 lines) | stat: -rw-r--r-- 4,338 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
import cupy as cp

cuda_kernel = """
extern "C" {
    __global__ void awkward_reduce_min_complex_a(float* toptr, float* fromptr, int* parents, int lenparents, int outlength, int identity, int* partial) {
       int thread_id = blockIdx.x * blockDim.x + threadIdx.x;

       if (thread_id < outlength) {
          toptr[thread_id * 2] = identity;
          toptr[thread_id * 2 + 1] = 0;
       }
    }
}
    
extern "C" {
    __global__ void awkward_reduce_min_complex_b(float* toptr, float* fromptr, int* parents, int lenparents, int outlength, int identity, int* partial) {
        extern __shared__ float shared[];

        int idx = threadIdx.x;
        int thread_id = blockIdx.x * blockDim.x + idx;

        if (thread_id < lenparents) {
            shared[idx * 2] = fromptr[thread_id * 2];
            shared[idx * 2 + 1] = fromptr[thread_id * 2 + 1];
        }
        __syncthreads();

        for (int stride = 1; stride < blockDim.x; stride *= 2) {
            float real = identity;
            float imag = 0;
            if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
                real = shared[(idx - stride) * 2];
                imag = shared[(idx - stride) * 2 + 1];
            }
            __syncthreads();
            if (shared[idx * 2] > real || shared[idx * 2] == real && shared[idx * 2 + 1] > imag) {
                shared[idx * 2] = real;
                shared[idx * 2 + 1] = imag;
            }
            __syncthreads();
        }

        if (thread_id < lenparents) {
            int parent = parents[thread_id];
            if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) {
                partial[(blockIdx.x * outlength + parent) * 2 ] = shared[idx * 2];
                partial[(blockIdx.x * outlength + parent) * 2 + 1] = shared[idx * 2 + 1];
            }
        }
    }
}

extern "C" {
    __global__ void awkward_reduce_min_complex_c(float* toptr, float* fromptr, int* parents, int lenparents, int outlength, int identity, int* partial) {
        int thread_id = blockIdx.x * blockDim.x + threadIdx.x;

        if (thread_id < outlength) {
            float real = identity;
            float imag = 0;
            int blocks = (lenparents + blockDim.x - 1) / blockDim.x;
            for (int i = 0; i < blocks; ++i) {
            if (real > partial[2 * (i * outlength + thread_id)] ||
                (partial[2 * (i * outlength + thread_id)] == real &&
                imag > partial[2 * (i * outlength + thread_id) + 1])) {
                    real = partial[(i * outlength + thread_id) * 2];
                    imag = partial[(i * outlength + thread_id) * 2 + 1];
                }
            }
            toptr[thread_id * 2] = real;
            toptr[thread_id * 2 + 1] = imag;
        }
    }
}
"""

parents = cp.array([0, 1, 1, 2, 2, 2, 2, 2, 2, 5], dtype=cp.int32)
fromptr = cp.array([1, 0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10, 0], dtype=cp.float32)
lenparents = len(parents)
outlength = int(cp.max(parents)) + 1
identity = 111111
toptr = cp.full(outlength * 2, identity, dtype=cp.float32)
block_size = 2
partial = cp.full((outlength * ((lenparents + block_size - 1) // block_size)), identity, dtype=cp.float32)
grid_size = (lenparents + block_size - 1) // block_size
shared_mem_size = block_size * cp.float32().nbytes

raw_module = cp.RawModule(code=cuda_kernel)

awkward_reduce_min_complex_a = raw_module.get_function('awkward_reduce_min_complex_a')
awkward_reduce_min_complex_b = raw_module.get_function('awkward_reduce_min_complex_b')
awkward_reduce_min_complex_c = raw_module.get_function('awkward_reduce_min_complex_c')

awkward_reduce_min_complex_a((grid_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength, identity, partial))
awkward_reduce_min_complex_b((grid_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength, identity, partial), shared_mem=shared_mem_size)
awkward_reduce_min_complex_c(((outlength + block_size - 1) // block_size,), (block_size,), (toptr, fromptr, parents, lenparents, outlength, identity, partial))

toptr_host = toptr[0::2] + 1j * toptr[1::2]
print("tree reduction toptr:", toptr_host)