File: standard_parallel_algorithms.h

package info (click to toggle)
python-awkward 2.6.5-1
  • links: PTS, VCS
  • area: main
  • in suites: sid
  • size: 23,088 kB
  • sloc: python: 148,689; cpp: 33,562; sh: 432; makefile: 21; javascript: 8
file content (137 lines) | stat: -rw-r--r-- 3,543 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
#include <cuda.h>
#include <cuda_runtime.h>
#include <cstdio>

#pragma once

static void
HandleError(cudaError_t err, const char* file, int line) {
  if (err != cudaSuccess) {
    int aa = 0;
    printf("%s in %s at line %d\n", cudaGetErrorString(err), file, line);
    scanf("%d", &aa);
    exit(EXIT_FAILURE);
  }
}
#define HANDLE_ERROR(err) (HandleError(err, __FILE__, __LINE__))

inline dim3
threads(int64_t length) {
  if (length > 1024) {
    return dim3(1024);
  }
  return dim3(length);
}

inline dim3
blocks(int64_t length) {
  if (length > 1024) {
    return dim3(ceil((length) / 1024.0));
  }
  return dim3(1);
}

inline dim3
threads_2d(int64_t length_x, int64_t length_y) {
  if (length_x > 32 && length_y > 32) {
    return dim3(32, 32);
  } else if (length_x > 32 && length_y <= 32) {
    return dim3(32, length_y);
  } else if (length_x <= 32 && length_y > 32) {
    return dim3(length_x, 32);
  } else {
    return dim3(length_x, length_y);
  }
}

inline dim3
blocks_2d(int64_t length_x, int64_t length_y) {
  if (length_x > 32 && length_y > 32) {
    return dim3(ceil(length_x / 32.0), ceil(length_y / 32.0));
  } else if (length_x > 32 && length_y <= 32) {
    return dim3(ceil(length_x / 32.0), 1);
  } else if (length_x <= 32 && length_y > 32) {
    return dim3(1, ceil(length_y / 32.0));
  } else {
    return dim3(1, 1);
  }
}

template <typename T>
__global__ void
exclusive_scan_kernel(T* d_in,
                      T* d_out,
                      T* d_final,
                      int64_t curr_step,
                      int64_t total_steps,
                      int64_t stride,
                      bool in_out_flag,
                      int64_t length) {
  int64_t block_id =
      blockIdx.x + blockIdx.y * gridDim.x + gridDim.x * gridDim.y * blockIdx.z;

  int64_t thread_id = block_id * blockDim.x + threadIdx.x;
  int64_t sum = 0;

  if (thread_id < length) {
    if (!in_out_flag) {
      if (thread_id < stride) {
        sum = d_out[thread_id];
        d_in[thread_id] = sum;
      } else {
        sum = d_out[thread_id] + d_out[thread_id - stride];
        d_in[thread_id] = sum;
      }
    } else {
      if (thread_id < stride) {
        sum = d_in[thread_id];
        d_out[thread_id] = sum;
      } else {
        sum = d_in[thread_id] + d_in[thread_id - stride];
        d_out[thread_id] = sum;
      }
    }

    if (curr_step == total_steps) {
      d_final[thread_id] = sum;
    }
  }
}

template <typename T, typename F>
__global__ void
copy(T* out, F* in, int64_t length) {
  int64_t block_id =
      blockIdx.x + blockIdx.y * gridDim.x + gridDim.x * gridDim.y * blockIdx.z;

  int64_t thread_id = block_id * blockDim.x + threadIdx.x;

  if (thread_id < length) {
    out[thread_id] = (T)(in[thread_id]);
  }
}

template <typename T, typename F>
void
exclusive_scan(T* out, const F* in, int64_t length) {
  T* d_in;
  T* d_out;

  dim3 blocks_per_grid = blocks(length);
  dim3 threads_per_block = threads(length);

  HANDLE_ERROR(cudaMalloc((void**)&d_in, length * sizeof(T)));
  HANDLE_ERROR(cudaMalloc((void**)&d_out, length * sizeof(T)));

  copy<<<blocks_per_grid, threads_per_block>>>(d_in, in, length);

  int64_t stride = 1;
  int64_t total_steps = ceil(log2(static_cast<float>(length)));

  for (int64_t curr_step = 1; curr_step <= total_steps; curr_step++) {
    bool in_out_flag = (curr_step % 2) != 0;
    exclusive_scan_kernel<<<blocks_per_grid, threads_per_block>>>(
        d_in, d_out, out, curr_step, total_steps, stride, in_out_flag, length);
    stride = stride * 2;
  }
}