File: cuda_std_scan.dox

package info (click to toggle)
taskflow 3.9.0%2Bds-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 45,948 kB
  • sloc: cpp: 39,058; xml: 35,572; python: 12,935; javascript: 1,732; makefile: 59; sh: 16
file content (195 lines) | stat: -rw-r--r-- 5,925 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
namespace tf {

/** @page CUDASTDScan Parallel Scan

%Taskflow provides standard template methods for scanning a range of items on 
a CUDA GPU.

@tableofcontents

@section CUDASTDParallelScanIncludeTheHeader Include the Header

You need to include the header file, `%taskflow/cuda/algorithm/scan.hpp`, 
for using the parallel-scan algorithm.

@code{.cpp}
#include <taskflow/cuda/algorithm/find.hpp>
@endcode

@section CUDASTDWhatIsAScanOperation What is a Scan Operation?

A parallel scan task
performs the cumulative sum, also known as <i>prefix sum</i> or @em scan, 
of the input range and writes the result to the output range.
Each element of the output range contains the
running total of all earlier elements using the given binary operator
for summation.

@image html images/scan.png

@section CUDASTDScanItems Scan a Range of Items

tf::cuda_inclusive_scan computes an inclusive prefix sum operation using
the given binary operator over a range of elements specified by <tt>[first, last)</tt>.
The term &quot;inclusive&quot; means that the i-th input element is included 
in the i-th sum.
The following code computes the inclusive prefix sum over an input array and
stores the result in an output array.

@code{.cpp}
const size_t N = 1000000;
int* input  = tf::cuda_malloc_shared<int>(N);  // input  vector
int* output = tf::cuda_malloc_shared<int>(N);  // output vector

// initializes the data
for(size_t i=0; i<N; input[i++] = rand()); 

// create an execution policy
tf::cudaStream stream;
tf::cudaDefaultExecutionPolicy policy(stream);

// queries the required buffer size to scan N elements using the given policy
auto bytes  = policy.scan_bufsz<int>(N);
auto buffer = tf::cuda_malloc_device<std::byte>(bytes);

// computes inclusive scan over input and stores the result in output
tf::cuda_inclusive_scan(policy, 
  input, input + N, output, [] __device__ (int a, int b) {return a + b;}, buffer
);

// synchronizes and verifies the result
stream.synchronize();

for(size_t i=1; i<N; i++) {
  assert(output[i] == output[i-1] + input[i]);
}

// delete the device memory
cudaFree(input);
cudaFree(output);
cudaFree(buffer);
@endcode

The scan algorithm runs @em asynchronously through the stream specified
in the execution policy. You need to synchronize the stream to
obtain correct results.
Since the GPU scan algorithm may require extra buffer to store the 
temporary results, you need to provide a buffer of size at least larger or equal
to the value returned from <tt>tf::cudaDefaultExecutionPolicy::scan_bufsz</tt>.

@attention
You must keep the buffer alive before the scan call completes.

On the other hand, tf::cuda_exclusive_scan computes an exclusive prefix sum operation.
The term &quot;exclusive&quot; means that the i-th input element is @em NOT included 
in the i-th sum.

@code{.cpp}
// computes exclusive scan over input and stores the result in output
tf::cuda_exclusive_scan(policy, 
  input, input + N, output, [] __device__ (int a, int b) {return a + b;}, buffer
);

// synchronizes the execution and verifies the result
stream.synchronize();
for(size_t i=1; i<N; i++) {
  assert(output[i] == output[i-1] + input[i-1]);
}
@endcode

@section CUDASTDScanTransformedItems Scan a Range of Transformed Items

tf::cuda_transform_inclusive_scan transforms each item in the range <tt>[first, last)</tt>
and computes an inclusive prefix sum over these transformed items.
The following code multiplies each item by 10 and then compute the inclusive prefix sum
over 1000000 transformed items.

@code{.cpp}
const size_t N = 1000000;
int* input  = tf::cuda_malloc_shared<int>(N);  // input  vector
int* output = tf::cuda_malloc_shared<int>(N);  // output vector

// initializes the data
for(size_t i=0; i<N; input[i++] = rand()); 

// create an execution policy
tf::cudaStream stream;
tf::cudaDefaultExecutionPolicy policy(stream);

// queries the required buffer size to scan N elements using the given policy
auto bytes  = policy.scan_bufsz<int>(N);
auto buffer = tf::cuda_malloc_device<std::byte>(bytes);

// computes inclusive scan over transformed input and stores the result in output
tf::cuda_transform_inclusive_scan(policy, 
  input, input + N, output, 
  [] __device__ (int a, int b) { return a + b; },  // binary scan operator
  [] __device__ (int a) { return a*10; },          // unary transform operator
  buffer
);

// wait for the scan to complete
stream.synchronize();

// verifies the result
for(size_t i=1; i<N; i++) {
  assert(output[i] == output[i-1] + input[i] * 10);
}

// delete the device memory
cudaFree(input);
cudaFree(output);
cudaFree(buffer);
@endcode

Similarly, tf::cuda_transform_exclusive_scan performs an exclusive prefix sum
over a range of transformed items.
The following code computes the exclusive prefix sum over 1000000 transformed items
each multiplied by 10.

@code{.cpp}
const size_t N = 1000000;
int* input  = tf::cuda_malloc_shared<int>(N);  // input  vector
int* output = tf::cuda_malloc_shared<int>(N);  // output vector

// initializes the data
for(size_t i=0; i<N; input[i++] = rand()); 

// create an execution policy
tf::cudaStream stream;
tf::cudaDefaultExecutionPolicy policy(stream);

// queries the required buffer size to scan N elements using the given policy
auto bytes  = policy.scan_bufsz<int>(N);
auto buffer = tf::cuda_malloc_device<std::byte>(bytes);

// computes exclusive scan over transformed input and stores the result in output
tf::cuda_transform_exclusive_scan(policy, 
  input, input + N, output, 
  [] __device__ (int a, int b) { return a + b; },  // binary scan operator
  [] __device__ (int a) { return a*10; },          // unary transform operator
  buffer
);

// wait for the scan to complete
stream.synchronize();

// verifies the result
for(size_t i=1; i<N; i++) {
  assert(output[i] == output[i-1] + input[i-1] * 10);
}

// delete the device memory
cudaFree(input);
cudaFree(output);
cudaFree(buffer);
@endcode

*/
}