1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195
|
namespace tf {
/** @page CUDASTDScan Parallel Scan
%Taskflow provides standard template methods for scanning a range of items on
a CUDA GPU.
@tableofcontents
@section CUDASTDParallelScanIncludeTheHeader Include the Header
You need to include the header file, `%taskflow/cuda/algorithm/scan.hpp`,
for using the parallel-scan algorithm.
@code{.cpp}
#include <taskflow/cuda/algorithm/find.hpp>
@endcode
@section CUDASTDWhatIsAScanOperation What is a Scan Operation?
A parallel scan task
performs the cumulative sum, also known as <i>prefix sum</i> or @em scan,
of the input range and writes the result to the output range.
Each element of the output range contains the
running total of all earlier elements using the given binary operator
for summation.
@image html images/scan.png
@section CUDASTDScanItems Scan a Range of Items
tf::cuda_inclusive_scan computes an inclusive prefix sum operation using
the given binary operator over a range of elements specified by <tt>[first, last)</tt>.
The term "inclusive" means that the i-th input element is included
in the i-th sum.
The following code computes the inclusive prefix sum over an input array and
stores the result in an output array.
@code{.cpp}
const size_t N = 1000000;
int* input = tf::cuda_malloc_shared<int>(N); // input vector
int* output = tf::cuda_malloc_shared<int>(N); // output vector
// initializes the data
for(size_t i=0; i<N; input[i++] = rand());
// create an execution policy
tf::cudaStream stream;
tf::cudaDefaultExecutionPolicy policy(stream);
// queries the required buffer size to scan N elements using the given policy
auto bytes = policy.scan_bufsz<int>(N);
auto buffer = tf::cuda_malloc_device<std::byte>(bytes);
// computes inclusive scan over input and stores the result in output
tf::cuda_inclusive_scan(policy,
input, input + N, output, [] __device__ (int a, int b) {return a + b;}, buffer
);
// synchronizes and verifies the result
stream.synchronize();
for(size_t i=1; i<N; i++) {
assert(output[i] == output[i-1] + input[i]);
}
// delete the device memory
cudaFree(input);
cudaFree(output);
cudaFree(buffer);
@endcode
The scan algorithm runs @em asynchronously through the stream specified
in the execution policy. You need to synchronize the stream to
obtain correct results.
Since the GPU scan algorithm may require extra buffer to store the
temporary results, you need to provide a buffer of size at least larger or equal
to the value returned from <tt>tf::cudaDefaultExecutionPolicy::scan_bufsz</tt>.
@attention
You must keep the buffer alive before the scan call completes.
On the other hand, tf::cuda_exclusive_scan computes an exclusive prefix sum operation.
The term "exclusive" means that the i-th input element is @em NOT included
in the i-th sum.
@code{.cpp}
// computes exclusive scan over input and stores the result in output
tf::cuda_exclusive_scan(policy,
input, input + N, output, [] __device__ (int a, int b) {return a + b;}, buffer
);
// synchronizes the execution and verifies the result
stream.synchronize();
for(size_t i=1; i<N; i++) {
assert(output[i] == output[i-1] + input[i-1]);
}
@endcode
@section CUDASTDScanTransformedItems Scan a Range of Transformed Items
tf::cuda_transform_inclusive_scan transforms each item in the range <tt>[first, last)</tt>
and computes an inclusive prefix sum over these transformed items.
The following code multiplies each item by 10 and then compute the inclusive prefix sum
over 1000000 transformed items.
@code{.cpp}
const size_t N = 1000000;
int* input = tf::cuda_malloc_shared<int>(N); // input vector
int* output = tf::cuda_malloc_shared<int>(N); // output vector
// initializes the data
for(size_t i=0; i<N; input[i++] = rand());
// create an execution policy
tf::cudaStream stream;
tf::cudaDefaultExecutionPolicy policy(stream);
// queries the required buffer size to scan N elements using the given policy
auto bytes = policy.scan_bufsz<int>(N);
auto buffer = tf::cuda_malloc_device<std::byte>(bytes);
// computes inclusive scan over transformed input and stores the result in output
tf::cuda_transform_inclusive_scan(policy,
input, input + N, output,
[] __device__ (int a, int b) { return a + b; }, // binary scan operator
[] __device__ (int a) { return a*10; }, // unary transform operator
buffer
);
// wait for the scan to complete
stream.synchronize();
// verifies the result
for(size_t i=1; i<N; i++) {
assert(output[i] == output[i-1] + input[i] * 10);
}
// delete the device memory
cudaFree(input);
cudaFree(output);
cudaFree(buffer);
@endcode
Similarly, tf::cuda_transform_exclusive_scan performs an exclusive prefix sum
over a range of transformed items.
The following code computes the exclusive prefix sum over 1000000 transformed items
each multiplied by 10.
@code{.cpp}
const size_t N = 1000000;
int* input = tf::cuda_malloc_shared<int>(N); // input vector
int* output = tf::cuda_malloc_shared<int>(N); // output vector
// initializes the data
for(size_t i=0; i<N; input[i++] = rand());
// create an execution policy
tf::cudaStream stream;
tf::cudaDefaultExecutionPolicy policy(stream);
// queries the required buffer size to scan N elements using the given policy
auto bytes = policy.scan_bufsz<int>(N);
auto buffer = tf::cuda_malloc_device<std::byte>(bytes);
// computes exclusive scan over transformed input and stores the result in output
tf::cuda_transform_exclusive_scan(policy,
input, input + N, output,
[] __device__ (int a, int b) { return a + b; }, // binary scan operator
[] __device__ (int a) { return a*10; }, // unary transform operator
buffer
);
// wait for the scan to complete
stream.synchronize();
// verifies the result
for(size_t i=1; i<N; i++) {
assert(output[i] == output[i-1] + input[i-1] * 10);
}
// delete the device memory
cudaFree(input);
cudaFree(output);
cudaFree(buffer);
@endcode
*/
}
|