File: async_reduce.cu

package info (click to toggle)
cccl 2.5.0-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 39,248 kB
  • sloc: cpp: 264,457; python: 6,421; sh: 2,762; perl: 460; makefile: 114; xml: 13
file content (78 lines) | stat: -rw-r--r-- 2,812 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
#include <thrust/detail/config.h>

#include <thrust/device_vector.h>
#include <thrust/reduce.h>
#include <thrust/system/cuda/execution_policy.h>

#include <cassert>
#include <future>

// This example demonstrates two ways to achieve algorithm invocations that are asynchronous with
// the calling thread.
//
// The first method wraps a call to thrust::reduce inside a __global__ function. Since __global__ function
// launches are asynchronous with the launching thread, this achieves asynchrony. The result of the reduction
// is stored to a pointer to CUDA global memory. The calling thread waits for the result of the reduction to
// be ready by synchronizing with the CUDA stream on which the __global__ function is launched.
//
// The second method uses the C++11 library function, std::async, to create concurrency. The lambda function
// given to std::async returns the result of thrust::reduce to a std::future. The calling thread can use the
// std::future to wait for the result of the reduction. This method requires a compiler which supports
// C++11-capable language and library constructs.

#ifdef THRUST_EXAMPLE_DEVICE_SIDE
template <typename Iterator, typename T, typename BinaryOperation, typename Pointer>
__global__ void reduce_kernel(Iterator first, Iterator last, T init, BinaryOperation binary_op, Pointer result)
{
  *result = thrust::reduce(thrust::cuda::par, first, last, init, binary_op);
}
#endif

int main()
{
  size_t n = 1 << 20;
  thrust::device_vector<unsigned int> data(n, 1);
  thrust::device_vector<unsigned int> result(1, 0);

  // method 1: call thrust::reduce from an asynchronous CUDA kernel launch

  // create a CUDA stream
  cudaStream_t s;
  cudaStreamCreate(&s);

  // launch a CUDA kernel with only 1 thread on our stream
#ifdef THRUST_EXAMPLE_DEVICE_SIDE
  reduce_kernel<<<1, 1, 0, s>>>(data.begin(), data.end(), 0, thrust::plus<int>(), result.data());
#else
  result[0] = thrust::reduce(thrust::cuda::par, data.begin(), data.end(), 0, thrust::plus<int>());
#endif

  // wait for the stream to finish
  cudaStreamSynchronize(s);

  // our result should be ready
  assert(result[0] == n);

  cudaStreamDestroy(s);

  // reset the result
  result[0] = 0;

  // method 2: use std::async to create asynchrony
  // copy all the algorithm parameters
  auto begin        = data.begin();
  auto end          = data.end();
  unsigned int init = 0;
  auto binary_op    = thrust::plus<unsigned int>();

  // std::async captures the algorithm parameters by value
  // use std::launch::async to ensure the creation of a new thread
  std::future<unsigned int> future_result = std::async(std::launch::async, [=] {
    return thrust::reduce(begin, end, init, binary_op);
  });

  // wait on the result and check that it is correct
  assert(future_result.get() == n);

  return 0;
}