File: cuda_sort.cu

package info (click to toggle)
taskflow 3.9.0%2Bds-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 45,948 kB
  • sloc: cpp: 39,058; xml: 35,572; python: 12,935; javascript: 1,732; makefile: 59; sh: 16
file content (67 lines) | stat: -rw-r--r-- 2,020 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
// This program demonstrates how to perform parallel sort with CUDA.

#include <taskflow/cuda/cudaflow.hpp>
#include <taskflow/cuda/algorithm/sort.hpp>

int main(int argc, char* argv[]) {
  
  if(argc != 2) {
    std::cerr << "usage: ./cuda_sort N\n";
    std::exit(EXIT_FAILURE);
  }

  unsigned N = std::atoi(argv[1]);

  // gpu data
  auto d_keys = tf::cuda_malloc_shared<int>(N);

  // cpu data
  std::vector<int> h_keys(N);

  for(unsigned i=0; i<N; i++) {
    int k = rand() % 10000;
    d_keys[i] = k;
    h_keys[i] = k;
  }
  
  // --------------------------------------------------------------------------
  // Standard GPU sort
  // --------------------------------------------------------------------------

  auto p = tf::cudaDefaultExecutionPolicy{};
  
  auto beg = std::chrono::steady_clock::now();
  tf::cudaStream s;
  auto bufsz = tf::cuda_sort_buffer_size<decltype(p), int>(N);
  tf::cudaDeviceVector<std::byte> buf(bufsz);
  tf::cuda_sort(p, d_keys, d_keys+N, tf::cuda_less<int>{}, buf.data());
  s.synchronize();
  auto end = std::chrono::steady_clock::now();

  std::cout << "GPU sort: " 
            << std::chrono::duration_cast<std::chrono::microseconds>(end-beg).count()
            << " us\n";
  
  // --------------------------------------------------------------------------
  // CPU sort
  // --------------------------------------------------------------------------
  beg = std::chrono::steady_clock::now();
  std::sort(h_keys.begin(), h_keys.end());
  end = std::chrono::steady_clock::now();
  
  std::cout << "CPU sort: " 
            << std::chrono::duration_cast<std::chrono::microseconds>(end-beg).count()
            << " us\n";

  // --------------------------------------------------------------------------
  // verify the result
  // --------------------------------------------------------------------------
  
  for(unsigned i=0; i<N; i++) {
    if(d_keys[i] != h_keys[i]) {
      throw std::runtime_error("incorrect result");
    }
  }

  std::cout << "correct result\n";
};