File: test_cuda_scan.cu

package info (click to toggle)
taskflow 3.9.0%2Bds-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 45,948 kB
  • sloc: cpp: 39,058; xml: 35,572; python: 12,935; javascript: 1,732; makefile: 59; sh: 16
file content (140 lines) | stat: -rw-r--r-- 4,191 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
#define DOCTEST_CONFIG_IMPLEMENT_WITH_MAIN

#include <doctest.h>
#include <taskflow/taskflow.hpp>
#include <taskflow/cuda/cudaflow.hpp>
#include <taskflow/cuda/algorithm/scan.hpp>

// ----------------------------------------------------------------------------
// cuda_scan
// ----------------------------------------------------------------------------

template <typename T>
void cuda_scan() {

  tf::Taskflow taskflow;
  tf::Executor executor;
  
  for(int n=0; n<=1234567; n = (n<=100) ? n+1 : n*2 + 1) {

    taskflow.emplace([n](){
  
      auto data1 = tf::cuda_malloc_shared<int>(n);
      auto data2 = tf::cuda_malloc_shared<int>(n);
      auto scan1 = tf::cuda_malloc_shared<int>(n);
      auto scan2 = tf::cuda_malloc_shared<int>(n);

      // --------------------------------------------------------------------------
      // inclusive/exclusive scan
      // --------------------------------------------------------------------------

      // initialize the data
      std::iota(data1, data1 + n, 0);
      std::iota(data2, data2 + n, 0);
      
      tf::cudaStream stream;
      tf::cudaDefaultExecutionPolicy policy(stream);

      // declare the buffer
      void* buff;
      cudaMalloc(&buff, policy.scan_bufsz<int>(n));
      
      // create inclusive and exclusive scan tasks
      tf::cuda_inclusive_scan(policy, data1, data1+n, scan1, tf::cuda_plus<int>{}, buff);
      tf::cuda_exclusive_scan(policy, data2, data2+n, scan2, tf::cuda_plus<int>{}, buff);

      stream.synchronize();
      
      // inspect 
      for(int i=1; i<n; i++) {
        REQUIRE(scan1[i] == (scan1[i-1] + data1[i]));
        REQUIRE(scan2[i] == (scan2[i-1] + data2[i-1]));
      }
  
      // deallocate the data
      REQUIRE(cudaFree(data1) == cudaSuccess);
      REQUIRE(cudaFree(data2) == cudaSuccess);
      REQUIRE(cudaFree(scan1) == cudaSuccess);
      REQUIRE(cudaFree(scan2) == cudaSuccess);
      REQUIRE(cudaFree(buff)  == cudaSuccess);
    });
  }

  executor.run(taskflow).wait();
}

TEST_CASE("cuda_scan.int" * doctest::timeout(300)) {
  cuda_scan<int>();
}

// ----------------------------------------------------------------------------
// transform_scan
// ----------------------------------------------------------------------------

template <typename T>
void cuda_transform_scan() {

  tf::Taskflow taskflow;
  tf::Executor executor;
  
  for(int n=0; n<=1234567; n = (n<=100) ? n+1 : n*2 + 1) {
  
    taskflow.emplace([n](){

      auto data1 = tf::cuda_malloc_shared<int>(n);
      auto data2 = tf::cuda_malloc_shared<int>(n);
      auto scan1 = tf::cuda_malloc_shared<int>(n);
      auto scan2 = tf::cuda_malloc_shared<int>(n);

      // --------------------------------------------------------------------------
      // inclusive/exclusive scan
      // --------------------------------------------------------------------------

      tf::cudaStream stream;
      tf::cudaDefaultExecutionPolicy policy(stream);

      // declare the buffer
      void* buff;
      cudaMalloc(&buff, policy.scan_bufsz<int>(n));
      
      // initialize the data
      std::iota(data1, data1 + n, 0);
      std::iota(data2, data2 + n, 0);
      
      // transform inclusive scan
      tf::cuda_transform_inclusive_scan(policy,
        data1, data1+n, scan1, tf::cuda_plus<int>{},
        [] __device__ (int a) { return a*10; },
        buff
      );

      // transform exclusive scan
      tf::cuda_transform_exclusive_scan(policy,
        data2, data2+n, scan2, tf::cuda_plus<int>{},
        [] __device__ (int a) { return a*11; },
        buff
      );
      
      stream.synchronize();
  
      // inspect 
      for(int i=1; i<n; i++) {
        REQUIRE(scan1[i] == scan1[i-1] + data1[i] * 10);
        REQUIRE(scan2[i] == scan2[i-1] + data2[i-1] * 11);
      }
  
      // deallocate the data
      REQUIRE(cudaFree(data1) == cudaSuccess);
      REQUIRE(cudaFree(data2) == cudaSuccess);
      REQUIRE(cudaFree(scan1) == cudaSuccess);
      REQUIRE(cudaFree(scan2) == cudaSuccess);
      REQUIRE(cudaFree(buff)  == cudaSuccess);
    });
  }

  executor.run(taskflow).wait();
}

TEST_CASE("cuda_transform_scan.int" * doctest::timeout(300)) {
  cuda_transform_scan<int>();
}