File: cuda_transform.cu

package info (click to toggle)
taskflow 3.9.0%2Bds-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 45,948 kB
  • sloc: cpp: 39,058; xml: 35,572; python: 12,935; javascript: 1,732; makefile: 59; sh: 16
file content (47 lines) | stat: -rw-r--r-- 1,009 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
// This program demonstrates how to performs a parallel transform
// using cudaFlow.

#include <taskflow/cuda/cudaflow.hpp>
#include <taskflow/cuda/algorithm/transform.hpp>

int main(int argc, char* argv[]) {

  if(argc != 2) {
    std::cerr << "usage: ./cuda_transform num_items\n";
    std::exit(EXIT_FAILURE);
  }

  size_t N = std::atoi(argv[1]);

  auto input  = tf::cuda_malloc_shared<int>(N);
  auto output = tf::cuda_malloc_shared<int>(N);
  
  // initialize the data
  for(size_t i=0; i<N; i++) {
    input [i] = -1;
    output[i] = 1;
  }
  
  // perform parallel transform
  tf::cudaFlow cudaflow;
  tf::cudaStream stream;
  
  // output[i] = input[i] + 11
  cudaflow.transform(
    input, input + N, output, [] __device__ (int a) { return a + 11; }
  );

  cudaflow.run(stream);
  stream.synchronize();

  // inspect the result
  for(size_t i=0; i<N; i++) {
    if(output[i] != 10) {
      throw std::runtime_error("incorrect result");
    }
  }

  std::cout << "correct result\n";

  return 0;
}