1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120
|
// Copyright John Maddock 2016.
// Copyright Matt Borland 2024.
// Use, modification and distribution are subject to the
// Boost Software License, Version 1.0. (See accompanying file
// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#include <iostream>
#include <iomanip>
#include <vector>
#include <boost/math/special_functions.hpp>
#include "cuda_managed_ptr.hpp"
#include "stopwatch.hpp"
// For the CUDA runtime routines (prefixed with "cuda_")
#include <cuda_runtime.h>
typedef double float_type;
/**
* CUDA Kernel Device code
*
*/
__global__ void cuda_test(const float_type *in1, const float_type *in2, float_type *out, int numElements)
{
using std::cos;
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < numElements)
{
out[i] = boost::math::heuman_lambda(in1[i], in2[i]);
}
}
/**
* Host main routine
*/
int main(void)
{
// Error code to check return values for CUDA calls
cudaError_t err = cudaSuccess;
// Print the vector length to be used, and compute its size
int numElements = 50000;
std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
// Allocate the managed input vector A
cuda_managed_ptr<float_type> input_vector1(numElements);
// Allocate the managed input vector B
cuda_managed_ptr<float_type> input_vector2(numElements);
// Allocate the managed output vector C
cuda_managed_ptr<float_type> output_vector(numElements);
// Initialize the input vectors
for (int i = 0; i < numElements; ++i)
{
input_vector1[i] = rand()/(float_type)RAND_MAX;
input_vector2[i] = rand()/(float_type)RAND_MAX;
}
// Launch the Vector Add CUDA Kernel
int threadsPerBlock = 256;
int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
watch w;
cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector1.get(), input_vector2.get(), output_vector.get(), numElements);
cudaDeviceSynchronize();
std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
err = cudaGetLastError();
if (err != cudaSuccess)
{
std::cerr << "Failed to launch CUDA kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
return EXIT_FAILURE;
}
// Verify that the result vector is correct
std::vector<float_type> results;
results.reserve(numElements);
w.reset();
for(int i = 0; i < numElements; ++i)
results.push_back(boost::math::heuman_lambda(input_vector1[i], input_vector2[i]));
double t = w.elapsed();
// check the results
int fail_counter = 0;
for(int i = 0; i < numElements; ++i)
{
if (std::isfinite(results[i]))
{
if (boost::math::epsilon_difference(output_vector[i], results[i]) > 200)
{
std::cerr << "Result verification failed at element " << i << "!\n"
<< "Device: " << output_vector[i] << '\n'
<< " Host: " << results[i] << '\n'
<< " Eps: " << boost::math::epsilon_difference(output_vector[i], results[i]) << std::endl;
fail_counter++;
if (fail_counter > 100)
{
break;
}
}
}
}
if (fail_counter > 0)
{
return EXIT_FAILURE;
}
std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
std::cout << "Done\n";
return 0;
}
|