1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188
|
// The example shows how to use syclFlow to multiply two 2D matrices.
#include <taskflow/taskflow.hpp>
#include <taskflow/sycl/syclflow.hpp>
// Matrix multiplication using GPU
auto gpu(int M, int N, int K) {
std::vector<int> ha, hb, hc;
int *da, *db, *dc;
tf::Executor executor;
tf::Taskflow taskflow("MatrixMultiplication");
sycl::queue queue;
// allocate the host and device storage for a
auto allocate_a = taskflow.emplace([&](){
ha.resize(M*N, M+N);
da = sycl::malloc_device<int>(M*N, queue);
}).name("allocate_a");
// allocate the host and device storage for b
auto allocate_b = taskflow.emplace([&](){
hb.resize(N*K, N+K);
db = sycl::malloc_device<int>(N*K, queue);
}).name("allocate_b");
// allocate the host and device storage for c
auto allocate_c = taskflow.emplace([&](){
hc.resize(M*K);
dc = sycl::malloc_device<int>(M*K, queue);
}).name("allocate_c");
// create a syclFlow to run the matrix multiplication
auto syclFlow = taskflow.emplace_on([&](tf::syclFlow& sf){
// copy data to da, db, and dc
auto copy_da = sf.copy(da, ha.data(), M*N).name("H2D_a");
auto copy_db = sf.copy(db, hb.data(), N*K).name("H2D_b");
auto copy_hc = sf.copy(hc.data(), dc, M*K).name("D2H_c");
auto _M = (M % 16 == 0) ? M : (M + 16 - M % 16);
auto _K = (K % 16 == 0) ? K : (K + 16 - K % 16);
auto kmatmul = sf.parallel_for(
sycl::nd_range<2>{sycl::range<2>(_M, _K ), sycl::range<2>(16, 16)},
[=](sycl::nd_item<2> item) {
int row = item.get_global_id(0);
int col = item.get_global_id(1);
if(row < M && col < K) {
int sum = 0;
for(int n = 0; n < N; n++) {
sum += da[row * N + n] * db[n * K + col];
}
dc[row * K + col] = sum;
}
}
).name("matmul");
// It is also possible to just use range and let the runtime decide the
// partition of groups, but the result is less efficient.
//
//auto kmatmul = sf.parallel_for(
// sycl::range<2>(M, K),
// [=](sycl::id<2> id) {
// int row = id[0];
// int col = id[1];
// int sum = 0;
// for(int n = 0; n < N; n++) {
// sum += da[row * N + n] * db[n * K + col];
// }
// dc[row * K + col] = sum;
// }
//).name("matmul");
kmatmul.succeed(copy_da, copy_db)
.precede(copy_hc);
}, queue).name("syclFlow");
auto free = taskflow.emplace([&](){
sycl::free(da, queue);
sycl::free(db, queue);
sycl::free(dc, queue);
}).name("free");
syclFlow.succeed(allocate_a, allocate_b, allocate_c)
.precede(free);
executor.run(taskflow).wait();
// You may uncomment the line below to dump the task graph
//taskflow.dump(std::cout);
return hc;
}
// Matrix multiplication using CPU
auto cpu(int M, int N, int K) {
std::vector<int> a, b, c;
tf::Executor executor;
tf::Taskflow taskflow;
auto ha = taskflow.emplace([&](){
a.resize(M*N, M+N);
}).name("allocate_a");
auto hb = taskflow.emplace([&](){
b.resize(N*K, N+K);
}).name("allocate_b");
auto hc = taskflow.emplace([&](){
c.resize(M*K, 0);
}).name("allocate_c");
auto pf = taskflow.for_each_index(0, M, 1, [&] (int m) {
for(int k=0; k<K; k++) {
for(int n=0; n<N; n++) {
c[m*K+k] += (a[m*N+n]*b[n*K+k]);
}
}
});
pf.succeed(ha, hb, hc);
//taskflow.dump(std::cout);
executor.run(taskflow).wait();
return c;
}
// Function: main
int main(int argc, char *argv[]) {
if(argc != 4) {
std::cerr << "usage: matrix-multiplication M N K\n";
std::exit(EXIT_FAILURE);
}
int M = std::atoi(argv[1]);
int N = std::atoi(argv[2]);
int K = std::atoi(argv[3]);
std::cout << "matrix A: " << M << 'x' << N << '\n'
<< "matrix B: " << N << 'x' << K << '\n'
<< "matrix C: " << M << 'x' << K << '\n';
// matrix multiplication using gpu
std::cout << "running gpu matrix multiplication ... ";
auto gbeg = std::chrono::steady_clock::now();
auto gres = gpu(M, N, K);
auto gend = std::chrono::steady_clock::now();
std::cout << "completed with "
<< std::chrono::duration_cast<std::chrono::milliseconds>(gend-gbeg).count()
<< " ms\n";
// matrix multiplication using cpu
std::cout << "running cpu matrix multiplication ... ";
auto cbeg = std::chrono::steady_clock::now();
auto cres = cpu(M, N, K);
auto cend = std::chrono::steady_clock::now();
std::cout << "completed with "
<< std::chrono::duration_cast<std::chrono::milliseconds>(cend-cbeg).count()
<< " ms\n";
// verify the result
int64_t error = 0;
std::cout << "verifying results ... ";
for(int i=0; i<M*K; ++i) {
error += abs(gres[i] - cres[i]);
}
std::cout << "abs-error=" << error << '\n';
return 0;
}
|