1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131
|
/* StarPU --- Runtime system for heterogeneous multicore architectures.
*
* Copyright (C) 2009-2022 Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
* Copyright (C) 2010 Mehdi Juhoor
*
* StarPU is free software; you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation; either version 2.1 of the License, or (at
* your option) any later version.
*
* StarPU is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
*
* See the GNU Lesser General Public License in COPYING.LGPL for more details.
*/
/*
* This example shows a simple implementation of a blocked matrix
* multiplication. Note that this is NOT intended to be an efficient
* implementation of sgemm! In this example, we show:
* - how to declare dense matrices (starpu_matrix_data_register)
* - how to manipulate matrices within codelets (eg. descr[0].blas.ld)
* - how to use filters to partition the matrices into blocks
* (starpu_data_partition and starpu_data_map_filters)
* - how to unpartition data (starpu_data_unpartition) and how to stop
* monitoring data (starpu_data_unregister)
* - how to manipulate subsets of data (starpu_data_get_sub_data)
* - how to construct an autocalibrated performance model (starpu_perfmodel)
* - how to submit asynchronous tasks
*/
#include <string.h>
#include <math.h>
#include <sys/types.h>
#include <signal.h>
#include <starpu.h>
#define THREADS_PER_BLOCK 256
/*
* That program should compute C = A * B
*
* A of size (z,y)
* B of size (x,z)
* C of size (x,y)
|---------------|
z | B |
|---------------|
z x
|----| |---------------|
| | | |
| | | |
| A | y | C |
| | | |
| | | |
|----| |---------------|
* Note: we use FORTRAN ordering.
*/
/*
* The codelet is passed 3 matrices, the "descr" union-type field gives a
* description of the layout of those 3 matrices in the local memory (ie. RAM
* in the case of CPU, GPU frame buffer in the case of GPU etc.). Since we have
* registered data with the "matrix" data interface, we use the matrix macros.
*/
static __global__ void cuda_mult_kernel(uint32_t nxC, uint32_t nyC, uint32_t nyA,
uint32_t ldA, uint32_t ldB, uint32_t ldC,
float * subA, float * subB, float * subC )
{
uint32_t id, i, j, k;
float sum;
id = blockIdx.x * blockDim.x + threadIdx.x;
i = id % nxC;
j = id / nxC;
if (j >= nyC)
{
return;
}
sum = 0.;
for (k = 0 ; k < nyA ; k++)
{
sum += subA[i + k*ldA] * subB[k + j*ldB];
}
subC[i + j*ldC] = sum;
}
extern "C" void cuda_mult(void *descr[], void *arg)
{
(void)arg;
float *d_subA, *d_subB, *d_subC;
uint32_t nxC, nyC, nyA;
uint32_t ldA, ldB, ldC;
uint32_t nblocks;
/* ptr gives a pointer to the first element of the local copy */
d_subA = (float *)STARPU_MATRIX_GET_PTR(descr[0]);
d_subB = (float *)STARPU_MATRIX_GET_PTR(descr[1]);
d_subC = (float *)STARPU_MATRIX_GET_PTR(descr[2]);
/*
* Note: STARPU_MATRIX_GET_NX/NY is different from X/Y of the FORTRAN
* ordering:
* - nx is the number of consecutive elements (thus the number of rows
* in FORTRAN order)
* - ny is the number of series that are separated by ld elements (thus
* the number of columns in FORTRAN order)
* - ld stands for leading dimension
*
* NB: in case some filters were used, the leading dimension is not
* guaranteed to be the same in main memory (on the original matrix)
* and on the accelerator! */
nxC = STARPU_MATRIX_GET_NX(descr[2]);
nyC = STARPU_MATRIX_GET_NY(descr[2]);
nyA = STARPU_MATRIX_GET_NY(descr[0]);
ldA = STARPU_MATRIX_GET_LD(descr[0]);
ldB = STARPU_MATRIX_GET_LD(descr[1]);
ldC = STARPU_MATRIX_GET_LD(descr[2]);
nblocks = (nxC * nyC + THREADS_PER_BLOCK - 1)/THREADS_PER_BLOCK;
cuda_mult_kernel
<<< nblocks, THREADS_PER_BLOCK, 0, starpu_cuda_get_local_stream()
>>> (nxC, nyC, nyA, ldA, ldB, ldC, d_subA, d_subB, d_subC);
cudaError_t status = cudaGetLastError();
if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
}
|