File: mult_cuda.cu

package info (click to toggle)
starpu 1.4.5%2Bdfsg-2
  • links: PTS, VCS
  • area: main
  • in suites: sid
  • size: 38,692 kB
  • sloc: ansic: 324,396; sh: 7,181; makefile: 6,636; lisp: 6,055; xml: 5,031; f90: 4,994; python: 2,398; cpp: 1,353; java: 330; sed: 162; pascal: 57; fortran: 25
file content (131 lines) | stat: -rw-r--r-- 4,357 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
/* StarPU --- Runtime system for heterogeneous multicore architectures.
 *
 * Copyright (C) 2009-2022  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
 * Copyright (C) 2010       Mehdi Juhoor
 *
 * StarPU is free software; you can redistribute it and/or modify
 * it under the terms of the GNU Lesser General Public License as published by
 * the Free Software Foundation; either version 2.1 of the License, or (at
 * your option) any later version.
 *
 * StarPU is distributed in the hope that it will be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 *
 * See the GNU Lesser General Public License in COPYING.LGPL for more details.
 */

/*
 * This example shows a simple implementation of a blocked matrix
 * multiplication. Note that this is NOT intended to be an efficient
 * implementation of sgemm! In this example, we show:
 *  - how to declare dense matrices (starpu_matrix_data_register)
 *  - how to manipulate matrices within codelets (eg. descr[0].blas.ld)
 *  - how to use filters to partition the matrices into blocks
 *    (starpu_data_partition and starpu_data_map_filters)
 *  - how to unpartition data (starpu_data_unpartition) and how to stop
 *    monitoring data (starpu_data_unregister)
 *  - how to manipulate subsets of data (starpu_data_get_sub_data)
 *  - how to construct an autocalibrated performance model (starpu_perfmodel)
 *  - how to submit asynchronous tasks
 */

#include <string.h>
#include <math.h>
#include <sys/types.h>
#include <signal.h>
#include <starpu.h>

#define THREADS_PER_BLOCK 256

/*
 * That program should compute C = A * B
 *
 *   A of size (z,y)
 *   B of size (x,z)
 *   C of size (x,y)

              |---------------|
            z |       B       |
              |---------------|
       z              x
     |----|   |---------------|
     |    |   |               |
     |    |   |               |
     | A  | y |       C       |
     |    |   |               |
     |    |   |               |
     |----|   |---------------|

 * Note: we use FORTRAN ordering.
 */

/*
 * The codelet is passed 3 matrices, the "descr" union-type field gives a
 * description of the layout of those 3 matrices in the local memory (ie. RAM
 * in the case of CPU, GPU frame buffer in the case of GPU etc.). Since we have
 * registered data with the "matrix" data interface, we use the matrix macros.
 */
static __global__ void cuda_mult_kernel(uint32_t nxC, uint32_t nyC, uint32_t nyA,
				       uint32_t ldA, uint32_t ldB, uint32_t ldC,
				       float * subA, float * subB, float * subC )
{
	uint32_t id, i, j, k;
	float sum;
	id = blockIdx.x * blockDim.x + threadIdx.x;
	i = id % nxC;
	j = id / nxC;
	if (j >= nyC)
	{
		return;
	}
	sum = 0.;
	for (k = 0 ; k < nyA ; k++)
	{
		sum += subA[i + k*ldA] * subB[k + j*ldB];
	}
	subC[i + j*ldC] = sum;
}

extern "C" void cuda_mult(void *descr[], void *arg)
{
	(void)arg;
	float *d_subA, *d_subB, *d_subC;
	uint32_t nxC, nyC, nyA;
	uint32_t ldA, ldB, ldC;
	uint32_t nblocks;

	/* ptr gives a pointer to the first element of the local copy */
	d_subA = (float *)STARPU_MATRIX_GET_PTR(descr[0]);
	d_subB = (float *)STARPU_MATRIX_GET_PTR(descr[1]);
	d_subC = (float *)STARPU_MATRIX_GET_PTR(descr[2]);

	/*
	 * Note: STARPU_MATRIX_GET_NX/NY is different from X/Y of the FORTRAN
	 * ordering:
	 * - nx is the number of consecutive elements (thus the number of rows
	 *   in FORTRAN order)
	 * - ny is the number of series that are separated by ld elements (thus
	 *   the number of columns in FORTRAN order)
	 * - ld stands for leading dimension
	 *
	 * NB: in case some filters were used, the leading dimension is not
	 * guaranteed to be the same in main memory (on the original matrix)
	 * and on the accelerator! */

	nxC = STARPU_MATRIX_GET_NX(descr[2]);
	nyC = STARPU_MATRIX_GET_NY(descr[2]);
	nyA = STARPU_MATRIX_GET_NY(descr[0]);

	ldA = STARPU_MATRIX_GET_LD(descr[0]);
	ldB = STARPU_MATRIX_GET_LD(descr[1]);
	ldC = STARPU_MATRIX_GET_LD(descr[2]);

	nblocks = (nxC * nyC + THREADS_PER_BLOCK - 1)/THREADS_PER_BLOCK;
        cuda_mult_kernel
		<<< nblocks, THREADS_PER_BLOCK, 0, starpu_cuda_get_local_stream()
		>>> (nxC, nyC, nyA, ldA, ldB, ldC, d_subA, d_subB, d_subC);

	cudaError_t status = cudaGetLastError();
	if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
}