File: blas_h_v2.cpp

package info (click to toggle)
magma 2.9.0%2Bds-2
  • links: PTS, VCS
  • area: contrib
  • in suites: forky, sid, trixie
  • size: 83,212 kB
  • sloc: cpp: 709,115; fortran: 121,916; ansic: 32,343; python: 25,603; f90: 15,208; makefile: 942; xml: 253; csh: 232; sh: 203; perl: 104
file content (186 lines) | stat: -rw-r--r-- 5,871 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
/*
    -- MAGMA (version 2.9.0) --
       Univ. of Tennessee, Knoxville
       Univ. of California, Berkeley
       Univ. of Colorado, Denver
       @date January 2025

       @author Ahmad Abdelfattah
*/
#include <cuda.h>    // for CUDA_VERSION
#include "magma_internal.h"
#include "error.h"

// =============================================================================
// Level 1 BLAS

// =============================================================================
// Level 2 BLAS

// =============================================================================
// Level 3 BLAS

/***************************************************************************//**
    Perform FP16 matrix-matrix product, \f$ C = \alpha op(A) op(B) + \beta C \f$.
    This routine requires CUDA 7.5 or greater.

    @param[in]
    transA  Operation op(A) to perform on matrix A.

    @param[in]
    transB  Operation op(B) to perform on matrix B.

    @param[in]
    m       Number of rows of C and op(A). m >= 0.

    @param[in]
    n       Number of columns of C and op(B). n >= 0.

    @param[in]
    k       Number of columns of op(A) and rows of op(B). k >= 0.

    @param[in]
    alpha   Scalar \f$ \alpha \f$

    @param[in]
    dA      HALF PRECISION array on GPU device.
            If transA == MagmaNoTrans, the m-by-k matrix A of dimension (ldda,k), ldda >= max(1,m); \n
            otherwise,                 the k-by-m matrix A of dimension (ldda,m), ldda >= max(1,k).

    @param[in]
    ldda    Leading dimension of dA.

    @param[in]
    dB      HALF PRECISION array on GPU device.
            If transB == MagmaNoTrans, the k-by-n matrix B of dimension (lddb,n), lddb >= max(1,k); \n
            otherwise,                 the n-by-k matrix B of dimension (lddb,k), lddb >= max(1,n).

    @param[in]
    lddb    Leading dimension of dB.

    @param[in]
    beta    Scalar \f$ \beta \f$

    @param[in,out]
    dC      HALF PRECISION array on GPU device.
            The m-by-n matrix C of dimension (lddc,n), lddc >= max(1,m).

    @param[in]
    lddc    Leading dimension of dC.

    @param[in]
    queue   magma_queue_t
            Queue to execute in.

    @ingroup magma_gemm
*******************************************************************************/
extern "C" void
magma_hgemm(
    magma_trans_t transA, magma_trans_t transB,
    magma_int_t m, magma_int_t n, magma_int_t k,
    magmaHalf alpha,
    magmaHalf_const_ptr dA, magma_int_t ldda,
    magmaHalf_const_ptr dB, magma_int_t lddb,
    magmaHalf beta,
    magmaHalf_ptr       dC, magma_int_t lddc,
    magma_queue_t queue )
{
#if CUDA_VERSION >= 7500
    magma_int_t arch = magma_getdevice_arch();
    if( arch >= 530 ) {
        #if CUDA_VERSION >= 9000
        // turn on tensor cores by default
        cublasSetMathMode(queue->cublas_handle(), CUBLAS_TENSOR_OP_MATH);
        #endif

        cublasHgemm(
            queue->cublas_handle(),
            cublas_trans_const( transA ),
            cublas_trans_const( transB ),
            int(m), int(n), int(k),
            &alpha, dA, int(ldda),
                    dB, int(lddb),
            &beta,  dC, int(lddc) );

        #if CUDA_VERSION >= 9000
        // roll back to default
        cublasSetMathMode(queue->cublas_handle(), CUBLAS_DEFAULT_MATH);
        #endif
    }
    else {
        printf("ERROR: unsupported architecture for %s \n", __func__ );
    }
#elif defined(MAGMA_HAVE_HIP)
    magma_int_t arch = magma_getdevice_arch();
    if( arch >= 330 ) {
        hipblasGemmEx(
		      queue->hipblas_handle(),
		      hipblas_trans_const( transA ),
		      hipblas_trans_const( transB ),
		      int(m), int(n), int(k),
		      (void*)&alpha, (void*)dA, HIPBLAS_R_16F, int(ldda),
		      (void*)dB, HIPBLAS_R_16F, int(lddb),
		      (void *)&beta,  (void*)dC, HIPBLAS_R_16F, int(lddc),
		      HIPBLAS_R_16F,
		      HIPBLAS_GEMM_DEFAULT);
    }
    else {
        printf("ERROR: unsupported architecture for %s \n", __func__ );
    }
#else
    printf("ERROR: unsupported architecture version for %s \n", __func__ );
#endif
}

extern "C" void
magma_hgemmx(
    magma_trans_t transA, magma_trans_t transB,
    magma_int_t m, magma_int_t n, magma_int_t k,
    float alpha,
    magmaHalf_const_ptr dA, magma_int_t ldda,
    magmaHalf_const_ptr dB, magma_int_t lddb,
    float beta,
    float *dC, magma_int_t lddc,
    magma_queue_t queue )
{
#if defined(MAGMA_HAVE_HIP)
    magma_int_t arch = magma_getdevice_arch();
    if( arch >= 330 ) {
        hipblasGemmEx(
		      queue->hipblas_handle(),
		      hipblas_trans_const( transA ),
		      hipblas_trans_const( transB ),
		      int(m), int(n), int(k),
		      (void*)&alpha, (void*)dA, HIPBLAS_R_16F, int(ldda),
                                     (void*)dB, HIPBLAS_R_16F, int(lddb),
		      (void*)&beta,  (void*)dC, HIPBLAS_R_32F, int(lddc),
		      HIPBLAS_R_32F,
		      HIPBLAS_GEMM_DEFAULT);
    }
    else {
        printf("ERROR: unsupported architecture for %s \n", __func__ );
    }
#else
    #if CUDA_VERSION >= 7500
    magma_int_t arch = magma_getdevice_arch();
    if( arch >= 530 ) {
        #if CUDA_VERSION >= 9000
        // turn on tensor cores by default
        cublasSetMathMode(queue->cublas_handle(), CUBLAS_TENSOR_OP_MATH);
        #endif
        cublasGemmEx( queue->cublas_handle(),
                      cublas_trans_const( transA ), cublas_trans_const( transB ),
                      int(m), int(n), int(k),
                      &alpha, dA,        CUDA_R_16F, int(ldda),
                              dB,        CUDA_R_16F, int(lddb),
                      &beta,  dC,        CUDA_R_32F, int(lddc),
                      CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP);
        #if CUDA_VERSION >= 9000
        // roll back to default
        cublasSetMathMode(queue->cublas_handle(), CUBLAS_DEFAULT_MATH);
        #endif
    }
    #endif
#endif
}