1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186
|
/*
-- MAGMA (version 2.9.0) --
Univ. of Tennessee, Knoxville
Univ. of California, Berkeley
Univ. of Colorado, Denver
@date January 2025
@author Ahmad Abdelfattah
*/
#include <cuda.h> // for CUDA_VERSION
#include "magma_internal.h"
#include "error.h"
// =============================================================================
// Level 1 BLAS
// =============================================================================
// Level 2 BLAS
// =============================================================================
// Level 3 BLAS
/***************************************************************************//**
Perform FP16 matrix-matrix product, \f$ C = \alpha op(A) op(B) + \beta C \f$.
This routine requires CUDA 7.5 or greater.
@param[in]
transA Operation op(A) to perform on matrix A.
@param[in]
transB Operation op(B) to perform on matrix B.
@param[in]
m Number of rows of C and op(A). m >= 0.
@param[in]
n Number of columns of C and op(B). n >= 0.
@param[in]
k Number of columns of op(A) and rows of op(B). k >= 0.
@param[in]
alpha Scalar \f$ \alpha \f$
@param[in]
dA HALF PRECISION array on GPU device.
If transA == MagmaNoTrans, the m-by-k matrix A of dimension (ldda,k), ldda >= max(1,m); \n
otherwise, the k-by-m matrix A of dimension (ldda,m), ldda >= max(1,k).
@param[in]
ldda Leading dimension of dA.
@param[in]
dB HALF PRECISION array on GPU device.
If transB == MagmaNoTrans, the k-by-n matrix B of dimension (lddb,n), lddb >= max(1,k); \n
otherwise, the n-by-k matrix B of dimension (lddb,k), lddb >= max(1,n).
@param[in]
lddb Leading dimension of dB.
@param[in]
beta Scalar \f$ \beta \f$
@param[in,out]
dC HALF PRECISION array on GPU device.
The m-by-n matrix C of dimension (lddc,n), lddc >= max(1,m).
@param[in]
lddc Leading dimension of dC.
@param[in]
queue magma_queue_t
Queue to execute in.
@ingroup magma_gemm
*******************************************************************************/
extern "C" void
magma_hgemm(
magma_trans_t transA, magma_trans_t transB,
magma_int_t m, magma_int_t n, magma_int_t k,
magmaHalf alpha,
magmaHalf_const_ptr dA, magma_int_t ldda,
magmaHalf_const_ptr dB, magma_int_t lddb,
magmaHalf beta,
magmaHalf_ptr dC, magma_int_t lddc,
magma_queue_t queue )
{
#if CUDA_VERSION >= 7500
magma_int_t arch = magma_getdevice_arch();
if( arch >= 530 ) {
#if CUDA_VERSION >= 9000
// turn on tensor cores by default
cublasSetMathMode(queue->cublas_handle(), CUBLAS_TENSOR_OP_MATH);
#endif
cublasHgemm(
queue->cublas_handle(),
cublas_trans_const( transA ),
cublas_trans_const( transB ),
int(m), int(n), int(k),
&alpha, dA, int(ldda),
dB, int(lddb),
&beta, dC, int(lddc) );
#if CUDA_VERSION >= 9000
// roll back to default
cublasSetMathMode(queue->cublas_handle(), CUBLAS_DEFAULT_MATH);
#endif
}
else {
printf("ERROR: unsupported architecture for %s \n", __func__ );
}
#elif defined(MAGMA_HAVE_HIP)
magma_int_t arch = magma_getdevice_arch();
if( arch >= 330 ) {
hipblasGemmEx(
queue->hipblas_handle(),
hipblas_trans_const( transA ),
hipblas_trans_const( transB ),
int(m), int(n), int(k),
(void*)&alpha, (void*)dA, HIPBLAS_R_16F, int(ldda),
(void*)dB, HIPBLAS_R_16F, int(lddb),
(void *)&beta, (void*)dC, HIPBLAS_R_16F, int(lddc),
HIPBLAS_R_16F,
HIPBLAS_GEMM_DEFAULT);
}
else {
printf("ERROR: unsupported architecture for %s \n", __func__ );
}
#else
printf("ERROR: unsupported architecture version for %s \n", __func__ );
#endif
}
extern "C" void
magma_hgemmx(
magma_trans_t transA, magma_trans_t transB,
magma_int_t m, magma_int_t n, magma_int_t k,
float alpha,
magmaHalf_const_ptr dA, magma_int_t ldda,
magmaHalf_const_ptr dB, magma_int_t lddb,
float beta,
float *dC, magma_int_t lddc,
magma_queue_t queue )
{
#if defined(MAGMA_HAVE_HIP)
magma_int_t arch = magma_getdevice_arch();
if( arch >= 330 ) {
hipblasGemmEx(
queue->hipblas_handle(),
hipblas_trans_const( transA ),
hipblas_trans_const( transB ),
int(m), int(n), int(k),
(void*)&alpha, (void*)dA, HIPBLAS_R_16F, int(ldda),
(void*)dB, HIPBLAS_R_16F, int(lddb),
(void*)&beta, (void*)dC, HIPBLAS_R_32F, int(lddc),
HIPBLAS_R_32F,
HIPBLAS_GEMM_DEFAULT);
}
else {
printf("ERROR: unsupported architecture for %s \n", __func__ );
}
#else
#if CUDA_VERSION >= 7500
magma_int_t arch = magma_getdevice_arch();
if( arch >= 530 ) {
#if CUDA_VERSION >= 9000
// turn on tensor cores by default
cublasSetMathMode(queue->cublas_handle(), CUBLAS_TENSOR_OP_MATH);
#endif
cublasGemmEx( queue->cublas_handle(),
cublas_trans_const( transA ), cublas_trans_const( transB ),
int(m), int(n), int(k),
&alpha, dA, CUDA_R_16F, int(ldda),
dB, CUDA_R_16F, int(lddb),
&beta, dC, CUDA_R_32F, int(lddc),
CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP);
#if CUDA_VERSION >= 9000
// roll back to default
cublasSetMathMode(queue->cublas_handle(), CUBLAS_DEFAULT_MATH);
#endif
}
#endif
#endif
}
|