File: blas_h_v2.cpp

package info (click to toggle)
magma 2.5.4%2Bds-3
  • links: PTS, VCS
  • area: contrib
  • in suites: bullseye
  • size: 55,132 kB
  • sloc: cpp: 403,043; fortran: 121,916; ansic: 29,190; python: 25,167; f90: 13,666; makefile: 776; csh: 232; xml: 182; sh: 178; perl: 88
file content (120 lines) | stat: -rw-r--r-- 3,556 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
/*
    -- MAGMA (version 2.5.4) --
       Univ. of Tennessee, Knoxville
       Univ. of California, Berkeley
       Univ. of Colorado, Denver
       @date October 2020

       @author Ahmad Abdelfattah
*/
#include <cuda.h>    // for CUDA_VERSION
#include "magma_internal.h"
#include "error.h"

#ifdef HAVE_CUBLAS

// =============================================================================
// Level 1 BLAS

// =============================================================================
// Level 2 BLAS

// =============================================================================
// Level 3 BLAS

/***************************************************************************//**
    Perform FP16 matrix-matrix product, \f$ C = \alpha op(A) op(B) + \beta C \f$.
    This routine requires CUDA 7.5 or greater. 

    @param[in]
    transA  Operation op(A) to perform on matrix A.

    @param[in]
    transB  Operation op(B) to perform on matrix B.

    @param[in]
    m       Number of rows of C and op(A). m >= 0.

    @param[in]
    n       Number of columns of C and op(B). n >= 0.

    @param[in]
    k       Number of columns of op(A) and rows of op(B). k >= 0.

    @param[in]
    alpha   Scalar \f$ \alpha \f$

    @param[in]
    dA      HALF PRECISION array on GPU device.
            If transA == MagmaNoTrans, the m-by-k matrix A of dimension (ldda,k), ldda >= max(1,m); \n
            otherwise,                 the k-by-m matrix A of dimension (ldda,m), ldda >= max(1,k).

    @param[in]
    ldda    Leading dimension of dA.

    @param[in]
    dB      HALF PRECISION array on GPU device.
            If transB == MagmaNoTrans, the k-by-n matrix B of dimension (lddb,n), lddb >= max(1,k); \n
            otherwise,                 the n-by-k matrix B of dimension (lddb,k), lddb >= max(1,n).

    @param[in]
    lddb    Leading dimension of dB.

    @param[in]
    beta    Scalar \f$ \beta \f$

    @param[in,out]
    dC      HALF PRECISION array on GPU device.
            The m-by-n matrix C of dimension (lddc,n), lddc >= max(1,m).

    @param[in]
    lddc    Leading dimension of dC.

    @param[in]
    queue   magma_queue_t
            Queue to execute in.

    @ingroup magma_gemm
*******************************************************************************/
extern "C" void
magma_hgemm(
    magma_trans_t transA, magma_trans_t transB,
    magma_int_t m, magma_int_t n, magma_int_t k,
    magmaHalf alpha,
    magmaHalf_const_ptr dA, magma_int_t ldda,
    magmaHalf_const_ptr dB, magma_int_t lddb,
    magmaHalf beta,
    magmaHalf_ptr       dC, magma_int_t lddc,
    magma_queue_t queue )
{
#if CUDA_VERSION >= 7500
    magma_int_t arch = magma_getdevice_arch();
    if( arch >= 530 ) {
        #if CUDA_VERSION >= 9000
        // turn on tensor cores by default
        cublasSetMathMode(queue->cublas_handle(), CUBLAS_TENSOR_OP_MATH);
        #endif

        cublasHgemm(
            queue->cublas_handle(),
            cublas_trans_const( transA ),
            cublas_trans_const( transB ),
            int(m), int(n), int(k),
            &alpha, dA, int(ldda),
                    dB, int(lddb),
            &beta,  dC, int(lddc) );
        
        #if CUDA_VERSION >= 9000
        // roll back to default
        cublasSetMathMode(queue->cublas_handle(), CUBLAS_DEFAULT_MATH);
        #endif
    }
    else {
        printf("ERROR: unsupported architecture for %s \n", __func__ );
    }
#else
    printf("ERROR: unsupported CUDA version for %s \n", __func__ );
#endif    // CUDA_VERSION >= 7500
}
#endif // HAVE_CUBLAS