File: dsetmatrix_transpose.cpp

package info (click to toggle)

magma 2.9.0%2Bds-2

links: PTS, VCS
area: contrib
in suites: forky, sid, trixie
size: 83,212 kB
sloc: cpp: 709,115; fortran: 121,916; ansic: 32,343; python: 25,603; f90: 15,208; makefile: 942; xml: 253; csh: 232; sh: 203; perl: 104

file content (77 lines) | stat: -rw-r--r-- 2,992 bytes

/*
    -- MAGMA (version 2.9.0) --
       Univ. of Tennessee, Knoxville
       Univ. of California, Berkeley
       Univ. of Colorado, Denver
       @date January 2025

       @generated from magmablas/zsetmatrix_transpose.cpp, normal z -> d, Wed Jan 22 14:41:49 2025

*/
#include "magma_internal.h"

/***************************************************************************//**
    Copy and transpose matrix hA on CPU host to dAT on GPU device.

    @param[in]  m       Number of rows    of input matrix hA. m >= 0.
    @param[in]  n       Number of columns of input matrix hA. n >= 0.
    @param[in]  nb      Block size. nb >= 0.
    @param[in]  hA      The m-by-n matrix A on the CPU, of dimension (lda,n).
    @param[in]  lda     Leading dimension of matrix hA. lda >= m.
    @param[out] dAT     The n-by-m matrix A^T on the GPU, of dimension (ldda,m).
    @param[in]  ldda    Leading dimension of matrix dAT. ldda >= n.
    @param[out] dwork   Workspace on the GPU, of dimension (2*lddw*nb).
    @param[in]  lddw    Leading dimension of dwork. lddw >= m.
    @param[in]  queues  Array of two queues, to pipeline operation.

    @ingroup magma_setmatrix_transpose
*******************************************************************************/
extern "C" void
magmablas_dsetmatrix_transpose(
    magma_int_t m, magma_int_t n, magma_int_t nb,
    const double     *hA, magma_int_t lda,
    magmaDouble_ptr       dAT, magma_int_t ldda,
    magmaDouble_ptr     dwork, magma_int_t lddw,
    magma_queue_t queues[2] )
{
#define    hA(i_, j_)    (hA + (i_) + (j_)*lda)
#define   dAT(i_, j_)   (dAT + (i_) + (j_)*ldda)
#define dwork(i_, j_) (dwork + (i_) + (j_)*lddw)

    magma_int_t i = 0, j = 0, ib;

    /* Quick return */
    if ( (m == 0) || (n == 0) )
        return;

    // TODO standard check arguments
    if (lda < m || ldda < n || lddw < m) {
        fprintf( stderr, "%s: wrong arguments.\n", __func__ );
        return;
    }

    /* Move data from CPU to GPU in the first panel in the dwork buffer */
    ib = min(n-i, nb);
    magma_dsetmatrix_async( m, ib,
                            hA(0,i), lda,
                            dwork(0,(j%2)*nb), lddw, queues[j%2] );
    j++;

    for (i=nb; i < n; i += nb) {
        /* Move data from CPU to GPU in the second panel in the dwork buffer */
        ib = min(n-i, nb);
        magma_dsetmatrix_async( m, ib,
                                hA(0,i), lda,
                                dwork(0,(j%2)*nb), lddw, queues[j%2] );
        j++;
        
        /* Note that the previous panel (i.e., j%2) comes through the queue
           for the kernel so there is no need to synchronize.             */
        // TODO should this be ib not nb?
        magmablas_dtranspose( m, nb, dwork(0,(j%2)*nb), lddw, dAT(i-nb,0), ldda, queues[j%2] );
    }

    /* Transpose the last part of the matrix.                            */
    j++;
    magmablas_dtranspose( m, ib, dwork(0,(j%2)*nb), lddw, dAT(i-nb,0), ldda, queues[j%2] );
}