1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76
|
/*
-- MAGMA (version 2.9.0) --
Univ. of Tennessee, Knoxville
Univ. of California, Berkeley
Univ. of Colorado, Denver
@date January 2025
@generated from magmablas/zsetmatrix_transpose_mgpu.cpp, normal z -> d, Wed Jan 22 14:41:55 2025
@author Ichitaro Yamazaki
*/
#include "magma_internal.h"
/***************************************************************************//**
Copy and transpose matrix hA on CPU host to dAT, which is distributed
row block cyclic over multiple GPUs.
@param[in] ngpu Number of GPUs over which dAT is distributed.
@param[in] m Number of rows of input matrix hA. m >= 0.
@param[in] n Number of columns of input matrix hA. n >= 0.
@param[in] nb Block size. nb >= 0.
@param[out] hA The m-by-n matrix A on the CPU, of dimension (lda,n).
@param[in] lda Leading dimension of matrix hA. lda >= m.
@param[in] dAT Array of ngpu pointers, one per GPU, that store the
disributed n-by-m matrix A^T on the GPUs, each of dimension (ldda,m).
@param[in] ldda Leading dimension of each matrix dAT on each GPU. ngpu*ldda >= n.
@param[out] dwork Array of ngpu pointers, one per GPU, that store the
workspaces on each GPU, each of dimension (2*lddw*nb).
@param[in] lddw Leading dimension of dwork. lddw >= m.
@param[in] queues 2D array of dimension (ngpu,2), with two queues per GPU.
@ingroup magma_setmatrix_transpose
*******************************************************************************/
extern "C" void
magmablas_dsetmatrix_transpose_mgpu(
magma_int_t ngpu,
magma_int_t m, magma_int_t n, magma_int_t nb,
const double *hA, magma_int_t lda,
magmaDouble_ptr dAT[], magma_int_t ldda,
magmaDouble_ptr dwork[], magma_int_t lddw,
magma_queue_t queues[][2] )
{
#define hA(j) (hA + (j)*lda)
#define dwork(d, j) (dwork[(d)] + (j)*nb*lddw)
#define dAT(d, j) (dAT[(d)] + (j)*nb)
magma_int_t nqueues = 2, d, j, j_local, id, ib;
/* Quick return */
if ( (m == 0) || (n == 0) )
return;
if (lda < m || ngpu*ldda < n || lddw < m) {
fprintf( stderr, "%s: wrong arguments (%lld < %lld), (%lld*%lld < %lld), or (%lld < %lld).\n",
__func__,
(long long) lda, (long long) m,
(long long) ngpu, (long long) ldda, (long long) n,
(long long) lddw, (long long) m );
return;
}
/* Move data from CPU to GPU by block columns and transpose it */
for (j=0; j < n; j += nb) {
d = (j/nb)%ngpu;
j_local = (j/nb)/ngpu;
id = j_local%nqueues;
magma_setdevice(d);
ib = min(n-j, nb);
magma_dsetmatrix_async( m, ib,
hA(j), lda,
dwork(d, id), lddw,
queues[d][id] );
magmablas_dtranspose( m, ib, dwork(d,id), lddw, dAT(d,j_local), ldda, queues[d][id] );
}
}
|