1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266
|
SUBROUTINE PSROW2COL( ICTXT, M, N, NB, VS, LDVS, VD, LDVD,
$ RSRC, CSRC, RDEST, CDEST, WORK)
*
* -- ScaLAPACK tools routine (version 1.7) --
* University of Tennessee, Knoxville, Oak Ridge National Laboratory,
* and University of California, Berkeley.
* May 1, 1997
*
* .. Scalar Arguments ..
INTEGER CDEST, CSRC, ICTXT, LDVD, LDVS, M, N, NB,
$ RDEST, RSRC
* ..
* .. Array Arguments ..
REAL VD( LDVD, * ), VS( LDVS, * ), WORK( * )
* ..
*
* Purpose
* =======
*
* Take a block of vectors with M total rows which are distributed over
* a row of processes, and distribute those rows over a column of
* processes. This routine minimizes communication by sending all
* information it has that a given process in the CDEST needs at once.
* To do this it uses the least common multiple (LCM) concept. This is
* simply the realization that if I have part of a vector split over a
* process row consisting of Q processes, and I want to send all of that
* vector that I own to a new vector distributed over P processes within
* a process column, that after I find the process in RDEST that owns
* the row of the vector I'm currently looking at, he will want every
* ( (LCM(P,Q)/Q ) block of my vector (the block being of size NB x N).
*
* Arguments
* =========
*
* Rem: MP, resp. NQ, denotes the number of local rows, resp. local
* ==== columns, necessary to store a global vector of dimension M
* across P processes, resp. N over Q processes.
*
* ICTXT (global input) INTEGER
* The BLACS context handle, indicating the global context of
* the operation. The context itself is global.
*
* M (global input) INTEGER
* The number of global rows each vector has.
*
* N (global input) INTEGER
* The number of vectors in the vector block
*
* NB (global input) INTEGER
* The blocking factor used to divide the rows of the vector
* amongst the processes of a row.
*
* VS (local input) REAL
* Array of dimension (LDVS,N), the block of vectors stored on
* process row RSRC to be put into memory VD, and stored on
* process column CDEST.
*
* LDVS (local input) INTEGER
* The leading dimension of VS.
*
* VD (local output) REAL
* Array of dimension (LDVD,N), on output, the contents of VD
* stored on process column CDEST will be here.
*
* LDVD (local input) INTEGER
* The leading dimension of VD.
*
* RSRC (global input) INTEGER
* The process row VS is distributed over.
*
* CSRC (global input) INTEGER
* The process column the distributed block of vectors VS
* begins on.
*
* RDEST (global input) INTEGER
* The process row that VD begins on.
*
* CDEST (global input) INTEGER
* The process column to distribute VD over.
*
* WORK (local workspace) REAL
* Array, dimension (LDW). The required size of work varies:
* if( nprow.eq.npcol ) then
* LDW = 0; WORK not accessed.
* else
* lcm = least common multiple of process rows and columns.
* Mq = number of rows of VS on my process.
* npcol = number of process columns
* CEIL = the ceiling of given operation
* LDW = NB*N*CEIL( CEIL( Mq/NB )/(LCM/npcol) )
* end if
*
* =====================================================================
*
* .. Local Scalars ..
INTEGER CBLKSKIP, ICPY, II, ISTART, ICSRC, IRDEST, JB,
$ JJ, K, LCM, MP, MQ, MYCOL, MYDIST, MYROW,
$ NBLOCKS, NPCOL, NPROW, RBLKSKIP
* ..
* .. External Subroutines ..
EXTERNAL BLACS_GRIDINFO, SGESD2D, SGERV2D, SLACPY
* ..
* .. External Functions ..
INTEGER ILCM, NUMROC
EXTERNAL ILCM, NUMROC
* ..
* .. Executable Statements ..
*
*
* .. Initialize Variables ..
*
ICPY = 0
*
CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL )
*
* If we are not in special case for NPROW = NPCOL where there is no
* copying required
*
IF( NPROW .NE. NPCOL ) THEN
LCM = ILCM( NPROW, NPCOL )
RBLKSKIP = LCM / NPCOL
CBLKSKIP = LCM / NPROW
*
* If I have part of VS, the source vector(s)
*
IF( MYROW.EQ.RSRC ) THEN
*
ISTART = 1
*
* Figure my distance from CSRC: the process in CDEST the same
* distance from RDEST will want my first block
*
MYDIST = MOD( NPCOL+MYCOL-CSRC, NPCOL )
MQ = NUMROC( M, NB, MYCOL, CSRC, NPCOL )
IRDEST = MOD( RDEST+MYDIST, NPROW )
*
* Loop over all possible destination processes
*
DO 20 K = 1, RBLKSKIP
JJ = 1
*
* If I am not destination process
*
IF( (MYROW.NE.IRDEST).OR.(MYCOL.NE.CDEST) ) THEN
*
* Pack all data I own that destination needs
*
DO 10 II = ISTART, MQ, NB*RBLKSKIP
JB = MIN( NB, MQ-II+1 )
CALL SLACPY( 'G', JB, N, VS(II,1), LDVS,
$ WORK(JJ), JB )
JJ = JJ + NB*N
10 CONTINUE
*
* Figure how many rows are to be sent and send them if
* necessary, NOTE: will send extra if NB > JB
*
JJ = JJ - 1
IF( JJ.GT.0 )
$ CALL SGESD2D( ICTXT, JJ, 1, WORK, JJ, IRDEST,
$ CDEST )
*
* I am both source and destination, save where to start
* copying from for later use
*
ELSE
ICPY = ISTART
END IF
*
ISTART = ISTART + NB
IRDEST = MOD( IRDEST+NPCOL, NPROW )
20 CONTINUE
END IF
*
* If I should receive info into VD
*
IF( MYCOL.EQ.CDEST ) THEN
*
ISTART = 1
*
* Figure my distance from CDEST: the process in CSRC the same
* distance from RSRC will have my first block
*
MYDIST = MOD( NPROW+MYROW-RDEST, NPROW )
MP = NUMROC( M, NB, MYROW, RDEST, NPROW )
ICSRC = MOD( CSRC+MYDIST, NPCOL )
*
* Loop over all sending processes
*
DO 50 K = 1, CBLKSKIP
*
* If I don't already possess the required data
*
IF( (MYROW.NE.RSRC).OR.(MYCOL.NE.ICSRC) ) THEN
*
* Figure how many rows to receive, and receive them
* NOTE: may receive to much -- NB instead of JB
*
NBLOCKS = (MP - ISTART + NB) / NB
JJ = ((NBLOCKS+CBLKSKIP-1) / CBLKSKIP)*NB
IF( JJ.GT.0 )
$ CALL SGERV2D( ICTXT, JJ, N, WORK, JJ, RSRC, ICSRC )
*
* Copy data to destination vector
*
JJ = 1
DO 30 II = ISTART, MP, NB*CBLKSKIP
JB = MIN( NB, MP-II+1 )
CALL SLACPY( 'G', JB, N, WORK(JJ), JB, VD(II,1),
$ LDVD )
JJ = JJ + NB*N
30 CONTINUE
*
* If I am both source and destination
*
ELSE
JJ = ICPY
DO 40 II = ISTART, MP, NB*CBLKSKIP
JB = MIN( NB, MP-II+1 )
CALL SLACPY( 'G', JB, N, VS(JJ,1), LDVS, VD(II,1),
$ LDVD )
JJ = JJ + NB*RBLKSKIP
40 CONTINUE
END IF
ISTART = ISTART + NB
ICSRC = MOD( ICSRC+NPROW, NPCOL )
50 CONTINUE
END IF
*
* if NPROW = NPCOL, there is a one-to-one correspondance between
* process rows and columns, so no work space or copying required
*
ELSE
*
IF( MYROW.EQ.RSRC ) THEN
*
* Figure my distance from CSRC: the process in CDEST the same
* distance from RDEST will want my piece of the vector
*
MYDIST = MOD( NPCOL+MYCOL-CSRC, NPCOL )
MQ = NUMROC( M, NB, MYCOL, CSRC, NPCOL )
IRDEST = MOD( RDEST+MYDIST, NPROW )
IF( (MYROW.NE.IRDEST).OR.(MYCOL.NE.CDEST) ) THEN
CALL SGESD2D( ICTXT, MQ, N, VS, LDVS, IRDEST, CDEST )
ELSE
CALL SLACPY( 'G', MQ, N, VS, LDVS, VD, LDVD )
END IF
END IF
IF( MYCOL.EQ.CDEST ) THEN
*
* Figure my distance from RDEST: the process in RSRC the same
* distance from CSRC will have my piece of the vector
*
MYDIST = MOD( NPROW+MYROW-RDEST, NPROW )
MP = NUMROC( M, NB, MYROW, RDEST, NPROW )
ICSRC = MOD( CSRC+MYDIST, NPCOL )
IF( (MYCOL.NE.ICSRC).OR.(MYROW.NE. RSRC) )
$ CALL SGERV2D( ICTXT, MP, N, VD, LDVD, RSRC, ICSRC )
END IF
END IF
*
RETURN
*
* End of PSROW2COL
*
END
|