1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335
|
/*
* Copyright (c) 2001-2003 The Trustees of Indiana University.
* All rights reserved.
* Copyright (c) 1998-2001 University of Notre Dame.
* All rights reserved.
* Copyright (c) 1994-1998 The Ohio State University.
* All rights reserved.
*
* This file is part of the LAM/MPI software package. For license
* information, see the LICENSE file in the top level directory of the
* LAM/MPI source distribution.
*
* $HEADER$
*
* $Id: local_queue.cc,v 1.3 2003/02/04 17:51:30 jsquyres Exp $
*
* Function: - queues up MPI messages to send to the local LAM
* rank. If there are messages ahead of the message being queued
* (including long messages that have not finished being received
* from the remote host yet), queue it up for later sending.
* This is on a destination/cid/tag basis. i.e., if the
* beginning of a long message for local rank 2, cid 3, tag 4
* arrives, and then a short message arrives for local rank 2,
* cid 17, tag 45, this short message will be allowed to pass.
* If, however, another message (short or long) arrives for local
* rank 2, cid 3, tag 4 before the first message is released from
* the queue, that message will not be sent until the first one
* completes (we don't even allow the ping to go if it's a long
* message).
*/
#include <lam_config.h>
#if LAM_WANT_IMPI
#include <iostream>
#include <map>
#include <list>
#include <impi.h>
#include <longbuf_mgmt.h>
#include <lamdebug-cc.h>
#include <impid-cc.h>
using std::list;
using std::map;
using std::endl;
/*
* private variables
*/
struct entry {
// For quick reference, put the drqid in the top level
IMPI_Uint8 drqid;
IMPI_Packet pk;
char* buffer;
MPI_Datatype type;
IMPI_Packet *syncack;
bool is_datasync_ping;
bool ping_sent;
bool is_long;
longbuf_mgmt* longbuf;
};
typedef list<entry*> local_list_t;
typedef map<IMPI_Uint8, local_list_t> tag_map_t;
typedef map<IMPI_Uint8, tag_map_t> cid_map_t;
typedef map<int, cid_map_t> dest_map_t;
typedef map<int, dest_map_t> src_map_t;
static src_map_t local_queue;
static Debug debug(false);
/*
* private functions
*/
static void progress_queue(local_list_t& q);
/*
* local_enqueue
*
* Function: - queue a data message to a local LAM rank
* - must calculate local LAM rank number from message
* header (pk_src)
* - if no other message is in the middle of
* being received (i.e., a long message that
* started to be received before this message)
* and not yet sent to the local LAM rank, call
* local_req_send to actually send this message.
* - if a long message that arrived before this
* data message has not yet been completely
* received (and therefore sent to the local LAM
* rank), queue this message up so that we
* preserve MPI's message ordering guarantee.
* Accepts: - ptr to IMPI_Packet header of message
* - ptr to data buffer
* - ptr to a SYNCACK IMPI_Packet that will be sent
* when this message is actually received by the
* LAM rank (or 0 if there is no syncack)
* - boolean indicating whether it is a ping or a
* data message
* Returns: - 0 on success, LAMERROR on failure
*/
int
local_enqueue(IMPI_Packet* pk, char* buffer, MPI_Datatype type,
IMPI_Packet* syncack, bool is_datasync_ping, bool is_long)
{
struct entry *e = 0;
int src_rank = pk->pk_lsrank;
int dest_rank = proc_resolver(&pk->pk_dest);
int tag = (int) pk->pk_tag;
int cid = (int) pk->pk_cid;
// Since we can send oob messages on related CIDs, we'll just use
// the base CID here.
cid = (cid / 3) * 3;
local_list_t &q = local_queue[src_rank][dest_rank][cid][tag];
// First check to see if the queue for this rank/cid/tag is empty.
// If it is, we can just send the message immediately.
if (q.empty()) {
debug << "local_enqueue: queue is empty, sending immediaty" << endl;
int ret = local_req_send(pk, buffer, type, syncack);
if (ret != 0)
return ret;
// If we just sent a ping for a long message, put it in the queue
// and mark it as "waiting for the rest of the message to arrive
// from the remote host" (although it's *really* waiting for the
// ACK from the local LAM -- the rest of the message won't arrive
// until that happens first :-)
if (is_datasync_ping && is_long) {
debug << "local_enqueue: this was a datasync ping, enqueued (drqid "
<< pk->pk_drqid << ", src " << src_rank << " dest " << dest_rank
<< " tag " << tag << " cid " << cid << ")" << endl;
e = new entry;
e->ping_sent = true;
}
}
// Otherwise, there was something in the queue already. Hence, this
// needs to be queued up at the end. SIDENOTE: local_enqueue() will
// not be called with the remainder of a long message
// (local_enqueue_finish_long()) will be -- hence, we *always* put
// entries at the end of the queue if the queue is not empty.
else {
debug << "local_enqueue: queueing up the message (drqid "
<< pk->pk_drqid << ", src " << src_rank << " dest " << dest_rank
<< " tag " << tag << " cid " << cid << " msglen "
<< pk->pk_msglen << ")" << endl;
e = new entry;
e->ping_sent = false;
}
// Save a bit of typing, 'cause the two cases where we enqueue save
// the same info except for the ping_sent field.
if (e != 0) {
debug << "local_enqeue: queueing up entry: drqid " << pk->pk_drqid << endl;
e->drqid = pk->pk_drqid;
e->pk = *pk;
e->buffer = buffer;
e->type = type;
e->syncack = syncack;
e->is_datasync_ping = is_datasync_ping;
e->is_long = is_long;
e->longbuf = 0;
q.push_back(e);
}
return 0;
}
/*
* local_enqueue_finish_long
*
* Function:
* - called with the data for a long message,
* invoked when we receive the last IMPI_Packet
* of a long message from a remote host
* - i.e., the ping has already been queued (and
* possibly sent already)
* - this function finds the ping in the queue
* and attaches the longbuf_mgmt to it.
* - if the entry is at the head of the queue (in
* which case the ping will have been sent
* already), send the rest of the message (use
* MPI_Send, 'cause we know that the local LAM
* rank is already waiting to receivie it
* (otherwise we wouldn't have sent the SYNCACK
* to the remote IMPI host and gotten the rest of
* the long message). Also advance the queue
* after that.
* Accepts: - ptr to longbuf_mgmt
* Returns: - 0 on success, LAMERROR on failure
*/
int
local_enqueue_finish_long(longbuf_mgmt* longbuf)
{
int ret = 0;
IMPI_Packet *pk = longbuf->get_packet();
int src_rank = pk->pk_lsrank;
int dest_rank = proc_resolver(&pk->pk_dest);
int tag = (int) pk->pk_tag;
int cid = (int) pk->pk_cid;
// Since we can send oob messages on related CIDs, we'll just use
// the base CID here.
cid = (cid / 3) * 3;
local_list_t &q = local_queue[src_rank][dest_rank][cid][tag];
local_list_t::iterator i;
IMPI_Uint8 drqid = longbuf->get_packet()->pk_drqid;
debug << "local_enqueue_finish_long: got rest of long message (drqid "
<< drqid << ", src " << src_rank << " dest " << dest_rank
<< " tag " << tag << " cid " << cid << ")" << endl;
debug << "local_enqueue_finish_log: queue.empty(): " << q.empty() << endl;
for (i = q.begin(); i != q.end(); ++i)
if ((*i)->drqid == drqid)
break;
if (i == q.end()) {
debug << "local_enqeue_finish_long didn't find the drqid in the list!"
<< endl;
return LAMERROR;
}
// We found the entry in the list, so attach the longbuf to it
(*i)->longbuf = longbuf;
// Are we at the head of the queue? If so, send it immediately,
// dequeue this entry, and then try to progress the rest of the
// queue.
if (i == q.begin()) {
debug << "local_enqueue_finish_long: long is at head of queue, sending"
<< endl;
struct entry *e = (*i);
IMPI_Packet *pk = longbuf->get_packet();
//
// ***** WARNING *****
//
// See the big warning in the local_req.cc.
//
int src_rank = (int) pk->pk_lsrank;
int dest_rank = proc_resolver(&pk->pk_dest);
MPI_Comm comm = comm_make(pk->pk_cid, src_rank,
dest_rank,
proc_resolver(&(pk->pk_dest)));
debug << "send_long()>>>>>> the pk_tag is " << (int) pk->pk_tag
<< " the pk_lsrank is " << src_rank
<< " dest_rank is " << dest_rank
<< " about to do an MPI_Send wiht the rest of the msg" << endl;
debug << "send_long()>>>>>> proc_resolver of pk_src is "
<< proc_resolver(&pk->pk_src)
<< " of pk_dest is " << proc_resolver(&pk->pk_dest) << endl;
int ret = MPI_Send(longbuf->get_buffer(), longbuf->get_received(),
MPI_BYTE, dest_rank, pk->pk_tag, comm);
debug << "send_long()>>>>>> MPI_Send is done" << endl;
comm_free(comm);
// All done; dequeue the front entry and try to progress the queue
delete e;
q.erase(q.begin());
delete longbuf;
if (ret == 0)
progress_queue(q);
}
debug << "local_enque_finish_long: done" << endl;
return ret;
}
//
// progress_queue
//
// This function is invoked with the assumption that it is acceptable
// to send the first entry in the queue to the local LAM rank.
//
static void
progress_queue(local_list_t& q)
{
local_list_t::iterator i, j;
// Loop through the entire queue
debug << "local_progress_queue: progressing queue" << endl;
for (i = q.begin(); i != q.end(); ) {
debug << "local_progress_queue: got entry -- drqid \n"
<< (*i)->drqid << endl;
local_req_send(&(*i)->pk, (*i)->buffer, (*i)->type, (*i)->syncack);
// Was this a ping for a long message? If so, we're done -- we
// have to wait for the LAM rank to send an ACK, which will
// trigger the remote host to send us the rest of the message, at
// which point we'll call local_enqueue_finish_long() and finish
// sending this long message.
if ((*i)->is_datasync_ping && (*i)->is_long) {
debug << "local_progress_queue: got a datasync ping; stopping" << endl;
(*i)->ping_sent = true;
break;
}
// Otherwise, erase this entry from the queue and keep going
j = i;
delete (*j);
++i;
q.erase(j);
}
debug << "local_progress_queue: done" << endl;
}
#endif
|