File: local_queue.cc

package info (click to toggle)
lam 7.1.4-8
  • links: PTS
  • area: main
  • in suites: forky, sid
  • size: 56,404 kB
  • sloc: ansic: 156,541; sh: 9,991; cpp: 7,699; makefile: 5,621; perl: 488; fortran: 260; asm: 83
file content (335 lines) | stat: -rw-r--r-- 11,046 bytes parent folder | download | duplicates (11)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
/*
 * Copyright (c) 2001-2003 The Trustees of Indiana University.  
 *                         All rights reserved.
 * Copyright (c) 1998-2001 University of Notre Dame. 
 *                         All rights reserved.
 * Copyright (c) 1994-1998 The Ohio State University.  
 *                         All rights reserved.
 * 
 * This file is part of the LAM/MPI software package.  For license
 * information, see the LICENSE file in the top level directory of the
 * LAM/MPI source distribution.
 * 
 * $HEADER$
 *
 * $Id: local_queue.cc,v 1.3 2003/02/04 17:51:30 jsquyres Exp $
 *
 *	Function: - queues up MPI messages to send to the local LAM
 *	rank.  If there are messages ahead of the message being queued
 *	(including long messages that have not finished being received
 *	from the remote host yet), queue it up for later sending.
 *	This is on a destination/cid/tag basis.  i.e., if the
 *	beginning of a long message for local rank 2, cid 3, tag 4
 *	arrives, and then a short message arrives for local rank 2,
 *	cid 17, tag 45, this short message will be allowed to pass.
 *	If, however, another message (short or long) arrives for local
 *	rank 2, cid 3, tag 4 before the first message is released from
 *	the queue, that message will not be sent until the first one
 *	completes (we don't even allow the ping to go if it's a long
 *	message).
 */

#include <lam_config.h>

#if LAM_WANT_IMPI

#include <iostream>
#include <map>
#include <list>

#include <impi.h>
#include <longbuf_mgmt.h>
#include <lamdebug-cc.h>
#include <impid-cc.h>

using std::list;
using std::map;
using std::endl;


/*
 * private variables
 */
struct entry {
  // For quick reference, put the drqid in the top level

  IMPI_Uint8 drqid;

  IMPI_Packet pk;
  char* buffer;
  MPI_Datatype type;
  IMPI_Packet *syncack;

  bool is_datasync_ping;
  bool ping_sent;
  bool is_long;
  longbuf_mgmt* longbuf;
};
typedef list<entry*> local_list_t;
typedef map<IMPI_Uint8, local_list_t> tag_map_t;
typedef map<IMPI_Uint8, tag_map_t> cid_map_t;
typedef map<int, cid_map_t> dest_map_t;
typedef map<int, dest_map_t> src_map_t;

static src_map_t local_queue;
static Debug debug(false);


/*
 * private functions
 */
static void progress_queue(local_list_t& q);


/*
 *	local_enqueue
 *
 *	Function:	- queue a data message to a local LAM rank
 *                      - must calculate local LAM rank number from message
 *                        header (pk_src)
 *                      - if no other message is in the middle of
 *                      being received (i.e., a long message that
 *                      started to be received before this message)
 *                      and not yet sent to the local LAM rank, call
 *                      local_req_send to actually send this message.
 *                      - if a long message that arrived before this
 *                      data message has not yet been completely
 *                      received (and therefore sent to the local LAM
 *                      rank), queue this message up so that we
 *                      preserve MPI's message ordering guarantee.
 *      Accepts:        - ptr to IMPI_Packet header of message
 *                      - ptr to data buffer
 *                      - ptr to a SYNCACK IMPI_Packet that will be sent 
 *                        when this message is actually received by the 
 *                        LAM rank (or 0 if there is no syncack)
 *                      - boolean indicating whether it is a ping or a
 *                        data message
 *      Returns:        - 0 on success, LAMERROR on failure
 */
int
local_enqueue(IMPI_Packet* pk, char* buffer, MPI_Datatype type, 
	      IMPI_Packet* syncack, bool is_datasync_ping, bool is_long)
{
  struct entry *e = 0;
  int src_rank = pk->pk_lsrank;
  int dest_rank = proc_resolver(&pk->pk_dest);
  int tag = (int) pk->pk_tag;
  int cid = (int) pk->pk_cid;

  // Since we can send oob messages on related CIDs, we'll just use
  // the base CID here.

  cid = (cid / 3) * 3;
  local_list_t &q = local_queue[src_rank][dest_rank][cid][tag];

  // First check to see if the queue for this rank/cid/tag is empty.
  // If it is, we can just send the message immediately.

  if (q.empty()) {
    debug << "local_enqueue: queue is empty, sending immediaty" << endl;
    int ret = local_req_send(pk, buffer, type, syncack);
    if (ret != 0)
      return ret;

    // If we just sent a ping for a long message, put it in the queue
    // and mark it as "waiting for the rest of the message to arrive
    // from the remote host" (although it's *really* waiting for the
    // ACK from the local LAM -- the rest of the message won't arrive
    // until that happens first :-)

    if (is_datasync_ping && is_long) {
      debug << "local_enqueue: this was a datasync ping, enqueued (drqid " 
	<< pk->pk_drqid << ", src " << src_rank << " dest " << dest_rank 
	    << " tag " << tag << " cid " << cid << ")" << endl;
      e = new entry;
      e->ping_sent = true;
    }
  }

  // Otherwise, there was something in the queue already.  Hence, this
  // needs to be queued up at the end.  SIDENOTE: local_enqueue() will
  // not be called with the remainder of a long message
  // (local_enqueue_finish_long()) will be -- hence, we *always* put
  // entries at the end of the queue if the queue is not empty.

  else {
    debug << "local_enqueue: queueing up the message (drqid " 
	  << pk->pk_drqid << ", src " << src_rank << " dest " << dest_rank 
	  << " tag " << tag << " cid " << cid << " msglen " 
	  << pk->pk_msglen << ")" << endl;
    e = new entry;
    e->ping_sent = false;
  }

  // Save a bit of typing, 'cause the two cases where we enqueue save
  // the same info except for the ping_sent field.

  if (e != 0) {
    debug << "local_enqeue: queueing up entry: drqid " << pk->pk_drqid << endl;
    e->drqid = pk->pk_drqid;
    e->pk = *pk;
    e->buffer = buffer;
    e->type = type;
    e->syncack = syncack;
    e->is_datasync_ping = is_datasync_ping;
    e->is_long = is_long;
    e->longbuf = 0;
    q.push_back(e);
  }

  return 0;
}



/*
 *	local_enqueue_finish_long
 *
 *	Function:	
 *                      - called with the data for a long message,
 *                        invoked when we receive the last IMPI_Packet
 *                        of a long message from a remote host
 *                      - i.e., the ping has already been queued (and
 *                        possibly sent already)
 *                      - this function finds the ping in the queue
 *                        and attaches the longbuf_mgmt to it.
 *                      - if the entry is at the head of the queue (in
 *                      which case the ping will have been sent
 *                      already), send the rest of the message (use
 *                      MPI_Send, 'cause we know that the local LAM
 *                      rank is already waiting to receivie it
 *                      (otherwise we wouldn't have sent the SYNCACK
 *                      to the remote IMPI host and gotten the rest of
 *                      the long message).  Also advance the queue
 *                      after that.
 *      Accepts:        - ptr to longbuf_mgmt
 *      Returns:        - 0 on success, LAMERROR on failure
 */
int 
local_enqueue_finish_long(longbuf_mgmt* longbuf)
{
  int ret = 0;
  IMPI_Packet *pk = longbuf->get_packet();
  int src_rank = pk->pk_lsrank;
  int dest_rank = proc_resolver(&pk->pk_dest);
  int tag = (int) pk->pk_tag;
  int cid = (int) pk->pk_cid;

  // Since we can send oob messages on related CIDs, we'll just use
  // the base CID here.

  cid = (cid / 3) * 3;
  local_list_t &q = local_queue[src_rank][dest_rank][cid][tag];
  local_list_t::iterator i;
  IMPI_Uint8 drqid = longbuf->get_packet()->pk_drqid;

  debug << "local_enqueue_finish_long: got rest of long message (drqid " 
	<< drqid << ", src " << src_rank << " dest " << dest_rank 
	<< " tag " << tag << " cid " << cid << ")" << endl;
  debug << "local_enqueue_finish_log: queue.empty(): " << q.empty() << endl;
  for (i = q.begin(); i != q.end(); ++i)
    if ((*i)->drqid == drqid)
      break;

  if (i == q.end()) {
    debug << "local_enqeue_finish_long didn't find the drqid in the list!" 
	  << endl;
    return LAMERROR;
  }

  // We found the entry in the list, so attach the longbuf to it

  (*i)->longbuf = longbuf;

  // Are we at the head of the queue?  If so, send it immediately,
  // dequeue this entry, and then try to progress the rest of the
  // queue.

  if (i == q.begin()) {
    debug << "local_enqueue_finish_long: long is at head of queue, sending" 
	  << endl;
    struct entry *e = (*i);
    IMPI_Packet *pk = longbuf->get_packet(); 

    //
    // ***** WARNING *****
    //
    // See the big warning in the local_req.cc.
    //
    
    int src_rank = (int) pk->pk_lsrank;
    int dest_rank = proc_resolver(&pk->pk_dest);
    MPI_Comm comm = comm_make(pk->pk_cid, src_rank,
			      dest_rank, 
			      proc_resolver(&(pk->pk_dest)));
    
    debug << "send_long()>>>>>> the pk_tag is " << (int) pk->pk_tag 
	  << " the pk_lsrank is " << src_rank
	  << " dest_rank is " << dest_rank
	  << " about to do an MPI_Send wiht the rest of the msg" << endl;
    debug << "send_long()>>>>>> proc_resolver of pk_src is " 
	  << proc_resolver(&pk->pk_src)
	  << " of pk_dest is " << proc_resolver(&pk->pk_dest) << endl;
    
    int ret = MPI_Send(longbuf->get_buffer(), longbuf->get_received(), 
		       MPI_BYTE, dest_rank, pk->pk_tag, comm); 
    debug << "send_long()>>>>>> MPI_Send is done" << endl;
    
    comm_free(comm);

    // All done; dequeue the front entry and try to progress the queue

    delete e;
    q.erase(q.begin());
    delete longbuf;
    if (ret == 0)
      progress_queue(q);
  }
  debug << "local_enque_finish_long: done" << endl;

  return ret;
}


//
// progress_queue
//
// This function is invoked with the assumption that it is acceptable
// to send the first entry in the queue to the local LAM rank.
//
static void
progress_queue(local_list_t& q)
{
  local_list_t::iterator i, j;

  // Loop through the entire queue

  debug << "local_progress_queue: progressing queue" << endl;
  for (i = q.begin(); i != q.end(); ) {
    debug << "local_progress_queue: got entry -- drqid \n" 
	  << (*i)->drqid << endl;
    local_req_send(&(*i)->pk, (*i)->buffer, (*i)->type, (*i)->syncack);

    // Was this a ping for a long message?  If so, we're done -- we
    // have to wait for the LAM rank to send an ACK, which will
    // trigger the remote host to send us the rest of the message, at
    // which point we'll call local_enqueue_finish_long() and finish
    // sending this long message.

    if ((*i)->is_datasync_ping && (*i)->is_long) {
      debug << "local_progress_queue: got a datasync ping; stopping" << endl;
      (*i)->ping_sent = true;
      break;
    }

    // Otherwise, erase this entry from the queue and keep going

    j = i;
    delete (*j);
    ++i;
    q.erase(j);
  }
  debug << "local_progress_queue: done" << endl;
}

#endif