1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346
|
/*
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2008 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2008 Institut National de Recherche en Informatique
* et Automatique. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/**
* @file
*
* Interface for waitpid / async notification of child death with the
* libevent runtime system.
*/
#ifndef ORTE_WAIT_H
#define ORTE_WAIT_H
#include "orte_config.h"
#ifdef HAVE_SYS_TYPES_H
#include <sys/types.h>
#endif
#ifdef HAVE_SYS_TIME_H
#include <sys/time.h>
#endif
#include "opal/dss/dss.h"
#include "opal/util/output.h"
#include "opal/sys/atomic.h"
#include "opal/event/event.h"
#include "opal/runtime/opal_progress.h"
#include "orte/types.h"
#include "orte/mca/rml/rml_types.h"
BEGIN_C_DECLS
typedef struct {
opal_object_t super;
char *name;
int channel;
opal_atomic_lock_t lock;
} orte_trigger_event_t;
ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_trigger_event_t);
/** typedef for callback function used in \c ompi_rte_wait_cb */
typedef void (*orte_wait_fn_t)(pid_t wpid, int status, void *data);
/**
* Disable / re-Enable SIGCHLD handler
*
* These functions have to be used after orte_wait_init was called.
*/
ORTE_DECLSPEC void orte_wait_enable(void);
ORTE_DECLSPEC void orte_wait_disable(void);
/**
* Wait for process terminiation
*
* Similar to \c waitpid, \c orte_waitpid utilizes the run-time
* event library for process terminiation notification. The \c
* WUNTRACED option is not supported, but the \c WNOHANG option is
* supported.
*
* \note A \c wpid value of \c -1 is not currently supported and will
* return an error.
*/
ORTE_DECLSPEC pid_t orte_waitpid(pid_t wpid, int *status, int options);
/**
* Register a callback for process termination
*
* Register a callback for notification when \c wpid causes a SIGCHLD.
* \c waitpid() will have already been called on the process at this
* time.
*
* If a thread is already blocked in \c ompi_rte_waitpid for \c wpid,
* this function will return \c ORTE_ERR_EXISTS. It is illegal for
* multiple callbacks to be registered for a single \c wpid
* (OMPI_EXISTS will be returned in this case).
*
* \warning It is not legal for \c wpid to be -1 when registering a
* callback.
*/
ORTE_DECLSPEC int orte_wait_cb(pid_t wpid, orte_wait_fn_t callback, void *data);
ORTE_DECLSPEC int orte_wait_cb_cancel(pid_t wpid);
ORTE_DECLSPEC int orte_wait_cb_disable(void);
ORTE_DECLSPEC int orte_wait_cb_enable(void);
/**
* Setup to wait for an event
*
* This function is used to setup a trigger event that can be used elsewhere
* in the code base where we want to wait for some event to
* happen. For example, orterun uses this function to setup an event
* that is used to notify orterun of abnormal and normal termination
* so it can wakeup and exit cleanly.
*
* The event will be defined so that firing the provided trigger
* will cause the event to trigger and callback to the provided
* function
*/
ORTE_DECLSPEC int orte_wait_event(opal_event_t **event,
orte_trigger_event_t *trig,
char *trigger_name,
void (*cbfunc)(int, short, void*));
/**
* In a number of places in the code, we need to wait for something
* to complete - for example, waiting for all launched procs to
* report into the HNP. In such cases, we want to just call progress
* so that any messages get processed, but otherwise "hold" the
* program at this spot until the counter achieves the specified
* value. We also want to provide a boolean flag, though, so that
* we break out of the loop should something go wrong.
*/
#define ORTE_PROGRESSED_WAIT(failed, counter, limit) \
do { \
OPAL_OUTPUT_VERBOSE((1, orte_debug_output, \
"progressed_wait: %s %d", \
__FILE__, __LINE__)); \
while (!(failed) && (counter) < (limit)) { \
opal_progress(); \
} \
} while(0); \
/**
* Trigger a defined event
*
* This function will trigger a previously-defined event - as setup
* by orte_wait_event - by firing the provided trigger
*/
ORTE_DECLSPEC void orte_trigger_event(orte_trigger_event_t *trig);
/**
* Setup an event to process a message
*
* If we are in an OOB recv callback, we frequently cannot process
* the received message until after we return from the callback to
* avoid a potential loopback situation - i.e., where processing
* the message can result in a message being sent somewhere that
* subsequently causes the recv we are in to get called again.
* This is typically the problem facing the daemons and HNP.
*
* To resolve this problem, we place the message to be processed on
* a list, and create a zero-time event that calls the function
* that will process the received message. The event library kindly
* does not trigger this event until after we return from the recv
* since the recv itself is considered an "event"! Thus, we will
* always execute the specified event cb function -after- leaving
* the recv.
*/
typedef struct {
opal_object_t super;
opal_event_t *ev;
orte_process_name_t sender;
opal_buffer_t *buffer;
orte_rml_tag_t tag;
#if OPAL_ENABLE_DEBUG
char *file;
int line;
#endif
} orte_message_event_t;
ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_message_event_t);
#define ORTE_MESSAGE_EVENT_DELAY(delay, mev) \
do { \
struct timeval now; \
OPAL_OUTPUT_VERBOSE((1, orte_debug_output, \
"defining message event delay: %s %d", \
__FILE__, __LINE__)); \
now.tv_sec = delay/1000000; \
now.tv_usec = delay%1000000; \
opal_evtimer_add(mev->ev, &now); \
} while(0);
#if OPAL_ENABLE_DEBUG
#define ORTE_MESSAGE_EVENT(sndr, buf, tg, cbfunc) \
do { \
orte_message_event_t *mev; \
struct timeval now; \
OPAL_OUTPUT_VERBOSE((1, orte_debug_output, \
"defining message event: %s %d", \
__FILE__, __LINE__)); \
mev = OBJ_NEW(orte_message_event_t); \
mev->sender.jobid = (sndr)->jobid; \
mev->sender.vpid = (sndr)->vpid; \
opal_dss.copy_payload(mev->buffer, (buf)); \
mev->tag = (tg); \
mev->file = strdup((buf)->parent.cls_init_file_name); \
mev->line = (buf)->parent.cls_init_lineno; \
opal_evtimer_set(mev->ev, (cbfunc), mev); \
now.tv_sec = 0; \
now.tv_usec = 0; \
opal_evtimer_add(mev->ev, &now); \
} while(0);
#else
#define ORTE_MESSAGE_EVENT(sndr, buf, tg, cbfunc) \
do { \
orte_message_event_t *mev; \
struct timeval now; \
OPAL_OUTPUT_VERBOSE((1, orte_debug_output, \
"defining message event: %s %d", \
__FILE__, __LINE__)); \
mev = OBJ_NEW(orte_message_event_t); \
mev->sender.jobid = (sndr)->jobid; \
mev->sender.vpid = (sndr)->vpid; \
opal_dss.copy_payload(mev->buffer, (buf)); \
mev->tag = (tg); \
opal_evtimer_set(mev->ev, (cbfunc), mev); \
now.tv_sec = 0; \
now.tv_usec = 0; \
opal_evtimer_add(mev->ev, &now); \
} while(0);
#endif
/* Sometimes, we just need to get out of the event library so
* we can progress - and we need to pass a little info. For those
* cases, we define a zero-time event that passes info to a cbfunc
*/
typedef struct {
opal_object_t super;
opal_event_t *ev;
orte_process_name_t proc;
} orte_notify_event_t;
ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_notify_event_t);
#define ORTE_NOTIFY_EVENT(cbfunc, data) \
do { \
struct timeval now; \
orte_notify_event_t *tmp; \
tmp = OBJ_NEW(orte_notify_event_t); \
tmp->proc.jobid = (data)->jobid; \
tmp->proc.vpid = (data)->vpid; \
opal_evtimer_set(tmp->ev, (cbfunc), tmp); \
now.tv_sec = 0; \
now.tv_usec = 0; \
OPAL_OUTPUT_VERBOSE((1, orte_debug_output, \
"defining notify event at %s:%d", \
__FILE__, __LINE__)); \
opal_evtimer_add(tmp->ev, &now); \
} while(0); \
/**
* In a number of places within the code, we want to setup a timer
* to detect when some procedure failed to complete. For example,
* when we launch the daemons, we frequently have no way to directly
* detect that a daemon failed to launch. Setting a timer allows us
* to automatically fail out of the launch if we don't hear from a
* daemon in some specified time window.
*
* Computing the amount of time to wait takes a few lines of code, but
* this macro encapsulates those lines along with the timer event
* definition just as a convenience. It also centralizes the
* necessary checks to ensure that the microsecond field is always
* less than 1M since some systems care about that, and to ensure
* that the computed wait time doesn't exceed the desired max
* wait
*/
#define ORTE_DETECT_TIMEOUT(event, n, deltat, maxwait, cbfunc) \
do { \
struct timeval now; \
opal_event_t *tmp; \
int timeout; \
tmp = (opal_event_t*)malloc(sizeof(opal_event_t)); \
opal_evtimer_set(tmp, (cbfunc), NULL); \
timeout = (deltat) * (n); \
if ((maxwait) > 0 && timeout > (maxwait)) { \
timeout = (maxwait); \
} \
now.tv_sec = timeout/1000000; \
now.tv_usec = timeout%1000000; \
OPAL_OUTPUT_VERBOSE((1, orte_debug_output, \
"defining timeout: %ld sec %ld usec at %s:%d", \
(long)now.tv_sec, (long)now.tv_usec, \
__FILE__, __LINE__)); \
opal_evtimer_add(tmp, &now); \
*(event) = tmp; \
}while(0); \
/**
* There are places in the code where we just want to periodically
* wakeup to do something, and then go back to sleep again. Setting
* a timer allows us to do this
*/
#define ORTE_TIMER_EVENT(sec, usec, cbfunc) \
do { \
struct timeval now; \
opal_event_t *tmp; \
tmp = (opal_event_t*)malloc(sizeof(opal_event_t)); \
opal_evtimer_set(tmp, (cbfunc), tmp); \
now.tv_sec = (sec); \
now.tv_usec = (usec); \
OPAL_OUTPUT_VERBOSE((1, orte_debug_output, \
"defining timer event: %ld sec %ld usec at %s:%d", \
(long)now.tv_sec, (long)now.tv_usec, \
__FILE__, __LINE__)); \
opal_evtimer_add(tmp, &now); \
}while(0); \
/**
* \internal
*
* Initialize the wait system (allocate mutexes, etc.)
*/
ORTE_DECLSPEC int orte_wait_init(void);
/**
* Kill all processes we are waiting on.
*/
ORTE_DECLSPEC int orte_wait_kill(int sig);
/**
* \internal
*
* Finalize the wait system (deallocate mutexes, etc.)
*/
ORTE_DECLSPEC int orte_wait_finalize(void);
END_C_DECLS
#endif /* #ifndef ORTE_WAIT_H */
|