1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163
|
/*
* Copyright (c) 2012-2013 Los Alamos National Security, LLC.
* All rights reserved.
* Copyright (c) 2013-2016 Intel, Inc. All rights reserved.
* Copyright (c) 2012-2014 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2014 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$
*/
#include "ompi_config.h"
#include "ompi/constants.h"
#include <string.h>
#include <stdio.h>
#include <ctype.h>
#include "opal/dss/dss.h"
#include "opal/util/argv.h"
#include "opal/util/proc.h"
#include "opal/util/opal_getcwd.h"
#include "opal/mca/pmix/pmix.h"
#include "opal/threads/threads.h"
#include "opal/class/opal_list.h"
#include "opal/dss/dss.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/ess/ess.h"
#include "orte/mca/grpcomm/base/base.h"
#include "orte/mca/odls/odls.h"
#include "orte/mca/plm/plm.h"
#include "orte/mca/rml/rml.h"
#include "orte/mca/rml/rml_types.h"
#include "orte/mca/rmaps/rmaps.h"
#include "orte/mca/rmaps/rmaps_types.h"
#include "orte/mca/rmaps/base/base.h"
#include "orte/mca/rml/base/rml_contact.h"
#include "orte/mca/state/state.h"
#include "orte/mca/routed/routed.h"
#include "orte/util/name_fns.h"
#include "orte/util/session_dir.h"
#include "orte/runtime/orte_globals.h"
#include "orte/runtime/orte_wait.h"
#include "orte/runtime/orte_data_server.h"
#include "ompi/mca/rte/base/base.h"
#include "ompi/mca/rte/rte.h"
#include "ompi/debuggers/debuggers.h"
#include "ompi/proc/proc.h"
#include "ompi/runtime/params.h"
#include "ompi/communicator/communicator.h"
extern ompi_rte_orte_component_t mca_rte_orte_component;
void ompi_rte_abort(int error_code, char *fmt, ...)
{
va_list arglist;
/* If there was a message, output it */
va_start(arglist, fmt);
if( NULL != fmt ) {
char* buffer = NULL;
vasprintf( &buffer, fmt, arglist );
opal_output( 0, "%s", buffer );
free( buffer );
}
va_end(arglist);
/* if I am a daemon or the HNP... */
if (ORTE_PROC_IS_HNP || ORTE_PROC_IS_DAEMON) {
/* whack my local procs */
orte_odls.kill_local_procs(NULL);
/* whack any session directories */
orte_session_dir_cleanup(ORTE_JOBID_WILDCARD);
} else {
/* cleanup my session directory */
orte_session_dir_finalize(ORTE_PROC_MY_NAME);
}
/* if a critical connection failed, or a sensor limit was exceeded, exit without dropping a core */
if (ORTE_ERR_CONNECTION_FAILED == error_code ||
ORTE_ERR_SENSOR_LIMIT_EXCEEDED == error_code) {
orte_ess.abort(error_code, false);
} else {
orte_ess.abort(error_code, true);
}
/*
* We must exit in orte_ess.abort; all implementations of orte_ess.abort
* contain __opal_attribute_noreturn__
*/
/* No way to reach here, but put an exit() here a) just to cover
for bugs, and b) to let the compiler know we're honoring the
__opal_attribute_noreturn__. */
exit(-1);
}
/*
* Wait for a debugger if asked. We support two ways of waiting for
* attaching debuggers -- see big comment in
* orte/tools/orterun/debuggers.c explaining the two scenarios.
*/
void ompi_rte_wait_for_debugger(void)
{
int debugger;
orte_rml_recv_cb_t xfer;
char *evar;
int time;
/* See lengthy comment in orte/tools/orterun/debuggers.c about
orte_in_parallel_debugger */
debugger = orte_in_parallel_debugger;
if (1 == MPIR_being_debugged) {
debugger = 1;
}
if (!debugger && NULL == getenv("ORTE_TEST_DEBUGGER_ATTACH")) {
/* if not, just return */
return;
}
/* if we are being debugged, then we need to find
* the correct plug-ins
*/
ompi_debugger_setup_dlls();
if (NULL != (evar = getenv("ORTE_TEST_DEBUGGER_SLEEP"))) {
time = strtol(evar, NULL, 10);
sleep(time);
return;
}
if (orte_standalone_operation) {
/* spin until debugger attaches and releases us */
while (MPIR_debug_gate == 0) {
#if defined(HAVE_USLEEP)
usleep(100000); /* microseconds */
#else
sleep(1); /* seconds */
#endif
}
} else {
/* only the rank=0 proc waits for either a message from the
* HNP or for the debugger to attach - everyone else will just
* spin in * the grpcomm barrier in ompi_mpi_init until rank=0
* joins them.
*/
if (0 != ORTE_PROC_MY_NAME->vpid) {
return;
}
/* VPID 0 waits for a message from the HNP */
OBJ_CONSTRUCT(&xfer, orte_rml_recv_cb_t);
xfer.active = true;
orte_rml.recv_buffer_nb(OMPI_NAME_WILDCARD,
ORTE_RML_TAG_DEBUGGER_RELEASE,
ORTE_RML_NON_PERSISTENT,
orte_rml_recv_callback, &xfer);
/* let the MPI progress engine run while we wait */
OMPI_WAIT_FOR_COMPLETION(xfer.active);
}
}
|