1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245
|
/*
* Copyright (c) 2012-2013 Los Alamos National Security, LLC.
* All rights reserved.
* Copyright (c) 2013-2017 Intel, Inc. All rights reserved.
* Copyright (c) 2012-2014 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2014 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$
*/
#include "ompi_config.h"
#include "ompi/constants.h"
#include <string.h>
#include <stdio.h>
#include <ctype.h>
#include "opal/dss/dss.h"
#include "opal/util/argv.h"
#include "opal/util/proc.h"
#include "opal/util/opal_getcwd.h"
#include "opal/mca/pmix/pmix.h"
#include "opal/threads/threads.h"
#include "opal/class/opal_list.h"
#include "opal/dss/dss.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/ess/ess.h"
#include "orte/mca/grpcomm/base/base.h"
#include "orte/mca/odls/odls.h"
#include "orte/mca/plm/plm.h"
#include "orte/mca/rml/rml.h"
#include "orte/mca/rml/rml_types.h"
#include "orte/mca/rmaps/rmaps.h"
#include "orte/mca/rmaps/rmaps_types.h"
#include "orte/mca/rmaps/base/base.h"
#include "orte/mca/rml/base/rml_contact.h"
#include "orte/mca/state/state.h"
#include "orte/mca/routed/routed.h"
#include "orte/util/name_fns.h"
#include "orte/util/session_dir.h"
#include "orte/util/show_help.h"
#include "orte/runtime/orte_globals.h"
#include "orte/runtime/orte_wait.h"
#include "orte/runtime/orte_data_server.h"
#include "ompi/mca/rte/base/base.h"
#include "ompi/mca/rte/rte.h"
#include "ompi/debuggers/debuggers.h"
#include "ompi/proc/proc.h"
#include "ompi/runtime/params.h"
#include "ompi/communicator/communicator.h"
extern ompi_rte_component_t mca_rte_orte_component;
void ompi_rte_abort(int error_code, char *fmt, ...)
{
va_list arglist;
/* If there was a message, output it */
va_start(arglist, fmt);
if( NULL != fmt ) {
char* buffer = NULL;
vasprintf( &buffer, fmt, arglist );
opal_output( 0, "%s", buffer );
free( buffer );
}
va_end(arglist);
/* if I am a daemon or the HNP... */
if (ORTE_PROC_IS_HNP || ORTE_PROC_IS_DAEMON) {
/* whack my local procs */
orte_odls.kill_local_procs(NULL);
/* whack any session directories */
orte_session_dir_cleanup(ORTE_JOBID_WILDCARD);
} else {
/* cleanup my session directory */
orte_session_dir_finalize(ORTE_PROC_MY_NAME);
}
/* if a critical connection failed, or a sensor limit was exceeded, exit without dropping a core */
if (ORTE_ERR_CONNECTION_FAILED == error_code ||
ORTE_ERR_SENSOR_LIMIT_EXCEEDED == error_code) {
orte_ess.abort(error_code, false);
} else {
orte_ess.abort(error_code, true);
}
/*
* We must exit in orte_ess.abort; all implementations of orte_ess.abort
* contain __opal_attribute_noreturn__
*/
/* No way to reach here, but put an exit() here a) just to cover
for bugs, and b) to let the compiler know we're honoring the
__opal_attribute_noreturn__. */
exit(-1);
}
static size_t handler = SIZE_MAX;
static bool debugger_register_active = true;
static bool debugger_event_active = true;
static void _release_fn(int status,
const opal_process_name_t *source,
opal_list_t *info, opal_list_t *results,
opal_pmix_notification_complete_fn_t cbfunc,
void *cbdata)
{
/* must let the notifier know we are done */
if (NULL != cbfunc) {
cbfunc(ORTE_SUCCESS, NULL, NULL, NULL, cbdata);
}
debugger_event_active = false;
}
static void _register_fn(int status,
size_t evhandler_ref,
void *cbdata)
{
opal_list_t *codes = (opal_list_t*)cbdata;
handler = evhandler_ref;
OPAL_LIST_RELEASE(codes);
debugger_register_active = false;
}
/*
* Wait for a debugger if asked. We support two ways of waiting for
* attaching debuggers -- see big comment in
* orte/tools/orterun/debuggers.c explaining the two scenarios.
*/
void ompi_rte_wait_for_debugger(void)
{
int debugger;
opal_list_t *codes, directives;
opal_value_t *kv;
char *evar;
int time;
/* See lengthy comment in orte/tools/orterun/debuggers.c about
orte_in_parallel_debugger */
debugger = orte_in_parallel_debugger;
if (1 == MPIR_being_debugged) {
debugger = 1;
}
if (!debugger && NULL == getenv("ORTE_TEST_DEBUGGER_ATTACH")) {
/* if not, just return */
return;
}
/* if we are being debugged, then we need to find
* the correct plug-ins
*/
ompi_debugger_setup_dlls();
if (NULL != (evar = getenv("ORTE_TEST_DEBUGGER_SLEEP"))) {
time = strtol(evar, NULL, 10);
sleep(time);
return;
}
if (orte_standalone_operation) {
/* spin until debugger attaches and releases us */
while (MPIR_debug_gate == 0) {
#if defined(HAVE_USLEEP)
usleep(100000); /* microseconds */
#else
sleep(1); /* seconds */
#endif
}
} else {
/* register an event handler for the ORTE_ERR_DEBUGGER_RELEASE event */
codes = OBJ_NEW(opal_list_t);
kv = OBJ_NEW(opal_value_t);
kv->key = strdup("errorcode");
kv->type = OPAL_INT;
kv->data.integer = ORTE_ERR_DEBUGGER_RELEASE;
opal_list_append(codes, &kv->super);
OBJ_CONSTRUCT(&directives, opal_list_t);
kv = OBJ_NEW(opal_value_t);
kv->key = strdup(OPAL_PMIX_EVENT_HDLR_NAME);
kv->type = OPAL_STRING;
kv->data.string = strdup("MPI-DEBUGGER-ATTACH");
opal_list_append(&directives, &kv->super);
opal_pmix.register_evhandler(codes, &directives, _release_fn, _register_fn, codes);
/* let the MPI progress engine run while we wait for registration to complete */
OMPI_WAIT_FOR_COMPLETION(debugger_register_active);
OPAL_LIST_DESTRUCT(&directives);
/* let the MPI progress engine run while we wait for debugger release */
OMPI_WAIT_FOR_COMPLETION(debugger_event_active);
/* deregister the event handler */
opal_pmix.deregister_evhandler(handler, NULL, NULL);
}
}
bool ompi_rte_connect_accept_support(const char *port)
{
char *ptr, *tmp;
orte_process_name_t name;
/* were we launched by mpirun, or are we calling
* without a defined port? */
if (NULL == orte_process_info.my_hnp_uri ||
NULL == port || 0 == strlen(port)) {
return true;
}
/* is the job family in the port different than my own? */
tmp = strdup(port); // protect input
if (NULL == (ptr = strchr(tmp, ':'))) {
/* this port didn't come from us! */
orte_show_help("help-orterun.txt", "orterun:malformedport", true);
free(tmp);
return false;
}
*ptr = '\0';
if (ORTE_SUCCESS != orte_util_convert_string_to_process_name(&name, tmp)) {
free(tmp);
orte_show_help("help-orterun.txt", "orterun:malformedport", true);
return false;
}
free(tmp);
if (ORTE_JOB_FAMILY(ORTE_PROC_MY_NAME->jobid) == ORTE_JOB_FAMILY(name.jobid)) {
/* same job family, so our infrastructure is adequate */
return true;
}
/* if the job family of the port is different than our own
* and we were launched by mpirun, then we require ompi-server
* support */
if (NULL == orte_data_server_uri) {
/* print a pretty help message */
orte_show_help("help-orterun.txt", "orterun:server-unavailable", true);
return false;
}
return true;
}
|