File: orte_quit.c

package info (click to toggle)
openmpi 2.0.2-2
  • links: PTS, VCS
  • area: main
  • in suites: stretch
  • size: 99,912 kB
  • ctags: 55,589
  • sloc: ansic: 525,999; f90: 18,307; makefile: 12,062; sh: 6,583; java: 6,278; asm: 3,515; cpp: 2,227; perl: 2,136; python: 1,350; lex: 734; fortran: 52; tcl: 12
file content (371 lines) | stat: -rw-r--r-- 16,904 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
 * Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
 *                         University Research and Technology
 *                         Corporation.  All rights reserved.
 * Copyright (c) 2004-2008 The University of Tennessee and The University
 *                         of Tennessee Research Foundation.  All rights
 *                         reserved.
 * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
 *                         University of Stuttgart.  All rights reserved.
 * Copyright (c) 2004-2005 The Regents of the University of California.
 *                         All rights reserved.
 * Copyright (c) 2006-2013 Cisco Systems, Inc.  All rights reserved.
 * Copyright (c) 2007-2009 Sun Microsystems, Inc. All rights reserved.
 * Copyright (c) 2007-2015 Los Alamos National Security, LLC.  All rights
 *                         reserved.
 * Copyright (c) 2012      Oak Ridge National Labs.  All rights reserved.
 * Copyright (c) 2014      Intel, Inc. All rights reserved.
 * $COPYRIGHT$
 *
 * Additional copyrights may follow
 *
 * $HEADER$
 */

#include "orte_config.h"
#include "orte/constants.h"

#include <string.h>
#include <stdio.h>
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif
#ifdef HAVE_SYS_PARAM_H
#include <sys/param.h>
#endif
#include <errno.h>
#include <signal.h>
#include <ctype.h>
#ifdef HAVE_SYS_TYPES_H
#include <sys/types.h>
#endif  /* HAVE_SYS_TYPES_H */
#ifdef HAVE_SYS_WAIT_H
#include <sys/wait.h>
#endif  /* HAVE_SYS_WAIT_H */
#ifdef HAVE_SYS_TIME_H
#include <sys/time.h>
#endif  /* HAVE_SYS_TIME_H */

#include "orte/mca/plm/plm.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/routed/routed.h"
#include "orte/mca/state/state.h"

#include "orte/util/session_dir.h"
#include "orte/util/show_help.h"

#include "orte/runtime/runtime.h"
#include "orte/runtime/orte_globals.h"
#include "orte/runtime/orte_quit.h"
#include "orte/runtime/orte_locks.h"
#include "orte/runtime/orte_data_server.h"

/*
 * Globals
 */
static int num_aborted = 0;
static int num_killed = 0;
static int num_failed_start = 0;
static bool errors_reported = false;

static void dump_aborted_procs(void);

void orte_quit(int fd, short args, void *cbdata)
{
    orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;

    /* cleanup */
    if (NULL != caddy) {
        OBJ_RELEASE(caddy);
    }

    /* check one-time lock to protect against "bounce" */
    if (opal_atomic_trylock(&orte_quit_lock)) { /* returns 1 if already locked */
        return;
    }

    /* if we are the hnp and haven't already reported it, then
     * report any errors
     */
    if (ORTE_PROC_IS_HNP && !errors_reported) {
        if (0 != orte_exit_status && !orte_execute_quiet) {
            errors_reported = true;
            /* abnormal termination of some kind */
            dump_aborted_procs();
            /* If we showed more abort messages than were allowed,
               show a followup message here */
            if (num_failed_start > 1) {
                if (orte_xml_output) {
                    fprintf(orte_xml_fp, "<stderr>");
                }
                fprintf(orte_xml_fp, "%d total process%s failed to start",
                        num_failed_start, ((num_failed_start > 1) ? "es" : ""));
                if (orte_xml_output) {
                    fprintf(orte_xml_fp, "&#010;</stderr>");
                }
                fprintf(orte_xml_fp, "\n");
            }
            if (num_aborted > 1) {
                if (orte_xml_output) {
                    fprintf(orte_xml_fp, "<stderr>");
                }
                fprintf(orte_xml_fp, "%d total process%s aborted",
                        num_aborted, ((num_aborted > 1) ? "es" : ""));
                if (orte_xml_output) {
                    fprintf(orte_xml_fp, "&#010;</stderr>");
                }
                fprintf(orte_xml_fp, "\n");
            }
            if (num_killed > 1) {
                if (orte_xml_output) {
                    fprintf(orte_xml_fp, "<stderr>");
                }
                fprintf(orte_xml_fp, "%d total process%s killed (some possibly by %s during cleanup)",
                        num_killed, ((num_killed > 1) ? "es" : ""), orte_basename);
                if (orte_xml_output) {
                    fprintf(orte_xml_fp, "&#010;</stderr>");
                }
                fprintf(orte_xml_fp, "\n");
            }
        }
    }

    /* flag that the event lib should no longer be looped
     * so we will exit
     */
    orte_event_base_active = false;
    /* break out of the event loop */
    opal_event_base_loopbreak(orte_event_base);
}

/*
 * On abnormal termination - dump the
 * exit status of the aborted procs.
 */

static void dump_aborted_procs(void)
{
    orte_std_cntr_t i, n;
    orte_proc_t *proc, *pptr;
    orte_app_context_t *approc;
    orte_job_t *job;
    orte_node_t *node;

    /* find the job that caused the problem - be sure to start the loop
     * at 1 as the daemons are in 0 and will clearly be "running", so no
     * point in checking them
     */
    for (n=1; n < orte_job_data->size; n++) {
        if (NULL == (job = (orte_job_t*)opal_pointer_array_get_item(orte_job_data, n))) {
            /* the array is no longer left-justified, so we have to continue */
            continue;
        }
        if (ORTE_JOB_STATE_UNDEF != job->state &&
            ORTE_JOB_STATE_INIT != job->state &&
            ORTE_JOB_STATE_RUNNING != job->state &&
            ORTE_JOB_STATE_TERMINATED != job->state &&
            ORTE_JOB_STATE_ABORT_ORDERED != job->state) {

            /* cycle through and count the number that were killed or aborted */
            for (i=0; i < job->procs->size; i++) {
                if (NULL == (pptr = (orte_proc_t*)opal_pointer_array_get_item(job->procs, i))) {
                    /* array is left-justfied - we are done */
                    continue;
                }
                if (ORTE_PROC_STATE_FAILED_TO_START == pptr->state ||
                    ORTE_PROC_STATE_FAILED_TO_LAUNCH == pptr->state) {
                    ++num_failed_start;
                } else if (ORTE_PROC_STATE_ABORTED == pptr->state) {
                    ++num_aborted;
                } else if (ORTE_PROC_STATE_ABORTED_BY_SIG == pptr->state) {
                    ++num_killed;
                } else if (ORTE_PROC_STATE_SENSOR_BOUND_EXCEEDED == pptr->state) {
                    ++num_killed;
                }
            }

            /* this is a guilty party */
            proc = NULL;
            if (!orte_get_attribute(&job->attributes, ORTE_JOB_ABORTED_PROC, (void**)&proc, OPAL_PTR) ||
                NULL == proc) {
                continue;
            }

            approc = (orte_app_context_t*)opal_pointer_array_get_item(job->apps, proc->app_idx);
            node = proc->node;
            if (ORTE_JOB_STATE_FAILED_TO_START == job->state ||
                ORTE_JOB_STATE_FAILED_TO_LAUNCH == job->state) {
                switch (proc->exit_code) {
                case ORTE_ERR_SILENT:
                    /* say nothing - it was already reported */
                    break;
                case ORTE_ERR_SYS_LIMITS_PIPES:
                    orte_show_help("help-orterun.txt", "orterun:sys-limit-pipe", true,
                                   orte_basename, proc->node->name,
                                   (unsigned long)proc->name.vpid);
                    break;
                case ORTE_ERR_PIPE_SETUP_FAILURE:
                    orte_show_help("help-orterun.txt", "orterun:pipe-setup-failure", true,
                                   orte_basename, proc->node->name,
                                   (unsigned long)proc->name.vpid);
                    break;
                case ORTE_ERR_SYS_LIMITS_CHILDREN:
                    orte_show_help("help-orterun.txt", "orterun:sys-limit-children", true,
                                   orte_basename, proc->node->name,
                                   (unsigned long)proc->name.vpid);
                    break;
                case ORTE_ERR_FAILED_GET_TERM_ATTRS:
                    orte_show_help("help-orterun.txt", "orterun:failed-term-attrs", true,
                                   orte_basename, proc->node->name,
                                   (unsigned long)proc->name.vpid);
                    break;
                case ORTE_ERR_WDIR_NOT_FOUND:
                    orte_show_help("help-orterun.txt", "orterun:wdir-not-found", true,
                                   orte_basename, approc->cwd,
                                   proc->node->name, (unsigned long)proc->name.vpid);
                    break;
                case ORTE_ERR_EXE_NOT_FOUND:
                    orte_show_help("help-orterun.txt", "orterun:exe-not-found", true,
                                   orte_basename,
                                   (unsigned long)proc->name.vpid,
                                   orte_basename,
                                   orte_basename,
                                   proc->node->name,
                                   approc->app);
                    break;
                case ORTE_ERR_EXE_NOT_ACCESSIBLE:
                    orte_show_help("help-orterun.txt", "orterun:exe-not-accessible", true,
                                   orte_basename, approc->app, proc->node->name,
                                   (unsigned long)proc->name.vpid);
                    break;
                case ORTE_ERR_MULTIPLE_AFFINITIES:
                    orte_show_help("help-orterun.txt",
                                   "orterun:multiple-paffinity-schemes", true, NULL);
                    break;
                case ORTE_ERR_TOPO_SLOT_LIST_NOT_SUPPORTED:
                    orte_show_help("help-orterun.txt",
                                   "orterun:topo-not-supported",
                                   true, orte_process_info.nodename, "rankfile containing a slot_list of ",
                                   NULL, approc->app);
                    break;
                case ORTE_ERR_INVALID_NODE_RANK:
                    orte_show_help("help-orterun.txt",
                                   "orterun:invalid-node-rank", true);
                    break;
                case ORTE_ERR_INVALID_LOCAL_RANK:
                    orte_show_help("help-orterun.txt",
                                   "orterun:invalid-local-rank", true);
                    break;
                case ORTE_ERR_NOT_ENOUGH_CORES:
                    orte_show_help("help-orterun.txt",
                                   "orterun:not-enough-resources", true,
                                   "sockets", node->name,
                                   "bind-to-core", approc->app);
                    break;
                case ORTE_ERR_TOPO_CORE_NOT_SUPPORTED:
                    orte_show_help("help-orterun.txt",
                                   "orterun:topo-not-supported",
                                   true, node->name, "bind-to-core", "",
                                   approc->app);
                    break;
                case ORTE_ERR_INVALID_PHYS_CPU:
                    orte_show_help("help-orterun.txt",
                                   "orterun:invalid-phys-cpu", true);
                    break;
                case ORTE_ERR_NOT_ENOUGH_SOCKETS:
                    orte_show_help("help-orterun.txt",
                                   "orterun:not-enough-resources", true,
                                   "sockets", node->name,
                                   "bind-to-socket", approc->app);
                    break;
                case ORTE_ERR_TOPO_SOCKET_NOT_SUPPORTED:
                    orte_show_help("help-orterun.txt",
                                   "orterun:topo-not-supported",
                                   true, node->name, "bind-to-socket", "",
                                   approc->app);
                    break;
                case ORTE_ERR_MODULE_NOT_FOUND:
                    orte_show_help("help-orterun.txt",
                                   "orterun:paffinity-missing-module",
                                   true, node->name);
                    break;
                case ORTE_ERR_SLOT_LIST_RANGE:
                    orte_show_help("help-orterun.txt",
                                   "orterun:invalid-slot-list-range",
                                   true, node->name, NULL);
                    break;
                case ORTE_ERR_PIPE_READ_FAILURE:
                    orte_show_help("help-orterun.txt", "orterun:pipe-read-failure", true,
                                   orte_basename, node->name, (unsigned long)proc->name.vpid);
                    break;
                case ORTE_ERR_SOCKET_NOT_AVAILABLE:
                    orte_show_help("help-orterun.txt", "orterun:proc-socket-not-avail", true,
                                   orte_basename, ORTE_ERROR_NAME(proc->exit_code), node->name,
                                   (unsigned long)proc->name.vpid);
                    break;

                default:
                    if (0 != proc->exit_code) {
                        orte_show_help("help-orterun.txt", "orterun:proc-failed-to-start", true,
                                       orte_basename, ORTE_ERROR_NAME(proc->exit_code), node->name,
                                       (unsigned long)proc->name.vpid);
                    } else {
                        orte_show_help("help-orterun.txt", "orterun:proc-failed-to-start-no-status", true,
                                       orte_basename, node->name);
                    }
                    break;
                }
            } else if (ORTE_JOB_STATE_ABORTED == job->state) {
                orte_show_help("help-orterun.txt", "orterun:proc-ordered-abort", true,
                               orte_basename, (unsigned long)proc->name.vpid, (unsigned long)proc->pid,
                               node->name, orte_basename);
            } else if (ORTE_JOB_STATE_ABORTED_BY_SIG == job->state) {  /* aborted by signal */
#ifdef HAVE_STRSIGNAL
                if (NULL != strsignal(WTERMSIG(proc->exit_code))) {
                    orte_show_help("help-orterun.txt", "orterun:proc-aborted-strsignal", true,
                                   orte_basename, (unsigned long)proc->name.vpid, (unsigned long)proc->pid,
                                   node->name, WTERMSIG(proc->exit_code),
                                   strsignal(WTERMSIG(proc->exit_code)));
                } else {
#endif
                    orte_show_help("help-orterun.txt", "orterun:proc-aborted", true,
                                   orte_basename, (unsigned long)proc->name.vpid, (unsigned long)proc->pid,
                                   node->name, WTERMSIG(proc->exit_code));
#ifdef HAVE_STRSIGNAL
                }
#endif
            } else if (ORTE_JOB_STATE_ABORTED_WO_SYNC == job->state) { /* proc exited w/o finalize */
                orte_show_help("help-orterun.txt", "orterun:proc-exit-no-sync", true,
                               orte_basename, (unsigned long)proc->name.vpid, (unsigned long)proc->pid,
                               node->name, orte_basename, orte_basename);
            } else if (ORTE_JOB_STATE_COMM_FAILED == job->state) {
                orte_show_help("help-orterun.txt", "orterun:proc-comm-failed", true,
                               ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                               ORTE_NAME_PRINT(&proc->name), node->name);
            } else if (ORTE_JOB_STATE_SENSOR_BOUND_EXCEEDED == job->state) {
                switch (proc->exit_code) {
                case ORTE_ERR_MEM_LIMIT_EXCEEDED:
                    orte_show_help("help-orterun.txt", "orterun:proc-mem-exceeded", true,
                                   ORTE_NAME_PRINT(&proc->name), node->name);
                    break;
                case ORTE_ERR_PROC_STALLED:
                    orte_show_help("help-orterun.txt", "orterun:proc-stalled", true);
                    break;

                default:
                    orte_show_help("help-orterun.txt", "orterun:proc-sensor-exceeded", true);
                    break;
                }
            } else if (ORTE_JOB_STATE_HEARTBEAT_FAILED == job->state) {
                orte_show_help("help-orterun.txt", "orterun:proc-heartbeat-failed", true,
                               orte_basename, ORTE_NAME_PRINT(&proc->name), node->name);
            } else if (orte_abort_non_zero_exit &&
                       ORTE_JOB_STATE_NON_ZERO_TERM == job->state) {
                orte_show_help("help-orterun.txt", "orterun:non-zero-exit", true,
                               orte_basename, ORTE_NAME_PRINT(&proc->name), proc->exit_code);
            }
            return;
        }
    }
}