File: oshmem_shmem_abort.c

package info (click to toggle)
openmpi 5.0.8-3
  • links: PTS, VCS
  • area: main
  • in suites:
  • size: 201,692 kB
  • sloc: ansic: 613,078; makefile: 42,353; sh: 11,194; javascript: 9,244; f90: 7,052; java: 6,404; perl: 5,179; python: 1,859; lex: 740; fortran: 61; cpp: 20; tcl: 12
file content (131 lines) | stat: -rw-r--r-- 3,917 bytes parent folder | download | duplicates (4)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
/*
 * Copyright (c) 2013      Mellanox Technologies, Inc.
 *                         All rights reserved.
 * Copyright (c) 2017      FUJITSU LIMITED.  All rights reserved.
 * Copyright (c) 2019      Research Organization for Information Science
 *                         and Technology (RIST).  All rights reserved.
 * Copyright (c) 2019      Triad National Security, LLC. All rights
 *                         reserved.
 * $COPYRIGHT$
 *
 * Additional copyrights may follow
 *
 * $HEADER$
 */

#include "oshmem_config.h"

#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif
#ifdef HAVE_SYS_TYPES_H
#include <sys/types.h>
#endif
#ifdef HAVE_SYS_PARAM_H
#include <sys/param.h>
#endif
#ifdef HAVE_NETDB_H
#include <netdb.h>
#endif

#include "opal/mca/backtrace/backtrace.h"
#include "opal/util/error.h"
#include "opal/runtime/opal.h"
#include "opal/runtime/opal_params.h"
#include "opal/util/show_help.h"

#include "oshmem/runtime/params.h"
#include "oshmem/runtime/runtime.h"
#include "oshmem/constants.h"
#include "oshmem/proc/proc.h"

static bool have_been_invoked = false;

int oshmem_shmem_abort(int errcode)
{
    const char *host;
    pid_t pid = 0;

    /* Protection for recursive invocation */
    if (have_been_invoked) {
        return OSHMEM_SUCCESS;
    }
    have_been_invoked = true;

    /* If ORTE is initialized, use its nodename.  Otherwise, call
     opal_gethostname. */

    /* If MPI is initialized, we know we have a runtime nodename, so
       use that.  Otherwise, call opal_gethostname. */
    if (ompi_rte_initialized) {
        host = ompi_process_info.nodename;
    } else {
        host = opal_gethostname();
    }
    pid = getpid();

    opal_show_help("help-shmem-api.txt",
                   "shmem-abort",
                   true,
                   OMPI_PROC_MY_NAME->vpid,
                   pid,
                   host,
                   errcode);

    /* Should we print a stack trace?  Not aggregated because they
     might be different on all processes. */
    if (opal_abort_print_stack) {
        char **messages;
        int len, i;

        if (OPAL_SUCCESS == opal_backtrace_buffer(&messages, &len)) {
            for (i = 0; i < len; ++i) {
                fprintf(stderr,
                        "[%s:%05d] [%d] func:%s\n",
                        host,
                        (int) pid,
                        i,
                        messages[i]);
                fflush(stderr);
            }
            free(messages);
        } else {
            /* This will print an message if it's unable to print the
             backtrace, so we don't need an additional "else" clause
             if opal_backtrace_print() is not supported. */
            opal_backtrace_print(stderr, NULL, 1);
        }
    }

    /* Wait for a while before aborting */
    opal_delay_abort();

    if (!oshmem_shmem_initialized) {
        if (!opal_initialized) {
            /* TODO help message from SHMEM not from MPI is needed*/
            opal_show_help("help-shmem-runtime.txt",
                           "oshmem shmem abort:cannot guarantee all killed",
                           true,
                           host,
                           (int) pid);
        } else {
            fprintf(stderr,
                    "[%s:%05d] Local abort completed successfully; not able to aggregate error messages, and not able to guarantee that all other processes were killed!\n",
                    host,
                    (int) pid);
        }
        oshmem_shmem_aborted = true;
        exit(errcode);
    }

    /* abort local procs in the communicator.  If the communicator is
     an intercommunicator AND the abort has explicitly requested
     that we abort the remote procs, then do that as well. */

    oshmem_shmem_aborted = true;
    /* now that we've aborted everyone else, gracefully die. */

    ompi_rte_abort(errcode, NULL );

    return OSHMEM_SUCCESS;
}