File: crs.h

package info (click to toggle)
openmpi 3.1.3-11
  • links: PTS, VCS
  • area: main
  • in suites: buster
  • size: 118,572 kB
  • sloc: ansic: 628,972; f90: 17,993; makefile: 13,761; sh: 7,051; java: 6,360; perl: 3,215; cpp: 2,225; python: 1,350; lex: 988; fortran: 52; tcl: 12
file content (307 lines) | stat: -rw-r--r-- 9,544 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
/*
 * Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
 *                         University Research and Technology
 *                         Corporation.  All rights reserved.
 * Copyright (c) 2004-2005 The University of Tennessee and The University
 *                         of Tennessee Research Foundation.  All rights
 *                         reserved.
 * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
 *                         University of Stuttgart.  All rights reserved.
 * Copyright (c) 2004-2005 The Regents of the University of California.
 *                         All rights reserved.
 * Copyright (c) 2007      Evergrid, Inc. All rights reserved.
 * Copyright (c) 2010-2011 Oak Ridge National Labs.  All rights reserved.
 * Copyright (c) 2015      Los Alamos National Security, LLC. All rights
 *                         reserved.
 *
 * $COPYRIGHT$
 *
 * Additional copyrights may follow
 *
 * $HEADER$
 */
/**
 * @file
 *
 * Checkpoint and Restart Service (CRS) Interface
 *
 * General Description:
 *
 * The OPAL Checkpoint and Restart Service (CRS) has been created to create an
 * abstract notion of a single process checkpointer for upper levels to
 * incorporate checkpoint/restart calls genericly into their code. This keeps
 * the upper levels from becoming too tied to a specfic checkpoint and restart
 * implementation.
 *
 * This interface will change in the future to allow for some additional
 * specialized functionality such as memory inclusion/exclusion, explicit
 * restarting while running, and others.
 *
 * Words to the Wise:
 *
 * The CRS module must adhere to the API exactly inorder to be fully supported.
 * How the module goes about conforming to the API is an internal module issue
 * and in no cases should the module impose restrictions upon the upper layers
 * as this is an API violation.
 *
 */

#ifndef MCA_CRS_H
#define MCA_CRS_H

#include "opal_config.h"
#include "opal/mca/mca.h"
#include "opal/mca/base/base.h"
#include "opal/class/opal_object.h"

BEGIN_C_DECLS

/**
 * States of the module
 */
enum opal_crs_state_type_t {
    OPAL_CRS_NONE        = 0,
    OPAL_CRS_CHECKPOINT  = 1,
    OPAL_CRS_RESTART_PRE = 2,
    OPAL_CRS_RESTART     = 3, /* RESTART_POST */
    OPAL_CRS_CONTINUE    = 4,
    OPAL_CRS_TERM        = 5,
    OPAL_CRS_RUNNING     = 6,
    OPAL_CRS_ERROR       = 7,
    OPAL_CRS_STATE_MAX   = 8
};
typedef enum opal_crs_state_type_t opal_crs_state_type_t;

/*
 * Possible checkpoint options
 */
struct opal_crs_base_ckpt_options_1_0_0_t {
    /** Parent is an object type */
    opal_object_t super;

    /** Terminate after checkpoint */
    bool term;
    /** Send SIGSTOP after checkpoint */
    bool stop;

    /** INC Prep Only */
    bool inc_prep_only;

    /** INC Recover Only */
    bool inc_recover_only;

#if OPAL_ENABLE_CRDEBUG == 1
    /** Wait for debugger to attach after checkpoint */
    bool attach_debugger;
    /** Do not wait for debugger to reattach after checkpoint */
    bool detach_debugger;
#endif
};
typedef struct opal_crs_base_ckpt_options_1_0_0_t opal_crs_base_ckpt_options_1_0_0_t;
typedef struct opal_crs_base_ckpt_options_1_0_0_t opal_crs_base_ckpt_options_t;
OPAL_DECLSPEC OBJ_CLASS_DECLARATION(opal_crs_base_ckpt_options_t);

/**
 * Structure for Single process snapshot
 * Each component is assumed to have extened this definition
 * in the same way they exten the opal_crs_base_compoinent_t below.
 */
struct opal_crs_base_snapshot_1_0_0_t {
    /** This is an object, so must have super */
    opal_list_item_t super;

    /** MCA Component name */
    char * component_name;

    /** Metadata filename */
    char * metadata_filename;

    /** Metadata fd */
    FILE * metadata;

    /** Absolute path the the snapshot directory */
    char * snapshot_directory;

    /** Cold Start:
     * If we are restarting cold, then we need to recreate this structure
     *  opal_restart would set this, and let the component do the heavy lifting
     *  of recreating the structure, sicne it doesn't know exactly how to.
     */
    bool cold_start;
};
typedef struct opal_crs_base_snapshot_1_0_0_t opal_crs_base_snapshot_1_0_0_t;
typedef struct opal_crs_base_snapshot_1_0_0_t opal_crs_base_snapshot_t;

OPAL_DECLSPEC OBJ_CLASS_DECLARATION(opal_crs_base_snapshot_t);

/**
 * Module initialization function.
 * Returns OPAL_SUCCESS
 */
typedef int (*opal_crs_base_module_init_fn_t)
     (void);

/**
 * Module finalization function.
 * Returns OPAL_SUCCESS
 */
typedef int (*opal_crs_base_module_finalize_fn_t)
     (void);

/**
 * Call the underlying checkpointer.
 * Returns OPAL_SUCCESS upon success, and OPAL_ERROR otherwise.
 *
 * Arguments:
 *   pid    = PID of the process to checkpoint, or 0 if checkpointing self.
 *   fname  = the filename where the checkpoint has been written.
 *   state = The state at which the checkpoint is exiting
 *     - OPAL_CRS_CONTINUE
 *       Continuing after a checkpoint has been taken
 *     - OPAL_CRS_RESTART
 *       Restarting from a checkpoint
 *     - OPAL_CRS_ERROR
 *       Checkpoint was not successful.
 *
 * The 'fname' string is owned by the caller: if appropriate, it must be eventually
 * freed by the caller.
 */
typedef int (*opal_crs_base_module_checkpoint_fn_t)
     (pid_t pid,
      opal_crs_base_snapshot_t *snapshot,
      opal_crs_base_ckpt_options_t *options,
      opal_crs_state_type_t *state);

/**
 * Call the underlying restart command for this process
 * Returns OPAL_SUCCESS or OPAL_CRS_ERROR
 *
 * Arguments:
 *  fname = Checkpoint filename
 *  spawn_child  = true if the restarted process should be forked as a new process,
 *                      in which case 'child_pid' will be returned.
 *                 false if the restarted process should overwrite the current
 *                       process space.
 *  child_pid = PID of the child that was started, if applicable
 *
 */
typedef int (*opal_crs_base_module_restart_fn_t)
     (opal_crs_base_snapshot_t *snapshot,
      bool spawn_child,
      pid_t *child_pid);

/**
 * Disable the checkpointer
 * Returns OPAL_SUCCESS or OPAL_CRS_ERROR
 *
 * This should set a flag/mutex to disallow checkpoints to occur.
 * If a checkpoint were to occur while checkpoints are disabled,
 * they should block until reenabled.
 * A quality module implementation would notify the user that the
 * checkpoint has been delayed until the program is out of this critical
 * section of code.
 */
typedef int (*opal_crs_base_module_disable_checkpoint_fn_t)
     (void);

/**
 * Enable the checkpointer
 * Returns OPAL_SUCCESS or OPAL_CRS_ERROR
 *
 * This should set a flag/mutex to allow checkpoints to occur
 */
typedef int (*opal_crs_base_module_enable_checkpoint_fn_t)
     (void);

/**
 * Prepare the CRS component for process launch.
 * Some CRS components need to take action before the
 * process is ever launched to do such things as:
 * - seed the process environment
 * - LD_PRELOAD
 * - Analyze the binary before launch
 *
 * @param rank Rank of the process to be started
 * @param app  Absolute pathname of argv[0]
 * @param argv Standard argv-style array, including a final NULL pointer
 * @param env  Standard environ-style array, including a final NULL pointer
 */
typedef int (*opal_crs_base_module_prelaunch_fn_t)
         (int32_t rank,
          char *base_snapshot_dir,
          char **app,
          char **cwd,
          char ***argv,
          char ***env);

/**
 * Register another thread that may call this library.
 * Some CR systems require that each thread that will call into their library
 * register individually before doing so.
 *
 * Returns OPAL_SUCCESS or OPAL_ERROR
 */
typedef int (*opal_crs_base_module_reg_thread_fn_t)
     (void);

/**
 * Structure for CRS components.
 */
struct opal_crs_base_component_2_0_0_t {
    /** MCA base component */
    mca_base_component_t base_version;
    /** MCA base data */
    mca_base_component_data_t base_data;

    /** Verbosity Level */
    int verbose;
    /** Output Handle for opal_output */
    int output_handle;
    /** Default Priority */
    int priority;
};
typedef struct opal_crs_base_component_2_0_0_t opal_crs_base_component_2_0_0_t;
typedef struct opal_crs_base_component_2_0_0_t opal_crs_base_component_t;

/**
 * Structure for CRS modules
 */
struct opal_crs_base_module_1_0_0_t {
    /** Initialization Function */
    opal_crs_base_module_init_fn_t           crs_init;
    /** Finalization Function */
    opal_crs_base_module_finalize_fn_t       crs_finalize;

    /** Checkpoint interface */
    opal_crs_base_module_checkpoint_fn_t     crs_checkpoint;

    /** Restart Interface */
    opal_crs_base_module_restart_fn_t        crs_restart;

    /** Disable checkpoints */
    opal_crs_base_module_disable_checkpoint_fn_t crs_disable_checkpoint;
    /** Enable checkpoints */
    opal_crs_base_module_enable_checkpoint_fn_t  crs_enable_checkpoint;

    /** Pre Launch */
    opal_crs_base_module_prelaunch_fn_t      crs_prelaunch;

    /** Per thread registration */
    opal_crs_base_module_reg_thread_fn_t      crs_reg_thread;
};
typedef struct opal_crs_base_module_1_0_0_t opal_crs_base_module_1_0_0_t;
typedef struct opal_crs_base_module_1_0_0_t opal_crs_base_module_t;

OPAL_DECLSPEC extern opal_crs_base_module_t opal_crs;

/**
 * Macro for use in components that are of type CRS
 */
#define OPAL_CRS_BASE_VERSION_2_0_0 \
    OPAL_MCA_BASE_VERSION_2_1_0("crs", 2, 0, 0)

END_C_DECLS

#endif /* OPAL_CRS_H */