File: snapc.h

package info (click to toggle)
openmpi 1.6.5-9.1%2Bdeb8u1
  • links: PTS, VCS
  • area: main
  • in suites: jessie
  • size: 91,628 kB
  • ctags: 44,305
  • sloc: ansic: 408,966; cpp: 44,454; sh: 27,828; makefile: 10,486; asm: 3,882; python: 1,239; lex: 805; perl: 549; csh: 253; fortran: 232; f90: 126; tcl: 12
file content (338 lines) | stat: -rw-r--r-- 10,860 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
/*
 * Copyright (c) 2004-2009 The Trustees of Indiana University and Indiana
 *                         University Research and Technology
 *                         Corporation.  All rights reserved.
 * Copyright (c) 2004-2005 The University of Tennessee and The University
 *                         of Tennessee Research Foundation.  All rights
 *                         reserved.
 * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, 
 *                         University of Stuttgart.  All rights reserved.
 * Copyright (c) 2004-2005 The Regents of the University of California.
 *                         All rights reserved.
 * $COPYRIGHT$
 * 
 * Additional copyrights may follow
 * 
 * $HEADER$
 */
/**
 * @file
 *
 * Snapshot Coordination (SNAPC) Interface
 *
 * Terminology:
 * ------------
 *  Global Snapshot Coordinator:
 *     - HNP(s) coordination function.
 *  Local Snapshot Coordinator
 *     - VHNP(s) [e.g., orted] coordination function
 *  Application Snapshot Coordinator
 *     - Application level coordinaton function
 *  Local Snapshot
 *     - Snapshot generated by a single process in the parallel job
 *  Local Snapshot Reference
 *     - A generic reference to the physical Local Snapshot 
 *  Global Snapshot
 *     - Snapshot generated for the entire parallel job
 *  Global Snapshot Reference
 *     - A generic reference to the physical Global Snapshot 
 *
 * General Description:
 * ---------------------
 * This framework is tasked with:
 * - Initiating the checkpoint in the system
 * - Physically moving the local snapshot files to a location
 *   Initially this location, is the node on which the Head Node Process (HNP)
 *   is running, but later this will be a replicated checkpoint server or
 *   the like.
 * - Generating a 'global snapshot handle' that the user can use to restart
 *   the parallel job.
 *
 * Each component will have 3 teirs of behavior that must behave in concert:
 *  - Global Snapshot Coordinator
 *    This is the HNPs tasks. Mostly distributing the notification of the
 *    checkpoint, and then compiling the physical and virtual nature of the
 *    global snapshot handle.
 *  - Local Snapshot Coordinator
 *    This is the VHNPs (or orted, if available) tasks. This will involve
 *    working with the Global Snapshot Coordinator to route the physical
 *    and virtual 'local snapshot's from the application to the desired
 *    location. This process must also notify the Global Snapshot Coordinator
 *    when it's set of processes have completed the checkpoint.
 *  - Application Snapshot Coordinator
 *    This is the application level coordinator. This is very light, just
 *    a subscription to be triggered when it needs to checkpoint, and then,
 *    once finished with the checkpoint, notify the Local Snapshot Coordinator
 *    that it is complete.
 *    If there is no orted (so no bootproxy), then the application assumes the
 *    responsibility of the Local Snapshot Coordinator as well.
 *
 */

#ifndef MCA_SNAPC_H
#define MCA_SNAPC_H

#include "orte_config.h"
#include "orte/constants.h"
#include "orte/types.h"

#include "opal/mca/mca.h"
#include "opal/mca/base/base.h"
#include "opal/mca/crs/crs.h"
#include "opal/mca/crs/base/base.h"

#include "opal/class/opal_object.h"
#include "opal/util/output.h"

BEGIN_C_DECLS

/**
 * States that a process can be in while checkpointing
 */
/* Reached an error */
#define ORTE_SNAPC_CKPT_STATE_ERROR           0

/* Doing no checkpoint -- Quiet state */
#define ORTE_SNAPC_CKPT_STATE_NONE            1
/* There has been a request for a checkpoint from one of the applications */
#define ORTE_SNAPC_CKPT_STATE_REQUEST         2
/* There is a Pending checkpoint for this process */
#define ORTE_SNAPC_CKPT_STATE_PENDING         3
/* Running the checkpoint */
#define ORTE_SNAPC_CKPT_STATE_RUNNING         4
/* All Processes have been stopped */
#define ORTE_SNAPC_CKPT_STATE_STOPPED         5
/* Finished the checkpoint locally */
#define ORTE_SNAPC_CKPT_STATE_FINISHED_LOCAL  6
/* File Transfer in progress */
#define ORTE_SNAPC_CKPT_STATE_FILE_XFER       7
/* Finished the checkpoint */
#define ORTE_SNAPC_CKPT_STATE_FINISHED        8
/* Unable to checkpoint this job */
#define ORTE_SNAPC_CKPT_STATE_NO_CKPT         9
#define ORTE_SNAPC_CKPT_MAX                  10

/**
 * Sufficiently high shift value to avoid colliding the process
 * checkpointing states above with the ORTE process states
 */
#define ORTE_SNAPC_CKPT_SHIFT                131072

/* Uniquely encode the SNAPC state */
#define ORTE_SNAPC_CKPT_NOTIFY(state) (ORTE_SNAPC_CKPT_SHIFT + state)

/* Decode the SNAPC state */
#define ORTE_SNAPC_CKPT_STATE(state) (state - ORTE_SNAPC_CKPT_SHIFT)

/* Check whether a state is a SNAPC state or not. */
#define CHECK_ORTE_SNAPC_CKPT_STATE(state) (state >= ORTE_SNAPC_CKPT_SHIFT)

/**
 * Definition of a orte local snapshot.
 * Similar to the opal_crs_base_snapshot_t except that it
 * contains process contact information.
 */
struct orte_snapc_base_local_snapshot_1_0_0_t {
    /** List super object */
    opal_list_item_t super;

    /** ORTE Process name */
    orte_process_name_t process_name;

    /** State of the checkpoint */
    int state;

    /** Unique name of the local snapshot */
    char * reference_name;
    
    /** Local location of the local snapshot Absolute path */
    char * local_location;

    /** Remote location of the local snapshot Absolute path */
    char * remote_location;

    /** CRS agent */
    char * opal_crs;

};
typedef struct orte_snapc_base_local_snapshot_1_0_0_t orte_snapc_base_local_snapshot_1_0_0_t;
typedef struct orte_snapc_base_local_snapshot_1_0_0_t orte_snapc_base_local_snapshot_t;

ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_snapc_base_local_snapshot_t);

/**
 * Definition of the global snapshot.
 * Each component is assumed to have extened this definition
 * in the same way they extern the orte_snapc_base_compoinent_t below.
 */
struct orte_snapc_base_global_snapshot_1_0_0_t {
    /** This is an object, so must have super */
    opal_list_item_t super;

    /** A list of orte_snapc_base_snapshot_t's */
    opal_list_t local_snapshots;
    
    /** Unique name of the global snapshot */
    char * reference_name;
    
    /** Location of the global snapshot Absolute path */
    char * local_location;
    
    /** Sequence Number */
    int seq_num;

    /** Start Timestamp */
    char * start_time;

    /** End Timestamp */
    char * end_time;
};
typedef struct orte_snapc_base_global_snapshot_1_0_0_t orte_snapc_base_global_snapshot_1_0_0_t;
typedef struct orte_snapc_base_global_snapshot_1_0_0_t orte_snapc_base_global_snapshot_t;

ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_snapc_base_global_snapshot_t);

struct orte_snapc_base_quiesce_1_0_0_t {
    /** Parent is an object type */
    opal_object_t super;

    /** Current epoch */
    int epoch;
    /** Requested CRS */
    char * crs_name;
    /** Handle for reference */
    char * handle;
    /** snapshot list */
    orte_snapc_base_global_snapshot_t *snapshot;

    /** Target Directory */
    char * target_dir;
    /** Command Line */
    char * cmdline;
    /** State of operation if checkpointing */
    opal_crs_state_type_t cr_state;
    /** Checkpointing? */
    bool checkpointing;
    /** Restarting? */
    bool restarting;
};
typedef struct orte_snapc_base_quiesce_1_0_0_t orte_snapc_base_quiesce_1_0_0_t;
typedef struct orte_snapc_base_quiesce_1_0_0_t orte_snapc_base_quiesce_t;

ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_snapc_base_quiesce_t);

/**
 * Module initialization function.
 * Returns ORTE_SUCCESS
 */
typedef int (*orte_snapc_base_module_init_fn_t)
     (bool seed, bool app);

/**
 * Module finalization function.
 * Returns ORTE_SUCCESS
 */
typedef int (*orte_snapc_base_module_finalize_fn_t)
     (void);

/**
 * Setup the necessary structures for this job
 * Returns ORTE_SUCCESS
 */
typedef int (*orte_snapc_base_setup_job_fn_t)
     (orte_jobid_t jobid);

/**
 * Setup the necessary structures for this job
 * Returns ORTE_SUCCESS
 */
typedef int (*orte_snapc_base_release_job_fn_t)
     (orte_jobid_t jobid);


/**
 * Handle fault tolerance updates
 *
 * @param[in] state Fault tolerance state update
 *
 * @retval ORTE_SUCCESS The operation completed successfully
 * @retval ORTE_ERROR   An unspecifed error occurred
 */
typedef int  (*orte_snapc_base_ft_event_fn_t)(int state);

/**
 * Start a checkpoint originating from an internal source.
 *
 * This really only makes sense to call from an application, but in the future
 * we may allow the checkpoint operation to use this function from the local
 * coordinator.
 *
 * @param[out] epoch Epoch number to associate with this checkpoint operation
 * Returns ORTE_SUCCESS
 */
typedef int (*orte_snapc_base_start_checkpoint_fn_t)
    (orte_snapc_base_quiesce_t *datum);

/**
 * Signal end of checkpoint epoch originating from an internal source.
 *
 * @param[in] epoch Epoch number to associate with this checkpoint operation
 * Returns ORTE_SUCCESS
 */
typedef int (*orte_snapc_base_end_checkpoint_fn_t)
    (orte_snapc_base_quiesce_t *datum);

/**
 * Structure for SNAPC components.
 */
struct orte_snapc_base_component_2_0_0_t {
    /** MCA base component */
    mca_base_component_t base_version;
    /** MCA base data */
    mca_base_component_data_t base_data;

    /** Verbosity Level */
    int verbose;
    /** Output Handle for opal_output */
    int output_handle;
    /** Default Priority */
    int priority;
};
typedef struct orte_snapc_base_component_2_0_0_t orte_snapc_base_component_2_0_0_t;
typedef struct orte_snapc_base_component_2_0_0_t orte_snapc_base_component_t;

/**
 * Structure for SNAPC modules
 */
struct orte_snapc_base_module_1_0_0_t {
    /** Initialization Function */
    orte_snapc_base_module_init_fn_t           snapc_init;
    /** Finalization Function */
    orte_snapc_base_module_finalize_fn_t       snapc_finalize;
    /** Setup structures for a job */
    orte_snapc_base_setup_job_fn_t             setup_job;
    /** Release job */
    orte_snapc_base_release_job_fn_t           release_job;
    /** Handle any FT Notifications */
    orte_snapc_base_ft_event_fn_t              ft_event;
    /** Handle internal request for checkpoint */
    orte_snapc_base_start_checkpoint_fn_t      start_ckpt;
    orte_snapc_base_end_checkpoint_fn_t        end_ckpt;
};
typedef struct orte_snapc_base_module_1_0_0_t orte_snapc_base_module_1_0_0_t;
typedef struct orte_snapc_base_module_1_0_0_t orte_snapc_base_module_t;

ORTE_DECLSPEC extern orte_snapc_base_module_t orte_snapc;
ORTE_DECLSPEC extern orte_snapc_base_component_t orte_snapc_base_selected_component;

/**
 * Macro for use in components that are of type SNAPC
 */
#define ORTE_SNAPC_BASE_VERSION_2_0_0 \
    MCA_BASE_VERSION_2_0_0, \
    "snapc", 2, 0, 0

END_C_DECLS

#endif /* ORTE_SNAPC_H */