1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338
|
/*
* Copyright (c) 2004-2009 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2004-2005 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/
/**
* @file
*
* Snapshot Coordination (SNAPC) Interface
*
* Terminology:
* ------------
* Global Snapshot Coordinator:
* - HNP(s) coordination function.
* Local Snapshot Coordinator
* - VHNP(s) [e.g., orted] coordination function
* Application Snapshot Coordinator
* - Application level coordinaton function
* Local Snapshot
* - Snapshot generated by a single process in the parallel job
* Local Snapshot Reference
* - A generic reference to the physical Local Snapshot
* Global Snapshot
* - Snapshot generated for the entire parallel job
* Global Snapshot Reference
* - A generic reference to the physical Global Snapshot
*
* General Description:
* ---------------------
* This framework is tasked with:
* - Initiating the checkpoint in the system
* - Physically moving the local snapshot files to a location
* Initially this location, is the node on which the Head Node Process (HNP)
* is running, but later this will be a replicated checkpoint server or
* the like.
* - Generating a 'global snapshot handle' that the user can use to restart
* the parallel job.
*
* Each component will have 3 teirs of behavior that must behave in concert:
* - Global Snapshot Coordinator
* This is the HNPs tasks. Mostly distributing the notification of the
* checkpoint, and then compiling the physical and virtual nature of the
* global snapshot handle.
* - Local Snapshot Coordinator
* This is the VHNPs (or orted, if available) tasks. This will involve
* working with the Global Snapshot Coordinator to route the physical
* and virtual 'local snapshot's from the application to the desired
* location. This process must also notify the Global Snapshot Coordinator
* when it's set of processes have completed the checkpoint.
* - Application Snapshot Coordinator
* This is the application level coordinator. This is very light, just
* a subscription to be triggered when it needs to checkpoint, and then,
* once finished with the checkpoint, notify the Local Snapshot Coordinator
* that it is complete.
* If there is no orted (so no bootproxy), then the application assumes the
* responsibility of the Local Snapshot Coordinator as well.
*
*/
#ifndef MCA_SNAPC_H
#define MCA_SNAPC_H
#include "orte_config.h"
#include "orte/constants.h"
#include "orte/types.h"
#include "opal/mca/mca.h"
#include "opal/mca/base/base.h"
#include "opal/mca/crs/crs.h"
#include "opal/mca/crs/base/base.h"
#include "opal/class/opal_object.h"
#include "opal/util/output.h"
BEGIN_C_DECLS
/**
* States that a process can be in while checkpointing
*/
/* Reached an error */
#define ORTE_SNAPC_CKPT_STATE_ERROR 0
/* Doing no checkpoint -- Quiet state */
#define ORTE_SNAPC_CKPT_STATE_NONE 1
/* There has been a request for a checkpoint from one of the applications */
#define ORTE_SNAPC_CKPT_STATE_REQUEST 2
/* There is a Pending checkpoint for this process */
#define ORTE_SNAPC_CKPT_STATE_PENDING 3
/* Running the checkpoint */
#define ORTE_SNAPC_CKPT_STATE_RUNNING 4
/* All Processes have been stopped */
#define ORTE_SNAPC_CKPT_STATE_STOPPED 5
/* Finished the checkpoint locally */
#define ORTE_SNAPC_CKPT_STATE_FINISHED_LOCAL 6
/* File Transfer in progress */
#define ORTE_SNAPC_CKPT_STATE_FILE_XFER 7
/* Finished the checkpoint */
#define ORTE_SNAPC_CKPT_STATE_FINISHED 8
/* Unable to checkpoint this job */
#define ORTE_SNAPC_CKPT_STATE_NO_CKPT 9
#define ORTE_SNAPC_CKPT_MAX 10
/**
* Sufficiently high shift value to avoid colliding the process
* checkpointing states above with the ORTE process states
*/
#define ORTE_SNAPC_CKPT_SHIFT 131072
/* Uniquely encode the SNAPC state */
#define ORTE_SNAPC_CKPT_NOTIFY(state) (ORTE_SNAPC_CKPT_SHIFT + state)
/* Decode the SNAPC state */
#define ORTE_SNAPC_CKPT_STATE(state) (state - ORTE_SNAPC_CKPT_SHIFT)
/* Check whether a state is a SNAPC state or not. */
#define CHECK_ORTE_SNAPC_CKPT_STATE(state) (state >= ORTE_SNAPC_CKPT_SHIFT)
/**
* Definition of a orte local snapshot.
* Similar to the opal_crs_base_snapshot_t except that it
* contains process contact information.
*/
struct orte_snapc_base_local_snapshot_1_0_0_t {
/** List super object */
opal_list_item_t super;
/** ORTE Process name */
orte_process_name_t process_name;
/** State of the checkpoint */
int state;
/** Unique name of the local snapshot */
char * reference_name;
/** Local location of the local snapshot Absolute path */
char * local_location;
/** Remote location of the local snapshot Absolute path */
char * remote_location;
/** CRS agent */
char * opal_crs;
};
typedef struct orte_snapc_base_local_snapshot_1_0_0_t orte_snapc_base_local_snapshot_1_0_0_t;
typedef struct orte_snapc_base_local_snapshot_1_0_0_t orte_snapc_base_local_snapshot_t;
ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_snapc_base_local_snapshot_t);
/**
* Definition of the global snapshot.
* Each component is assumed to have extened this definition
* in the same way they extern the orte_snapc_base_compoinent_t below.
*/
struct orte_snapc_base_global_snapshot_1_0_0_t {
/** This is an object, so must have super */
opal_list_item_t super;
/** A list of orte_snapc_base_snapshot_t's */
opal_list_t local_snapshots;
/** Unique name of the global snapshot */
char * reference_name;
/** Location of the global snapshot Absolute path */
char * local_location;
/** Sequence Number */
int seq_num;
/** Start Timestamp */
char * start_time;
/** End Timestamp */
char * end_time;
};
typedef struct orte_snapc_base_global_snapshot_1_0_0_t orte_snapc_base_global_snapshot_1_0_0_t;
typedef struct orte_snapc_base_global_snapshot_1_0_0_t orte_snapc_base_global_snapshot_t;
ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_snapc_base_global_snapshot_t);
struct orte_snapc_base_quiesce_1_0_0_t {
/** Parent is an object type */
opal_object_t super;
/** Current epoch */
int epoch;
/** Requested CRS */
char * crs_name;
/** Handle for reference */
char * handle;
/** snapshot list */
orte_snapc_base_global_snapshot_t *snapshot;
/** Target Directory */
char * target_dir;
/** Command Line */
char * cmdline;
/** State of operation if checkpointing */
opal_crs_state_type_t cr_state;
/** Checkpointing? */
bool checkpointing;
/** Restarting? */
bool restarting;
};
typedef struct orte_snapc_base_quiesce_1_0_0_t orte_snapc_base_quiesce_1_0_0_t;
typedef struct orte_snapc_base_quiesce_1_0_0_t orte_snapc_base_quiesce_t;
ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_snapc_base_quiesce_t);
/**
* Module initialization function.
* Returns ORTE_SUCCESS
*/
typedef int (*orte_snapc_base_module_init_fn_t)
(bool seed, bool app);
/**
* Module finalization function.
* Returns ORTE_SUCCESS
*/
typedef int (*orte_snapc_base_module_finalize_fn_t)
(void);
/**
* Setup the necessary structures for this job
* Returns ORTE_SUCCESS
*/
typedef int (*orte_snapc_base_setup_job_fn_t)
(orte_jobid_t jobid);
/**
* Setup the necessary structures for this job
* Returns ORTE_SUCCESS
*/
typedef int (*orte_snapc_base_release_job_fn_t)
(orte_jobid_t jobid);
/**
* Handle fault tolerance updates
*
* @param[in] state Fault tolerance state update
*
* @retval ORTE_SUCCESS The operation completed successfully
* @retval ORTE_ERROR An unspecifed error occurred
*/
typedef int (*orte_snapc_base_ft_event_fn_t)(int state);
/**
* Start a checkpoint originating from an internal source.
*
* This really only makes sense to call from an application, but in the future
* we may allow the checkpoint operation to use this function from the local
* coordinator.
*
* @param[out] epoch Epoch number to associate with this checkpoint operation
* Returns ORTE_SUCCESS
*/
typedef int (*orte_snapc_base_start_checkpoint_fn_t)
(orte_snapc_base_quiesce_t *datum);
/**
* Signal end of checkpoint epoch originating from an internal source.
*
* @param[in] epoch Epoch number to associate with this checkpoint operation
* Returns ORTE_SUCCESS
*/
typedef int (*orte_snapc_base_end_checkpoint_fn_t)
(orte_snapc_base_quiesce_t *datum);
/**
* Structure for SNAPC components.
*/
struct orte_snapc_base_component_2_0_0_t {
/** MCA base component */
mca_base_component_t base_version;
/** MCA base data */
mca_base_component_data_t base_data;
/** Verbosity Level */
int verbose;
/** Output Handle for opal_output */
int output_handle;
/** Default Priority */
int priority;
};
typedef struct orte_snapc_base_component_2_0_0_t orte_snapc_base_component_2_0_0_t;
typedef struct orte_snapc_base_component_2_0_0_t orte_snapc_base_component_t;
/**
* Structure for SNAPC modules
*/
struct orte_snapc_base_module_1_0_0_t {
/** Initialization Function */
orte_snapc_base_module_init_fn_t snapc_init;
/** Finalization Function */
orte_snapc_base_module_finalize_fn_t snapc_finalize;
/** Setup structures for a job */
orte_snapc_base_setup_job_fn_t setup_job;
/** Release job */
orte_snapc_base_release_job_fn_t release_job;
/** Handle any FT Notifications */
orte_snapc_base_ft_event_fn_t ft_event;
/** Handle internal request for checkpoint */
orte_snapc_base_start_checkpoint_fn_t start_ckpt;
orte_snapc_base_end_checkpoint_fn_t end_ckpt;
};
typedef struct orte_snapc_base_module_1_0_0_t orte_snapc_base_module_1_0_0_t;
typedef struct orte_snapc_base_module_1_0_0_t orte_snapc_base_module_t;
ORTE_DECLSPEC extern orte_snapc_base_module_t orte_snapc;
ORTE_DECLSPEC extern orte_snapc_base_component_t orte_snapc_base_selected_component;
/**
* Macro for use in components that are of type SNAPC
*/
#define ORTE_SNAPC_BASE_VERSION_2_0_0 \
MCA_BASE_VERSION_2_0_0, \
"snapc", 2, 0, 0
END_C_DECLS
#endif /* ORTE_SNAPC_H */
|