File: opal_cr.h

package info (click to toggle)
openmpi 1.6.5-9.1%2Bdeb8u1
  • links: PTS, VCS
  • area: main
  • in suites: jessie
  • size: 91,628 kB
  • ctags: 44,305
  • sloc: ansic: 408,966; cpp: 44,454; sh: 27,828; makefile: 10,486; asm: 3,882; python: 1,239; lex: 805; perl: 549; csh: 253; fortran: 232; f90: 126; tcl: 12
file content (338 lines) | stat: -rw-r--r-- 11,971 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
/*
 * Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
 *                         University Research and Technology
 *                         Corporation.  All rights reserved.
 * Copyright (c) 2004-2005 The University of Tennessee and The University
 *                         of Tennessee Research Foundation.  All rights
 *                         reserved.
 * Copyright (c) 2004-2007 High Performance Computing Center Stuttgart, 
 *                         University of Stuttgart.  All rights reserved.
 * Copyright (c) 2004-2005 The Regents of the University of California.
 *                         All rights reserved.
 * Copyright (c) 2008      Cisco Systems, Inc.  All rights reserved.
 * $COPYRIGHT$
 * 
 * Additional copyrights may follow
 * 
 * $HEADER$
 */

/**
 * @file
 *
 * Checkpoint functionality for Open MPI
 */

#include "opal_config.h"
#include "opal/mca/crs/crs.h"
#include "opal/event/event.h"
#include "opal/util/output.h"
#include "opal/prefetch.h"

#ifndef OPAL_CR_H
#define OPAL_CR_H


BEGIN_C_DECLS

/*
 * Some defines shared with opal-[checkpoint|restart] commands
 */
#define OPAL_CR_DONE       ((char) 0)
#define OPAL_CR_ACK        ((char) 1)
#define OPAL_CR_CHECKPOINT ((char) 2)
#define OPAL_CR_NAMED_PROG_R  ("opal_cr_prog_read")
#define OPAL_CR_NAMED_PROG_W  ("opal_cr_prog_write")
#define OPAL_CR_BASE_ENV_NAME ("opal_cr_restart-env")

/*
 * Possible responses to a checkpoint request from opal-checkpoint
 */
enum opal_cr_ckpt_cmd_state_t {
    OPAL_CHECKPOINT_CMD_START,       /* Checkpoint is starting on this request */
    OPAL_CHECKPOINT_CMD_IN_PROGRESS, /* Checkpoint is currently running */
    OPAL_CHECKPOINT_CMD_NULL,        /* Checkpoint cannot be started because it is not supported */
    OPAL_CHECKPOINT_CMD_ERROR,       /* An error occurred such that the checkpoint cannot be completed */
    /* State of the checkpoint operation */
    OPAL_CR_STATUS_NONE,       /* No checkpoint in progress */
    OPAL_CR_STATUS_REQUESTED,  /* Checkpoint has been requested */
    OPAL_CR_STATUS_RUNNING,    /* Checkpoint is currently running */
    OPAL_CR_STATUS_TERM,       /* Checkpoint is running and will terminate process upon completion */
    /* State of the continue operation */
    OPAL_CR_STATUS_CONTINUE,
    /* State of the restart operation */
    OPAL_CR_STATUS_RESTART_PRE,
    OPAL_CR_STATUS_RESTART_POST
};
typedef enum opal_cr_ckpt_cmd_state_t opal_cr_ckpt_cmd_state_t;

    /* An output handle to be used by the cr runtime 
     * functionality as an argument to opal_output() */
    OPAL_DECLSPEC extern int    opal_cr_output;

    /* Directory containing the named pipes for communication
     * with the opal-checkpoint tool  */
    OPAL_DECLSPEC extern char * opal_cr_pipe_dir;

    /* Signal that opal-checkpoint uses to contact the 
     * application process */
    OPAL_DECLSPEC extern int    opal_cr_entry_point_signal;

    /* If Checkpointing is enabled in this application */
    OPAL_DECLSPEC extern bool   opal_cr_is_enabled;

    /* If the application running is a tool
     * (e.g., opal-checkpoint, orted, ...) */
    OPAL_DECLSPEC extern bool   opal_cr_is_tool;

    /* If a checkpoint has been requested */
    OPAL_DECLSPEC extern int opal_cr_checkpoint_request;

    /* The current state of a checkpoint operation */
    OPAL_DECLSPEC extern int opal_cr_checkpointing_state;

    /*
     * If this is an application that doesn't want to have
     * a notification callback installed, set this to false.
     * To see the effect, this must be called before opal_cr_init().
     * Default: Enabled
     */
    OPAL_DECLSPEC int opal_cr_set_enabled(bool);

    /**
     * Initialize the notification and coordination
     *  elements.
     */
    OPAL_DECLSPEC int opal_cr_init(void);

    /**
     * Finalize the notification and coordination
     *  elements.
     */
    OPAL_DECLSPEC int opal_cr_finalize(void);
    
    /*************************************************
     * Check to see if a checkpoint has been requested
     *
     * When the checkpoint thread is disabled:
     *   This will be checked whenever the MPI Library
     *   is entered by the application. It will stop
     *   the application for the duration of the entire
     *   checkpoint.
     * When the checkpoint thread is enabled:
     *   The request is handled in the thread parallel
     *   with the execution of the program regardless
     *   of where the program is in exection.
     *   The problem with this method is that it
     *   requires the support of progress threads
     *   which is currently not working properly :/
     *
     *************************************************/
    OPAL_DECLSPEC void opal_cr_test_if_checkpoint_ready(void);

    /* If the checkpoint operation should be stalled to
     * wait for another sevice to complete before 
     * continuing with the checkpoint */
    OPAL_DECLSPEC extern bool opal_cr_stall_check;
    OPAL_DECLSPEC extern bool opal_cr_currently_stalled;

#if OPAL_ENABLE_FT_THREAD == 1
    /* Some thread functions */
    OPAL_DECLSPEC void opal_cr_thread_init_library(void);
    OPAL_DECLSPEC void opal_cr_thread_finalize_library(void);
    OPAL_DECLSPEC void opal_cr_thread_abort_library(void);
    OPAL_DECLSPEC void opal_cr_thread_enter_library(void);
    OPAL_DECLSPEC void opal_cr_thread_exit_library(void);
    OPAL_DECLSPEC void opal_cr_thread_noop_progress(void);
#endif /* OPAL_ENABLE_FT_THREAD == 1 */

    /* 
     * If not using FT then make the #defines noops
     */
#if OPAL_ENABLE_FT == 0 || OPAL_ENABLE_FT_CR == 0
#define OPAL_CR_TEST_CHECKPOINT_READY() ;
#define OPAL_CR_TEST_CHECKPOINT_READY_STALL() ;
#define OPAL_CR_INIT_LIBRARY() ;
#define OPAL_CR_FINALIZE_LIBRARY() ;
#define OPAL_CR_ABORT_LIBRARY() ;
#define OPAL_CR_ENTER_LIBRARY() ;
#define OPAL_CR_EXIT_LIBRARY() ;
#define OPAL_CR_NOOP_PROGRESS() ;
#endif /* #if OPAL_ENABLE_FT == 0 || OPAL_ENABLE_FT_CR == 0 */

    /*
     * If using FT
     */
#if OPAL_ENABLE_FT_CR == 1
#define OPAL_CR_TEST_CHECKPOINT_READY()      \
  {                                          \
    if(OPAL_UNLIKELY(opal_cr_is_enabled) ) { \
      opal_cr_test_if_checkpoint_ready();    \
    }                                        \
  }

#define OPAL_CR_TEST_CHECKPOINT_READY_STALL()        \
  {                                                  \
    if(OPAL_UNLIKELY(opal_cr_is_enabled && !opal_cr_stall_check)) { \
      opal_cr_test_if_checkpoint_ready();            \
    }                                                \
  }

/* If *not* using FT thread */
#if OPAL_ENABLE_FT_THREAD == 0
#define OPAL_CR_INIT_LIBRARY()     OPAL_CR_TEST_CHECKPOINT_READY();
#define OPAL_CR_FINALIZE_LIBRARY() OPAL_CR_TEST_CHECKPOINT_READY();
#define OPAL_CR_ABORT_LIBRARY()    OPAL_CR_TEST_CHECKPOINT_READY();
#define OPAL_CR_ENTER_LIBRARY()    OPAL_CR_TEST_CHECKPOINT_READY();
#define OPAL_CR_EXIT_LIBRARY()     OPAL_CR_TEST_CHECKPOINT_READY();
#define OPAL_CR_NOOP_PROGRESS()    OPAL_CR_TEST_CHECKPOINT_READY();
#endif /* OPAL_ENABLE_FT_THREAD == 0 */

/* If using FT thread */
#if OPAL_ENABLE_FT_THREAD == 1
#define OPAL_CR_INIT_LIBRARY()    \
 {                                \
   opal_cr_thread_init_library(); \
 }
#define OPAL_CR_FINALIZE_LIBRARY()    \
 {                                    \
   opal_cr_thread_finalize_library(); \
 }
#define OPAL_CR_ABORT_LIBRARY()    \
 {                                 \
   opal_cr_thread_abort_library(); \
 }
#define OPAL_CR_ENTER_LIBRARY()    \
 {                                 \
   opal_cr_thread_enter_library(); \
 }
#define OPAL_CR_EXIT_LIBRARY()    \
 {                                \
   opal_cr_thread_exit_library(); \
 }
#define OPAL_CR_NOOP_PROGRESS()    \
 {                                 \
   opal_cr_thread_noop_progress(); \
 }
#endif /* OPAL_ENABLE_FT_THREAD == 1 */

#endif /* OPAL_ENABLE_FT_CR == 1 */

    /*******************************
     * Notification Routines
     *******************************/
    /*******************************
     * Notification Routines
     *******************************/
    /**
     * A function to respond to the async checkpoint request
     * this is useful when figuring out who should respond
     * when stalling.
     */
    typedef int (*opal_cr_notify_callback_fn_t) (opal_cr_ckpt_cmd_state_t);

    OPAL_DECLSPEC int opal_cr_reg_notify_callback
    (opal_cr_notify_callback_fn_t new_func,
     opal_cr_notify_callback_fn_t *prev_func);

    /**
     * Function to go through the INC
     * - Call Registered INC_Coord(CHECKPOINT)
     * - Call the CRS.checkpoint()
     * - Call Registered INC_Coord(state)
     */
    OPAL_DECLSPEC int opal_cr_inc_core(pid_t pid, 
                                       opal_crs_base_snapshot_t *snapshot,
                                       opal_crs_base_ckpt_options_t *options,
                                       int *state);
    
    OPAL_DECLSPEC int opal_cr_inc_core_prep(void);
    OPAL_DECLSPEC int opal_cr_inc_core_ckpt(pid_t pid,
                                            opal_crs_base_snapshot_t *snapshot,
                                            opal_crs_base_ckpt_options_t *options,
                                            int *state);
    OPAL_DECLSPEC int opal_cr_inc_core_recover(int state);

    /*******************************
     * Coordination Routines
     *******************************/
    /** 
     * Coordination callback routine signature
     */
    typedef int (*opal_cr_coord_callback_fn_t) (int);

    /**
     * Register a checkpoint coodination routine
     * for a higher level.
     */
     OPAL_DECLSPEC int opal_cr_reg_coord_callback
     (opal_cr_coord_callback_fn_t  new_func,
      opal_cr_coord_callback_fn_t *prev_func);

    /**
     * OPAL Checkpoint Coordination Routine
     */
    OPAL_DECLSPEC int opal_cr_coord(int state);

    /**
     * Checkpoint life-cycle timing
     */
    OPAL_DECLSPEC void opal_cr_set_time(int idx);
    OPAL_DECLSPEC void opal_cr_display_all_timers(void);
    OPAL_DECLSPEC void opal_cr_clear_timers(void);

    OPAL_DECLSPEC extern bool opal_cr_timing_enabled;
    OPAL_DECLSPEC extern bool opal_cr_timing_barrier_enabled;
    OPAL_DECLSPEC extern int  opal_cr_timing_my_rank;
    OPAL_DECLSPEC extern int  opal_cr_timing_target_rank;


#define OPAL_CR_TIMER_ENTRY0    0
#define OPAL_CR_TIMER_ENTRY1    1
#define OPAL_CR_TIMER_ENTRY2    2
#define OPAL_CR_TIMER_CRCPBR0   3
#define OPAL_CR_TIMER_CRCP0     4
#define OPAL_CR_TIMER_CRCPBR1   5
#define OPAL_CR_TIMER_P2P0      6
#define OPAL_CR_TIMER_P2P1      7
#define OPAL_CR_TIMER_P2PBR0    8
#define OPAL_CR_TIMER_CORE0     9
#define OPAL_CR_TIMER_CORE1    10
#define OPAL_CR_TIMER_COREBR0  11
#define OPAL_CR_TIMER_P2P2     12
#define OPAL_CR_TIMER_P2PBR1   13
#define OPAL_CR_TIMER_P2P3     14
#define OPAL_CR_TIMER_P2PBR2   15
#define OPAL_CR_TIMER_CRCP1    16
#define OPAL_CR_TIMER_COREBR1  17
#define OPAL_CR_TIMER_CORE2    18
#define OPAL_CR_TIMER_ENTRY3   19
#define OPAL_CR_TIMER_ENTRY4   20
#define OPAL_CR_TIMER_MAX      21


#define OPAL_CR_CLEAR_TIMERS()                          \
    {                                                   \
        if(OPAL_UNLIKELY(opal_cr_timing_enabled > 0)) { \
            opal_cr_clear_timers();                     \
        }                                               \
    }

#define OPAL_CR_SET_TIMER(idx)                          \
    {                                                   \
        if(OPAL_UNLIKELY(opal_cr_timing_enabled > 0)) { \
            opal_cr_set_time(idx);                      \
        }                                               \
    }

#define OPAL_CR_DISPLAY_ALL_TIMERS()                    \
    {                                                   \
        if(OPAL_UNLIKELY(opal_cr_timing_enabled > 0)) { \
            opal_cr_display_all_timers();               \
        }                                               \
    }

END_C_DECLS

#endif /* OPAL_CR_H */