File: ad_gpfs_hints.c

package info (click to toggle)
openmpi 4.1.4-3
  • links: PTS, VCS
  • area: main
  • in suites: bookworm
  • size: 127,592 kB
  • sloc: ansic: 690,998; makefile: 43,047; f90: 19,220; sh: 7,182; java: 6,360; perl: 3,590; cpp: 2,227; python: 1,350; lex: 989; fortran: 61; tcl: 12
file content (288 lines) | stat: -rw-r--r-- 10,577 bytes parent folder | download | duplicates (5)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
/* ---------------------------------------------------------------- */
/* (C)Copyright IBM Corp.  2007, 2008                               */
/* ---------------------------------------------------------------- */
/**
 * \file ad_gpfs_hints.c
 * \brief GPFS hint processing - for now, only used for BlueGene and PE platforms
 */

/* -*- Mode: C; c-basic-offset:4 ; -*- */
/*
 *   Copyright (C) 1997 University of Chicago.
 *   See COPYRIGHT notice in top-level directory.
 */

#include "adio.h"
#include "adio_extern.h"
#include "hint_fns.h"

#include "ad_gpfs.h"

#define   ADIOI_GPFS_CB_BUFFER_SIZE_DFLT      	"16777216"
#define	  ADIOI_GPFS_IND_RD_BUFFER_SIZE_DFLT	"4194304"
#define   ADIOI_GPFS_IND_WR_BUFFER_SIZE_DFLT	"4194304"

#ifdef BGQPLATFORM
#define   ADIOI_BG_NAGG_IN_PSET_HINT_NAME	"bg_nodes_pset"
#endif

/** \page mpiio_vars MPIIO Configuration
 *
 * GPFS MPIIO configuration and performance tuning. Used by ad_gpfs ADIO.
 *
 * Used for BlueGene and PE platforms, which each have their own aggregator selection
 * algorithms that ignore user provided cb_config_list.
 *
 * \section hint_sec Hints
 * - bg_nodes_pset - BlueGene only - specify how many aggregators to use per pset.
 *   This hint will override the cb_nodes hint based on BlueGene psets.
 *   - N - Use N nodes per pset as aggregators.
 *   - Default is based on partition configuration and cb_nodes.
 *
 *   The following default key/value pairs may differ from other platform defaults.
 *
 *     - key = cb_buffer_size     value = 16777216
 *     - key = romio_cb_read      value = enable
 *     - key = romio_cb_write     value = enable
 *     - key = ind_rd_buffer_size value = 4194304
 *     - key = ind_wr_buffer_size value = 4194304
 */

#ifdef BGQPLATFORM
/* Compute the aggregator-related parameters that are required in 2-phase collective IO of ADIO. */
extern int
ADIOI_BG_gen_agg_ranklist(ADIO_File fd, int n_proxy_per_pset);
#elif PEPLATFORM
extern int
ADIOI_PE_gen_agg_ranklist(ADIO_File fd);
#endif

void ADIOI_GPFS_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code)
{
/* if fd->info is null, create a new info object.
   Initialize fd->info to default values.
   Initialize fd->hints to default values.
   Examine the info object passed by the user. If it contains values that
   ROMIO understands, override the default. */

    MPI_Info info;
    char *value;
    int flag, intval, nprocs=0, nprocs_is_valid = 0;
    static char myname[] = "ADIOI_GPFS_SETINFO";

    int did_anything = 0;

    if (fd->info == MPI_INFO_NULL) MPI_Info_create(&(fd->info));
    info = fd->info;

    /* Note that fd->hints is allocated at file open time; thus it is
     * not necessary to allocate it, or check for allocation, here.
     */

    value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char));
    ADIOI_Assert ((value != NULL));

    /* initialize info and hints to default values if they haven't been
     * previously initialized
     */
    if (!fd->hints->initialized) {

	ad_gpfs_get_env_vars();
	did_anything = 1;

	/* buffer size for collective I/O */
	ADIOI_Info_set(info, "cb_buffer_size", ADIOI_GPFS_CB_BUFFER_SIZE_DFLT);
	fd->hints->cb_buffer_size = atoi(ADIOI_GPFS_CB_BUFFER_SIZE_DFLT);

	/* default is to let romio automatically decide when to use
	 * collective buffering
	 */
	ADIOI_Info_set(info, "romio_cb_read", "enable");
	fd->hints->cb_read = ADIOI_HINT_ENABLE;
	ADIOI_Info_set(info, "romio_cb_write", "enable");
	fd->hints->cb_write = ADIOI_HINT_ENABLE;

   	if ( fd->hints->cb_config_list != NULL ) ADIOI_Free (fd->hints->cb_config_list);
	fd->hints->cb_config_list = NULL;

	/* number of processes that perform I/O in collective I/O */
	MPI_Comm_size(fd->comm, &nprocs);
	nprocs_is_valid = 1;
	ADIOI_Snprintf(value, MPI_MAX_INFO_VAL+1, "%d", nprocs);
	ADIOI_Info_set(info, "cb_nodes", value);
	fd->hints->cb_nodes = -1;

	/* hint indicating that no indep. I/O will be performed on this file */
	ADIOI_Info_set(info, "romio_no_indep_rw", "false");
	fd->hints->no_indep_rw = 0;

	/* gpfs is not implementing file realms (ADIOI_IOStridedColl),
	   initialize to disabled it. 	   */
	/* hint instructing the use of persistent file realms */
	ADIOI_Info_set(info, "romio_cb_pfr", "disable");
	fd->hints->cb_pfr = ADIOI_HINT_DISABLE;

	/* hint guiding the assignment of persistent file realms */
	ADIOI_Info_set(info, "romio_cb_fr_types", "aar");
	fd->hints->cb_fr_type = ADIOI_FR_AAR;

	/* hint to align file realms with a certain byte value */
	ADIOI_Info_set(info, "romio_cb_fr_alignment", "1");
	fd->hints->cb_fr_alignment = 1;

	/* hint to set a threshold percentage for a datatype's size/extent at
	 * which data sieving should be done in collective I/O */
	ADIOI_Info_set(info, "romio_cb_ds_threshold", "0");
	fd->hints->cb_ds_threshold = 0;

	/* hint to switch between point-to-point or all-to-all for two-phase */
	ADIOI_Info_set(info, "romio_cb_alltoall", "automatic");
	fd->hints->cb_alltoall = ADIOI_HINT_AUTO;

	 /* deferred_open derived from no_indep_rw and cb_{read,write} */
	fd->hints->deferred_open = 0;

	/* buffer size for data sieving in independent reads */
	ADIOI_Info_set(info, "ind_rd_buffer_size", ADIOI_GPFS_IND_RD_BUFFER_SIZE_DFLT);
	fd->hints->ind_rd_buffer_size = atoi(ADIOI_GPFS_IND_RD_BUFFER_SIZE_DFLT);

	/* buffer size for data sieving in independent writes */
	ADIOI_Info_set(info, "ind_wr_buffer_size", ADIOI_GPFS_IND_WR_BUFFER_SIZE_DFLT);
	fd->hints->ind_wr_buffer_size = atoi(ADIOI_GPFS_IND_WR_BUFFER_SIZE_DFLT);


    ADIOI_Info_set(info, "romio_ds_read", "automatic");
    fd->hints->ds_read = ADIOI_HINT_AUTO;
    ADIOI_Info_set(info, "romio_ds_write", "automatic");
    fd->hints->ds_write = ADIOI_HINT_AUTO;

    /* still to do: tune this a bit for a variety of file systems. there's
	 * no good default value so just leave it unset */
    fd->hints->min_fdomain_size = 0;
    fd->hints->striping_unit = 0;

    fd->hints->initialized = 1;
    }

    /* add in user's info if supplied */
    if (users_info != MPI_INFO_NULL) {
	ADIOI_Info_check_and_install_int(fd, users_info, "cb_buffer_size",
		&(fd->hints->cb_buffer_size), myname, error_code);
	/* new hints for enabling/disabling coll. buffering on
	 * reads/writes
	 */
	ADIOI_Info_check_and_install_enabled(fd, users_info, "romio_cb_read",
		&(fd->hints->cb_read), myname, error_code);
	if (fd->hints->cb_read == ADIOI_HINT_DISABLE) {
	    /* romio_cb_read overrides no_indep_rw */
	    ADIOI_Info_set(info, "romio_no_indep_rw", "false");
	    fd->hints->no_indep_rw = ADIOI_HINT_DISABLE;
	}
	ADIOI_Info_check_and_install_enabled(fd, users_info, "romio_cb_write",
		&(fd->hints->cb_write), myname, error_code);
	if (fd->hints->cb_write == ADIOI_HINT_DISABLE) {
	    /* romio_cb_write overrides no_indep_rw */
	    ADIOI_Info_set(info, "romio_no_indep_rw", "false");
	    fd->hints->no_indep_rw = ADIOI_HINT_DISABLE;
	}
	/* Has the user indicated all I/O will be done collectively? */
	ADIOI_Info_check_and_install_true(fd, users_info, "romio_no_indep_rw",
		&(fd->hints->no_indep_rw), myname, error_code);
	if (fd->hints->no_indep_rw == 1) {
	    /* if 'no_indep_rw' set, also hint that we will do
	     * collective buffering: if we aren't doing independent io,
	     * then we have to do collective  */
	    ADIOI_Info_set(info, "romio_cb_write", "enable");
	    ADIOI_Info_set(info, "romio_cb_read", "enable");
	    fd->hints->cb_read = 1;
	    fd->hints->cb_write = 1;
	}

	/* new hints for enabling/disabling data sieving on
	 * reads/writes
	 */
	ADIOI_Info_check_and_install_enabled(fd, users_info, "romio_ds_read",
		&(fd->hints->ds_read), myname, error_code);
	ADIOI_Info_check_and_install_enabled(fd, users_info, "romio_ds_write",
		&(fd->hints->ds_write), myname, error_code);

	ADIOI_Info_check_and_install_int(fd, users_info, "ind_wr_buffer_size",
		&(fd->hints->ind_wr_buffer_size), myname, error_code);
	ADIOI_Info_check_and_install_int(fd, users_info, "ind_rd_buffer_size",
		&(fd->hints->ind_rd_buffer_size), myname, error_code);

	memset( value, 0, MPI_MAX_INFO_VAL+1 );
	ADIOI_Info_get(users_info, "romio_min_fdomain_size", MPI_MAX_INFO_VAL,
			value, &flag);
	if ( flag && ((intval = atoi(value)) > 0) ) {
		ADIOI_Info_set(info, "romio_min_fdomain_size", value);
		fd->hints->min_fdomain_size = intval;
	}
  /* Now we use striping unit in common code so we should
     process hints for it. */
	ADIOI_Info_check_and_install_int(fd, users_info, "striping_unit",
		&(fd->hints->striping_unit), myname, error_code);

#ifdef BGQPLATFORM
	memset( value, 0, MPI_MAX_INFO_VAL+1 );
        ADIOI_Info_get(users_info, ADIOI_BG_NAGG_IN_PSET_HINT_NAME, MPI_MAX_INFO_VAL,
		     value, &flag);
	if (flag && ((intval = atoi(value)) > 0)) {

	    did_anything = 1;
	    ADIOI_Info_set(info, ADIOI_BG_NAGG_IN_PSET_HINT_NAME, value);
	    fd->hints->cb_nodes = intval;
	}
#endif
    }

    /* special CB aggregator assignment */
    if (did_anything) {
#ifdef BGQPLATFORM
	ADIOI_BG_gen_agg_ranklist(fd, fd->hints->cb_nodes);
#elif PEPLATFORM
	ADIOI_PE_gen_agg_ranklist(fd);
#endif
    }

    /* deferred_open won't be set by callers, but if the user doesn't
     * explicitly disable collecitve buffering (two-phase) and does hint that
     * io w/o independent io is going on, we'll set this internal hint as a
     * convenience */
    if ( ( (fd->hints->cb_read != ADIOI_HINT_DISABLE) \
			    && (fd->hints->cb_write != ADIOI_HINT_DISABLE)\
			    && fd->hints->no_indep_rw ) ) {
	    fd->hints->deferred_open = 1;
    } else {
	    /* setting romio_no_indep_rw enable and romio_cb_{read,write}
	     * disable at the same time doesn't make sense. honor
	     * romio_cb_{read,write} and force the no_indep_rw hint to
	     * 'disable' */
	    ADIOI_Info_set(info, "romio_no_indep_rw", "false");
	    fd->hints->no_indep_rw = 0;
	    fd->hints->deferred_open = 0;
    }

    /* BobC commented this out, but since hint processing runs on both bg and
     * bglockless, we need to keep DS writes enabled on gpfs and disabled on
     * PVFS */
    if (ADIO_Feature(fd, ADIO_DATA_SIEVING_WRITES) == 0) {
    /* disable data sieving for fs that do not
       support file locking */
       	ADIOI_Info_get(info, "ind_wr_buffer_size", MPI_MAX_INFO_VAL,
		     value, &flag);
	if (flag) {
	    /* get rid of this value if it is set */
	    ADIOI_Info_delete(info, "ind_wr_buffer_size");
	}
	/* note: leave ind_wr_buffer_size alone; used for other cases
	 * as well. -- Rob Ross, 04/22/2003
	 */
	ADIOI_Info_set(info, "romio_ds_write", "disable");
	fd->hints->ds_write = ADIOI_HINT_DISABLE;
    }

    ADIOI_Free(value);

    *error_code = MPI_SUCCESS;
}