File: mpir_cst_filesys.c

package info (click to toggle)
openmpi 5.0.8-4
  • links: PTS, VCS
  • area: main
  • in suites:
  • size: 201,684 kB
  • sloc: ansic: 613,078; makefile: 42,353; sh: 11,194; javascript: 9,244; f90: 7,052; java: 6,404; perl: 5,179; python: 1,859; lex: 740; fortran: 61; cpp: 20; tcl: 12
file content (258 lines) | stat: -rw-r--r-- 9,072 bytes parent folder | download | duplicates (4)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
/*
 * Copyright (C) by Argonne National Laboratory
 *     See COPYRIGHT in top-level directory
 */

#include "mpioimpl.h"

#ifdef HAVE_STDLIB_H
#include <stdlib.h>
#endif

#ifdef HAVE_DIRENT_H
#include <dirent.h>
#endif

#ifdef MPICH

static int comm_split_filesystem_exhaustive(MPI_Comm comm, int key,
                                            const char *dirname, MPI_Comm * newcomm)
{
    /* If you run this at scale against GPFS, be prepared to spend 30 mintues
     * creating 10,000 files -- and the numbers only get worse from there.
     *
     * - create random directory
     * - create files in that directory
     * - based on the visible files, construct a new group, then a new
     *   communicator
     * - there are no directory operation routines in MPI so we'll do it via
     *   POSIX.  */
    int rank, nprocs, ret;
    int *ranks;
    MPI_Group comm_group, newgroup;
    int j = 0, mpi_errno = MPI_SUCCESS;
    char *filename = NULL, *testdirname = NULL;
    DIR *dir;
    struct dirent *entry;
    MPI_Comm_rank(comm, &rank);
    MPI_Comm_size(comm, &nprocs);

    /* rank zero constructs the candidate directory name (just the
     * name).  Everyone will create the directory though -- this will be
     * a headache for the file system at scale..  don't do this on a
     * large parallel file system! */

    testdirname = MPL_malloc(PATH_MAX, MPL_MEM_IO);
    filename = MPL_malloc(PATH_MAX, MPL_MEM_IO);
    ranks = MPL_malloc(nprocs * sizeof(int), MPL_MEM_IO);

    if (rank == 0)
        MPL_create_pathname(testdirname, dirname, ".commonfstest.0", 1);

    MPI_Bcast(testdirname, PATH_MAX, MPI_BYTE, 0, comm);
    /* ignore EEXIST: quite likely another process will have made this
     * directory, but since the whole point is to figure out who we share this
     * directory with, brute force it is! */
    ret = mkdir(testdirname, S_IRWXU);
    if (ret == -1 && errno != EEXIST)
        goto fn_fail;
    MPL_snprintf(filename, PATH_MAX, "%s/%d", testdirname, rank);
    open(filename, O_CREAT, S_IRUSR | S_IWUSR);

    MPI_Barrier(comm);
    /* each process has created a file in a M-way shared directory (where M in
     * the range [1-nprocs]).  Let's see who else can see this directory */
    if ((dir = opendir(testdirname)) == NULL)
        goto fn_fail;
    while ((entry = readdir(dir)) != NULL) {
        if (strcmp(entry->d_name, ".") == 0)
            continue;
        if (strcmp(entry->d_name, "..") == 0)
            continue;
        ranks[j++] = atoi(entry->d_name);
    }

    MPI_Comm_group(comm, &comm_group);
    MPI_Group_incl(comm_group, j, ranks, &newgroup);
    MPI_Comm_create(comm, newgroup, newcomm);
    MPI_Group_free(&newgroup);
    MPI_Group_free(&comm_group);

    unlink(filename);
    /* ok to ignore errors */
    rmdir(testdirname);

  fn_exit:
    MPL_free(ranks);
    MPL_free(filename);
    MPL_free(testdirname);
    return mpi_errno;
  fn_fail:
    goto fn_exit;
}

static int comm_split_filesystem_heuristic(MPI_Comm comm, int key,
                                           const char *dirname, MPI_Comm * newcomm)
{
    int i, mpi_errno = MPI_SUCCESS;
    int rank, nprocs;
    int id;
    int32_t *all_ids;
    char *filename = NULL;
    int challenge_rank, globally_visible = 0;
    MPI_Request check_req;

    MPI_Comm_rank(comm, &rank);
    MPI_Comm_size(comm, &nprocs);
    MPIR_Get_node_id(comm, rank, &id);

    /* We could detect the common file systems by parsing 'df'-style
     * output, but that's fidgety, fragile, and error prone.  Instead,
     * determine who shares a file system through testing.
     *
     * As an optimization, we should try to avoid creating a lot of
     * files: we want something that could work at hundreds of thousands
     * of nodes, and creating a hundred thousand files in a directory is
     * a recipe for sadness
     *
     * In CH3 and in wider practice "shared memory" is the same as "on
     * the same node, so let's start there.
     *
     * - Create file on one processor
     * - pick a processor outside the "on this node" group
     * - if that processor can see the file, then assume the file is
     *   visible to all groups.
     *
     * note that this scheme works really well for traditional linux clusters:
     * think nodes with a local scratch drive.  this scheme works less well for
     * a deeper heirarchy.  what if the directory in question was hosted by an
     * i/o forwarding agent?
     */

    /* learn a bit about what groups were created: as a scalable
     * optimization we want to check a file's presence from a group
     * other than which created it */
    all_ids = MPL_malloc(nprocs * sizeof(*all_ids), MPL_MEM_IO);

    mpi_errno = MPI_Gather(&id, 1, MPI_INT32_T, all_ids, 1, MPI_INT32_T, 0, comm);

    if (rank == 0) {
        for (i = 0; i < nprocs; i++) {
            if (all_ids[i] != id)
                break;
        }
        if (i >= nprocs)
            /* everyone is in the same group; pick a process that's not rank 0
             * just in case the file system is really weird */
            challenge_rank = nprocs - 1;
        else
            challenge_rank = i;
    }
    mpi_errno = MPI_Bcast(&challenge_rank, 1, MPI_INT, 0, comm);

    /* now that we've informally lumped everyone into groups based on node
     * (like shared memory does) it's time to poke the file system and see
     * which group can see what files */

    /* here come a bunch of assumptions:
     * - file system layouts are homogenous: if one system has /scratch,
     *   all have /scratch
     * - a globally visible parallel file system will have the same name
     *   everywhere: e.g /gpfs/users/something
     * - a file created on one node will be deterministically visible on
     *   another.  NFS has problems with this
     * - if a process from one group creates a file, and a process from
     *   another group finds that file, then a process from all groups
     *   can find that file
     */

    /* is the file globally visible to all?  create on rank 0, test on a
     * different off-group rank.
     * Use a single short message to force check after create: ordering
     * is a little odd in case we are creating and checking on the same
     * rank  */

    filename = MPL_calloc(PATH_MAX, sizeof(char), MPL_MEM_IO);

    if (rank == 0)
        MPL_create_pathname(filename, dirname, ".commonfstest.0", 0);

    MPI_Bcast(filename, PATH_MAX, MPI_BYTE, 0, comm);

    if (rank == challenge_rank) {
        MPI_Irecv(NULL, 0, MPI_BYTE, 0, 0, comm, &check_req);
    }

    if (rank == 0) {
        MPI_File fh;
        mpi_errno = MPI_File_open(MPI_COMM_SELF, filename,
                                  MPI_MODE_CREATE | MPI_MODE_EXCL | MPI_MODE_WRONLY,
                                  MPI_INFO_NULL, &fh);
        if (mpi_errno != MPI_SUCCESS)
            goto fn_exit;
        MPI_File_close(&fh);
        /* the check for file has to happen after file created. only need one
         * process, though, not a full barrier */
        MPI_Send(NULL, 0, MPI_BYTE, challenge_rank, 0, comm);
    }

    if (rank == challenge_rank) {
        MPI_File fh;

        MPI_Wait(&check_req, MPI_STATUS_IGNORE);

        /* too bad there's no ADIO equivalent of access: we'll have to
         * open/close the file instead */

        mpi_errno = MPI_File_open(MPI_COMM_SELF, filename, MPI_MODE_RDONLY, MPI_INFO_NULL, &fh);
        if (mpi_errno == MPI_SUCCESS) {
            globally_visible = 1;
            MPI_File_close(&fh);
        } else {
            /* do not report error up to caller.  we are merely testing the
             * presence of the file */
            mpi_errno = MPI_SUCCESS;
            globally_visible = 0;
        }
    }
    MPI_Bcast(&globally_visible, 1, MPI_INT, challenge_rank, comm);

    /*   with the above assumptions, we have two cases for a flie
     *   created on one process:
     *   -- either a process not in the group can access it (node-local
     *      storage of some sort)
     *   -- or a process not in the group cannot access it (globally
     *      accessable parallel file system) */

    if (globally_visible) {
        MPI_Comm_dup(comm, newcomm);
    } else {
        MPI_Comm_split(comm, id, key, newcomm);
    }
    if (rank == 0)
        MPI_File_delete(filename, MPI_INFO_NULL);

  fn_exit:
    MPL_free(all_ids);
    MPL_free(filename);
    return mpi_errno;
}

/* not to be called directly (note the MPIR_ prefix), but instead from
 * MPI-level MPI_Comm_split_type implementation (e.g.
 * MPIR_Comm_split_type_impl). */

/* split communicator based on access to directory 'dirname'. */
int MPIR_Comm_split_filesystem(MPI_Comm comm, int key, const char *dirname, MPI_Comm * newcomm)
{
    int mpi_errno = MPI_SUCCESS;
    char *s;
    if ((s = getenv("MPIX_SPLIT_DISABLE_HEURISTIC")) != NULL) {
        mpi_errno = comm_split_filesystem_exhaustive(comm, key, dirname, newcomm);
    } else {
        mpi_errno = comm_split_filesystem_heuristic(comm, key, dirname, newcomm);
    }
    return mpi_errno;
}

#endif /* MPICH */