File: ad_pvfs2_open.c

package info (click to toggle)
openmpi 5.0.8-3
  • links: PTS, VCS
  • area: main
  • in suites:
  • size: 201,692 kB
  • sloc: ansic: 613,078; makefile: 42,353; sh: 11,194; javascript: 9,244; f90: 7,052; java: 6,404; perl: 5,179; python: 1,859; lex: 740; fortran: 61; cpp: 20; tcl: 12
file content (232 lines) | stat: -rw-r--r-- 8,694 bytes parent folder | download | duplicates (8)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
/*
 * Copyright (C) by Argonne National Laboratory
 *     See COPYRIGHT in top-level directory
 */

#include "ad_pvfs2.h"
#include "ad_pvfs2_common.h"

/* open_status is helpful for bcasting values around */
struct open_status_s {
    int error;
    PVFS_object_ref object_ref;
};
typedef struct open_status_s open_status;

    /* steps for getting a handle:  (it gets a little convoluted, but at least
     * it's deterministic)
     * . lookup the file.
     * . if lookup succeeds, but we were passed MPI_MODE_EXCL, that's an error
     * . if lookup fails, the file might not exist.
     *          in that case, create the file if we were passed MPI_MODE_CREATE
     * . if the create fails, that means someone else created the file between
     *    our call to lookup and our call to create (like if N processors all
     *    open the same file with MPI_COMM_SELF).  Then we can just look up the
     *    file (which now exists).
     *
     * the good news is that only one processor does this and broadcasts the
     * handle to everyone else in the communicator
     */
static void fake_an_open(PVFS_fs_id fs_id, char *pvfs_name, int access_mode,
                         int nr_datafiles, PVFS_size strip_size,
                         ADIOI_PVFS2_fs * pvfs2_fs, open_status * o_status)
{
    int ret;
    PVFS_sysresp_lookup resp_lookup;
    PVFS_sysresp_getparent resp_getparent;
    PVFS_sysresp_create resp_create;
    PVFS_sys_attr attribs;
    PVFS_sys_dist *dist;

    ADIOI_PVFS2_makeattribs(&attribs);
    if (nr_datafiles > 0) {
        attribs.dfile_count = nr_datafiles;
        attribs.mask |= PVFS_ATTR_SYS_DFILE_COUNT;
    }

    dist = NULL;

    memset(&resp_lookup, 0, sizeof(resp_lookup));
    memset(&resp_getparent, 0, sizeof(resp_getparent));
    memset(&resp_create, 0, sizeof(resp_create));


    ret = PVFS_sys_lookup(fs_id, pvfs_name,
                          &(pvfs2_fs->credentials), &resp_lookup, PVFS2_LOOKUP_LINK_FOLLOW);
    if (ret == (-PVFS_ENOENT)) {
        if (access_mode & ADIO_CREATE) {
            ret = PVFS_sys_getparent(fs_id, pvfs_name, &(pvfs2_fs->credentials), &resp_getparent);
            if (ret < 0) {
                FPRINTF(stderr, "pvfs_sys_getparent returns with %d\n", ret);
                o_status->error = ret;
                return;
            }

            /* Set the distribution strip size if specified */
            if (0 < strip_size) {
                /* Note that the distribution is hardcoded here */
                dist = PVFS_sys_dist_lookup("simple_stripe");
                ret = PVFS_sys_dist_setparam(dist, "strip_size", &strip_size);
                if (ret < 0) {
                    FPRINTF(stderr, "pvfs_sys_dist_setparam returns with %d\n", ret);
                    o_status->error = ret;
                }
            }

            /* Perform file creation */
#ifdef HAVE_PVFS2_CREATE_WITHOUT_LAYOUT
            ret = PVFS_sys_create(resp_getparent.basename,
                                  resp_getparent.parent_ref, attribs,
                                  &(pvfs2_fs->credentials), dist, &resp_create);
#else
            ret = PVFS_sys_create(resp_getparent.basename,
                                  resp_getparent.parent_ref, attribs,
                                  &(pvfs2_fs->credentials), dist, NULL, &resp_create);
#endif

            /* if many creates are happening in this directory, the earlier
             * sys_lookup may have returned ENOENT, but the sys_create could
             * return EEXISTS.  That means the file has been created anyway, so
             * less work for us and we can just open it up and return the
             * handle */
            if (ret == (-PVFS_EEXIST)) {
                ret = PVFS_sys_lookup(fs_id, pvfs_name,
                                      &(pvfs2_fs->credentials), &resp_lookup,
                                      PVFS2_LOOKUP_LINK_FOLLOW);
                if (ret < 0) {
                    o_status->error = ret;
                    return;
                }
                o_status->error = ret;
                o_status->object_ref = resp_lookup.ref;
                return;
            }
            o_status->object_ref = resp_create.ref;
        } else {
            FPRINTF(stderr, "cannot create file without MPI_MODE_CREATE\n");
            o_status->error = ret;
            return;
        }
    } else if (access_mode & ADIO_EXCL) {
        /* lookup should not succeed if opened with EXCL */
        o_status->error = -PVFS_EEXIST;
        return;
    } else {
        o_status->object_ref = resp_lookup.ref;
    }
    o_status->error = ret;
    return;

}


/* ADIOI_PVFS2_Open:
 *  one process opens (or creates) the file, then broadcasts the result to the
 *  remaining processors.
 *
 *  ADIO_Open used to perform an optimization when MPI_MODE_CREATE (and before
 * that, MPI_MODE_EXCL) was set.  Because PVFS2 handles file lookup and
 * creation more scalably than other file systems, ADIO_Open now skips any
 * special handling when CREATE is set.  */
void ADIOI_PVFS2_Open(ADIO_File fd, int *error_code)
{
    int rank, ret;
    PVFS_fs_id cur_fs;
    static char myname[] = "ADIOI_PVFS2_OPEN";
    char pvfs_path[PVFS_NAME_MAX] = { 0 };

    ADIOI_PVFS2_fs *pvfs2_fs;

    /* since one process is doing the open, that means one process is also
     * doing the error checking.  define a struct for both the object reference
     * and the error code to broadcast to all the processors */

    open_status o_status = { 0, {0, 0} };
    MPI_Datatype open_status_type;
    MPI_Datatype types[2] = { MPI_INT, MPI_BYTE };
    int lens[2] = { 1, sizeof(PVFS_object_ref) };
    MPI_Aint offsets[2];

    pvfs2_fs = (ADIOI_PVFS2_fs *) ADIOI_Malloc(sizeof(ADIOI_PVFS2_fs));

    /* --BEGIN ERROR HANDLING-- */
    if (pvfs2_fs == NULL) {
        *error_code = MPIO_Err_create_code(MPI_SUCCESS,
                                           MPIR_ERR_RECOVERABLE,
                                           myname, __LINE__,
                                           MPI_ERR_UNKNOWN, "Error allocating memory", 0);
        return;
    }
    /* --END ERROR HANDLING-- */

    MPI_Comm_rank(fd->comm, &rank);

    ADIOI_PVFS2_Init(error_code);
    if (*error_code != MPI_SUCCESS) {
        /* ADIOI_PVFS2_INIT handles creating error codes on its own */
        return;
    }

    /* currently everyone gets their own credentials */
    ADIOI_PVFS2_makecredentials(&(pvfs2_fs->credentials));

    /* one process resolves name and will later bcast to others */
#ifdef ADIOI_MPE_LOGGING
    MPE_Log_event(ADIOI_MPE_open_a, 0, NULL);
#endif
    if (rank == fd->hints->ranklist[0] && fd->fs_ptr == NULL) {
        /* given the filename, figure out which pvfs filesystem it is on */
        ret = PVFS_util_resolve(fd->filename, &cur_fs, pvfs_path, PVFS_NAME_MAX);
        if (ret < 0) {
            PVFS_perror("PVFS_util_resolve", ret);
            /* TODO: pick a good error for this */
            o_status.error = -1;
        } else {
            fake_an_open(cur_fs, pvfs_path,
                         fd->access_mode, fd->hints->striping_factor,
                         fd->hints->striping_unit, pvfs2_fs, &o_status);
        }

        /* store credentials and object reference in fd */
        pvfs2_fs->object_ref = o_status.object_ref;
        fd->fs_ptr = pvfs2_fs;
    }
#ifdef ADIOI_MPE_LOGGING
    MPE_Log_event(ADIOI_MPE_open_b, 0, NULL);
#endif

    /* broadcast status and (possibly valid) object reference */
    MPI_Get_address(&o_status.error, &offsets[0]);
    MPI_Get_address(&o_status.object_ref, &offsets[1]);

    MPI_Type_create_struct(2, lens, offsets, types, &open_status_type);
    MPI_Type_commit(&open_status_type);

    /* Assertion: if we hit this Bcast, then all processes collectively
     *            called this open.
     *
     * That's because deferred open never happens with PVFS2.
     */
    MPI_Bcast(MPI_BOTTOM, 1, open_status_type, fd->hints->ranklist[0], fd->comm);
    MPI_Type_free(&open_status_type);

    /* --BEGIN ERROR HANDLING-- */
    if (o_status.error != 0) {
        ADIOI_Free(pvfs2_fs);
        fd->fs_ptr = NULL;
        *error_code = MPIO_Err_create_code(MPI_SUCCESS,
                                           MPIR_ERR_RECOVERABLE,
                                           myname, __LINE__,
                                           ADIOI_PVFS2_error_convert(o_status.error),
                                           "Unknown error", 0);
        /* TODO: FIX STRING */
        return;
    }
    /* --END ERROR HANDLING-- */

    pvfs2_fs->object_ref = o_status.object_ref;
    fd->fs_ptr = pvfs2_fs;

    *error_code = MPI_SUCCESS;
    return;
}