1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232
|
/*
* Copyright (C) by Argonne National Laboratory
* See COPYRIGHT in top-level directory
*/
#include "ad_pvfs2.h"
#include "ad_pvfs2_common.h"
/* open_status is helpful for bcasting values around */
struct open_status_s {
int error;
PVFS_object_ref object_ref;
};
typedef struct open_status_s open_status;
/* steps for getting a handle: (it gets a little convoluted, but at least
* it's deterministic)
* . lookup the file.
* . if lookup succeeds, but we were passed MPI_MODE_EXCL, that's an error
* . if lookup fails, the file might not exist.
* in that case, create the file if we were passed MPI_MODE_CREATE
* . if the create fails, that means someone else created the file between
* our call to lookup and our call to create (like if N processors all
* open the same file with MPI_COMM_SELF). Then we can just look up the
* file (which now exists).
*
* the good news is that only one processor does this and broadcasts the
* handle to everyone else in the communicator
*/
static void fake_an_open(PVFS_fs_id fs_id, char *pvfs_name, int access_mode,
int nr_datafiles, PVFS_size strip_size,
ADIOI_PVFS2_fs * pvfs2_fs, open_status * o_status)
{
int ret;
PVFS_sysresp_lookup resp_lookup;
PVFS_sysresp_getparent resp_getparent;
PVFS_sysresp_create resp_create;
PVFS_sys_attr attribs;
PVFS_sys_dist *dist;
ADIOI_PVFS2_makeattribs(&attribs);
if (nr_datafiles > 0) {
attribs.dfile_count = nr_datafiles;
attribs.mask |= PVFS_ATTR_SYS_DFILE_COUNT;
}
dist = NULL;
memset(&resp_lookup, 0, sizeof(resp_lookup));
memset(&resp_getparent, 0, sizeof(resp_getparent));
memset(&resp_create, 0, sizeof(resp_create));
ret = PVFS_sys_lookup(fs_id, pvfs_name,
&(pvfs2_fs->credentials), &resp_lookup, PVFS2_LOOKUP_LINK_FOLLOW);
if (ret == (-PVFS_ENOENT)) {
if (access_mode & ADIO_CREATE) {
ret = PVFS_sys_getparent(fs_id, pvfs_name, &(pvfs2_fs->credentials), &resp_getparent);
if (ret < 0) {
FPRINTF(stderr, "pvfs_sys_getparent returns with %d\n", ret);
o_status->error = ret;
return;
}
/* Set the distribution strip size if specified */
if (0 < strip_size) {
/* Note that the distribution is hardcoded here */
dist = PVFS_sys_dist_lookup("simple_stripe");
ret = PVFS_sys_dist_setparam(dist, "strip_size", &strip_size);
if (ret < 0) {
FPRINTF(stderr, "pvfs_sys_dist_setparam returns with %d\n", ret);
o_status->error = ret;
}
}
/* Perform file creation */
#ifdef HAVE_PVFS2_CREATE_WITHOUT_LAYOUT
ret = PVFS_sys_create(resp_getparent.basename,
resp_getparent.parent_ref, attribs,
&(pvfs2_fs->credentials), dist, &resp_create);
#else
ret = PVFS_sys_create(resp_getparent.basename,
resp_getparent.parent_ref, attribs,
&(pvfs2_fs->credentials), dist, NULL, &resp_create);
#endif
/* if many creates are happening in this directory, the earlier
* sys_lookup may have returned ENOENT, but the sys_create could
* return EEXISTS. That means the file has been created anyway, so
* less work for us and we can just open it up and return the
* handle */
if (ret == (-PVFS_EEXIST)) {
ret = PVFS_sys_lookup(fs_id, pvfs_name,
&(pvfs2_fs->credentials), &resp_lookup,
PVFS2_LOOKUP_LINK_FOLLOW);
if (ret < 0) {
o_status->error = ret;
return;
}
o_status->error = ret;
o_status->object_ref = resp_lookup.ref;
return;
}
o_status->object_ref = resp_create.ref;
} else {
FPRINTF(stderr, "cannot create file without MPI_MODE_CREATE\n");
o_status->error = ret;
return;
}
} else if (access_mode & ADIO_EXCL) {
/* lookup should not succeed if opened with EXCL */
o_status->error = -PVFS_EEXIST;
return;
} else {
o_status->object_ref = resp_lookup.ref;
}
o_status->error = ret;
return;
}
/* ADIOI_PVFS2_Open:
* one process opens (or creates) the file, then broadcasts the result to the
* remaining processors.
*
* ADIO_Open used to perform an optimization when MPI_MODE_CREATE (and before
* that, MPI_MODE_EXCL) was set. Because PVFS2 handles file lookup and
* creation more scalably than other file systems, ADIO_Open now skips any
* special handling when CREATE is set. */
void ADIOI_PVFS2_Open(ADIO_File fd, int *error_code)
{
int rank, ret;
PVFS_fs_id cur_fs;
static char myname[] = "ADIOI_PVFS2_OPEN";
char pvfs_path[PVFS_NAME_MAX] = { 0 };
ADIOI_PVFS2_fs *pvfs2_fs;
/* since one process is doing the open, that means one process is also
* doing the error checking. define a struct for both the object reference
* and the error code to broadcast to all the processors */
open_status o_status = { 0, {0, 0} };
MPI_Datatype open_status_type;
MPI_Datatype types[2] = { MPI_INT, MPI_BYTE };
int lens[2] = { 1, sizeof(PVFS_object_ref) };
MPI_Aint offsets[2];
pvfs2_fs = (ADIOI_PVFS2_fs *) ADIOI_Malloc(sizeof(ADIOI_PVFS2_fs));
/* --BEGIN ERROR HANDLING-- */
if (pvfs2_fs == NULL) {
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
MPIR_ERR_RECOVERABLE,
myname, __LINE__,
MPI_ERR_UNKNOWN, "Error allocating memory", 0);
return;
}
/* --END ERROR HANDLING-- */
MPI_Comm_rank(fd->comm, &rank);
ADIOI_PVFS2_Init(error_code);
if (*error_code != MPI_SUCCESS) {
/* ADIOI_PVFS2_INIT handles creating error codes on its own */
return;
}
/* currently everyone gets their own credentials */
ADIOI_PVFS2_makecredentials(&(pvfs2_fs->credentials));
/* one process resolves name and will later bcast to others */
#ifdef ADIOI_MPE_LOGGING
MPE_Log_event(ADIOI_MPE_open_a, 0, NULL);
#endif
if (rank == fd->hints->ranklist[0] && fd->fs_ptr == NULL) {
/* given the filename, figure out which pvfs filesystem it is on */
ret = PVFS_util_resolve(fd->filename, &cur_fs, pvfs_path, PVFS_NAME_MAX);
if (ret < 0) {
PVFS_perror("PVFS_util_resolve", ret);
/* TODO: pick a good error for this */
o_status.error = -1;
} else {
fake_an_open(cur_fs, pvfs_path,
fd->access_mode, fd->hints->striping_factor,
fd->hints->striping_unit, pvfs2_fs, &o_status);
}
/* store credentials and object reference in fd */
pvfs2_fs->object_ref = o_status.object_ref;
fd->fs_ptr = pvfs2_fs;
}
#ifdef ADIOI_MPE_LOGGING
MPE_Log_event(ADIOI_MPE_open_b, 0, NULL);
#endif
/* broadcast status and (possibly valid) object reference */
MPI_Get_address(&o_status.error, &offsets[0]);
MPI_Get_address(&o_status.object_ref, &offsets[1]);
MPI_Type_create_struct(2, lens, offsets, types, &open_status_type);
MPI_Type_commit(&open_status_type);
/* Assertion: if we hit this Bcast, then all processes collectively
* called this open.
*
* That's because deferred open never happens with PVFS2.
*/
MPI_Bcast(MPI_BOTTOM, 1, open_status_type, fd->hints->ranklist[0], fd->comm);
MPI_Type_free(&open_status_type);
/* --BEGIN ERROR HANDLING-- */
if (o_status.error != 0) {
ADIOI_Free(pvfs2_fs);
fd->fs_ptr = NULL;
*error_code = MPIO_Err_create_code(MPI_SUCCESS,
MPIR_ERR_RECOVERABLE,
myname, __LINE__,
ADIOI_PVFS2_error_convert(o_status.error),
"Unknown error", 0);
/* TODO: FIX STRING */
return;
}
/* --END ERROR HANDLING-- */
pvfs2_fs->object_ref = o_status.object_ref;
fd->fs_ptr = pvfs2_fs;
*error_code = MPI_SUCCESS;
return;
}
|