1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300
|
/*-
* See the file LICENSE for redistribution information.
*
* Copyright (c) 1997-2002
* Sleepycat Software. All rights reserved.
*/
#include "db_config.h"
#ifndef lint
static const char revid[] = "$Id: os_rw.c,v 1.1.1.1 2003/11/20 22:13:39 toshok Exp $";
#endif /* not lint */
#ifndef NO_SYSTEM_INCLUDES
#include <sys/types.h>
#include <string.h>
#include <unistd.h>
#endif
#include "db_int.h"
#ifdef HAVE_FILESYSTEM_NOTZERO
static int __os_zerofill __P((DB_ENV *, DB_FH *));
#endif
static int __os_physwrite __P((DB_ENV *, DB_FH *, void *, size_t, size_t *));
/*
* __os_io --
* Do an I/O.
*
* PUBLIC: int __os_io __P((DB_ENV *, DB_IO *, int, size_t *));
*/
int
__os_io(dbenv, db_iop, op, niop)
DB_ENV *dbenv;
DB_IO *db_iop;
int op;
size_t *niop;
{
int ret;
if (__os_is_winnt()) {
ULONG64 off = (ULONG64)db_iop->pagesize * db_iop->pgno;
OVERLAPPED over;
DWORD nbytes;
over.Offset = (DWORD)(off & 0xffffffff);
over.OffsetHigh = (DWORD)(off >> 32);
over.hEvent = 0; /* we don't want asynchronous notifications */
switch (op) {
case DB_IO_READ:
if (DB_GLOBAL(j_read) != NULL)
goto slow;
if (!ReadFile(db_iop->fhp->handle,
db_iop->buf, (DWORD)db_iop->bytes, &nbytes, &over))
goto slow;
break;
case DB_IO_WRITE:
if (DB_GLOBAL(j_write) != NULL)
goto slow;
#ifdef HAVE_FILESYSTEM_NOTZERO
if (__os_fs_notzero())
goto slow;
#endif
if (!WriteFile(db_iop->fhp->handle,
db_iop->buf, (DWORD)db_iop->bytes, &nbytes, &over))
goto slow;
break;
}
if (nbytes == db_iop->bytes) {
*niop = (size_t)nbytes;
return (0);
}
}
slow: MUTEX_THREAD_LOCK(dbenv, db_iop->mutexp);
if ((ret = __os_seek(dbenv, db_iop->fhp,
db_iop->pagesize, db_iop->pgno, 0, 0, DB_OS_SEEK_SET)) != 0)
goto err;
switch (op) {
case DB_IO_READ:
ret = __os_read(dbenv,
db_iop->fhp, db_iop->buf, db_iop->bytes, niop);
break;
case DB_IO_WRITE:
ret = __os_write(dbenv,
db_iop->fhp, db_iop->buf, db_iop->bytes, niop);
break;
}
err: MUTEX_THREAD_UNLOCK(dbenv, db_iop->mutexp);
return (ret);
}
/*
* __os_read --
* Read from a file handle.
*
* PUBLIC: int __os_read __P((DB_ENV *, DB_FH *, void *, size_t, size_t *));
*/
int
__os_read(dbenv, fhp, addr, len, nrp)
DB_ENV *dbenv;
DB_FH *fhp;
void *addr;
size_t len;
size_t *nrp;
{
size_t offset;
DWORD nr;
int ret;
BOOL success;
u_int8_t *taddr;
for (taddr = addr,
offset = 0; offset < len; taddr += nr, offset += nr) {
retry: if (DB_GLOBAL(j_read) != NULL) {
nr = (DWORD)DB_GLOBAL(j_read)(fhp->fd,
taddr, len - offset);
success = (nr >= 0);
} else {
success = ReadFile(fhp->handle,
taddr, (DWORD)(len - offset), &nr, NULL);
if (!success)
__os_set_errno(__os_win32_errno());
}
if (!success) {
if ((ret = __os_get_errno()) == EINTR)
goto retry;
__db_err(dbenv, "read: 0x%lx, %lu: %s",
P_TO_ULONG(taddr),
(u_long)len - offset, strerror(ret));
return (ret);
}
if (nr == 0)
break;
}
*nrp = taddr - (u_int8_t *)addr;
return (0);
}
/*
* __os_write --
* Write to a file handle.
*
* PUBLIC: int __os_write __P((DB_ENV *, DB_FH *, void *, size_t, size_t *));
*/
int
__os_write(dbenv, fhp, addr, len, nwp)
DB_ENV *dbenv;
DB_FH *fhp;
void *addr;
size_t len;
size_t *nwp;
{
int ret;
#ifdef HAVE_FILESYSTEM_NOTZERO
/* Zero-fill as necessary. */
if (__os_fs_notzero() && (ret = __os_zerofill(dbenv, fhp)) != 0)
return (ret);
#endif
return (__os_physwrite(dbenv, fhp, addr, len, nwp));
}
/*
* __os_physwrite --
* Physical write to a file handle.
*/
static int
__os_physwrite(dbenv, fhp, addr, len, nwp)
DB_ENV *dbenv;
DB_FH *fhp;
void *addr;
size_t len;
size_t *nwp;
{
size_t offset;
DWORD nw;
int ret;
BOOL success;
u_int8_t *taddr;
for (taddr = addr,
offset = 0; offset < len; taddr += nw, offset += nw) {
retry: if (DB_GLOBAL(j_write) != NULL) {
nw = (DWORD)DB_GLOBAL(j_write)(fhp->fd,
taddr, len - offset);
success = (nw >= 0);
} else {
success = WriteFile(fhp->handle,
taddr, (DWORD)(len - offset), &nw, NULL);
if (!success)
__os_set_errno(__os_win32_errno());
}
if (!success) {
if ((ret = __os_get_errno()) == EINTR)
goto retry;
__db_err(dbenv, "write: 0x%x, %lu: %s", taddr,
(u_long)len-offset, strerror(ret));
return (ret);
}
}
*nwp = len;
return (0);
}
#ifdef HAVE_FILESYSTEM_NOTZERO
/*
* __os_zerofill --
* Zero out bytes in the file.
*
* Pages allocated by writing pages past end-of-file are not zeroed,
* on some systems. Recovery could theoretically be fooled by a page
* showing up that contained garbage. In order to avoid this, we
* have to write the pages out to disk, and flush them. The reason
* for the flush is because if we don't sync, the allocation of another
* page subsequent to this one might reach the disk first, and if we
* crashed at the right moment, leave us with this page as the one
* allocated by writing a page past it in the file.
*/
static int
__os_zerofill(dbenv, fhp)
DB_ENV *dbenv;
DB_FH *fhp;
{
unsigned __int64 stat_offset, write_offset;
size_t blen, nw;
u_int32_t bytes, mbytes;
int group_sync, need_free, ret;
u_int8_t buf[8 * 1024], *bp;
/* Calculate the byte offset of the next write. */
write_offset = (unsigned __int64)fhp->pgno * fhp->pgsize + fhp->offset;
/* Stat the file. */
if ((ret = __os_ioinfo(dbenv, NULL, fhp, &mbytes, &bytes, NULL)) != 0)
return (ret);
stat_offset = (unsigned __int64)mbytes * MEGABYTE + bytes;
/* Check if the file is large enough. */
if (stat_offset >= write_offset)
return (0);
/* Get a large buffer if we're writing lots of data. */
#undef ZF_LARGE_WRITE
#define ZF_LARGE_WRITE (64 * 1024)
if (write_offset - stat_offset > ZF_LARGE_WRITE) {
if ((ret = __os_calloc(dbenv, 1, ZF_LARGE_WRITE, &bp)) != 0)
return (ret);
blen = ZF_LARGE_WRITE;
need_free = 1;
} else {
bp = buf;
blen = sizeof(buf);
need_free = 0;
memset(buf, 0, sizeof(buf));
}
/* Seek to the current end of the file. */
if ((ret = __os_seek(
dbenv, fhp, MEGABYTE, mbytes, bytes, 0, DB_OS_SEEK_SET)) != 0)
goto err;
/*
* Hash is the only access method that allocates groups of pages. Hash
* uses the existence of the last page in a group to signify the entire
* group is OK; so, write all the pages but the last one in the group,
* flush them to disk, then write the last one to disk and flush it.
*/
for (group_sync = 0; stat_offset < write_offset; group_sync = 1) {
if (write_offset - stat_offset <= blen) {
blen = (size_t)(write_offset - stat_offset);
if (group_sync && (ret = __os_fsync(dbenv, fhp)) != 0)
goto err;
}
if ((ret = __os_physwrite(dbenv, fhp, bp, blen, &nw)) != 0)
goto err;
stat_offset += blen;
}
if ((ret = __os_fsync(dbenv, fhp)) != 0)
goto err;
/* Seek back to where we started. */
mbytes = (u_int32_t)(write_offset / MEGABYTE);
bytes = (u_int32_t)(write_offset % MEGABYTE);
ret = __os_seek(dbenv, fhp, MEGABYTE, mbytes, bytes, 0, DB_OS_SEEK_SET);
err: if (need_free)
__os_free(dbenv, bp);
return (ret);
}
#endif
|