1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372
|
/***************************************************************
* *
* Copyright (c) 2001-2023 Fidelity National Information *
* Services, Inc. and/or its subsidiaries. All rights reserved. *
* *
* This source code contains the intellectual property *
* of its copyright holder(s), and is made available *
* under a license. If you do not know the terms of *
* the license, please stop and do not read further. *
* *
****************************************************************/
#include "mdef.h"
#include "gtm_unistd.h" /* DB_FSYNC macro needs this */
#include "gtm_string.h"
#include "gtmio.h" /* this has to come in before gdsfhead.h, for all "open" to be defined
to "open64", including the open in header files */
#include "aswp.h"
#include "gdsroot.h"
#include "gtm_facility.h"
#include "fileinfo.h"
#include "gdsbt.h"
#include "gdsblk.h"
#include "gdsfhead.h"
#include "filestruct.h"
#include "gt_timer.h"
#include "jnl.h"
#include "lockconst.h"
#include "interlock.h"
#include "iosp.h"
#include "gdsbgtr.h"
#include "is_file_identical.h"
#include "dpgbldir.h"
#include "gtm_rel_quant.h"
#include "repl_sp.h" /* for F_CLOSE used by the JNL_FD_CLOSE macro */
#include "memcoherency.h"
#include "gtm_dbjnl_dupfd_check.h"
#include "anticipatory_freeze.h"
GBLREF volatile int4 db_fsync_in_prog;
GBLREF volatile int4 jnl_qio_in_prog;
GBLREF uint4 process_id;
GBLREF jnlpool_addrs_ptr_t jnlpool;
error_def(ERR_DBFSYNCERR);
error_def(ERR_ENOSPCQIODEFER);
error_def(ERR_JNLACCESS);
error_def(ERR_JNLCNTRL);
error_def(ERR_JNLRDERR);
error_def(ERR_JNLWRTDEFER);
error_def(ERR_JNLWRTNOWWRTR);
error_def(ERR_PREMATEOF);
uint4 jnl_sub_qio_start(jnl_private_control *jpc, boolean_t aligned_write);
/* If the second argument is TRUE, then the jnl write is done only upto the previous aligned boundary.
* else the write is done upto the freeaddr */
uint4 jnl_sub_qio_start(jnl_private_control *jpc, boolean_t aligned_write)
{
boolean_t was_wrapped;
int tsz, close_res;
jnl_buffer_ptr_t jb;
int4 free_ptr;
sgmnt_addrs *csa;
node_local_ptr_t cnl;
sm_uc_ptr_t base;
unix_db_info *udi;
unsigned int status;
int save_errno;
uint4 aligned_dskaddr, dskaddr;
uint4 jnl_wrt_start_mask;
int4 aligned_dsk, dsk;
int aligned_tsz;
sm_uc_ptr_t aligned_base;
uint4 jnl_fs_block_size, new_dsk, new_dskaddr;
gd_region *reg;
intrpt_state_t prev_intrpt_state;
DCL_THREADGBL_ACCESS;
SETUP_THREADGBL_ACCESS;
assert(NULL != jpc);
reg = jpc->region;
udi = FILE_INFO(reg);
csa = &udi->s_addrs;
jb = jpc->jnl_buff;
if (jb->io_in_prog_latch.u.parts.latch_pid == process_id) /* We already have the lock? */
return ERR_JNLWRTNOWWRTR; /* timer driven io in progress */
DEFER_INTERRUPTS(INTRPT_IN_JNL_QIO, prev_intrpt_state);
if (!GET_SWAPLOCK(&jb->io_in_prog_latch))
{
ENABLE_INTERRUPTS(INTRPT_IN_JNL_QIO, prev_intrpt_state);
return ERR_JNLWRTDEFER;
}
if (IS_REPL_INST_FROZEN(TREF(defer_instance_freeze)))
{
RELEASE_SWAPLOCK(&jb->io_in_prog_latch);
ENABLE_INTERRUPTS(INTRPT_IN_JNL_QIO, prev_intrpt_state);
return ERR_JNLWRTDEFER;
}
# ifdef DEBUG
/* When jnl_sub_qio_start() is called as part of WBTEST_SIGTSTP_IN_JNL_OUTPUT_SP white-box test case,
* aligned_write should always be FALSE. But depending upon the filesystem block size, it is possible that
* the function could also be called with aligned_write being TRUE. This could lead to sending SIGTSTP
* twice. Hence ensure that SIGTSTP is sent only for the unaligned write.
*/
if (gtm_white_box_test_case_enabled && (WBTEST_SIGTSTP_IN_JNL_OUTPUT_SP == gtm_white_box_test_case_number)
&& !aligned_write)
kill(process_id, SIGTSTP);
# endif
if (jb->dsk != (jb->dskaddr % jb->size))
{
assert(gtm_white_box_test_case_enabled && (WBTEST_JNL_FILE_LOST_DSKADDR == gtm_white_box_test_case_number));
RELEASE_SWAPLOCK(&jb->io_in_prog_latch);
ENABLE_INTERRUPTS(INTRPT_IN_JNL_QIO, prev_intrpt_state);
return ERR_JNLCNTRL;
}
if (!JNL_FILE_SWITCHED(jpc))
jpc->fd_mismatch = FALSE;
else
{ /* journal file has been switched; release io_in_prog lock and return */
jpc->fd_mismatch = TRUE;
RELEASE_SWAPLOCK(&jb->io_in_prog_latch);
ENABLE_INTERRUPTS(INTRPT_IN_JNL_QIO, prev_intrpt_state);
return SS_NORMAL;
}
/* Currently we overload io_in_prog_latch to perform the db fsync too. Anyone trying to do a
* jnl_qio_start will first check if a db_fsync is needed and if so sync that before doing any jnl qio.
* Note that since an epoch record is written when need_db_fsync is set to TRUE, we are guaranteed that
* (dskaddr < freeaddr) which is necessary for the jnl_wait --> jnl_write_attempt mechanism (triggered
* by wcs_flu) to actually initiate a call to jnl_qio_start().
*/
if (jb->need_db_fsync)
{
DB_FSYNC(reg, udi, csa, db_fsync_in_prog, save_errno);
GTM_WHITE_BOX_TEST(WBTEST_ANTIFREEZE_DBFSYNCERR, save_errno, EIO);
if (0 != save_errno)
{
RELEASE_SWAPLOCK(&jb->io_in_prog_latch);
ENABLE_INTERRUPTS(INTRPT_IN_JNL_QIO, prev_intrpt_state);
/* DBFSYNCERR can potentially cause syslog flooding. Remove the following line if we it becomes an issue. */
send_msg_csa(CSA_ARG(csa) VARLSTCNT(5) ERR_DBFSYNCERR, 2, DB_LEN_STR(reg), save_errno);
rts_error_csa(CSA_ARG(csa) VARLSTCNT(5) ERR_DBFSYNCERR, 2, DB_LEN_STR(reg), save_errno);
assert(FALSE); /* should not come here as the rts_error above should not return */
return ERR_DBFSYNCERR; /* ensure we do not fall through to the code below as we no longer have the lock */
}
jb->need_db_fsync = FALSE;
}
free_ptr = jb->free;
/* The following barrier is to make sure that for the value of "free" that we extract (which may be
* slightly stale but that is not a correctness issue) we make sure we don't write out a stale version of
* the journal buffer contents. While it is possible that we see journal buffer contents that are more
* uptodate than "free", this would only mean writing out a less than optimal number of bytes but again,
* not a correctness issue. Secondary effect is that it also enforces a corresponding non-stale value of
* freeaddr is read and this is relied upon by asserts below.
*/
SHM_READ_MEMORY_BARRIER;
dsk = jb->dsk;
dskaddr = jb->dskaddr;
was_wrapped = (free_ptr < dsk);
jnl_fs_block_size = jb->fs_block_size;
base = &jb->buff[dsk + jb->buff_off];
aligned_base = (sm_uc_ptr_t)ROUND_DOWN2((uintszofptr_t)base, jnl_fs_block_size);
assert(aligned_base >= &jb->buff[jb->buff_off]);
aligned_dskaddr = ROUND_DOWN2(dskaddr, jnl_fs_block_size);
if (jb->re_read_dskaddr)
{ /* Need to re-read the filesystem-block-size-aligned partial block before jb->dskaddr */
assert(jb->re_read_dskaddr == dskaddr);
tsz = dskaddr - aligned_dskaddr;
if (tsz)
{
/* Assert that both ends of the source buffer for the read falls within journal buffer limits */
assert(aligned_base >= &jb->buff[jb->buff_off]);
assert(aligned_base + tsz <= &jb->buff[jb->buff_off + jb->size]);
LSEEKREAD(jpc->channel, aligned_dskaddr, aligned_base, tsz, jpc->status);
if (SS_NORMAL != jpc->status)
{
RELEASE_SWAPLOCK(&jb->io_in_prog_latch);
ENABLE_INTERRUPTS(INTRPT_IN_JNL_QIO, prev_intrpt_state);
jpc->status2 = SS_NORMAL;
jnl_send_oper(jpc, ERR_JNLRDERR);
rts_error_csa(CSA_ARG(csa) VARLSTCNT(5) ERR_JNLRDERR, 2, JNL_LEN_STR(csa->hdr), jpc->status);
assert(FALSE); /* should not come here as the rts_error above should not return */
return ERR_JNLRDERR; /* ensure we do not fall through to code below as we no longer have the lock */
}
}
jb->re_read_dskaddr = 0;
}
if (aligned_write)
free_ptr = ROUND_DOWN2(free_ptr, jnl_fs_block_size);
assert(!(jb->size % jnl_fs_block_size));
tsz = (free_ptr < dsk ? jb->size : free_ptr) - dsk;
if ((aligned_write && !was_wrapped && (free_ptr <= dsk)) || (NOJNL == jpc->channel))
tsz = 0;
assert(0 <= tsz);
assert(dskaddr + tsz <= jb->freeaddr);
status = SS_NORMAL;
if (tsz)
{ /* ensure that dsk and free are never equal and we have left space for filesystem-aligned offset BEFORE dsk */
assert(SS_NORMAL == status);
DEBUG_ONLY(jnl_wrt_start_mask = ~(jb->fs_block_size - 1);)
assert((free_ptr > dsk) || (free_ptr < (dsk & jnl_wrt_start_mask)) || (dsk != (dsk & jnl_wrt_start_mask)));
jb->wrtsize = tsz;
jb->qiocnt++;
assert(NOJNL != jpc->channel);
/* If sync_io is turned on, we would have turned on the O_DIRECT flag on some platforms. That will
* require us to do aligned writes. Both the source buffer and the size of the write need to be aligned
* for this to work on some platforms. The alignment needs to be on a filesystem-block-size granularity.
* If sync_io is not turned on, doing aligned writes saves us from the OS doing a read of the block
* under the covers in case we write only a part of the filesystem block.
* Therefore we do aligned writes no matter what. This means we could be writing some garbage padding
* data out after the last valid journal record jut to fit in the alignment requirements. But that is
* considered okay because as part of writing the EOF record out (for a clean termination), jnl_write
* would have 0-padded the journal buffer for us. So a cleanly shutdown journal file will have 0-padding
* following the EOF record but an actively used journal file might have garbage padding following the
* last valid record. This is considered okay as journal recovery has logic to scan past the garbage and
* locate the last valid record in case of a crash before writing the EOF.
*/
DEBUG_ONLY(aligned_dsk = ROUND_DOWN2(dsk, jnl_fs_block_size));
aligned_tsz = ROUND_UP2((tsz + (dskaddr - aligned_dskaddr)), jnl_fs_block_size);
/* Assert that aligned_dsk never backs up to a point BEFORE where the free pointer is */
assert((aligned_dsk > free_ptr) || (dsk <= free_ptr));
/* Assert that aligned_dskaddr never backs up to a point inside journal file header territory.
* This is because those fields are always updated inside crit and therefore we should
* never touch those while we hold only the jnl qio lock.
*/
assert(JNL_HDR_LEN <= aligned_dskaddr);
/* Assert that both ends of the source buffer for the write falls within journal buffer limits */
assert(aligned_base >= &jb->buff[jb->buff_off]);
assert(aligned_base + aligned_tsz <= &jb->buff[jb->buff_off + jb->size]);
JNL_LSEEKWRITE(csa, csa->hdr->jnl_file_name, jpc->channel,
(off_t)aligned_dskaddr, aligned_base, (size_t)aligned_tsz, jpc->status);
status = jpc->status;
if (SS_NORMAL == status)
{ /* update jnl_buff pointers to reflect the successful write to the journal file */
assert(dsk <= jb->size);
assert(jb->io_in_prog_latch.u.parts.latch_pid == process_id);
new_dskaddr = dskaddr + tsz;
new_dsk = dsk + tsz;
if (new_dsk >= jb->size)
{
assert(new_dsk == jb->size);
new_dsk = 0;
}
assert(new_dsk == new_dskaddr % jb->size);
assert(jb->freeaddr >= new_dskaddr);
/* Note: "wcs_flu" does a "performCASLatchCheck" of jb->io_in_prog_latch and relies
* on "jb->dskaddr" being updated BEFORE "jb->dsk".
*/
jb->dskaddr = new_dskaddr;
jb->dsk = new_dsk;
cnl = csa->nl;
INCR_GVSTATS_COUNTER(csa, cnl, n_jfile_bytes, aligned_tsz);
INCR_GVSTATS_COUNTER(csa, cnl, n_jfile_writes, 1);
} else
{
assert((ENOSPC == status) || (ERR_ENOSPCQIODEFER == status));
jb->errcnt++;
if (ENOSPC == status)
jb->enospc_errcnt++;
else
jb->enospc_errcnt = 0;
if (ERR_ENOSPCQIODEFER != status)
{
jnl_send_oper(jpc, ERR_JNLACCESS);
jpc->status = status; /* set jpc->status back to original error as jnl_send_oper resets
* jpc->status to SS_NORMAL. We need it in callers of this function
* (e.g. jnl_write_attempt). */
}
# ifdef GTM_FD_TRACE
if ((EBADF == status) || (ESPIPE == status))
{ /* likely case of D9I11-002714. check if fd is valid */
gtm_dbjnl_dupfd_check();
/* If fd of this journal points to some other database or journal file opened by this process
* the above call would have reset jpc->channel. If it did not get reset, then check
* if the fd in itself is valid and points back to the journal file. If not reset it to NOJNL.
*/
if (NOJNL != jpc->channel)
gtm_check_fd_is_valid(reg, FALSE, jpc->channel);
/* If jpc->channel still did not get reset to NOJNL, it means the file descriptor is valid but
* not sure why we are getting EBADF/ESPIPE errors. No further recovery attempted at this point.
*/
}
# endif
if (ERR_ENOSPCQIODEFER == status)
status = ERR_JNLWRTDEFER;
else
status = ERR_JNLACCESS;
}
}
RELEASE_SWAPLOCK(&jb->io_in_prog_latch);
if ((jnl_closed == csa->hdr->jnl_state) && (NOJNL != jpc->channel))
{
JNL_FD_CLOSE(jpc->channel, close_res); /* sets jpc->channel to NOJNL */
jpc->pini_addr = 0;
}
ENABLE_INTERRUPTS(INTRPT_IN_JNL_QIO, prev_intrpt_state);
return status;
}
/* This is a wrapper for jnl_sub_qio_start that tries to divide the writes into optimal chunks.
* It calls jnl_sub_qio_start() with appropriate arguments in two stages, the first one with
* optimal "jnl_fs_block_size" boundary and the other suboptimal tail end of the write. The latter
* call is made only if no other process has finished the jnl write upto the required point
* during the time this process yields
*/
uint4 jnl_qio_start(jnl_private_control *jpc)
{
unsigned int yield_cnt, status;
uint4 lcl_dskaddr, old_freeaddr, spin_sleep_mask, target_freeaddr;
jnl_buffer_ptr_t jb;
sgmnt_addrs *csa;
unix_db_info *udi;
uint4 jnl_fs_block_size;
int index1;
assert(NULL != jpc);
udi = FILE_INFO(jpc->region);
csa = &udi->s_addrs;
jb = jpc->jnl_buff;
/* this block of code (till yield()) processes the buffer upto an "jnl_fs_block_size" alignment boundary
* and the next block of code (after the yield()) processes the tail end of the data (if necessary)
*/
lcl_dskaddr = jb->dskaddr;
/* Check if there are any pending jnl phase2 commits that can be cleaned up. That will bring jb->freeaddr more uptodate. */
index1 = jb->phase2_commit_index1;
ASSERT_JNL_PHASE2_COMMIT_INDEX_IS_VALID(index1, JNL_PHASE2_COMMIT_ARRAY_SIZE);
if ((index1 != jb->phase2_commit_index2) && jb->phase2_commit_array[index1].write_complete
&& (LOCK_AVAILABLE == jb->phase2_commit_latch.u.parts.latch_pid))
jnl_phase2_cleanup(csa, jb);
/* Now that any possible phase2 commit cleanup is done, go ahead with qio (if needed) using updated jb->freeaddr */
target_freeaddr = jb->freeaddr;
if (lcl_dskaddr >= target_freeaddr)
return SS_NORMAL;
/* ROUND_DOWN2 macro is used under the assumption that "jnl_fs_block_size" would be a power of 2 */
jnl_fs_block_size = jb->fs_block_size;
if (ROUND_DOWN2(lcl_dskaddr, jnl_fs_block_size) != ROUND_DOWN2(target_freeaddr, jnl_fs_block_size))
{ /* data crosses/touches an alignment boundary */
if (SS_NORMAL != (status = jnl_sub_qio_start(jpc, TRUE)))
return status;
} /* else, data does not cross/touch an alignment boundary, yield and see if someone else
* does the dirty job more efficiently
*/
spin_sleep_mask = csa->hdr->mutex_spin_parms.mutex_spin_sleep_mask;
for (yield_cnt = 0; !csa->now_crit && (yield_cnt < csa->hdr->yield_lmt); yield_cnt++)
{ /* If not in crit, wait until someone has finished your job or no one else is active on the jnl file */
old_freeaddr = jb->freeaddr;
GTM_REL_QUANT(spin_sleep_mask);
/* Purpose of this memory barrier is to get a current view of asyncrhonously changed fields
* like whether the jnl file was switched, the write position in the journal file and the
* write address in the journal buffer for all the remaining statements in this loop because
* the GTM_REL_QUANT invocation above allows any and all of them to change and we aren't under any
* locks while in this loop. This is not a correctness issue as we would either eventually
* see the updates or it means we are writing what has already been written. It is a performance
* issue keeping more current with state changes done by other processes on other processors.
*/
SHM_READ_MEMORY_BARRIER;
if (JNL_FILE_SWITCHED(jpc))
return SS_NORMAL;
/* assert(old_freeaddr <= jb->freeaddr) ** Potential race condition with jnl file switch could
* make this assert fail so it is removed
*/
if (old_freeaddr == jb->freeaddr || target_freeaddr <= jb->dskaddr)
break;
}
status = SS_NORMAL;
if (target_freeaddr > jb->dskaddr)
status = jnl_sub_qio_start(jpc, FALSE);
return status;
}
|