File: jnl_output_sp.c

package info (click to toggle)
fis-gtm 7.1-006-1
  • links: PTS, VCS
  • area: main
  • in suites: trixie
  • size: 32,908 kB
  • sloc: ansic: 344,906; asm: 5,184; csh: 4,859; sh: 2,000; awk: 294; makefile: 73; sed: 13
file content (372 lines) | stat: -rw-r--r-- 16,574 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
/***************************************************************
 *								*
 * Copyright (c) 2001-2023 Fidelity National Information	*
 * Services, Inc. and/or its subsidiaries. All rights reserved.	*
 *								*
 *	This source code contains the intellectual property	*
 *	of its copyright holder(s), and is made available	*
 *	under a license.  If you do not know the terms of	*
 *	the license, please stop and do not read further.	*
 *								*
 ****************************************************************/

#include "mdef.h"

#include "gtm_unistd.h"	/* DB_FSYNC macro needs this */
#include "gtm_string.h"
#include "gtmio.h"	/* this has to come in before gdsfhead.h, for all "open" to be defined
				to "open64", including the open in header files */
#include "aswp.h"
#include "gdsroot.h"
#include "gtm_facility.h"
#include "fileinfo.h"
#include "gdsbt.h"
#include "gdsblk.h"
#include "gdsfhead.h"
#include "filestruct.h"
#include "gt_timer.h"
#include "jnl.h"
#include "lockconst.h"
#include "interlock.h"
#include "iosp.h"
#include "gdsbgtr.h"
#include "is_file_identical.h"
#include "dpgbldir.h"
#include "gtm_rel_quant.h"
#include "repl_sp.h"	/* for F_CLOSE used by the JNL_FD_CLOSE macro */
#include "memcoherency.h"
#include "gtm_dbjnl_dupfd_check.h"
#include "anticipatory_freeze.h"

GBLREF	volatile int4		db_fsync_in_prog;
GBLREF	volatile int4		jnl_qio_in_prog;
GBLREF	uint4			process_id;
GBLREF	jnlpool_addrs_ptr_t	jnlpool;

error_def(ERR_DBFSYNCERR);
error_def(ERR_ENOSPCQIODEFER);
error_def(ERR_JNLACCESS);
error_def(ERR_JNLCNTRL);
error_def(ERR_JNLRDERR);
error_def(ERR_JNLWRTDEFER);
error_def(ERR_JNLWRTNOWWRTR);
error_def(ERR_PREMATEOF);

uint4 jnl_sub_qio_start(jnl_private_control *jpc, boolean_t aligned_write);

/* If the second argument is TRUE, then the jnl write is done only upto the previous aligned boundary.
 * else the write is done upto the freeaddr */

uint4 jnl_sub_qio_start(jnl_private_control *jpc, boolean_t aligned_write)
{
	boolean_t		was_wrapped;
	int			tsz, close_res;
	jnl_buffer_ptr_t	jb;
	int4			free_ptr;
	sgmnt_addrs		*csa;
	node_local_ptr_t	cnl;
	sm_uc_ptr_t		base;
	unix_db_info		*udi;
	unsigned int		status;
	int			save_errno;
	uint4			aligned_dskaddr, dskaddr;
	uint4			jnl_wrt_start_mask;
	int4			aligned_dsk, dsk;
	int			aligned_tsz;
	sm_uc_ptr_t		aligned_base;
	uint4			jnl_fs_block_size, new_dsk, new_dskaddr;
	gd_region		*reg;
	intrpt_state_t		prev_intrpt_state;
	DCL_THREADGBL_ACCESS;

	SETUP_THREADGBL_ACCESS;
	assert(NULL != jpc);
	reg = jpc->region;
	udi = FILE_INFO(reg);
	csa = &udi->s_addrs;
	jb = jpc->jnl_buff;
	if (jb->io_in_prog_latch.u.parts.latch_pid == process_id)	/* We already have the lock? */
		return ERR_JNLWRTNOWWRTR;			/* timer driven io in progress */
	DEFER_INTERRUPTS(INTRPT_IN_JNL_QIO, prev_intrpt_state);
	if (!GET_SWAPLOCK(&jb->io_in_prog_latch))
	{
		ENABLE_INTERRUPTS(INTRPT_IN_JNL_QIO, prev_intrpt_state);
		return ERR_JNLWRTDEFER;
	}
	if (IS_REPL_INST_FROZEN(TREF(defer_instance_freeze)))
	{
		RELEASE_SWAPLOCK(&jb->io_in_prog_latch);
		ENABLE_INTERRUPTS(INTRPT_IN_JNL_QIO, prev_intrpt_state);
		return ERR_JNLWRTDEFER;
	}
#	ifdef DEBUG
	/* When jnl_sub_qio_start() is called as part of WBTEST_SIGTSTP_IN_JNL_OUTPUT_SP white-box test case,
	 * aligned_write should always be FALSE. But depending upon the filesystem block size, it is possible that
	 * the function could also be called with aligned_write being TRUE. This could lead to sending SIGTSTP
	 * twice. Hence ensure that SIGTSTP is sent only for the unaligned write.
	 */
	if (gtm_white_box_test_case_enabled && (WBTEST_SIGTSTP_IN_JNL_OUTPUT_SP == gtm_white_box_test_case_number)
				&& !aligned_write)
		kill(process_id, SIGTSTP);
#	endif
	if (jb->dsk != (jb->dskaddr % jb->size))
	{
		assert(gtm_white_box_test_case_enabled && (WBTEST_JNL_FILE_LOST_DSKADDR == gtm_white_box_test_case_number));
		RELEASE_SWAPLOCK(&jb->io_in_prog_latch);
		ENABLE_INTERRUPTS(INTRPT_IN_JNL_QIO, prev_intrpt_state);
		return ERR_JNLCNTRL;
	}
	if (!JNL_FILE_SWITCHED(jpc))
		jpc->fd_mismatch = FALSE;
	else
	{	/* journal file has been switched; release io_in_prog lock and return */
		jpc->fd_mismatch = TRUE;
		RELEASE_SWAPLOCK(&jb->io_in_prog_latch);
		ENABLE_INTERRUPTS(INTRPT_IN_JNL_QIO, prev_intrpt_state);
		return SS_NORMAL;
	}
	/* Currently we overload io_in_prog_latch to perform the db fsync too. Anyone trying to do a
	 *   jnl_qio_start will first check if a db_fsync is needed and if so sync that before doing any jnl qio.
	 * Note that since an epoch record is written when need_db_fsync is set to TRUE, we are guaranteed that
	 *   (dskaddr < freeaddr) which is necessary for the jnl_wait --> jnl_write_attempt mechanism (triggered
	 *   by wcs_flu) to actually initiate a call to jnl_qio_start().
	 */
	if (jb->need_db_fsync)
	{
		DB_FSYNC(reg, udi, csa, db_fsync_in_prog, save_errno);
		GTM_WHITE_BOX_TEST(WBTEST_ANTIFREEZE_DBFSYNCERR, save_errno, EIO);
		if (0 != save_errno)
		{
			RELEASE_SWAPLOCK(&jb->io_in_prog_latch);
			ENABLE_INTERRUPTS(INTRPT_IN_JNL_QIO, prev_intrpt_state);
			/* DBFSYNCERR can potentially cause syslog flooding. Remove the following line if we it becomes an issue. */
			send_msg_csa(CSA_ARG(csa) VARLSTCNT(5) ERR_DBFSYNCERR, 2, DB_LEN_STR(reg), save_errno);
			rts_error_csa(CSA_ARG(csa) VARLSTCNT(5) ERR_DBFSYNCERR, 2, DB_LEN_STR(reg), save_errno);
			assert(FALSE);	/* should not come here as the rts_error above should not return */
			return ERR_DBFSYNCERR;	/* ensure we do not fall through to the code below as we no longer have the lock */
		}
		jb->need_db_fsync = FALSE;
	}
	free_ptr = jb->free;
        /* The following barrier is to make sure that for the value of "free" that we extract (which may be
         * slightly stale but that is not a correctness issue) we make sure we don't write out a stale version of
         * the journal buffer contents. While it is possible that we see journal buffer contents that are more
         * uptodate than "free", this would only mean writing out a less than optimal number of bytes but again,
         * not a correctness issue. Secondary effect is that it also enforces a corresponding non-stale value of
         * freeaddr is read and this is relied upon by asserts below.
	 */
	SHM_READ_MEMORY_BARRIER;
	dsk = jb->dsk;
	dskaddr = jb->dskaddr;
	was_wrapped = (free_ptr < dsk);
	jnl_fs_block_size = jb->fs_block_size;
	base = &jb->buff[dsk + jb->buff_off];
	aligned_base = (sm_uc_ptr_t)ROUND_DOWN2((uintszofptr_t)base, jnl_fs_block_size);
	assert(aligned_base >= &jb->buff[jb->buff_off]);
	aligned_dskaddr = ROUND_DOWN2(dskaddr, jnl_fs_block_size);
	if (jb->re_read_dskaddr)
	{	/* Need to re-read the filesystem-block-size-aligned partial block before jb->dskaddr */
		assert(jb->re_read_dskaddr == dskaddr);
		tsz = dskaddr - aligned_dskaddr;
		if (tsz)
		{
			/* Assert that both ends of the source buffer for the read falls within journal buffer limits */
			assert(aligned_base >= &jb->buff[jb->buff_off]);
			assert(aligned_base + tsz <= &jb->buff[jb->buff_off + jb->size]);
			LSEEKREAD(jpc->channel, aligned_dskaddr, aligned_base, tsz, jpc->status);
			if (SS_NORMAL != jpc->status)
			{
				RELEASE_SWAPLOCK(&jb->io_in_prog_latch);
				ENABLE_INTERRUPTS(INTRPT_IN_JNL_QIO, prev_intrpt_state);
				jpc->status2 = SS_NORMAL;
				jnl_send_oper(jpc, ERR_JNLRDERR);
				rts_error_csa(CSA_ARG(csa) VARLSTCNT(5) ERR_JNLRDERR, 2, JNL_LEN_STR(csa->hdr), jpc->status);
				assert(FALSE);	/* should not come here as the rts_error above should not return */
				return ERR_JNLRDERR; /* ensure we do not fall through to code below as we no longer have the lock */
			}
		}
		jb->re_read_dskaddr = 0;
	}
	if (aligned_write)
		free_ptr = ROUND_DOWN2(free_ptr, jnl_fs_block_size);
	assert(!(jb->size % jnl_fs_block_size));
	tsz = (free_ptr < dsk ? jb->size : free_ptr) - dsk;
	if ((aligned_write && !was_wrapped && (free_ptr <= dsk)) || (NOJNL == jpc->channel))
		tsz = 0;
	assert(0 <= tsz);
	assert(dskaddr + tsz <= jb->freeaddr);
	status = SS_NORMAL;
	if (tsz)
	{	/* ensure that dsk and free are never equal and we have left space for filesystem-aligned offset BEFORE dsk */
		assert(SS_NORMAL == status);
		DEBUG_ONLY(jnl_wrt_start_mask = ~(jb->fs_block_size - 1);)
		assert((free_ptr > dsk) || (free_ptr < (dsk & jnl_wrt_start_mask)) || (dsk != (dsk & jnl_wrt_start_mask)));
		jb->wrtsize = tsz;
		jb->qiocnt++;
		assert(NOJNL != jpc->channel);
		/* If sync_io is turned on, we would have turned on the O_DIRECT flag on some platforms. That will
		 * require us to do aligned writes. Both the source buffer and the size of the write need to be aligned
		 * for this to work on some platforms. The alignment needs to be on a filesystem-block-size granularity.
		 * If sync_io is not turned on, doing aligned writes saves us from the OS doing a read of the block
		 * under the covers in case we write only a part of the filesystem block.
		 * Therefore we do aligned writes no matter what. This means we could be writing some garbage padding
		 * data out after the last valid journal record jut to fit in the alignment requirements. But that is
		 * considered okay because as part of writing the EOF record out (for a clean termination), jnl_write
		 * would have 0-padded the journal buffer for us. So a cleanly shutdown journal file will have 0-padding
		 * following the EOF record but an actively used journal file might have garbage padding following the
		 * last valid record. This is considered okay as journal recovery has logic to scan past the garbage and
		 * locate the last valid record in case of a crash before writing the EOF.
		 */
		DEBUG_ONLY(aligned_dsk = ROUND_DOWN2(dsk, jnl_fs_block_size));
		aligned_tsz = ROUND_UP2((tsz + (dskaddr - aligned_dskaddr)), jnl_fs_block_size);
		/* Assert that aligned_dsk never backs up to a point BEFORE where the free pointer is */
		assert((aligned_dsk > free_ptr) || (dsk <= free_ptr));
		/* Assert that aligned_dskaddr never backs up to a point inside journal file header territory.
		 * This is because those fields are always updated inside crit and therefore we should
		 * never touch those while we hold only the jnl qio lock.
		 */
		assert(JNL_HDR_LEN <= aligned_dskaddr);
		/* Assert that both ends of the source buffer for the write falls within journal buffer limits */
		assert(aligned_base >= &jb->buff[jb->buff_off]);
		assert(aligned_base + aligned_tsz <= &jb->buff[jb->buff_off + jb->size]);
		JNL_LSEEKWRITE(csa, csa->hdr->jnl_file_name, jpc->channel,
			(off_t)aligned_dskaddr, aligned_base, (size_t)aligned_tsz, jpc->status);
		status = jpc->status;
		if (SS_NORMAL == status)
		{	/* update jnl_buff pointers to reflect the successful write to the journal file */
			assert(dsk <= jb->size);
			assert(jb->io_in_prog_latch.u.parts.latch_pid == process_id);
			new_dskaddr = dskaddr + tsz;
			new_dsk = dsk + tsz;
			if (new_dsk >= jb->size)
			{
				assert(new_dsk == jb->size);
				new_dsk = 0;
			}
			assert(new_dsk == new_dskaddr % jb->size);
			assert(jb->freeaddr >= new_dskaddr);
			/* Note: "wcs_flu" does a "performCASLatchCheck" of jb->io_in_prog_latch and relies
			 * on "jb->dskaddr" being updated BEFORE "jb->dsk".
			 */
			jb->dskaddr = new_dskaddr;
			jb->dsk = new_dsk;
			cnl = csa->nl;
			INCR_GVSTATS_COUNTER(csa, cnl, n_jfile_bytes, aligned_tsz);
			INCR_GVSTATS_COUNTER(csa, cnl, n_jfile_writes, 1);
		} else
		{
			assert((ENOSPC == status) || (ERR_ENOSPCQIODEFER == status));
			jb->errcnt++;
			if (ENOSPC == status)
				jb->enospc_errcnt++;
			else
				jb->enospc_errcnt = 0;

			if (ERR_ENOSPCQIODEFER != status)
			{
				jnl_send_oper(jpc, ERR_JNLACCESS);
				jpc->status = status;	/* set jpc->status back to original error as jnl_send_oper resets
							 * jpc->status to SS_NORMAL. We need it in callers of this function
							 * (e.g. jnl_write_attempt). */
			}
#			ifdef GTM_FD_TRACE
			if ((EBADF == status) || (ESPIPE == status))
			{	/* likely case of D9I11-002714. check if fd is valid */
				gtm_dbjnl_dupfd_check();
				/* If fd of this journal points to some other database or journal file opened by this process
				 * the above call would have reset jpc->channel. If it did not get reset, then check
				 * if the fd in itself is valid and points back to the journal file. If not reset it to NOJNL.
				 */
				if (NOJNL != jpc->channel)
					gtm_check_fd_is_valid(reg, FALSE, jpc->channel);
				/* If jpc->channel still did not get reset to NOJNL, it means the file descriptor is valid but
				 * not sure why we are getting EBADF/ESPIPE errors. No further recovery attempted at this point.
				 */
			}
#			endif
			if (ERR_ENOSPCQIODEFER == status)
				status = ERR_JNLWRTDEFER;
			else
				status = ERR_JNLACCESS;
		}
	}
	RELEASE_SWAPLOCK(&jb->io_in_prog_latch);
	if ((jnl_closed == csa->hdr->jnl_state) && (NOJNL != jpc->channel))
	{
		JNL_FD_CLOSE(jpc->channel, close_res);	/* sets jpc->channel to NOJNL */
		jpc->pini_addr = 0;
	}
	ENABLE_INTERRUPTS(INTRPT_IN_JNL_QIO, prev_intrpt_state);
	return status;
}

/* This is a wrapper for jnl_sub_qio_start that tries to divide the writes into optimal chunks.
 * It calls jnl_sub_qio_start() with appropriate arguments in two stages, the first one with
 * optimal "jnl_fs_block_size" boundary and the other suboptimal tail end of the write. The latter
 * call is made only if no other process has finished the jnl write upto the required point
 * during the time this process yields
 */
uint4 jnl_qio_start(jnl_private_control *jpc)
{
	unsigned int		yield_cnt, status;
	uint4			lcl_dskaddr, old_freeaddr, spin_sleep_mask, target_freeaddr;
	jnl_buffer_ptr_t	jb;
	sgmnt_addrs		*csa;
	unix_db_info		*udi;
	uint4			jnl_fs_block_size;
	int			index1;

	assert(NULL != jpc);
	udi = FILE_INFO(jpc->region);
	csa = &udi->s_addrs;
	jb = jpc->jnl_buff;
	/* this block of code (till yield()) processes the buffer upto an "jnl_fs_block_size" alignment boundary
	 * and the next block of code (after the yield()) processes the tail end of the data (if necessary)
	 */
	lcl_dskaddr = jb->dskaddr;
	/* Check if there are any pending jnl phase2 commits that can be cleaned up. That will bring jb->freeaddr more uptodate. */
	index1 = jb->phase2_commit_index1;
	ASSERT_JNL_PHASE2_COMMIT_INDEX_IS_VALID(index1, JNL_PHASE2_COMMIT_ARRAY_SIZE);
	if ((index1 != jb->phase2_commit_index2) && jb->phase2_commit_array[index1].write_complete
				&& (LOCK_AVAILABLE == jb->phase2_commit_latch.u.parts.latch_pid))
		jnl_phase2_cleanup(csa, jb);
	/* Now that any possible phase2 commit cleanup is done, go ahead with qio (if needed) using updated jb->freeaddr */
	target_freeaddr = jb->freeaddr;
	if (lcl_dskaddr >= target_freeaddr)
		return SS_NORMAL;
	/* ROUND_DOWN2 macro is used under the assumption that "jnl_fs_block_size" would be a power of 2 */
	jnl_fs_block_size = jb->fs_block_size;
	if (ROUND_DOWN2(lcl_dskaddr, jnl_fs_block_size) != ROUND_DOWN2(target_freeaddr, jnl_fs_block_size))
	{	/* data crosses/touches an alignment boundary */
		if (SS_NORMAL != (status = jnl_sub_qio_start(jpc, TRUE)))
			return status;
	} /* else, data does not cross/touch an alignment boundary, yield and see if someone else
	   * does the dirty job more efficiently
	   */
	spin_sleep_mask = csa->hdr->mutex_spin_parms.mutex_spin_sleep_mask;
	for (yield_cnt = 0; !csa->now_crit && (yield_cnt < csa->hdr->yield_lmt); yield_cnt++)
	{	/* If not in crit, wait until someone has finished your job or no one else is active on the jnl file */
		old_freeaddr = jb->freeaddr;
		GTM_REL_QUANT(spin_sleep_mask);
		/* Purpose of this memory barrier is to get a current view of asyncrhonously changed fields
		 * like whether the jnl file was switched, the write position in the journal file and the
		 * write address in the journal buffer for all the remaining statements in this loop because
		 * the GTM_REL_QUANT invocation above allows any and all of them to change and we aren't under any
		 * locks while in this loop. This is not a correctness issue as we would either eventually
		 * see the updates or it means we are writing what has already been written. It is a performance
		 * issue keeping more current with state changes done by other processes on other processors.
		 */
		SHM_READ_MEMORY_BARRIER;
		if (JNL_FILE_SWITCHED(jpc))
			return SS_NORMAL;
		/* assert(old_freeaddr <= jb->freeaddr) ** Potential race condition with jnl file switch could
		 * make this assert fail so it is removed
		 */
		if (old_freeaddr == jb->freeaddr || target_freeaddr <= jb->dskaddr)
			break;
	}
	status = SS_NORMAL;
	if (target_freeaddr > jb->dskaddr)
		status = jnl_sub_qio_start(jpc, FALSE);
	return status;
}