File: wcs_clean_dbsync.c

package info (click to toggle)
fis-gtm 6.2-000-1
  • links: PTS, VCS
  • area: main
  • in suites: jessie, jessie-kfreebsd
  • size: 30,784 kB
  • ctags: 42,554
  • sloc: ansic: 358,483; asm: 4,847; csh: 4,574; sh: 2,261; awk: 200; makefile: 86; sed: 13
file content (248 lines) | stat: -rw-r--r-- 12,986 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
/****************************************************************
 *								*
 *	Copyright 2001, 2013 Fidelity Information Services, Inc	*
 *								*
 *	This source code contains the intellectual property	*
 *	of its copyright holder(s), and is made available	*
 *	under a license.  If you do not know the terms of	*
 *	the license, please stop and do not read further.	*
 *								*
 ****************************************************************/

#include "mdef.h"

#include "gtm_fcntl.h"	/* needed for silly aix's expansion of open to open64 */
#include "gtm_unistd.h"

#include "gdsroot.h"
#include "gtm_facility.h"
#include "gdskill.h"
#include "fileinfo.h"
#include "gdsbt.h"
#include "gdsblk.h"
#include "gdsfhead.h"
#include "filestruct.h"
#include "gdscc.h"
#include "jnl.h"
#include "buddy_list.h"		/* for tp.h */
#include "hashtab_int4.h"	/* needed for tp.h */
#include "tp.h"			/* for tp_region definition */
#include "gt_timer.h"		/* for TID definition */
#include "timers.h"		/* for TIM_DEFER_DBSYNC #define */
#include "gdsbgtr.h"		/* for the BG_TRACE_PRO macros */
#include "gtmio.h"		/* for the GET_LSEEK_FLAG macro */
#include "wcs_clean_dbsync.h"
#include "wcs_flu.h"
#include "lockconst.h"

#ifdef GTM_MALLOC_RENT
#	define	GTM_MALLOC_NO_RENT_ONLY(X)
#else
#	define	GTM_MALLOC_NO_RENT_ONLY(X)	X
#endif

NOPIO_ONLY(GBLREF boolean_t	*lseekIoInProgress_flags;)	/* needed for the LSEEK* macros in gtmio.h */
GBLREF	gd_region		*gv_cur_region;
GBLREF	sgmnt_addrs		*cs_addrs;
GBLREF	sgmnt_data_ptr_t	cs_data;
GBLREF	volatile int4		crit_count;
GBLREF	volatile boolean_t	in_mutex_deadlock_check;
GBLREF	volatile int4		db_fsync_in_prog, jnl_qio_in_prog;
GBLREF	volatile int4 		fast_lock_count;
GBLREF	volatile int4		gtmMallocDepth;		/* Recursion indicator */
GBLREF	boolean_t	 	mupip_jnl_recover;
#ifdef DEBUG
GBLREF	unsigned int		t_tries;
GBLREF	volatile boolean_t	timer_in_handler;
#endif

/* Sync the filehdr (and epoch in the journal file if before imaging). The goal is to sync the database,
 * but if we find us in a situation where we need to block on someone else, then we defer this to the next round.
 */
void	wcs_clean_dbsync(TID tid, int4 hd_len, sgmnt_addrs **csaptr)
{
	boolean_t		dbsync_defer_timer;
	gd_region               *reg, *save_region;
	jnl_private_control	*jpc;
	node_local_ptr_t	cnl;
	sgmnt_addrs		*csa, *check_csaddrs, *save_csaddrs;
	sgmnt_data_ptr_t	csd, save_csdata;
	NOPIO_ONLY(boolean_t	lseekIoInProgress_flag;)
	DEBUG_ONLY(boolean_t	save_ok_to_call_wcs_recover;)
	boolean_t		is_mm;
	DCL_THREADGBL_ACCESS;

	SETUP_THREADGBL_ACCESS;
	csa = *csaptr;
	assert(timer_in_handler);
	assert(csa->dbsync_timer);	/* to ensure no duplicate dbsync timers */
	CANCEL_DBSYNC_TIMER(csa);	/* reset csa->dbsync_timer now that the dbsync timer has popped */
	assert(!csa->dbsync_timer);
	reg = csa->region;
	/* Don't know how this can happen, but if region is closed, just return in PRO. */
	if (!reg->open)
	{
		assert(FALSE);
		return;
	}
	is_mm = (dba_mm == reg->dyn.addr->acc_meth);
	save_region = gv_cur_region; /* Save for later restore. See notes about restore */
	save_csaddrs = cs_addrs;
	save_csdata = cs_data;
	/* Save to see if we are in crit anywhere */
	check_csaddrs = ((NULL == save_region || FALSE == save_region->open) ?  NULL : (&FILE_INFO(save_region)->s_addrs));
	/* Note the non-usage of TP_CHANGE_REG_IF_NEEDED macros since this routine can be timer driven. */
	TP_CHANGE_REG(reg);
	csd = csa->hdr;
	cnl = csa->nl;
	jpc = csa->jnl;
	BG_TRACE_PRO_ANY(csa, n_dbsync_timers);
	assert(csa == cs_addrs);
	assert(!JNL_ALLOWED(csd) || NULL != jpc);
	/* Note that even if the active queue was emptied when this routine was called, due to
	 * concurrent update activity, cnl->wcs_active_lvl can be non-zero when we reach here. We
	 * defer syncing in this case to the next time the active queue becomes empty ( or when we
	 * reach the next scheduled epoch_time -- in case of before-imaging) whichever is earlier.
	 *
	 * Note that if we are already in wcs_wtstart for this region, then invoking wcs_flu() won't
	 * recurse on wcs_wtstart. In any case the interrupted wcs_wtstart invocation will take care
	 * of the dbsync_timer once it is done. Therefore in this case too no need to do the dbsync.
	 */
	dbsync_defer_timer = FALSE;
	if (!cnl->wcs_active_lvl && !csa->in_wtstart)
	{	/* Similar to wcs_stale, defer expensive IO flushing if any of the following is true.
		 *   1) We are in the midst of lseek/read/write IO. This could reset an lseek.
		 *   2) We are aquiring/releasing crit in any region (Strictly speaking it is enough
		 *		to check this in the current region, but doesn't harm us much).
		 *	Note that the function "mutex_deadlock_check" resets crit_count to 0 temporarily even though we
		 *	might actually be in the midst of acquiring crit. Therefore we should not interrupt mainline code
		 *	if we are in the "mutex_deadlock_check" as otherwise it presents reentrancy issues.
		 *   3) We have crit in the current region OR are in the middle of commit for this region (even though
		 *	we dont hold crit) OR are in wcs_wtstart (potentially holding write interlock and keeping another
		 *	process in crit waiting) OR we need to wait to obtain crit. At least one reason why we should not wait
		 *	to obtain crit is because the timeout mechanism for the critical section is currently (as of 2004 May)
		 *	driven by heartbeat on Tru64, AIX, Solaris and HPUX. The periodic heartbeat handler cannot pop as
		 *	it is a SIGALRM handler and cannot nest while we are already in a SIGALRM handler for the wcs_clean_dbsync.
		 *   	Were this to happen, we could end up waiting for crit, not being able to interrupt the wait
		 *   	with a timeout resulting in a hang until crit became available.
		 *   4) We are in a "fast lock".
		 *   5) We are in gtm_malloc. Don't want to recurse on malloc.
		 * Other deadlock causing conditions that need to be taken care of
		 *   1) We already have either the fsync_in_prog or the io_in_prog lock.
		 *   2) We are currently doing a db_fsync on some region.
		 */
		dbsync_defer_timer = TRUE;
		GET_LSEEK_FLAG(FILE_INFO(reg)->fd, lseekIoInProgress_flag);
		DEBUG_ONLY(
			/* We invoke grab_crit_immediate below which can potentially do cache-recoveries if cnl->wc_blocked is set.
			 * But wcs_recover has an assert that we never invoke it in the final retry. This is to avoid
			 * restarts in the final retry. But wcs_clean_dbsync invokes grab_crit_immediate only if we dont already
			 * hold crit and that means we have already finished commit on this particular region (e.g. if
			 * commit is complete on all regions and crit is released on all of them but before we reset t_tries
			 * to 0 in t_end/tp_tend) so it is okay to invoke wcs_recover in that case. Signal that to wcs_recover
			 * by setting ok_to_call_wcs_recover to TRUE. Need to save and restore the global as it could be
			 * TRUE or FALSE depending on where wcs_clean_dbsync interrupted mainline code.
			 */
			assert(CDB_STAGNATE >= t_tries || WBTEST_ENABLED(WBTEST_ANTIFREEZE_GVDATAFAIL));
			if (CDB_STAGNATE <= t_tries)
			{
				save_ok_to_call_wcs_recover = TREF(ok_to_call_wcs_recover);
				TREF(ok_to_call_wcs_recover) = TRUE;
			}
		)
		if (!mupip_jnl_recover NOPIO_ONLY(&& (FALSE == lseekIoInProgress_flag))
			GTM_MALLOC_NO_RENT_ONLY(&& 0 == gtmMallocDepth)
			&& (0 == crit_count) && !in_mutex_deadlock_check
			&& (0 == fast_lock_count)
			&& (!jnl_qio_in_prog)      && (!db_fsync_in_prog)
			&& (!jpc || !jpc->jnl_buff || (LOCK_AVAILABLE == jpc->jnl_buff->fsync_in_prog_latch.u.parts.latch_pid))
			&& ((NULL == check_csaddrs) || !T_IN_CRIT_OR_COMMIT_OR_WRITE(check_csaddrs))
			&& !T_IN_CRIT_OR_COMMIT_OR_WRITE(csa)
			&& (FALSE != grab_crit_immediate(reg)))
		{	/* Note that grab_crit_immediate invokes wcs_recover in case cnl->wc_blocked is non-zero.  This means we
			 * could be doing cache recovery even though we are in interrupt code.  If this is found undesirable, the
			 * logic in grab_crit_immediate that invokes wcs_recover has to be re-examined.
			 */
			/* Note that if we are here, we have obtained crit using grab_crit_immediate. */
			assert(csa->ti->early_tn == csa->ti->curr_tn);
			/* Do not invoke wcs_flu if the database has a newer journal file than what this process had open
			 * when the dbsync timer was started in wcs_wtstart. This is because mainline (non-interrupt) code
			 * in jnl_write_attempt/jnl_output_sp assumes that interrupt code will not update jpc structures to
			 * point to latest journal file (i.e. will not do a jnl_ensure_open) but wcs_flu might invoke just
			 * that. It is ok not to do a wcs_flu since whichever process did the journal switch would have
			 * written the EPOCH record in the older generation journal file. Therefore there is no need to
			 * start a new dbsync timer in this case.
			 *
			 * If journaling and writing EPOCHs, do a wcs_flu only if there has been at least one transaction
			 * since the last time someone wrote an EPOCH.
			 *
			 * If NOT journaling or if NOT writing EPOCHs, do a wcs_flu only if there has been at least one
			 * transaction since the last time someone did a wcs_flu.
			 *
			 * This way wcs_flu is not redundantly invoked and it ensures that the least number of epochs
			 * (only the necessary ones) are written OR the least number of db file header flushes are done.
			 *
			 * If MM and not writing EPOCHs, we need to flush the fileheader out as that is not mmap'ed.
			 */
			/* Write idle/free epoch only if db curr_tn did not change since when the last dirty cache record was
			 * written in wcs_wtstart to when the dbsync timer (5 seconds) popped. If the curr_tn changed it means
			 * some other update happened in between and things are no longer idle so the previous idle dbsync
			 * timer can be stopped. A new timer will be written when the later updates finish and leave the db
			 * idle again. Note that there are some race conditions where we might not be accurate in writing idle
			 * EPOCH only when necessary (since we dont hold crit at the time we record csa->dbsync_timer_tn). But
			 * any error will always be on the side of caution so we might end up writing more idle EPOCHs than
			 * necessary. Also, even if we dont write an idle EPOCH (for example because we found an update
			 * happened later but that update turned out to be a duplicate SET which will not start an idle
			 * EPOCH timer), journal recovery already knows to handle the case where an idle EPOCH did not get
			 * written. So things will still work but it might just take a little longer than usual.
			 */
			if (csa->dbsync_timer_tn == csa->ti->curr_tn)
			{	/* Note that it is possible in rare cases that an online rollback took csa->ti->curr_tn back
				 * and the exact # of updates happened concurrently to take csa->ti->curr_tn back to where it
				 * was to match csa->dbsync_timer_tn. In this case, we will be writing an epoch unnecessarily
				 * but this is a very rare situation that is considered okay to write the epoch in that case
				 * as it keeps the if check simple for the most frequent path.
				 */
				if ((NULL != jpc) && JNL_HAS_EPOCH(jpc->jnl_buff)
					? (((NOJNL == jpc->channel) || !JNL_FILE_SWITCHED(jpc))
							&& (jpc->jnl_buff->epoch_tn < csa->ti->curr_tn))
					: (cnl->last_wcsflu_tn < csa->ti->curr_tn))
				{
					wcs_flu(WCSFLU_FLUSH_HDR | WCSFLU_WRITE_EPOCH | WCSFLU_SYNC_EPOCH | WCSFLU_CLEAN_DBSYNC
							| WCSFLU_SPEEDUP_NOBEFORE);
					BG_TRACE_PRO_ANY(csa, n_dbsync_writes);
					/* If MM, file could have been remapped by wcs_flu above.
					 * If so, cs_data needs to be reset.
					 */
					if (is_mm && (save_csaddrs == cs_addrs) && (save_csdata != cs_data))
						save_csdata = cs_addrs->hdr;
				}
			}
			dbsync_defer_timer = FALSE;
			assert(!csa->hold_onto_crit); /* this ensures we can safely do unconditional rel_crit */
			rel_crit(reg);
		}
		DEBUG_ONLY(
			if (CDB_STAGNATE <= t_tries)
				TREF(ok_to_call_wcs_recover) = save_ok_to_call_wcs_recover;
		)
	}
	if (dbsync_defer_timer)
	{
		assert(SIZEOF(INTPTR_T) == SIZEOF(csa));
		/* Adding a new dbsync timer should typically be done in a deferred zone to avoid duplicate timer additions for the
		 * same TID. But, in this case, we are guaranteed that timers won't pop as we are already in a timer handler. As
		 * for the external interrupts, they should be okay to interrupt at this point since, unlike timer interrupts,
		 * control won't return to mainline code. So, in either case, we can safely add the new timer.
		 */
		if (!csa->dbsync_timer)
			START_DBSYNC_TIMER(csa, TIM_DEFER_DBSYNC);
	}
	/* To restore to former glory, don't use TP_CHANGE_REG, 'coz we might mistakenly set cs_addrs and cs_data to NULL
	 * if the region we are restoring to has been closed. Don't use tp_change_reg 'coz we might be ripping out the structures
	 * needed in tp_change_reg in gv_rundown. */
	gv_cur_region = save_region;
	cs_addrs = save_csaddrs;
	cs_data = save_csdata;
	return;
}