File: wcs_clean_dbsync_ast.c

package info (click to toggle)
fis-gtm 6.2-000-1
  • links: PTS, VCS
  • area: main
  • in suites: jessie, jessie-kfreebsd
  • size: 30,784 kB
  • ctags: 42,554
  • sloc: ansic: 358,483; asm: 4,847; csh: 4,574; sh: 2,261; awk: 200; makefile: 86; sed: 13
file content (306 lines) | stat: -rw-r--r-- 15,226 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
/****************************************************************
 *								*
 *	Copyright 2001, 2013 Fidelity Information Services, Inc	*
 *								*
 *	This source code contains the intellectual property	*
 *	of its copyright holder(s), and is made available	*
 *	under a license.  If you do not know the terms of	*
 *	the license, please stop and do not read further.	*
 *								*
 ****************************************************************/

#include "mdef.h"

#include <stddef.h>		/* for offsetof macro */
#include "gdsroot.h"
#include "gtm_facility.h"
#include "gdskill.h"
#include "fileinfo.h"
#include "gdsbt.h"
#include "gdsfhead.h"
#include "filestruct.h"		/* for the FILE_INFO macros */
#include "jnl.h"
#include "iosp.h"
#include "efn.h"		/* for efn_immed_wait and efn_ignore */
#include "gdsbgtr.h"		/* for the BG_TRACE_PRO_ANY macros */
#include "timers.h"		/* for TIM_AST_WAIT */
#include "wcs_phase2_commit_wait.h"

#define	MAX_DBSYNC_DEFERS	10	/* 10 times deferring of 5 sec (TIM_DEFER_DBSYNC) each for a total of 50 seconds */
#define MAX_DBSYNC_LOOPS	600	/* each loop is of 5msec and we wait for a max. of 30 seconds */

#ifdef GTM_MALLOC_RENT
#	define	GTM_MALLOC_NO_RENT_ONLY(X)
#else
#	define	GTM_MALLOC_NO_RENT_ONLY(X)	X
#endif

GBLDEF	int4			defer_dbsync[2] = { TIM_DEFER_DBSYNC, -1 };	/* picked from timers.h */

GBLREF	gd_region		*gv_cur_region;
GBLREF	int4			wtfini_in_prog;
GBLREF	short			astq_dyn_avail;
GBLREF	volatile int4		gtmMallocDepth;		/* Recursion indicator */
GBLREF	uint4			process_id;
GBLREF	boolean_t		mupip_jnl_recover;
GBLREF 	jnl_gbls_t		jgbl;
GBLREF	volatile int4		fast_lock_count;
GBLREF	volatile int4		crit_count;
GBLREF	volatile boolean_t	in_mutex_deadlock_check;

error_def(ERR_JNLFLUSH);
error_def(ERR_TEXT);

/* Sync the filehdr (and epoch in the journal file if before imaging). The goal is to sync the database,
 * but if we find us in a situation where we need to block on someone else, then we defer this to the next round.
 * This is the equivalent of the Unix wcs_clean_dbsync() routine.
 */
void	wcs_clean_dbsync_ast(sgmnt_addrs *csa)
{
	static readonly int4	pause[2] = { TIM_AST_WAIT, -1 };	/* picked from wcs_timer_start */
	boolean_t		bimg_jnl, dbsync_defer_timer;		/* bimg_jnl --> before-imaging or not */
	cache_que_head		*crqwip;
	int			counter, status;
	gd_region		*reg;
	jnl_private_control	*jpc;
	jnl_buffer_ptr_t	jb;
	node_local_ptr_t	cnl;
	sgmnt_addrs		*save_csa;
	sgmnt_data_ptr_t	csd;
	void			fileheader_sync();
	uint4			jnl_status;

	assert(lib$ast_in_prog());	/* If dclast fails and setast is used, this assert trips, but in that
					 * case, we anyway want to know why we needed setast. */
	/* Although csa->dbsync_timer is almost always TRUE if here, there is a small possibility it is FALSE. This is
	 * possible if we are currently in gds_rundown for this region where the flag is reset to FALSE irrespective
	 * of whether we have a pending timer or a sys$qio-termination-signalling-ast. In the case dbsync_timer is
	 * FALSE, return. There is a very remote possibility that we miss syncing the db if the qio of the last
	 * dirty buffer finishes after we die and we are not the last writer. In this case the sync won't be done
	 * since all the maintenance is process-private. But that possibility is too remote and we will live with
	 * it for now since otherwise we need to implement grander mechanisms involving shared memory and the like.
	 */
	reg = csa->region;
	assert(FALSE == csa->dbsync_timer || reg->open);
	/* Don't know how this can happen, but if region is closed, just return in PRO */
	/* In MM, not yet sure whether it will work */
	if (FALSE == csa->dbsync_timer || dba_mm == reg->dyn.addr->acc_meth || !reg->open)
	{
		csa->dbsync_timer = FALSE;
		astq_dyn_avail++;
		return;
	}
	/* Save to see if we are in crit anywhere */
	save_csa = ((NULL == gv_cur_region || FALSE == gv_cur_region->open) ?  NULL : (&FILE_INFO(gv_cur_region)->s_addrs));
	csa = &FILE_INFO(reg)->s_addrs;
	csd = csa->hdr;
	cnl = csa->nl;
	jpc = csa->jnl;
	BG_TRACE_PRO_ANY(csa, n_dbsync_timers);
	assert(!JNL_ALLOWED(csd) || NULL != jpc);

	/* Note that even if the active queue was emptied when this routine was called, due to
	 * concurrent update activity, cnl->wcs_active_lvl can be non-zero when we reach here. We
	 * defer syncing the database in this case to the next time the active queue becomes empty or
	 * when we reach the next scheduled epoch_time (only if before-imaging) whichever is earlier.
	 */
	dbsync_defer_timer = FALSE;
	if (!cnl->wcs_active_lvl)
	{	/* Currently VMS timer writes don't have the optimizations for deferring expensive IO at
		 * critical times that exist in Unix. Need to get them (those that apply) to VMS too. They are
		 *   1) We are in the midst of lseek/read/write IO. This could reset an lseek. (Doesn't apply to VMS).
		 *   2) We are aquiring/releasing crit in any region (Strictly speaking it is enough
		 *		to check this in the current region, but doesn't harm us much).
		 *	Note that the function "mutex_deadlock_check" resets crit_count to 0 temporarily even though we
		 *	might actually be in the midst of acquiring crit. Therefore we should not interrupt mainline code
		 *	if we are in the "mutex_deadlock_check" as otherwise it presents reentrancy issues.
		 *   3) We have crit in the current region OR are in the middle of commit for this region (even though
		 *	we dont hold crit) OR are in wcs_wtstart (potentially holding write interlock and keeping another
		 *	process in crit waiting) OR we need to wait to obtain crit.
		 *   4) We are in a "fast lock".
		 * Out of the above, items (2) & (3) are currently being taken care of below since they can cause
		 *	deadlocks (if not taken care of) while the others are just performance enhancements. Note
		 *	that the last part of (3) is taken care of by doing a grab_crit_immediate() rather than a grab_crit().
		 * Also to be taken care of are the following situations.
		 *   1) We are currently in wcs_wtfini be it the same or a different region.
		 *	To avoid reentrancy issues (if same region) and deadlock issues (if different region).
		 *   2) We are currently in malloc(). Although nested malloc() now works and we won't be needing it
		 *	as much, want to be paranoid here since there are quite a few functions called from here.
		 * Other reentrancy issues to be taken care of are
		 *   1) Avoid doing recursive wcs_recovers.
		 */
		dbsync_defer_timer = TRUE;
		crqwip = &csa->acc_meth.bg.cache_state->cacheq_wip;
		if (!mupip_jnl_recover && 0 == crit_count && !in_mutex_deadlock_check && !wtfini_in_prog && !fast_lock_count
			GTM_MALLOC_NO_RENT_ONLY(&& 0 == gtmMallocDepth)
			&& ((NULL == save_csa) || !T_IN_CRIT_OR_COMMIT_OR_WRITE(save_csa))
			&& !T_IN_CRIT_OR_COMMIT_OR_WRITE(csa)
			&& (TRUE == grab_crit_immediate(reg)))
		{	/* Note that if we are here, we have obtained crit using grab_crit_immediate. Also grab_crit_immediate
			 * doesn't call wcs_recover if wc_blocked is TRUE in order to prevent possible deadlocks.
			 * Note that mutex_lockwim() cannot be used since crit_count is not maintained there.
			 */
			assert(csa->ti->early_tn == csa->ti->curr_tn);
			/* if wcs_wtfini() returns FALSE, it means the cache is suspect. but we are in interrupt code
			 * and therefore want to play it safe. Hence we will not set wc_blocked. we will defer writing
			 * epoch and wait for a future call to mainline code to detect this and initiate cache recovery.
			 */
			/* Wait for ALL active phase2 commits to complete first. If they dont complete in time then defer
			 * writing the epoch. Also dont wait if cnl->wc_blocked is already set to TRUE. In that case
			 * defer writing the EPOCH unconditionally. */
			if (!cnl->wc_blocked && (!cnl->wcs_phase2_commit_pidcnt || wcs_phase2_commit_wait(csa, NULL))
				&& wcs_wtfini(reg)) /* wcs_wtfini handles calls from ASTs appropriately */
			{
				if (JNL_ENABLED(csd))
				{
					jb = jpc->jnl_buff;
					if (jb->before_images)
						bimg_jnl = TRUE;
				} else
					bimg_jnl = FALSE;
				/* Note that if before-imaging and we haven't opened the journal file, then we
				 * can't write an epoch record here because opening the jnl file involves a
				 * heavyweight routine jnl_file_open() which is risky in this ast-prone code.
				 * Also, if before-imaging and the journal file has been switched since the time the
				 * dbsync timer started, we do not want to do any writes as they will go to the older
				 * generation journal file. It is ok not to write an EPOCH record in the older generation
				 * journal file because whichever process did the journal file switch would have done
				 * exactly that. And therefore there is no need to start a new dbsync timer in this case.
				 */
				if (cnl->wcs_active_lvl || bimg_jnl && ((NOJNL == jpc->channel) || JNL_FILE_SWITCHED(jpc)))
					dbsync_defer_timer = FALSE;	/* don't/can't write epoch. */
				else if (0 == crqwip->fl)
				{
					if (!bimg_jnl)
					{	/* Entire wip queue is flushed. So sync the file-header now */
						assert(cnl->wc_in_free == csd->n_bts);
						BG_TRACE_PRO_ANY(csa, n_dbsync_writes);
						fileheader_sync(reg);	/* sync the fileheader to disk */
						dbsync_defer_timer = FALSE;
					} else if (jb->dskaddr == jb->freeaddr)
					{	/* Entire wip queue and jnl buffer is flushed. So write an epoch record now. */
						assert(cnl->wc_in_free == csd->n_bts);
						BG_TRACE_PRO_ANY(csa, n_dbsync_writes);
						fileheader_sync(reg);	/* sync the fileheader to disk */
						/* To avoid deadlocks (e.g. we waiting for a jnl_flush while someone
						 * is holding the io_in_prog lock) we use a kludge. Setting jb->blocked
						 * prevents others from picking up the io_in_prog lock. We then check
						 * whether there is anyone holding the lock. If so, we defer writing the
						 * epoch to the next round and if not go ahead with the flush. Note that
						 * "someone" above includes ourselves too since the qio we have done prior
						 * to entering wcs_wipchk_ast will again be delivered as a jnl_qio_end AST
						 * which will again be blocked.
						 */
						jb->blocked = process_id;
						if (!jb->io_in_prog)
						{
							assert(NOJNL != jpc->channel);
							/* Since the journal buffer is flushed to disk at this point
							 * we don't expect any other routines (like jnl_write_attempt etc.)
							 * to be called. Also since the epoch-record is less than a hundred
							 * bytes, we don't expect a jnl_qio_start to be called at the end
							 * of jnl_write(). We also assume that the check for extension of
							 * journal file takes into account space for an epoch + eof + align.
							 * Note that the assert below checks that the min_write_size (the value
							 * needed to trigger a jnl_qio_write) is less than the maximum number of
							 * bytes that will be written in the journal buffer by jnl_write_epoch_rec
							 * (= size of the epoch record + maximum size of align record if needed).
							 */
							/* Is there a correctness issue if the file gets extended? The assumption
							 * about space check for epoch + eof + align may not be correct. Also,
							 * now we may be writing a PINI as well. Vinaya, 2003, May 2. Check with
							 * Narayanan */
							assert(2 * EPOCH_RECLEN + PINI_RECLEN + 3 * MIN_ALIGN_RECLEN <
									jb->min_write_size);
							assert(csa->ti->curr_tn == csa->ti->early_tn);
							/* There is no need for jnl_ensure_open here since we have crit and
							 * have already determined that the journal file has not been switched.
							 */
							/* Initialize gbl_jrec_time if necessary before jnl_put_jrt_pini */
							if (!jgbl.dont_reset_gbl_jrec_time)
								SET_GBL_JREC_TIME;
							/* Before writing to jnlfile, adjust jgbl.gbl_jrec_time (if needed) to
							 * maintain time order of jnl records. This needs to be done BEFORE
							 * writing any records to the journal file.
							 */
							ADJUST_GBL_JREC_TIME(jgbl, jb);
							if (0 == jpc->pini_addr)
							{/* in the rare case when we haven't done any updates to the db (till
							   * now only db reads), but had to flush the jnl buffer and cache due to
							   * lack of cache buffer (flush trigger mechanism in t_qread) we may not
							   * have written our PINI record yet */
								jnl_put_jrt_pini(csa);
							}
							jnl_write_epoch_rec(csa);
							INCR_GVSTATS_COUNTER(csa, cnl, n_jrec_epoch_idle, 1);
							/* Need to flush this epoch record out */
							jnl_status = jnl_flush(reg);	/* handles calls from ASTs appropriately */
							if (SS_NORMAL == jnl_status)
							{
								assert(jb->dskaddr == jb->freeaddr);
								dbsync_defer_timer = FALSE;
								assert(0 == jb->blocked);    /* jnl_flush should have reset this.*/
								if (process_id == jb->blocked)
									jb->blocked = 0;
							} else
							{
								send_msg(VARLSTCNT(9) ERR_JNLFLUSH, 2, JNL_LEN_STR(csd),
									ERR_TEXT, 2,
									RTS_ERROR_TEXT("Error with journal flush in wcsdbsyncast"),
									jnl_status);
								assert(NOJNL == jpc->channel);/* jnl file lost has been triggered */
								/* In this routine, all code that follows from here on does not
								 * assume anything about the journaling characteristics of this
								 * database so it is safe to continue execution even though
								 * journaling got closed in the middle.
								 */
							}
						} else
							jb->blocked = 0;
					} else
						jnl_start_ast(jpc);	/* Start a journal write and defer epoch writing. */
				}
			}
			rel_crit(reg);
		}
	}
	if (FALSE != dbsync_defer_timer)
	{
		for (counter = 0; 1 > astq_dyn_avail; counter++)
		{	/* Wait until we have room to queue our timer AST for wcs_clean_dbsync_ast. */
			assert(FALSE);
			if (SS$_NORMAL == sys$setimr(efn_timer_ast, &pause, 0, 0, 0))
				sys$synch(efn_timer_ast, 0);
			if (counter > MAX_DBSYNC_LOOPS)
			{
				csa->dbsync_timer = FALSE;
				astq_dyn_avail++;
				return;		/* in this case, we skip syncing the db. */
			}
		}
		astq_dyn_avail--;
		if (MAX_DBSYNC_DEFERS > csa->dbsync_timer++)
		{
			status = sys$setimr(efn_ignore, &defer_dbsync[0], wcs_clean_dbsync_ast, csa, 0);
			if (0 == (status & 1))
			{
				assert(FALSE);
				csa->dbsync_timer = FALSE;
				astq_dyn_avail++;	/* in this case too, we skip syncing the db. */
			}
		} else
		{	/* We have deferred the dbsync timer at least MAX_DBSYNC_DEFERS times (nearly 50 seconds). We cannot keep
			 * doing this indefinitely as it is possible that whatever is causing us to defer this timer (crit_count
			 * being non-zero etc.) is in turn blocked because it needs a timer queue entry but cannot find one due
			 * to wcs_clean_dbsync_ast eternally using up the same (eats up the TQELM job/process quota). Therefore
			 * to avoid a potential deadlock, we stop requeueing ourselves even though it means we will skip syncing
			 * the db. The only one that cares for this dbsync is journal recovery which anyways has been worked around
			 * to take care of indefinite deferring (equivalent to skipping the syncing) so that should not be an issue.
			 */
			csa->dbsync_timer = FALSE;	/* in this case, we skip syncing the db. */
		}
	} else
		csa->dbsync_timer = FALSE;
	astq_dyn_avail++;
	return;
}