File: wcs_wterror.c

package info (click to toggle)
fis-gtm 7.1-006-1
  • links: PTS, VCS
  • area: main
  • in suites: trixie
  • size: 32,908 kB
  • sloc: ansic: 344,906; asm: 5,184; csh: 4,859; sh: 2,000; awk: 294; makefile: 73; sed: 13
file content (150 lines) | stat: -rw-r--r-- 6,022 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
/****************************************************************
 *								*
 * Copyright (c) 2016-2020 Fidelity National Information	*
 * Services, Inc. and/or its subsidiaries. All rights reserved.	*
 *								*
 *	This source code contains the intellectual property	*
 *	of its copyright holder(s), and is made available	*
 *	under a license.  If you do not know the terms of	*
 *	the license, please stop and do not read further.	*
 *								*
 ****************************************************************/

#include "mdef.h"

#include <errno.h>

#include "gdsroot.h"
#include "gtm_facility.h"
#include "fileinfo.h"
#include "gdsbt.h"
#include "gdsblk.h"
#include "gdsbml.h"
#include "gdsfhead.h"
#include "filestruct.h"
#include "jnl.h"
#include "interlock.h"
#include "relqueopi.h"
#include "gdsbgtr.h"
#include "aio_shim.h"
#include "gtmio.h"
#include "is_proc_alive.h"
#include "anticipatory_freeze.h"
#include "add_inter.h"
#include "wcs_wt.h"
#include "compswap.h"

#define DBIOERR_LOGGING_PERIOD			100
#define DSKSPACE_MSG_INTERVAL 			60 	/* 60 seconds, epoch time */

#ifdef USE_LIBAIO
GBLREF char	*aio_shim_errstr;
#endif

error_def(ERR_DBFILERR);
error_def(ERR_DBIOERR);
error_def(ERR_SYSCALL);
error_def(ERR_ENOSPCQIODEFER);

STATICDEF volatile uint4 	eagain_error_count;

/* This function is called from wcs_wtstart (for noasyncio and asyncio cases) and/or wcs_wtfini (for asyncio) when they each
 * encounter an error in a write to the database file on disk. It could be ENOSPC or some other IO error. Handle all of them
 * by sending periodic syslog messages etc.
 */
void	wcs_wterror(gd_region *reg, int4 save_errno)
{
	unix_db_info		*udi;
	sgmnt_addrs		*csa;
	node_local_ptr_t	cnl;
	gtm_uint64_t		dskspace_next_fire;

	udi = FILE_INFO(reg);
	csa = &udi->s_addrs;
	BG_TRACE_PRO_ANY(csa, wcs_wterror_invoked);
	cnl = csa->nl;
	if (ENOSPC == save_errno)
	{	/* Determine whether or not to ignore this error, based on when the last ENOSPC error was reported. */
		dskspace_next_fire = cnl->dskspace_next_fire;
		if ((dskspace_next_fire + DSKSPACE_MSG_INTERVAL <= time(NULL))
				/* We use a CAS instruction to ensure that concurrent accesses to this location don't fire
				 * multiple times by different processes; the first one to swap dskspace_next_fire is the
				 * only one to report the ENOSPC error. A blind interlock_add() would not prevent this.
				 */
				&& COMPSWAP_LOCK((sm_global_latch_ptr_t)&cnl->dskspace_next_fire,
						 dskspace_next_fire, 0, time(NULL), 0))
		{	/* Report ENOSPC errors for first time and every minute after that. */
			send_msg_csa(CSA_ARG(csa) VARLSTCNT(9) ERR_DBIOERR, 2, DB_LEN_STR(reg),
				 ERR_TEXT, 2, RTS_ERROR_TEXT("Error during flush write"), save_errno);
			if (!IS_GTM_IMAGE)
			{
				gtm_putmsg_csa(CSA_ARG(csa) VARLSTCNT(9) ERR_DBIOERR, 2, DB_LEN_STR(reg),
					ERR_TEXT, 2, RTS_ERROR_TEXT("Error during flush write"), save_errno);
			}
		}
	} else if (EAGAIN == save_errno)
	{ 	/* When using POSIX AIO we don't ever expect to see an EAGAIN error. */
		assert(IF_LIBAIO_ELSE(NULL != aio_shim_errstr, FALSE));
		/* If EAGAIN occurs from "io_submit", do not treat it as an ERROR. We know it can happen if more than the
		 * allocated aio slots are issued as writes concurrently by this process. In that case, the count of
		 * wcs_wterror_invoked in file header is enough to indicate how many times such events occured.
		 */
#		ifdef USE_LIBAIO
		if ((NULL != aio_shim_errstr) && STRCMP(aio_shim_errstr, "io_submit()"))
		{
#		endif
			eagain_error_count++;
			if (1 == (eagain_error_count % DBIOERR_LOGGING_PERIOD))
			{	/* See below; every 100th failed attempt, issue a warning. We cannot issue a DBIOERR in
				 * the case of an EAGAIN because it is innocuous and can easily be retried -- a DBIOERR
				 * will freeze the database forcing us to not perform a retry at all.
				 */
				send_msg_csa(CSA_ARG(csa) VARLSTCNT(12) ERR_DBFILERR, 2, DB_LEN_STR(reg), ERR_SYSCALL, 5,
					LEN_AND_STR(IF_LIBAIO_ELSE(aio_shim_errstr, "aio_write()")), CALLFROM, save_errno);
			}
#		ifdef USE_LIBAIO
			aio_shim_errstr = NULL;
		}
#		endif
	} else if (ERR_ENOSPCQIODEFER != save_errno)
	{
		cnl->wtstart_errcnt++;
		if (1 == (cnl->wtstart_errcnt % DBIOERR_LOGGING_PERIOD))
		{	/* Every 100th failed attempt, issue an operator log indicating an I/O error.
			 * wcs_wtstart is typically invoked during periodic flush timeout and since there
			 * cannot be more than 2 pending flush timers per region, number of concurrent
			 * processes issuing the below send_msg should be relatively small even if there
			 * are 1000s of processes.
			 */
			/* Below assert is to account for some white-box tests which exercise this code as
			 * well as tests which could trigger a CRYPTOPFAILED inside the encryption plugin.
			 * Neither of those are real IO errors.
			 */
			assert(gtm_white_box_test_case_enabled
				|| (SET_REPEAT_MSG_MASK(SET_CRYPTERR_MASK(ERR_CRYPTOPFAILED)) == save_errno));
#ifdef 			USE_LIBAIO
			if (NULL == aio_shim_errstr)
			{
#			endif
				send_msg_csa(CSA_ARG(csa) VARLSTCNT(7) ERR_DBIOERR, 4, REG_LEN_STR(reg), DB_LEN_STR(reg),
						save_errno);
#ifdef			USE_LIBAIO
			} else
			{	/* If the error string was set, then we can output the syscall that failed as well. */
				send_msg_csa(CSA_ARG(csa) VARLSTCNT(15) ERR_DBIOERR, 4, REG_LEN_STR(reg), DB_LEN_STR(reg),
						ERR_SYSCALL, 5, LEN_AND_STR(aio_shim_errstr), CALLFROM,
						save_errno);
				aio_shim_errstr = NULL;
			}
#			endif
		}
	}
	/* If (ERR_ENOSPCQIODEFER == save_errno): DB_LSEEKWRITE above encountered ENOSPC but could not
	 * trigger a freeze as it did not hold crit. It is okay to return as this is not a critical write.
	 * Eventually, some crit holding process will trigger a freeze and wait for space to be freed up.
	 * Analogously, if we detected that encryption settings have changed during a transaction, it is OK
	 * to skip this write because this transaction will be retried after encryption settings update in
	 * t_retry or tp_restart.
	 */
	return;
}