1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182
|
/****************************************************************
* *
* Copyright (c) 2006-2023 Fidelity National Information *
* Services, Inc. and/or its subsidiaries. All rights reserved. *
* *
* This source code contains the intellectual property *
* of its copyright holder(s), and is made available *
* under a license. If you do not know the terms of *
* the license, please stop and do not read further. *
* *
****************************************************************/
#include "mdef.h"
#include "gtm_string.h"
#include "gtm_inet.h"
#include <sys/time.h>
#include <errno.h>
#ifdef UNIX
#include <sys/sem.h>
#endif
#ifdef VMS
#include <descrip.h> /* Required for gtmsource.h */
#endif
#include "gdsroot.h"
#include "gdsblk.h"
#include "gtm_facility.h"
#include "fileinfo.h"
#include "gdsbt.h"
#include "gdsfhead.h"
#include "filestruct.h"
#include "jnl.h"
#include "repl_msg.h"
#include "gtmsource.h"
#include "repl_dbg.h"
#include "gtm_stdio.h"
#include "repl_shutdcode.h"
#include "repl_sem.h"
#include "repl_sp.h"
#include "repl_log.h"
#include "is_proc_alive.h"
#include "gtmmsg.h"
#include "sgtm_putmsg.h"
#include "util.h"
GBLREF jnlpool_addrs_ptr_t jnlpool;
GBLREF gtmsource_options_t gtmsource_options;
GBLREF boolean_t holds_sem[NUM_SEM_SETS][NUM_SRC_SEMS];
GBLREF gd_addr *gd_header;
error_def(ERR_NOTALLDBOPN);
error_def(ERR_REPLJNLCLOSED);
error_def(ERR_SRCSRVNOTEXIST);
error_def(ERR_TEXT);
int gtmsource_checkhealth(void)
{
uint4 gtmsource_pid;
int status, semval, save_errno;
boolean_t srv_alive, all_files_open;
gtmsource_local_ptr_t gtmsourcelocal_ptr;
int4 index, num_servers;
seq_num reg_seqno, jnlseqno;
gd_region *reg, *region_top;
sgmnt_addrs *csa;
sgmnt_data_ptr_t csd;
char errtxt[OUT_BUFF_SIZE];
char *modestr;
assert(holds_sem[SOURCE][JNL_POOL_ACCESS_SEM]);
if (NULL != jnlpool->gtmsource_local) /* Check health of a specific source server */
gtmsourcelocal_ptr = jnlpool->gtmsource_local;
else
gtmsourcelocal_ptr = &jnlpool->gtmsource_local_array[0];
num_servers = 0;
status = SRV_ALIVE;
for (index = 0; index < NUM_GTMSRC_LCL; index++, gtmsourcelocal_ptr++)
{
if ('\0' == gtmsourcelocal_ptr->secondary_instname[0])
{
assert(NULL == jnlpool->gtmsource_local);
continue;
}
gtmsource_pid = gtmsourcelocal_ptr->gtmsource_pid;
/* If CHECKHEALTH on a specific secondary instance is requested, print the health information irrespective
* of whether a source server for that instance is alive or not. For CHECKHEALTH on ALL secondary instances
* print health information only for those instances that have an active or passive source server alive.
*/
if ((NULL == jnlpool->gtmsource_local) && (0 == gtmsource_pid))
continue;
repl_log(stdout, TRUE, TRUE, "Initiating CHECKHEALTH operation on source server pid [%d] for secondary instance"
" name [%s]\n", gtmsource_pid, gtmsourcelocal_ptr->secondary_instname);
srv_alive = (0 == gtmsource_pid) ? FALSE : is_proc_alive(gtmsource_pid, 0);
if (srv_alive)
{
if (GTMSOURCE_MODE_ACTIVE == gtmsourcelocal_ptr->mode)
modestr = "ACTIVE";
else if (GTMSOURCE_MODE_ACTIVE_REQUESTED == gtmsourcelocal_ptr->mode)
modestr = "ACTIVE REQUESTED";
else if (GTMSOURCE_MODE_PASSIVE == gtmsourcelocal_ptr->mode)
modestr = "PASSIVE";
else if (GTMSOURCE_MODE_PASSIVE_REQUESTED == gtmsourcelocal_ptr->mode)
modestr = "PASSIVE REQUESTED";
else
{
assert(gtmsourcelocal_ptr->mode != gtmsourcelocal_ptr->mode);
modestr = "UNKNOWN";
}
repl_log(stderr, FALSE, TRUE, FORMAT_STR1, gtmsource_pid, "Source server", "", modestr);
status |= SRV_ALIVE;
num_servers++;
} else
{
repl_log(stderr, FALSE, TRUE, FORMAT_STR, gtmsource_pid, "Source server", " NOT");
gtm_putmsg_csa(CSA_ARG(NULL) VARLSTCNT(4) ERR_SRCSRVNOTEXIST, 2,
LEN_AND_STR(gtmsourcelocal_ptr->secondary_instname));
status |= SRV_DEAD;
}
if (NULL != jnlpool->gtmsource_local)
break;
}
if (NULL == jnlpool->gtmsource_local)
{ /* Compare number of servers that were found alive with the current value of the COUNT semaphore.
* If they are not equal, report the discrepancy.
*/
semval = get_sem_info(SOURCE, SRC_SERV_COUNT_SEM, SEM_INFO_VAL);
if (-1 == semval)
{
save_errno = errno;
repl_log(stderr, FALSE, TRUE,
"Error fetching source server count semaphore value : %s\n", STRERROR(save_errno));
status |= SRV_ERR;
} else if (semval != num_servers)
{
repl_log(stderr, FALSE, FALSE,
"Error : Expected %d source server(s) to be alive but found %d actually alive\n",
semval, num_servers);
repl_log(stderr, FALSE, TRUE, "Error : Check if any pid reported above is NOT a source server process\n");
status |= SRV_ERR;
}
}
rel_sem(SOURCE, JNL_POOL_ACCESS_SEM);
/* Check that there are no regions with replication state = WAS_ON (i.e. repl_was_open). If so report that.
* But to determine that, we need to attach to all the database regions.
*/
gvinit();
/* We use the same code dse uses to open all regions but we must make sure they are all open before proceeding. */
all_files_open = region_init(FALSE);
if (!all_files_open)
{
gtm_putmsg_csa(CSA_ARG(NULL) VARLSTCNT(1) ERR_NOTALLDBOPN);
status |= SRV_ERR;
} else
{
for (reg = gd_header->regions, region_top = gd_header->regions + gd_header->n_regions; reg < region_top; reg++)
{
csa = &FILE_INFO(reg)->s_addrs;
csd = csa->hdr;
if (REPL_WAS_ENABLED(csd))
{
assert(!JNL_ENABLED(csd) || REPL_ENABLED(csd)); /* || is for turning replication on concurrently */
reg_seqno = csd->reg_seqno;
jnlseqno = (NULL != jnlpool->jnlpool_ctl) ? jnlpool->jnlpool_ctl->jnl_seqno : MAX_SEQNO;
sgtm_putmsg(errtxt, OUT_BUFF_SIZE, VARLSTCNT(12) ERR_REPLJNLCLOSED, 6, DB_LEN_STR(reg),
®_seqno, ®_seqno, &jnlseqno, &jnlseqno, ERR_TEXT, 2,
RTS_ERROR_TEXT("Replication will continue using records in the replication journal pool,"
" but will fail if operation requires access to journal files"));
repl_log(stderr, FALSE, TRUE, errtxt);
status |= SRV_ERR;
}
}
}
if (jnlpool->jnlpool_ctl->freeze)
{
repl_log(stderr, FALSE, FALSE, "Warning: Instance Freeze is ON\n");
repl_log(stderr, FALSE, TRUE, " Freeze Comment: %s\n", jnlpool->jnlpool_ctl->freeze_comment);
status |= SRV_ERR;
}
return (status + NORMAL_SHUTDOWN);
}
|