File: failsafe.c

package info (click to toggle)
netstd 3.07-2hamm.5
  • links: PTS
  • area: main
  • in suites: hamm
  • size: 6,384 kB
  • ctags: 9,087
  • sloc: ansic: 72,547; cpp: 6,141; makefile: 1,681; yacc: 1,615; sh: 1,220; perl: 303; awk: 46
file content (201 lines) | stat: -rw-r--r-- 4,592 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
/*
 * failsafe.c
 *
 * Copyright (C) 1998, <okir@monad.swb.de>
 *
 * Implements fail-safe mode for nfsd/mountd.
 */

#include "system.h"
#include "logging.h"
#include "signals.h"
#include <sys/wait.h>

static const char *	get_signame(int signo);

void
failsafe(int level, int ncopies)
{
	int	*servers, running, child, i;
	int	pid, signo, status;
	time_t	last_restart = 0, now;
	int	restarts = 0, backoff = 60;

	servers = (int *) xmalloc(ncopies * sizeof(int));
	memset(servers, 0, ncopies * sizeof(int));

	/* Loop forever, until we get SIGTERM */
	running = 0;
	while (1) {
		while (running < ncopies) {
			if ((now = time(NULL)) == last_restart) {
				if (++restarts > 2 * ncopies) {
					Dprintf(L_ERROR,
						"Servers restarting too "
						"quickly, backing off.");
					if (backoff < 60 * 60)
						backoff <<= 1;
					sleep(backoff);
				}
			} else {
				last_restart = now;
				restarts = 0;
				backoff = 60;
			}

			/* Locate a free pid slot */
			for (i = 0, child = -1; i < ncopies; i++) {
				if (servers[i] == 0) {
					child = i;
					break;
				}
			}

			if (child < 0)
				Dprintf(L_FATAL, "failsafe: no pid slot?!");

			Dprintf(D_GENERAL,
				"starting server thread %d...\n", child + 1);

			pid = fork();
			if (pid < 0)
				Dprintf(L_FATAL,
					"Unable to fork for failsafe: %s",
					strerror(errno));

			if (pid == 0) {
				/* Child process: continue with execution. */
				return;
			}

			servers[child] = pid;
			running++;
		}

		/* Ignore some signals */
		ignore_signal(SIGTERM);
		ignore_signal(SIGHUP);
		ignore_signal(SIGINT);
		ignore_signal(SIGCHLD);

		if ((pid = wait(&status)) < 0) {
			Dprintf((errno == ECHILD)? L_FATAL : L_WARNING,
				"failsafe: wait(): %s", strerror(errno));
			continue;
		}

		/* Locate the child */
		for (i = 0, child = -1; i < ncopies; i++) {
			if (servers[i] == pid) {
				child = i;
				break;
			}
		}

		if (child < 0) {
			Dprintf(L_WARNING,
				"failsafe: unknown child (pid %d) terminated",
				pid);
			continue;
		}

		/* Book-keeping */
		servers[child] = 0;
		running--;

		if (WIFSIGNALED(status)) {
			signo = WTERMSIG(status);
			if (signo == SIGTERM) {
				Dprintf(L_NOTICE, "failsafe: "
					"child %d terminated by SIGTERM. %s.",
					pid, running? "Continue" : "Exit");
			} else {
				Dprintf(L_WARNING, "failsafe: "
					"child %d terminated by %s. "
					"Restarting.",
					pid, get_signame(signo));
				child = -1; /* Restart */
			}
		} else if (WIFEXITED(status)) {
			Dprintf(L_NOTICE, "failsafe: "
				"child %d exited, status %d.",
				pid, WEXITSTATUS(status));
		} else {
			Dprintf(L_ERROR, "failsafe: "
				"abnormal child termination, "
				"pid=%d status=%d. Restarting.",
				pid, status);
			child = -1; /* Restart */
		}

		/* If child >= 0, we should not restart */
		if (child >= 0) {
			if (!running) {
				Dprintf(D_GENERAL,
					"No more children, exiting.");
				exit(0);
			}
			for (i = child; i < ncopies-1; i++)
				servers[i] = servers[i+1];
			ncopies--; /* Make sure we start no new servers */
		}
	}
}

/*
 * Failsafe session, catch core file.
 *
 * Not yet implemented.
 * General outline: we need to fork first, because nfsd changes
 * uids frequently, and the kernel won't write out a core file after
 * that. The forked proc starts out with a clean dumpable flag though.
 *
 * After the fork, we might want to make sure we end up in some common
 * directory that the failsafe loop knows about.
 */
void
failsafe_loop(int level, void (*function)(void))
{
	/* NOP */
}

static const char *
get_signame(int signo)
{
	static char	namebuf[30];

	switch (signo) {
	case SIGHUP:	return "SIGHUP";
	case SIGINT:	return "SIGINT";
	case SIGQUIT:	return "SIGQUIT";
	case SIGILL:	return "SIGILL";
	case SIGTRAP:	return "SIGTRAP";
	case SIGIOT:	return "SIGIOT";
	case SIGBUS:	return "SIGBUS";
	case SIGFPE:	return "SIGFPE";
	case SIGKILL:	return "SIGKILL";
	case SIGUSR1:	return "SIGUSR1";
	case SIGSEGV:	return "SIGSEGV";
	case SIGUSR2:	return "SIGUSR2";
	case SIGPIPE:	return "SIGPIPE";
	case SIGALRM:	return "SIGALRM";
	case SIGTERM:	return "SIGTERM";
	case SIGCHLD:	return "SIGCHLD";
	case SIGCONT:	return "SIGCONT";
	case SIGSTOP:	return "SIGSTOP";
	case SIGTSTP:	return "SIGTSTP";
	case SIGTTIN:	return "SIGTTIN";
	case SIGTTOU:	return "SIGTTOU";
	case SIGURG:	return "SIGURG";
	case SIGXCPU:	return "SIGXCPU";
	case SIGXFSZ:	return "SIGXFSZ";
	case SIGVTALRM:	return "SIGVTALRM";
	case SIGPROF:	return "SIGPROF";
	case SIGWINCH:	return "SIGWINCH";
	case SIGIO:	return "SIGIO";
	case SIGPWR:	return "SIGPWR";
	}

	sprintf(namebuf, "signal #%d", signo);
	return namebuf;
}