File: drbd-attr

package info (click to toggle)
drbd-utils 9.15.0-1
  • links: PTS, VCS
  • area: main
  • in suites: bullseye
  • size: 6,464 kB
  • sloc: ansic: 47,782; xml: 11,374; cpp: 9,765; sh: 4,398; makefile: 1,020; perl: 353
file content (332 lines) | stat: -rwxr-xr-x 10,166 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
#!/bin/bash

: ${OCF_FUNCTIONS=${OCF_ROOT}/resource.d/heartbeat/.ocf-shellfuncs}
. ${OCF_FUNCTIONS}
: ${__OCF_ACTION=$1}

OCF_RESKEY_dampening_delay_default=5
OCF_RESKEY_attr_name_prefix_default=drbd-promotion-score
OCF_RESKEY_record_event_details_default=false

# see pacemaker:include/crm/crm.h CRM_SCORE_INFINITY
MINUS_INFINITY=-1000000

meta-data() {
    cat <<___
<?xml version="1.0"?>
<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd">
<resource-agent name="drbd-attr" version="1.0">
<version>1.0</version>

<longdesc lang="en">
This listens for DRBD state change events, and sets or deletes transient node
attributes based on the "promotion_score" and "may_promote" values as presented
by the DRBD events2 interface.

Optionally using a dampening delay, see attrd_updater for details.

To be used as a clone on all DRBD nodes.  The idea is to start DRBD outside of
pacemaker, use DRBD auto-promote, and add location constraints for the
Filesystem or other resource agents which are using DRBD.
</longdesc>
<shortdesc lang="en">import DRBD state change events as transient node attributes</shortdesc>

<parameters>
<parameter name="dampening_delay">
<longdesc lang="en">
To be used as dampening delay in attrd_updater.
</longdesc>
<shortdesc lang="en">attrd_updater --delay</shortdesc>
<content type="integer" default="$OCF_RESKEY_dampening_delay_default" />
</parameter>

<parameter name="attr_name_prefix">
<longdesc lang="en">
The attributes will be named "*prefix*-drbd_resource_name".
You can chose that prefix here.
</longdesc>
<shortdesc lang="en">attrd_updater --name *prefix*-drbd_resource_name</shortdesc>
<content type="string" default="$OCF_RESKEY_attr_name_prefix_default" />
</parameter>

<parameter name="record_event_details">
<longdesc lang="en">
It may be convenient to know which event lead to the current score.
This setting toggles the recording of the event.
The attributes will be named "*prefix*:event-details-drbd_resource_name".
</longdesc>
<shortdesc lang="en"></shortdesc>
<content type="boolean" default="$OCF_RESKEY_record_event_details_default" />
</parameter>
</parameters>

<actions>
<action name="start"        timeout="20s" />
<action name="stop"         timeout="20s" />
<action name="monitor"      timeout="20s" interval="60s" depth="0"/>
<action name="validate-all" timeout="20s" />
<action name="meta-data"    timeout="5s" />
</actions>
</resource-agent>
___
}

validate-all()
{
	ocf_check_binary drbdadm
	ocf_check_binary drbdsetup

	# I think we can expect "coreutils"...
	# ocf_check_binary stat
	# ocf_check_binary sort
	# ocf_check_binary join

	# we need at least drbd utils 9.14.1
	if (( $(drbdadm -V | sed -ne 's/^DRBDADM_VERSION_CODE=\(0x[0-9a-f]*\)$/\1/p') < 0x090e00 )) ; then
		ocf_exit_reason "need at least drbd-utils 9.14.0"
		return $OCF_ERR_INSTALLED
	fi

	# Or do we allow empty prefix? Resulting attribute names would be "-$DRBD_RESOURCE_NAME" ...
	[[ -n $attr_name_prefix ]] || { ocf_exit_reason "attr_name_prefix must not be empty"; return $OCF_ERR_CONFIGURED; }
}

monitor()
{
	test -e $PIDFILE || return $OCF_NOT_RUNNING
	test -s $PIDFILE || return $OCF_ERR_GENERIC
	# could even add a: test $PIDFILE -ef "/proc/$pid/fd/9"
	read pid < $PIDFILE &> /dev/null && kill -0 "$pid" && return $OCF_SUCCESS
	if [[ $__OCF_ACTION == monitor ]]; then
		# try to report the "last words of the previous instance" as exit reason
		tmp=$(crm_attribute --lifetime reboot --query --name $attr_name_prefix --quiet)
		[[ $tmp ]] && ocf_exit_reason ":: $tmp"
	fi
	return $OCF_ERR_GENERIC
}

list_existing_attributes()
{
	cibadmin -Q --xpath "/cib/status/node_state[@id='$node_id']/transient_attributes/instance_attributes/nvpair[starts-with(@name,'${attr_name_prefix}-')]" |
		sed -ne 's,^.* name="'"${attr_name_prefix}"'-\([^"]*\)".*,\1,p'
}

re_init_daemon()
{
	resources_with_existing_attributes=$( list_existing_attributes )
	resources_known_to_drbdadm=$( drbadm sh-resources )
	resources_seen_in_initial_dump=''
	status_delay=''
	exec 0< <(exec drbdsetup events2 --timestamps)
}

delete_event_details()
{
	local name=$1
	attrd_updater -n $attr_name_prefix:event-details-$name --delete
}

record_event_details()
{
	$record_event_details || return 0
	local name=$1 ; shift # rest is "message"
	attrd_updater -n $attr_name_prefix:event-details-$name --delay $dampening_delay --update-both "$*"
}

delete_name()
{
	local name=$1
	# at the time of writing, there is only --update-both (delay and value),
	# but no --update-delay-and-delete...
	attrd_updater -n $attr_name_prefix-$name --delete
	attrd_updater -n $attr_name_prefix-$name --update-delay --delay 0
	delete_event_details $name
}

update_name_to_minus_inf()
{
	local name=$1
	attrd_updater -n $attr_name_prefix-$name --update-both $MINUS_INFINITY --delay 0
}

update_name_to_score()
{
	local name=$1 score=$2
	case $score in
	""|*[!0-9]*)
		delete_name $name ;;
	0)
		update_name_to_minus_inf $name ;;
	*)
		attrd_updater -n $attr_name_prefix-$name --update-both $score --delay $dampening_delay
	esac
}

handle_stale_attributes()
{
	local stale_and_unknown stale_but_known
	stale_but_known=$(
		join -v 2 <(printf "%s\n" $resources_seen_in_initial_dump | sort -u ) \
			<(printf "%s\n" $resources_known_to_drbdadm | sort -u )
		)
	stale_and_unknown=$(
		join -v 2 <(printf "%s\n" $resources_seen_in_initial_dump $resources_known_to_drbdadm | sort -u ) \
			<(printf "%s\n" $resources_with_existing_attributes | sort )
		)
	for r in $stale_and_unknown ; do delete_name $r;  done
	for r in $stale_but_known ; do update_name_to_minus_inf $r ; done
	unset resources_with_existing_attributes
	unset resources_seen_in_initial_dump
	unset resources_known_to_drbdadm
}

remove_all_event_details()
{
	# !! NOT --delete --force --xpath,
	# that would only manipulate the CIB,
	# but the ATTRD would still remember.
	local attr
	for attr in $(
		cibadmin -Q --xpath "/cib/status/node_state[@id='$node_id']/transient_attributes/instance_attributes/nvpair[starts-with(@name,'${attr_name_prefix}:event-details-')]" |
			sed -ne 's,^.* name="\([^"]*\)".*,\1,p'
		)
	do
		attrd_updater -n $attr --delete
	done
}

toggle_event_details()
{
	if $record_event_details ; then
		record_event_details=false
		remove_all_event_details
	else
		record_event_details=true
	fi
}

the-daemon()
{
	node_id=$(crm_node -i)
	event=()

	trap toggle_event_details USR1

	re_init_daemon
	trap "re_init_daemon" HUP
	while prev=( "${event[@]}" ); read -a event; do
		# Is this still *our* pidfile?
		test $PIDFILE -ef /proc/self/fd/9 || { event="lost PIDFILE"; break; }

		# handle end of initial state dump
		if [[ "${event[1]} ${event[2]}" == "exists -" ]]; then
			handle_stale_attributes
			attrd_updater -n $attr_name_prefix --update-both "(re)init completed at $(date "+%F %T") by $OCF_RESOURCE_INSTANCE [$BASHPID]" --delay 0
			[[ $dampening_delay != 0 ]] && status_delay=$dampening_delay || status_delay=$OCF_RESKEY_dampening_delay_default
			continue
		fi

		# ignore non-resource events for now
		[[ ${event[2]} == resource ]] || continue

		# parse the events line
		name=''
		promotion_score=''
		for f in ${event[@]:3}; do
			k=${f%%:*}; v=${f#*:}
			[[ $k == name ]]		&& name=$v
			[[ $k == promotion_score ]]	&& promotion_score=$v
		done

		# events lines must always contain the resource name
		[[ -n $name ]] || break

		# record name, if still in initial state dump
		[[ ${event[1]} == exists ]] && resources_seen_in_initial_dump+=" $name"

		# either this was a "destroy" event (and we remove the attribute),
		# or the event contained a promotion_score,
	        # or we ignore this line.
		[[ ${event[1]} == destroy ]] || [[ $promotion_score ]] || continue

		update_name_to_score $name $promotion_score || {
			event+=( "// attrd_updater failed with exit code $?" )
			break
		}
		record_event_details $name "${event[*]}"

		# let them know when we saw the last event, unless still within the initial state dump
		[[ $status_delay ]] &&
		attrd_updater -n $attr_name_prefix --update-both "Last event at $(date "+%F %T") by $OCF_RESOURCE_INSTANCE [$BASHPID]" --delay $status_delay
	done

	# some means of "error reporting" ...
	attrd_updater -n $attr_name_prefix --update-both "FAILED at $(date "+%F %T") as $OCF_RESOURCE_INSTANCE, inactive; last input: '${prev[*]}' // '${event[*]}'" --delay 0

	# Let them know we failed, but only do this if this script survived for
	# a minute already, using the bash magic SECONDS variable.
	# If we did not make it that long, don't immediately report that failure,
	# I want to avoid a potentially "tight" recovery loop.
	# Pacemaker will notice on the next monitoring interval,
	(( $SECONDS >= 60 )) && crm_resource --fail --resource $OCF_RESOURCE_INSTANCE
}

start()
{
	validate-all || return
	monitor && return $OCF_SUCCESS
	test -e $PIDFILE && return $OCF_GENERIC

	exec 9> $PIDFILE || return $OCF_GENERIC
	(
		echo $BASHPID >&9
		handle_SIGTERM() {
			attrd_updater -n $attr_name_prefix --update-both "STOPPED at $(date "+%F %T") as $OCF_RESOURCE_INSTANCE, inactive" --delay 0
			rm -f $PIDFILE
			exit
		}
		trap "handle_SIGTERM" TERM

		attrd_updater -n $attr_name_prefix --update-both "STARTED at $(date "+%F %T") as $OCF_RESOURCE_INSTANCE, initializing" --delay 0

		the-daemon
	) </dev/null >/dev/null 2>&1 &

	test -e $PIDFILE || return $OCF_ERR_GENERIC
	while ! monitor ; do sleep 1; done
	return $OCF_SUCCESS
}

stop()
{
	if monitor; then
		read pid < $PIDFILE && kill -TERM "$pid"
		while test -e $PIDFILE ; do sleep 1; done
	else
		rm -f $PIDFILE
	fi
	return $OCF_SUCCESS
}

dampening_delay=${OCF_RESKEY_dampening_delay:=$OCF_RESKEY_dampening_delay_default}
attr_name_prefix=${OCF_RESKEY_attr_name_prefix:=$OCF_RESKEY_attr_name_prefix_default}
if ocf_is_true ${OCF_RESKEY_record_event_details:=$OCF_RESKEY_record_event_details_default} ; then
	record_event_details=true
else
	record_event_details=false
	remove_all_event_details
fi

PIDFILE=${HA_VARRUN%%/}/drbd-attr-${OCF_RESOURCE_INSTANCE}.pid


case $__OCF_ACTION in
	validate-all|meta-data|start|stop|monitor)
		$__OCF_ACTION
		;;
	*)
		ocf_exit_reason "'$__OCF_ACTION' not implemented"
		exit $OCF_ERR_UNIMPLEMENTED
esac

exit $? # that would happen implicitly anyways