1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374
|
#!@BASH_PATH@
#
# ocf:pacemaker:HealthSMART resource agent
#
# Copyright 2009-2023 the Pacemaker project contributors
#
# The version control history for this file may have further details.
#
# This source code is licensed under the GNU General Public License version 2
# (GPLv2) WITHOUT ANY WARRANTY.
#
#
# Checks the S.M.A.R.T. status of all given drives and writes the #health-smart
# status into the CIB
#
#######################################################################
#######################################################################
# Initialization:
: ${OCF_FUNCTIONS:="${OCF_ROOT}/resource.d/heartbeat/.ocf-shellfuncs"}
. "${OCF_FUNCTIONS}"
: ${__OCF_ACTION:="$1"}
# Explicitly list all environment variables used, to make static analysis happy
: ${OCF_RESKEY_CRM_meta_interval:=0}
: ${OCF_RESKEY_CRM_meta_globally_unique:="true"}
: ${OCF_RESKEY_temp_warning:=""}
: ${OCF_RESKEY_temp_lower_limit:=""}
: ${OCF_RESKEY_temp_upper_limit:=""}
: ${OCF_RESKEY_drives:="/dev/sda"}
: ${OCF_RESKEY_devices:=""}
: ${OCF_RESKEY_state:=""}
: ${OCF_RESKEY_smartctl:="/usr/sbin/smartctl"}
: ${OCF_RESKEY_dampen:="5s"}
# Turn these into arrays so we can iterate them later.
DRIVES=(${OCF_RESKEY_drives})
DEVICES=(${OCF_RESKEY_devices})
#######################################################################
meta_data() {
cat <<END
<?xml version="1.0"?>
<resource-agent name="HealthSMART" version="@VERSION@">
<version>1.1</version>
<longdesc lang="en">
System health agent that checks the S.M.A.R.T. status of the given drives and
updates the #health-smart attribute.
</longdesc>
<shortdesc lang="en">SMART health status</shortdesc>
<parameters>
<parameter name="state" unique-group="state">
<longdesc lang="en">
Location to store the resource state in.
</longdesc>
<shortdesc lang="en">State file</shortdesc>
<content type="string" default="${HA_VARRUN%%/}/HealthSMART-${OCF_RESOURCE_INSTANCE}.state" />
</parameter>
<parameter name="drives" reloadable="1">
<longdesc lang="en">
The drive(s) to check as a SPACE separated list. Enter the full path to the device, e.g. "/dev/sda".
</longdesc>
<shortdesc lang="en">Drives to check</shortdesc>
<content type="string" default="/dev/sda" />
</parameter>
<parameter name="devices" reloadable="1">
<longdesc lang="en">
The device type(s) to assume for the drive(s) being tested as a SPACE separated list.
</longdesc>
<shortdesc lang="en">Device types</shortdesc>
<content type="string" />
</parameter>
<parameter name="temp_lower_limit" reloadable="1">
<longdesc lang="en">
Lower limit of the temperature in deg C of the drive(s). Below this limit the status of #health-smart will be red.
</longdesc>
<shortdesc lang="en">Lower limit for the red smart attribute</shortdesc>
<content type="string" default="0"/>
</parameter>
<parameter name="temp_upper_limit" reloadable="1">
<longdesc lang="en">
Upper limit of the temperature if deg C of the drives(s). If the drive reports
a temperature higher than this value the status of #health-smart will be red.
</longdesc>
<shortdesc lang="en">Upper limit for red smart attribute</shortdesc>
<content type="string" default="60"/>
</parameter>
<parameter name="temp_warning" reloadable="1">
<longdesc lang="en">
Number of deg C below/above the upper/lower temp limits at which point the status of #health-smart will change to yellow.
</longdesc>
<shortdesc lang="en">Deg C below/above the upper limits for yellow smart attribute</shortdesc>
<content type="string" default="5"/>
</parameter>
<parameter name="smartctl" reloadable="1">
<longdesc lang="en">
The path to the smartctl program, used for querying device health.
</longdesc>
<shortdesc lang="en">The path to the smartctl program</shortdesc>
<content type="string" default="/usr/sbin/smartctl"/>
</parameter>
<parameter name="dampen" reloadable="1">
<longdesc lang="en">
The time to wait (dampening) for further changes to occur
</longdesc>
<shortdesc lang="en">Dampening interval</shortdesc>
<content type="string" default="5s"/>
</parameter>
</parameters>
<actions>
<action name="start" timeout="10s" />
<action name="stop" timeout="10s" />
<action name="monitor" timeout="10s" interval="10s" start-delay="0s" />
<action name="meta-data" timeout="5s" />
<action name="validate-all" timeout="10s" depth="0" />
<action name="reload-agent" timeout="20s" />
</actions>
</resource-agent>
END
}
#######################################################################
check_temperature() {
if [ -n "$1" ]; then
if [ $1 -lt ${lower_red_limit} ] ; then
ocf_log info "Drive ${DRIVE} ${DEVICE} too cold: ${1} C"
attrd_updater -n "#health-smart" -B "red" -d "${OCF_RESKEY_dampen}"
return 1
fi
if [ $1 -gt ${upper_red_limit} ] ; then
ocf_log info "Drive ${DRIVE} ${DEVICE} too hot: ${1} C"
attrd_updater -n "#health-smart" -B "red" -d "${OCF_RESKEY_dampen}"
return 1
fi
if [ $1 -lt ${lower_yellow_limit} ] ; then
ocf_log info "Drive ${DRIVE} ${DEVICE} quite cold: ${1} C"
attrd_updater -n "#health-smart" -B "yellow" -d "${OCF_RESKEY_dampen}"
return 1
fi
if [ $1 -gt ${upper_yellow_limit} ] ; then
ocf_log info "Drive ${DRIVE} ${DEVICE} quite hot: ${1} C"
attrd_updater -n "#health-smart" -B "yellow" -d "${OCF_RESKEY_dampen}"
return 1
fi
fi
}
common_checks() {
# Each item in $OCF_RESKEY_drives must have a corresponding item in
# $OCF_RESKEY_devices with the device type. Alternately,
# $OCF_RESKEY_devices can be empty.
drives_len=${#DRIVES[@]}
devices_len=${#DEVICES[@]}
if [ "${drives_len}" -ne "${devices_len}" ] && [ "${devices_len}" -gt 0 ]; then
ocf_log err "OCF_RESKEY_devices must be empty or the same length as OCF_RESKEY_drives."
exit $OCF_ERR_ARGS
fi
# Each item in $OCF_RESKEY_drives must look like a device node.
for d in "${DRIVES[@]}"; do
if [[ "$d" != /dev/* ]]; then
ocf_log err "Device in OCF_RESKEY_devices does not look like a device node: $d"
exit $OCF_ERR_ARGS
fi
done
}
init_smart() {
#Set temperature defaults
if [ -z "${OCF_RESKEY_temp_warning}" ]; then
yellow_threshold=5
else
yellow_threshold=${OCF_RESKEY_temp_warning}
fi
if [ -z "${OCF_RESKEY_temp_lower_limit}" ] ; then
lower_red_limit=0
else
lower_red_limit=${OCF_RESKEY_temp_lower_limit}
fi
lower_yellow_limit=$((${lower_red_limit}+${yellow_threshold}))
if [ -z "${OCF_RESKEY_temp_upper_limit}" ] ; then
upper_red_limit=60
else
upper_red_limit=${OCF_RESKEY_temp_upper_limit}
fi
upper_yellow_limit=$((${upper_red_limit}-${yellow_threshold}))
for ndx in ${!DRIVES[*]}; do
DRIVE=${DRIVES[$ndx]}
if [ -n "${OCF_RESKEY_devices}" ]; then
DEVICE=${DEVICES[$ndx]}
"${OCF_RESKEY_smartctl}" -d "${DEVICE}" -i "${DRIVE}" | grep -q "SMART support is: Enabled"
if [ $? -ne 0 ] ; then
ocf_log err "S.M.A.R.T. not enabled for drive "${DRIVE}
exit $OCF_ERR_INSTALLED
fi
else
"${OCF_RESKEY_smartctl}" -i "${DRIVE}" | grep -q "SMART support is: Enabled"
if [ $? -ne 0 ] ; then
ocf_log err "S.M.A.R.T. not enabled for drive "${DRIVE}
exit $OCF_ERR_INSTALLED
fi
fi
done
}
HealthSMART_usage() {
cat <<END
usage: $0 {start|stop|monitor|validate-all|meta-data|reload-agent}
Expects to have a fully populated OCF RA-compliant environment set.
END
}
HealthSMART_start() {
HealthSMART_monitor
if [ $? -eq $OCF_SUCCESS ]; then
return $OCF_SUCCESS
fi
touch "${OCF_RESKEY_state}"
}
HealthSMART_stop() {
attrd_updater -D -n "#health-smart"
rm "${OCF_RESKEY_state}"
if [ $? -eq 0 ]; then
return $OCF_SUCCESS
else
return $OCF_ERR_GENERIC
fi
}
HealthSMART_monitor() {
common_checks
# Test for presence of smartctl
check_binary smartctl
init_smart
# Monitor _MUST!_ differentiate correctly between running
# (SUCCESS), failed (ERROR) or _cleanly_ stopped (NOT RUNNING).
# That is THREE states, not just yes/no.
if [ -f "${OCF_RESKEY_state}" ]; then
for ndx in ${!DRIVES[*]}; do
DRIVE=${DRIVES[$ndx]}
if [ -n "${OCF_RESKEY_devices}" ]; then
DEVICE=${DEVICES[$ndx]}
# Check overall S.M.A.R.T. status
"${OCF_RESKEY_smartctl}" -d "${DEVICE}" -H ${DRIVE} | grep -q "SMART overall-health self-assessment test result: PASSED"
if [ $? -ne 0 ]; then
attrd_updater -n "#health-smart" -B "red" -d "${OCF_RESKEY_dampen}"
return $OCF_SUCCESS
fi
# Check drive temperature(s)
check_temperature "$("${OCF_RESKEY_smartctl}" -d "${DEVICE}" -A "${DRIVE}" | awk '/^194/ { print $10 }')"
if [ $? -ne 0 ]; then
return $OCF_SUCCESS
fi
else
"${OCF_RESKEY_smartctl}" -H "${DRIVE}" | grep -q "SMART overall-health self-assessment test result: PASSED"
if [ $? -ne 0 ]; then
attrd_updater -n "#health-smart" -B "red" -d "${OCF_RESKEY_dampen}"
return $OCF_SUCCESS
fi
check_temperature "$("${OCF_RESKEY_smartctl}" -A "${DRIVE}" | awk '/^194/ { print $10 }')"
if [ $? -ne 0 ]; then
return $OCF_SUCCESS
fi
fi
done
attrd_updater -n "#health-smart" -B "green" -d "${OCF_RESKEY_dampen}"
return $OCF_SUCCESS
fi
return $OCF_NOT_RUNNING
}
HealthSMART_validate() {
common_checks
# Host-specific checks
if [ "$OCF_CHECK_LEVEL" = "10" ]; then
# Test for presence of smartctl
check_binary smartctl
init_smart
# Is the state directory writable?
state_dir=$(dirname "$OCF_RESKEY_state")
touch "$state_dir/$$"
if [ $? -ne 0 ]; then
return $OCF_ERR_ARGS
fi
rm "$state_dir/$$"
fi
return $OCF_SUCCESS
}
HealthSMART_reload_agent() {
return $OCF_SUCCESS
}
if [ -z "$OCF_RESKEY_state" ]; then
if [ "${OCF_RESKEY_CRM_meta_globally_unique}" = "false" ]; then
state="${HA_VARRUN%%/}/HealthSMART-${OCF_RESOURCE_INSTANCE}.state"
# Strip off the trailing clone marker
OCF_RESKEY_state=$(echo $state | sed s/:[0-9][0-9]*\.state/.state/)
else
OCF_RESKEY_state="${HA_VARRUN%%/}/HealthSMART-${OCF_RESOURCE_INSTANCE}.state"
fi
fi
case "$__OCF_ACTION" in
start) HealthSMART_start;;
stop) HealthSMART_stop;;
monitor) HealthSMART_validate && HealthSMART_monitor;;
validate-all) HealthSMART_validate;;
reload-agent) HealthSMART_reload_agent;;
meta-data)
meta_data
exit $OCF_SUCCESS
;;
usage|help)
HealthSMART_usage
exit $OCF_SUCCESS
;;
*) HealthSMART_usage
exit $OCF_ERR_UNIMPLEMENTED
;;
esac
rc=$?
ocf_log debug "${OCF_RESOURCE_INSTANCE} $__OCF_ACTION : $rc"
exit $rc
# vim: set filetype=sh expandtab tabstop=4 softtabstop=4 shiftwidth=4 textwidth=80:
|