1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285
|
setterm -cursor off
stty -echoctl # hide ^C
# function called by trap
ct_on_exit() {
setterm -cursor on
if [ $# -gt 0 ] && [ $1 -gt 0 ]; then
ct_notify "$(basename $0) Finish with errors"
fi
exit $1
}
trap 'ct_on_exit' EXIT HUP INT TERM
# Init list of OSDs Down
ceph osd tree down |grep "osd\."| awk '{print $1}'|sort -n -o /tmp/OSDs_Down
CT_SCRIPT_NAME=$0
CT_MAX_REMAPPED=64
CT_OSD_DOWN=0
CT_MAX_TEMP=70
CT_HELP_MSG="""
-h, --help show this help message and exit
-M MAX_REMAPPED, --max_remapped MAX_REMAPPED
Increasing PG_NUM waits until there is less 'MAX_REMAPPED' in progress to resume,
default: $CT_MAX_REMAPPED
-d, --osd_down Suspend operation when an OSD is down
-k, --exit_on_critical Exit script if critical event
-w WAIT, --wait_for_nobackfill WAIT
Wait to start until the cluster has no more backfill
-t MAX_TEMP, --max_temp MAX_TEMP
Suspend operation if serveur Temp >= MAX_TEMP
-S SCHEDULE, --schedule SCHEDULE
Schedule, eg : 21-07 for 21h to 07h or 08-16 for 8h to 16h
"""
CT_HELP_EXAMPLE="-M 100 -d -k -w -t 45 -S 21-07"
CT_HELP_DESCR="[-m|--CT_MAX_REMAPPED CT_MAX_REMAPPED] [-w|--wait_for_nobackfill WAIT] [-d|osd-down]"
ct_help_min () {
if [ $1 -lt $2 ]
then
help_msg
ct_on_exit 1
fi
}
ct_help () {
_shift=0
while [ $# -ge 1 ]; do
ARGS=$#
_key="$1"
case $_key in
-h|--help)
help_msg
ct_on_exit 0
shift 1
;;
-M|--max_remapped)
CT_MAX_REMAPPED=$2
((_shift = _shift + 2))
shift 2
;;
-d|--osd_down)
CT_OSD_DOWN=1
((_shift = _shift + 1))
shift 1
;;
-k|--exit_on_critical)
CT_EXIT_ON_ERROR=1
((_shift = _shift + 1))
shift 1
;;
-t|--max_temp)
CT_MAX_TEMP=$2
((_shift = _shift + 2))
shift 2
;;
-w|--wait_for_nobackfill)
CT_WAIT=1
((_shift = _shift + 1))
shift 1
;;
-S|--schedule)
CT_SCHEDULE=$2
((_shift = _shift + 2))
shift 2
;;
esac
if [ "$#" == "$ARGS" ]; then
return $_shift
fi
done
return $_shift
}
_CT_AVG_FILE=$(mktemp -t ct_avg_file.XXXXX)
_check_variation_remapped() {
# Calculate variation of remapped PGs for the last 60 measures (~10 minutes)
_PAST=$(($(date "+%s") - 600))
_cpt=0
while read -r _TIME _REMAP
do
if [ "${_TIME}" -lt "${_PAST}" ]; then
((_cpt+=1))
else
break
fi
done < $_CT_AVG_FILE
sed -i "1,${_cpt}d" $_CT_AVG_FILE
if [ "$(cat $_CT_AVG_FILE | wc -l)" -lt "30" ]; then
return 1
fi
_OLD_REMAP=$(cat $_CT_AVG_FILE| head 1| awk '{print $1}')
while read -r _TIME _REMAP
do
if ! [ "${_REMAP}" -eq "${_OLD_REMAP}" ]; then
return 0
fi
done < $_CT_AVG_FILE
return 1
}
ct_get_current_remapped() {
# Get Current Remapped
_DATE=$(date "+%s")
_REMAP=$(ceph -s -f json | jq '.["osdmap"]["osdmap"]["num_remapped_pgs"]')
echo ${_DATE} ${REMAP} >> $_CT_AVG_FILE
echo ${_REMAP}
}
_ct_test_osd_down() {
# Test if OSD has gone down since script beginning
if [ $CT_OSD_DOWN -eq 1 ]; then
ceph osd tree down |grep "osd\."| awk '{print $1}'|sort -n -o /tmp/OSDs_Down_now
if ! diff -q /tmp/OSDs_Down /tmp/OSDs_Down_now > /dev/null; then
echo "OSDs down, operation paused"
echo "For resume,"
echo "You can reinit OSDs Down list by execute"
echo "'ceph osd tree down |grep \"osd\\.\"| awk '{print \$1}'|sort -n -o /tmp/OSDs_Down'"
ct_notify "Detected OSD Down"
while ! diff -q /tmp/OSDs_Down /tmp/OSDs_Down_now > /dev/null; do
sleep 60
ceph osd tree down |grep "osd\."| awk '{print $1}'|sort -n -o /tmp/OSDs_Down_now
done
ct_logger "OSDs OK: Resume"
fi
fi
return
}
_ct_test_temp () {
# Test current temperature
for HWMON in $(ls /sys/class/hwmon); do
if [ -f "/sys/class/hwmon/${HWMON}/temp1_input" ]; then
_CT_TEMP=$(($(cat "/sys/class/hwmon/${HWMON}/temp1_input") / 1000))
if [[ $_CT_TEMP -ge $CT_MAX_TEMP ]]; then
ct_logger "High temperature (${_CT_TEMP} > ${CT_MAX_TEMP}): Pause"
while [[ $_CT_TEMP -ge $CT_MAX_TEMP ]]; do
sleep 60
_CT_TEMP=$(($(cat "/sys/class/hwmon/${HWMON}/temp1_input") / 1000))
done
ct_logger "Temperature OK: Resume"
return
fi
fi
done
}
_ct_test_schedule () {
# Test if in the good time range
if [ ! -z "$CT_SCHEDULE" ]; then
# define Test
day(){ [ $_CT_F -ge $_CT_D ]; }
sup_begin(){ [ $_CT_T -ge $_CT_D ]; }
inf_end(){ [ $_CT_T -lt $_CT_F ]; }
test_in_hours() { { day && { sup_begin && inf_end; }; } || { ! day && { sup_begin || inf_end; }; }; }
_CT_T=$(date "+%H")
_CT_D=$(echo $CT_SCHEDULE | cut -d '-' -f 1)
_CT_F=$(echo $CT_SCHEDULE | cut -d '-' -f 2)
if ! test_in_hours; then
ct_logger "Outside of hours range: Pause"
while ! test_in_hours; do
sleep 3600
_CT_T=$(date "+%H")
done
ct_logger "Inside of hours range: Resume"
fi
fi
return
}
_ct_test_remapped(){
# Wait for Remapped <= MAX_REMAPPED
# Param 1: Max remapped
_ct_mr=$CT_MAX_REMAPPED
_ct_very_high=$((CT_MAX_REMAPPED * 3))
if [ -z "$1" ]; then
_ct_max=$((CT_MAX_REMAPPED * 2))
elif [[ $1 -eq 0 ]]; then
_ct_mr=0
else
_ct_max=$1
fi
_ct_cbf=$(ct_get_current_remapped)
if [[ $_ct_cbf -gt $_ct_very_high ]]; then
ct_notify "Currently remapped PGs is very high"
while [[ $_ct_cbf -gt $_ct_very_high ]]; do
sleep 60
_ct_cbf=$(ct_get_current_remapped)
done
return 1
elif _check_variation_remapped ; then
ct_notify "Remapped PG don't change for ~ 10 minutes"
while _check_variation_remapped; do
sleep 60
_ct_cbf=$(ct_get_current_remapped)
done
return 1
elif [[ $_ct_cbf -gt $_ct_mr ]]; then
_ct_cu=$((CT_MAX_REMAPPED - (_ct_cbf - _ct_mr)))
if [ $_ct_cu -lt 0 ]
then
_ct_cu=0
fi
echo -ne "\r[$(_ct_progress ${_ct_cu} ${_ct_max})] ==> Current remapped > Max remapped (${_ct_cbf}>${_ct_mr}) "
return 1
fi
return 0
}
_ct_progress() {
# Display Progress Bar
# Param 1: current value
# Param 2: max value
_ct_step=$((${2:-64} / 64))
_ct_max=$2
_ct_current=$1
printf '%.0s#' $(seq 0 $_ct_step $_ct_current)
printf '%.0s_' $(seq $_ct_current $_ct_step $_ct_max)
}
ct_notify() {
# Notify by external tool
# Param 1: message
if [ ! -z "$CT_EXIT_ON_ERROR" ]; then
# echo $(hostname -s)-$(basename $0): ${1}: Script stopped >&2
ct_logger "Critical -> $1: Script stopped"
setterm -cursor on
echo
exit 1
else
# echo $(hostname -s)-$(basename $0): ${1}: Script paused >&2
ct_logger "Critical -> $1: Script paused"
fi
}
ct_logger() {
# Log in syslog
# Param 1: message
echo $1
logger -i "$(basename $0): $1"
}
ct_healthy_wait() {
# Wait For cluster Heathly
_ct_test_osd_down
_ct_test_temp
_ct_test_schedule
while ! _ct_test_remapped $1; do
sleep 10
done
echo
}
if [ ! -z "$CT_WAIT" ]
then
ct_healthy_wait 0
fi
|