File: _ct-common

package info (click to toggle)
ceph-tools 0.0.40
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 112 kB
  • sloc: python: 703; sh: 626; makefile: 15
file content (285 lines) | stat: -rw-r--r-- 7,391 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
setterm -cursor off

stty -echoctl # hide ^C

# function called by trap
ct_on_exit() {
    setterm -cursor on
    if [ $# -gt 0 ] && [ $1 -gt 0 ]; then
      ct_notify "$(basename $0) Finish with errors"
    fi
    exit $1
}

trap 'ct_on_exit' EXIT HUP INT TERM

# Init list of OSDs Down
ceph osd tree down |grep "osd\."| awk '{print $1}'|sort -n -o /tmp/OSDs_Down

CT_SCRIPT_NAME=$0
CT_MAX_REMAPPED=64
CT_OSD_DOWN=0
CT_MAX_TEMP=70
CT_HELP_MSG="""
  -h, --help                show this help message and exit
  -M MAX_REMAPPED, --max_remapped MAX_REMAPPED            
                            Increasing PG_NUM waits until there is less 'MAX_REMAPPED' in progress to resume,
                          default: $CT_MAX_REMAPPED
  -d, --osd_down            Suspend operation when an OSD is down
  -k, --exit_on_critical    Exit script if critical event
  -w WAIT, --wait_for_nobackfill WAIT
                            Wait to start until the cluster has no more backfill
  -t MAX_TEMP, --max_temp MAX_TEMP
                            Suspend operation if serveur Temp >= MAX_TEMP
  -S SCHEDULE, --schedule SCHEDULE
                            Schedule, eg : 21-07 for 21h to 07h or 08-16 for 8h to 16h
"""


CT_HELP_EXAMPLE="-M 100 -d -k -w -t 45 -S 21-07"

CT_HELP_DESCR="[-m|--CT_MAX_REMAPPED CT_MAX_REMAPPED] [-w|--wait_for_nobackfill WAIT] [-d|osd-down]"

ct_help_min () {
    if [ $1 -lt $2 ]
    then
        help_msg
        ct_on_exit 1
    fi
}

ct_help () {
  _shift=0
  while [ $# -ge 1 ]; do
    ARGS=$#
    _key="$1"
    case $_key in
      -h|--help)
          help_msg
          ct_on_exit 0
          shift 1
          ;;
      -M|--max_remapped)
          CT_MAX_REMAPPED=$2
          ((_shift = _shift + 2))
          shift 2
          ;;
      -d|--osd_down)
          CT_OSD_DOWN=1
          ((_shift = _shift + 1))
          shift 1
          ;;
      -k|--exit_on_critical)
          CT_EXIT_ON_ERROR=1
          ((_shift = _shift + 1))
          shift 1
          ;;
      -t|--max_temp)
          CT_MAX_TEMP=$2
            ((_shift = _shift + 2))
            shift 2
            ;;
      -w|--wait_for_nobackfill)
          CT_WAIT=1
          ((_shift = _shift + 1))
          shift 1
          ;;
      -S|--schedule)
          CT_SCHEDULE=$2
            ((_shift = _shift + 2))
            shift 2
            ;;
    esac
    if [ "$#" == "$ARGS" ]; then
        return $_shift
    fi
  done
  return $_shift
}

_CT_AVG_FILE=$(mktemp -t ct_avg_file.XXXXX)

_check_variation_remapped() {
  # Calculate variation of remapped PGs for the last 60 measures (~10 minutes)
  _PAST=$(($(date "+%s") - 600))
  _cpt=0
  while read -r _TIME _REMAP
  do
    if [ "${_TIME}" -lt "${_PAST}" ]; then
      ((_cpt+=1))
    else
      break
    fi
  done < $_CT_AVG_FILE

  sed -i "1,${_cpt}d" $_CT_AVG_FILE
  if [ "$(cat $_CT_AVG_FILE | wc -l)" -lt "30" ]; then
    return 1
  fi
  _OLD_REMAP=$(cat $_CT_AVG_FILE| head 1| awk '{print $1}')
  while read -r _TIME _REMAP
  do
    if ! [ "${_REMAP}" -eq "${_OLD_REMAP}" ]; then
      return 0
    fi
  done < $_CT_AVG_FILE
  return 1
}

ct_get_current_remapped() {
  # Get Current Remapped
  _DATE=$(date "+%s")
  _REMAP=$(ceph -s -f json | jq '.["osdmap"]["osdmap"]["num_remapped_pgs"]')
  echo ${_DATE} ${REMAP} >> $_CT_AVG_FILE
  echo ${_REMAP}
}

_ct_test_osd_down() {
  # Test if OSD has gone down since script beginning
  if [ $CT_OSD_DOWN -eq 1 ]; then
    ceph osd tree down |grep "osd\."| awk '{print $1}'|sort -n -o /tmp/OSDs_Down_now
    if ! diff -q /tmp/OSDs_Down /tmp/OSDs_Down_now > /dev/null; then
      echo "OSDs down, operation paused"
      echo "For resume,"
      echo "You can reinit OSDs Down list by execute"
      echo "'ceph osd tree down |grep \"osd\\.\"| awk '{print \$1}'|sort -n -o /tmp/OSDs_Down'"
      ct_notify "Detected OSD Down"
      while ! diff -q /tmp/OSDs_Down /tmp/OSDs_Down_now > /dev/null; do
        sleep 60
        ceph osd tree down |grep "osd\."| awk '{print $1}'|sort -n -o /tmp/OSDs_Down_now
      done
      ct_logger "OSDs OK: Resume"
    fi
  fi
  return
}

_ct_test_temp () {
  # Test current temperature
  for HWMON in $(ls /sys/class/hwmon); do
    if [ -f "/sys/class/hwmon/${HWMON}/temp1_input" ]; then
      _CT_TEMP=$(($(cat "/sys/class/hwmon/${HWMON}/temp1_input") / 1000))
      if [[ $_CT_TEMP -ge $CT_MAX_TEMP ]]; then
        ct_logger "High temperature (${_CT_TEMP} > ${CT_MAX_TEMP}): Pause"
        while [[ $_CT_TEMP -ge $CT_MAX_TEMP ]]; do
          sleep 60
          _CT_TEMP=$(($(cat "/sys/class/hwmon/${HWMON}/temp1_input") / 1000))
        done
        ct_logger "Temperature OK: Resume"
        return
      fi 
    fi
  done
}

_ct_test_schedule () {
  # Test if in the good time range
  if [ ! -z "$CT_SCHEDULE" ]; then
    # define Test
    day(){ [ $_CT_F -ge $_CT_D ]; }
    sup_begin(){ [ $_CT_T -ge $_CT_D ]; }
    inf_end(){ [ $_CT_T -lt $_CT_F ]; }
    test_in_hours() { { day && { sup_begin && inf_end; }; } || { ! day && { sup_begin || inf_end; }; }; }
    _CT_T=$(date "+%H")
    _CT_D=$(echo $CT_SCHEDULE | cut -d '-' -f 1)
    _CT_F=$(echo $CT_SCHEDULE | cut -d '-' -f 2)
    if ! test_in_hours; then
      ct_logger "Outside of hours range: Pause"
      while ! test_in_hours; do
        sleep 3600
        _CT_T=$(date "+%H")
      done
      ct_logger "Inside of hours range: Resume"
    fi
  fi
  return
}

_ct_test_remapped(){
  # Wait for Remapped <= MAX_REMAPPED
  # Param 1: Max remapped
  _ct_mr=$CT_MAX_REMAPPED
  _ct_very_high=$((CT_MAX_REMAPPED * 3))
  if [ -z "$1" ]; then
    _ct_max=$((CT_MAX_REMAPPED * 2))
  elif [[ $1 -eq 0 ]]; then
    _ct_mr=0
  else
    _ct_max=$1
  fi
  _ct_cbf=$(ct_get_current_remapped)
  if [[ $_ct_cbf -gt $_ct_very_high ]]; then
    ct_notify "Currently remapped PGs is very high"
    while [[ $_ct_cbf -gt $_ct_very_high ]]; do
      sleep 60
      _ct_cbf=$(ct_get_current_remapped)
    done
    return 1
  elif _check_variation_remapped ; then
    ct_notify "Remapped PG don't change for ~ 10 minutes"
    while _check_variation_remapped; do
      sleep 60
      _ct_cbf=$(ct_get_current_remapped)
    done
    return 1
  elif [[ $_ct_cbf -gt $_ct_mr ]]; then
    _ct_cu=$((CT_MAX_REMAPPED - (_ct_cbf - _ct_mr)))
    if [ $_ct_cu -lt 0 ]
    then
      _ct_cu=0
    fi
    echo -ne "\r[$(_ct_progress ${_ct_cu} ${_ct_max})] ==> Current remapped > Max remapped (${_ct_cbf}>${_ct_mr})      "
    return 1
  fi
  return 0
}

_ct_progress() {
  # Display Progress Bar
  # Param 1: current value
  # Param 2: max value
  _ct_step=$((${2:-64} / 64))
  _ct_max=$2
  _ct_current=$1
  printf '%.0s#' $(seq 0 $_ct_step $_ct_current)  
  printf '%.0s_' $(seq $_ct_current $_ct_step $_ct_max)  
}

ct_notify() {
  # Notify by external tool
  # Param 1: message
  if [ ! -z "$CT_EXIT_ON_ERROR" ]; then
#    echo $(hostname -s)-$(basename $0): ${1}: Script stopped >&2
    ct_logger "Critical -> $1: Script stopped"
    setterm -cursor on
    echo 
    exit 1
  else
#    echo $(hostname -s)-$(basename $0): ${1}: Script paused >&2
    ct_logger "Critical ->  $1: Script paused"
  fi
}

ct_logger() {
  # Log in syslog
  # Param 1: message
  echo $1
  logger -i "$(basename $0): $1"
}

ct_healthy_wait() {
# Wait For cluster Heathly
  _ct_test_osd_down
  _ct_test_temp
  _ct_test_schedule
  while ! _ct_test_remapped $1; do
    sleep 10
  done
  echo
}
 

if [ ! -z "$CT_WAIT" ]
then
  ct_healthy_wait 0
fi