File: gr_autorejoin_monitoring_test.test

package info (click to toggle)
mysql-8.0 8.0.43-3
  • links: PTS, VCS
  • area: main
  • in suites: sid
  • size: 1,273,924 kB
  • sloc: cpp: 4,684,605; ansic: 412,450; pascal: 108,398; java: 83,641; perl: 30,221; cs: 27,067; sql: 26,594; sh: 24,181; python: 21,816; yacc: 17,169; php: 11,522; xml: 7,388; javascript: 7,076; makefile: 2,194; lex: 1,075; awk: 670; asm: 520; objc: 183; ruby: 97; lisp: 86
file content (232 lines) | stat: -rw-r--r-- 7,909 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
################################################################################
#
# The auto-rejoin process is a mechanism that allows a server that left the
# group due to flaky network (either on his side or on the other members) to
# try to join again up to group_replication_autorejoin_tries number of times.
#
# This test shall verify that the auto-rejoin process is monitorable through
# thread stage events, i.e. it will verify that it is possible to:
#   * Observe that an auto-rejoin is running or not
#   * See the number of retries being attempted
#   * Infer the time remaining for the next retry
#   * See the timestamp of the last retry
#   * See the number of times the auto-rejoin process was triggered
#
# Test:
# 0) The test requires three servers.
# 1) Provoke a member expel.
# 2) Verify that the auto-rejoin process is ongoing.
# 2.1) Verify that we can see the number of retries being incremented until
#      depleted.
# 2.2) Verify that we can infer the time remaining until the next retry.
# 3) Verify that we see the timestamp of the last auto-rejoin process.
# 4) Verify that we can see the number of times the auto-rejoin process was
#    run.
# 5) Cleanup.
#
################################################################################
--source include/have_debug_sync.inc
--source include/big_test.inc
--source include/linux.inc
#
# We'll need to verify the remaining time between each retry and, with valgrind,
# we may easily go over our estimated time, so we have to disable this test
# from running in valgrind jobs.
#
--source include/not_valgrind.inc
#
# We manually start the group because we need to set the rejoin timeout before
# the START GROUP_REPLICATION command.
#
--source include/have_group_replication_plugin.inc
--let $rpl_skip_group_replication_start= 1
--let $rpl_server_count = 3
--source include/group_replication.inc

--echo
--echo ####
--echo # 0) The test requires three servers.
--echo ####
--echo
--let $rpl_connection_name = server1
--source include/rpl_connection.inc

SET sql_log_bin = 0;
call mtr.add_suppression("Member was expelled from the group due to network failures, changing member status to ERROR.");
call mtr.add_suppression("The server was automatically set into read only mode after an error was detected.");
call mtr.add_suppression("\\[GCS\\] read failed");
call mtr.add_suppression("Started auto-rejoin procedure attempt*");
call mtr.add_suppression("Auto-rejoin procedure attempt*");
call mtr.add_suppression("Unable to confirm whether the server has left the group or not. Check performance_schema.replication_group_members to check group membership information.");
SET sql_log_bin = 1;

--source include/gr_autorejoin_monitoring.inc

SET @debug_saved = @@GLOBAL.DEBUG;
SET @@GLOBAL.DEBUG='+d,group_replication_rejoin_long_retry';
SET @@GLOBAL.DEBUG='+d,group_replication_fail_rejoin';
SET @@GLOBAL.DEBUG='+d,group_replication_stop_before_rejoin_loop';
SET @@GLOBAL.DEBUG='+d,group_replication_stop_before_rejoin';

--source include/start_and_bootstrap_group_replication.inc

--let $rpl_connection_name = server2
--source include/rpl_connection.inc

--source include/start_group_replication.inc

--let $rpl_connection_name = server3
--source include/rpl_connection.inc

--source include/start_group_replication.inc

--echo
--echo ####
--echo # 1) Provoke a member expel.
--echo ####
--echo
--let $rpl_connection_name = server1
--source include/rpl_connection.inc

# First, enable auto-rejoin
SET GLOBAL group_replication_autorejoin_tries = 3;

#
# Save timestamp before we start the auto-rejoin process, so we can validate the
# time it took.
#
--let $timestamp_before_autorejoin = `SELECT UNIX_TIMESTAMP()`

# Force expel on member 1
--let $member_id = `SELECT @@GLOBAL.server_uuid`
--source include/gr_expel_member_from_group.inc

--echo
--echo ####
--echo # 2) Verify that the auto-rejoin process is ongoing.
--echo ####
--echo
SET DEBUG_SYNC = "now WAIT_FOR signal.autorejoin_entering_loop";

# Verify that it is currently running
--let $assert_text = Auto-rejoin should be running
--let $assert_cond = [SELECT IS_AUTOREJOIN_RUNNING()] = TRUE
--source include/assert.inc

SET DEBUG_SYNC = "now SIGNAL signal.autorejoin_enter_loop";

#
# We go through each retry to observe the retry counter and remaining time being
# updated
#
--let $num_retries = 1
while ($num_retries <= 3)
{
  SET DEBUG_SYNC = "now WAIT_FOR signal.autorejoin_waiting";

  # Verify that it is currently running
  --let $assert_text = Auto-rejoin should be running
  --let $assert_cond = [SELECT IS_AUTOREJOIN_RUNNING()] = TRUE
  --source include/assert.inc


  --let $assert_text= super_read_only should be enabled
  --let $assert_cond= [SELECT @@GLOBAL.super_read_only] = 1;
  --source include/assert.inc

  --echo
  --echo ####
  --echo # 2.1) Verify that we can see the number of retries being incremented
  --echo # until depleted.
  --echo ####
  --echo
  --let $assert_text = We should have attempted $num_retries rejoins
  --let $assert_cond = [SELECT GET_NUMBER_RETRIES()] = $num_retries
  --source include/assert.inc

  --echo
  --echo ####
  --echo # 2.2) Verify that we can infer the time remaining until the next
  --echo # retry.
  --echo ####
  --echo
  # Basically the remaining time for the next retry should be under 5 minutes
  # Note that even though in production we sleep for 5 minutes, for testing
  # purposes we lower that number to 1 minute.
  --let $assert_text = Time remaining should be less than 5 mins
  --let $assert_cond = [SELECT GET_TIME_UNTIL_NEXT_RETRY(60)] <= 60
  --source include/assert.inc

  # Unblock rejoin loop
  SET DEBUG_SYNC = "now SIGNAL signal.autorejoin_continue";

  --inc $num_retries
}

# Verify that the auto-rejoin process has terminated
--let $wait_condition = SELECT IS_AUTOREJOIN_RUNNING() = FALSE
--source include/wait_condition_or_abort.inc

--echo
--echo ####
--echo # 3) Verify that we see the timestamp of the last auto-rejoin process.
--echo ####
--echo
--let $last_autorejoin_time = `SELECT GET_LAST_AUTOREJOIN()`

#
# We set an upper bound for execution time comprising of the number of retries
# times the sleep interval and add a small overhead factor
#
--let $num_retries = 3
--let $sleep_interval = 60
--let $overhead = $sleep_interval

#
# Then we calculate the time we are expecting the auto-rejoin process to run, so
# that we can validate if the auto-rejoin process execution time deviates from
# this expected interval.
#
--expr $expected_spent_time = $num_retries * $sleep_interval
--expr $expected_time_delta = $timestamp_before_autorejoin + $expected_spent_time
--expr $actual_time_delta = $last_autorejoin_time - $timestamp_before_autorejoin
--expr $max_time = $expected_time_delta + $overhead

# If it deviates too much (i.e. more than the overhead), we fail the test.
if ($actual_time_delta > $max_time)
{
  --echo The auto-rejoin process took longer than expected or its timestamp is wrong
  --die
}

--echo
--echo ####
--echo # 4) Verify that we can see the number of times the auto-rejoin process
--echo # was run.
--echo ####
--echo
# We should have already one auto-rejoin there
--let $assert_text = There should only be one auto-rejoin process triggered
--let $assert_cond = [SELECT GET_COUNT_AUTOREJOIN()] = 1
--source include/assert.inc

SET @@GLOBAL.DEBUG='-d,group_replication_stop_before_rejoin';
SET @@GLOBAL.DEBUG='-d,group_replication_stop_before_rejoin_loop';
SET @@GLOBAL.DEBUG='-d,group_replication_stop_rejoin_long_retry';

#
# Reset GR so that the member returns to a clean slate and disables
# super_read_only mode
#
--source include/stop_group_replication.inc
--source include/start_group_replication.inc

--echo
--echo ####
--echo # 5) Cleanup.
--echo ####
--echo
SET @@GLOBAL.DEBUG = @debug_saved;
SET GLOBAL group_replication_autorejoin_tries = default;
--source include/gr_end_autorejoin_monitoring.inc
--source include/group_replication_end.inc