1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300
|
start_server {tags {"failover external:skip"} overrides {save {}}} {
start_server {overrides {save {}}} {
start_server {overrides {save {}}} {
set node_0 [srv 0 client]
set node_0_host [srv 0 host]
set node_0_port [srv 0 port]
set node_0_pid [srv 0 pid]
set node_1 [srv -1 client]
set node_1_host [srv -1 host]
set node_1_port [srv -1 port]
set node_1_pid [srv -1 pid]
set node_2 [srv -2 client]
set node_2_host [srv -2 host]
set node_2_port [srv -2 port]
set node_2_pid [srv -2 pid]
proc assert_digests_match {n1 n2 n3} {
assert_equal [$n1 debug digest] [$n2 debug digest]
assert_equal [$n2 debug digest] [$n3 debug digest]
}
test {failover command fails without connected replica} {
catch { $node_0 failover to $node_1_host $node_1_port } err
if {! [string match "ERR*" $err]} {
fail "failover command succeeded when replica not connected"
}
}
test {setup replication for following tests} {
$node_1 replicaof $node_0_host $node_0_port
$node_2 replicaof $node_0_host $node_0_port
wait_for_sync $node_1
wait_for_sync $node_2
# wait for both replicas to be online from the perspective of the master
wait_for_condition 50 100 {
[string match "*slave0:*,state=online*slave1:*,state=online*" [$node_0 info replication]]
} else {
fail "replica didn't online in time"
}
}
test {failover command fails with invalid host} {
catch { $node_0 failover to invalidhost $node_1_port } err
assert_match "ERR*" $err
}
test {failover command fails with invalid port} {
catch { $node_0 failover to $node_1_host invalidport } err
assert_match "ERR*" $err
}
test {failover command fails with just force and timeout} {
catch { $node_0 FAILOVER FORCE TIMEOUT 100} err
assert_match "ERR*" $err
}
test {failover command fails when sent to a replica} {
catch { $node_1 failover to $node_1_host $node_1_port } err
assert_match "ERR*" $err
}
test {failover command fails with force without timeout} {
catch { $node_0 failover to $node_1_host $node_1_port FORCE } err
assert_match "ERR*" $err
}
test {failover command to specific replica works} {
set initial_psyncs [s -1 sync_partial_ok]
set initial_syncs [s -1 sync_full]
# Generate a delta between primary and replica
set load_handler [start_write_load $node_0_host $node_0_port 5]
pause_process [srv -1 pid]
wait_for_condition 50 100 {
[s 0 total_commands_processed] > 100
} else {
fail "Node 0 did not accept writes"
}
resume_process [srv -1 pid]
# Execute the failover
$node_0 failover to $node_1_host $node_1_port
# Wait for failover to end
wait_for_condition 50 100 {
[s 0 master_failover_state] == "no-failover"
} else {
fail "Failover from node 0 to node 1 did not finish"
}
# stop the write load and make sure no more commands processed
stop_write_load $load_handler
wait_load_handlers_disconnected
$node_2 replicaof $node_1_host $node_1_port
wait_for_sync $node_0
wait_for_sync $node_2
assert_match *slave* [$node_0 role]
assert_match *master* [$node_1 role]
assert_match *slave* [$node_2 role]
# We should accept psyncs from both nodes
assert_equal [expr [s -1 sync_partial_ok] - $initial_psyncs] 2
assert_equal [expr [s -1 sync_full] - $initial_psyncs] 0
assert_digests_match $node_0 $node_1 $node_2
}
test {failover command to any replica works} {
set initial_psyncs [s -2 sync_partial_ok]
set initial_syncs [s -2 sync_full]
wait_for_ofs_sync $node_1 $node_2
# We stop node 0 to and make sure node 2 is selected
pause_process $node_0_pid
$node_1 set CASE 1
$node_1 FAILOVER
# Wait for failover to end
wait_for_condition 50 100 {
[s -1 master_failover_state] == "no-failover"
} else {
fail "Failover from node 1 to node 2 did not finish"
}
resume_process $node_0_pid
$node_0 replicaof $node_2_host $node_2_port
wait_for_sync $node_0
wait_for_sync $node_1
assert_match *slave* [$node_0 role]
assert_match *slave* [$node_1 role]
assert_match *master* [$node_2 role]
# We should accept Psyncs from both nodes
assert_equal [expr [s -2 sync_partial_ok] - $initial_psyncs] 2
assert_equal [expr [s -1 sync_full] - $initial_psyncs] 0
assert_digests_match $node_0 $node_1 $node_2
}
test {failover to a replica with force works} {
set initial_psyncs [s 0 sync_partial_ok]
set initial_syncs [s 0 sync_full]
pause_process $node_0_pid
# node 0 will never acknowledge this write
$node_2 set case 2
$node_2 failover to $node_0_host $node_0_port TIMEOUT 100 FORCE
# Wait for node 0 to give up on sync attempt and start failover
wait_for_condition 50 100 {
[s -2 master_failover_state] == "failover-in-progress"
} else {
fail "Failover from node 2 to node 0 did not timeout"
}
# Quick check that everyone is a replica, we never want a
# state where there are two masters.
assert_match *slave* [$node_1 role]
assert_match *slave* [$node_2 role]
resume_process $node_0_pid
# Wait for failover to end
wait_for_condition 50 100 {
[s -2 master_failover_state] == "no-failover"
} else {
fail "Failover from node 2 to node 0 did not finish"
}
$node_1 replicaof $node_0_host $node_0_port
wait_for_sync $node_1
wait_for_sync $node_2
assert_match *master* [$node_0 role]
assert_match *slave* [$node_1 role]
assert_match *slave* [$node_2 role]
assert_equal [count_log_message -2 "time out exceeded, failing over."] 1
# We should accept both psyncs, although this is the condition we might not
# since we didn't catch up.
assert_equal [expr [s 0 sync_partial_ok] - $initial_psyncs] 2
assert_equal [expr [s 0 sync_full] - $initial_syncs] 0
assert_digests_match $node_0 $node_1 $node_2
}
test {failover with timeout aborts if replica never catches up} {
set initial_psyncs [s 0 sync_partial_ok]
set initial_syncs [s 0 sync_full]
# Stop replica so it never catches up
pause_process [srv -1 pid]
$node_0 SET CASE 1
$node_0 failover to [srv -1 host] [srv -1 port] TIMEOUT 500
# Wait for failover to end
wait_for_condition 50 20 {
[s 0 master_failover_state] == "no-failover"
} else {
fail "Failover from node_0 to replica did not finish"
}
resume_process [srv -1 pid]
# We need to make sure the nodes actually sync back up
wait_for_ofs_sync $node_0 $node_1
wait_for_ofs_sync $node_0 $node_2
assert_match *master* [$node_0 role]
assert_match *slave* [$node_1 role]
assert_match *slave* [$node_2 role]
# Since we never caught up, there should be no syncs
assert_equal [expr [s 0 sync_partial_ok] - $initial_psyncs] 0
assert_equal [expr [s 0 sync_full] - $initial_syncs] 0
assert_digests_match $node_0 $node_1 $node_2
}
test {failovers can be aborted} {
set initial_psyncs [s 0 sync_partial_ok]
set initial_syncs [s 0 sync_full]
# Stop replica so it never catches up
pause_process [srv -1 pid]
$node_0 SET CASE 2
$node_0 failover to [srv -1 host] [srv -1 port] TIMEOUT 60000
assert_match [s 0 master_failover_state] "waiting-for-sync"
# Sanity check that read commands are still accepted
$node_0 GET CASE
$node_0 failover abort
assert_match [s 0 master_failover_state] "no-failover"
resume_process [srv -1 pid]
# Just make sure everything is still synced
wait_for_ofs_sync $node_0 $node_1
wait_for_ofs_sync $node_0 $node_2
assert_match *master* [$node_0 role]
assert_match *slave* [$node_1 role]
assert_match *slave* [$node_2 role]
# Since we never caught up, there should be no syncs
assert_equal [expr [s 0 sync_partial_ok] - $initial_psyncs] 0
assert_equal [expr [s 0 sync_full] - $initial_syncs] 0
assert_digests_match $node_0 $node_1 $node_2
}
test {failover aborts if target rejects sync request} {
set initial_psyncs [s 0 sync_partial_ok]
set initial_syncs [s 0 sync_full]
# We block psync, so the failover will fail
$node_1 acl setuser default -psync
# We pause the target long enough to send a write command
# during the pause. This write will not be interrupted.
pause_process [srv -1 pid]
set rd [redis_deferring_client]
$rd SET FOO BAR
$node_0 failover to $node_1_host $node_1_port
resume_process [srv -1 pid]
# Wait for failover to end
wait_for_condition 50 100 {
[s 0 master_failover_state] == "no-failover"
} else {
fail "Failover from node_0 to replica did not finish"
}
assert_equal [$rd read] "OK"
$rd close
# restore access to psync
$node_1 acl setuser default +psync
# We need to make sure the nodes actually sync back up
wait_for_sync $node_1
wait_for_sync $node_2
assert_match *master* [$node_0 role]
assert_match *slave* [$node_1 role]
assert_match *slave* [$node_2 role]
# We will cycle all of our replicas here and force a psync.
assert_equal [expr [s 0 sync_partial_ok] - $initial_psyncs] 2
assert_equal [expr [s 0 sync_full] - $initial_syncs] 0
assert_equal [count_log_message 0 "Failover target rejected psync request"] 1
assert_digests_match $node_0 $node_1 $node_2
}
}
}
}
|