1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277
|
import tests.pgautofailover_utils as pgautofailover
from nose.tools import raises, eq_
import time
import os.path
cluster = None
monitor = None
node1 = None
node2 = None
node3 = None
def setup_module():
global cluster
cluster = pgautofailover.Cluster()
def teardown_module():
cluster.destroy()
def test_000_create_monitor():
global monitor
monitor = cluster.create_monitor("/tmp/multi_ifdown/monitor")
monitor.run()
monitor.wait_until_pg_is_running()
def test_001_init_primary():
global node1
node1 = cluster.create_datanode("/tmp/multi_ifdown/node1")
node1.create()
node1.run()
assert node1.wait_until_state(target_state="single")
def test_002_add_standby():
global node2
node2 = cluster.create_datanode("/tmp/multi_ifdown/node2")
node2.create()
node2.run()
assert node2.wait_until_pg_is_running()
assert node2.wait_until_state(target_state="secondary")
assert node1.wait_until_state(target_state="primary")
assert node1.has_needed_replication_slots()
assert node2.has_needed_replication_slots()
# make sure we reached primary on node1 before next tests
assert node1.wait_until_state(target_state="primary")
def test_003_add_standby():
global node3
node3 = cluster.create_datanode("/tmp/multi_ifdown/node3")
node3.create()
node3.run()
assert node3.wait_until_state(target_state="secondary")
assert node2.wait_until_state(target_state="secondary")
assert node1.wait_until_state(target_state="primary")
assert node1.has_needed_replication_slots()
assert node2.has_needed_replication_slots()
assert node3.has_needed_replication_slots()
# the formation number_sync_standbys is expected to be set to 1 now
assert node1.get_number_sync_standbys() == 1
# make sure we reached primary on node1 before next tests
assert node1.wait_until_state(target_state="primary")
def test_004_write_into_primary():
node1.run_sql_query("CREATE TABLE t1(a int)")
node1.run_sql_query("INSERT INTO t1 VALUES (1), (2), (3), (4)")
node1.run_sql_query("CHECKPOINT")
results = node1.run_sql_query("SELECT * FROM t1")
assert results == [(1,), (2,), (3,), (4,)]
def test_005_set_candidate_priorities():
print()
assert node1.wait_until_state(target_state="primary")
# set priorities in a way that we know the candidate: node3
node1.set_candidate_priority(90) # current primary
node2.set_candidate_priority(0) # not a candidate anymore
node3.set_candidate_priority(90)
# when we set candidate priority we go to apply_settings then primary
print()
assert node1.wait_until_state(target_state="primary")
assert node2.wait_until_state(target_state="secondary")
assert node3.wait_until_state(target_state="secondary")
# node1 should still be "sync"
assert node1.get_number_sync_standbys() == 1
assert node2.get_replication_quorum()
# also let's see synchronous_standby_names here
# remember to sort by candidate priority then name
ssn = "ANY 1 (pgautofailover_standby_3, pgautofailover_standby_2)"
node1.check_synchronous_standby_names(ssn)
def test_006_ifdown_node3():
node3.ifdown()
def test_007_insert_rows():
node1.run_sql_query(
"INSERT INTO t1 SELECT x+10 FROM generate_series(1, 10000) as gs(x)"
)
node1.run_sql_query("CHECKPOINT")
lsn1 = node1.run_sql_query("select pg_current_wal_lsn()")[0][0]
print("%s " % lsn1, end="", flush=True)
# node2 is sync and should get the WAL
lsn2 = node2.run_sql_query("select pg_last_wal_receive_lsn()")[0][0]
print("%s " % lsn2, end="", flush=True)
while lsn2 != lsn1:
time.sleep(1)
lsn2 = node2.run_sql_query("select pg_last_wal_receive_lsn()")[0][0]
print("%s " % lsn2, end="", flush=True)
eq_(lsn1, lsn2)
def test_008_failover():
print()
print("Injecting failure of node1")
node1.fail()
# have node2 re-join the network and hopefully reconnect etc
print("Reconnecting node3 (ifconfig up)")
node3.ifup()
# now we should be able to continue with the failover, and fetch missing
# WAL bits from node2
assert node3.wait_until_pg_is_running()
assert node3.wait_until_state(target_state="wait_primary", timeout=120)
assert node2.wait_until_state(target_state="secondary")
# node 2 has candidate priority of 0, can still be used to reach primary
assert node3.wait_until_state(target_state="primary")
assert node3.has_needed_replication_slots()
assert node2.has_needed_replication_slots()
# when in wait_primary state we should not block writes when:
assert node3.get_number_sync_standbys() == 1
ssn = "ANY 1 (pgautofailover_standby_1, pgautofailover_standby_2)"
node3.check_synchronous_standby_names(ssn=ssn)
def test_009_read_from_new_primary():
results = node3.run_sql_query("SELECT count(*) FROM t1")
assert results == [(10004,)]
def test_010_start_node1_again():
node1.run()
assert node1.wait_until_state(target_state="secondary")
assert node2.wait_until_state(target_state="secondary")
assert node3.wait_until_state(target_state="primary")
assert node1.has_needed_replication_slots()
assert node2.has_needed_replication_slots()
assert node3.has_needed_replication_slots()
# now that we're back to primary, check we have sync rep again
ssn = "ANY 1 (pgautofailover_standby_1, pgautofailover_standby_2)"
node3.check_synchronous_standby_names(ssn)
# test_011_XXX, test_012_XXX, test_013_XXX, test_014_XXX and test_015_XXX
# are meant to test the scenario when the most advanced secondary
# becomes inaccessible at the same time when the primary is inaccessible
def test_011_prepare_candidate_priorities():
# we are aiming to promote node2
assert node2.set_candidate_priority(100) # the next primary
# other nodes are already candidates for primary, but with less
# priority
assert node1.get_candidate_priority() == 90
assert node3.get_candidate_priority() == 90
def test_012_prepare_replication_quorums():
# for the purpose of this test, we need one node
# async, to allow that we should decrement the sync stanbys
node3.set_number_sync_standbys(0)
# to emulate one node is behind, it is easier to make it async
# we want node2 to be behind others
assert node2.set_replication_quorum("false")
# others should be sync
assert node1.get_replication_quorum()
assert node3.get_replication_quorum()
def test_013_secondary_gets_behind_primary():
# make sure that node2 gets behind of the primary
node2.ifdown()
# primary ingests some data
node3.run_sql_query("INSERT INTO t1 VALUES (5), (6)")
node3.run_sql_query("CHECKPOINT")
# ensure that the healthy secondary gets the change
results = node1.run_sql_query("SELECT count(*) FROM t1")
assert results == [(10006,)]
lsn1 = node1.run_sql_query("select pg_last_wal_receive_lsn()")[0][0]
print("%s " % lsn1, end="", flush=True)
# ensure the monitor received this lsn
node1.pg_autoctl.sighup() # wake up from the 10s node_active delay
time.sleep(1)
q = "select reportedlsn from pgautofailover.node where nodeid = 1"
lsn1m = monitor.run_sql_query(q)[0][0]
print("%s " % lsn1m, end="", flush=True)
retry = 0
while lsn1 != lsn1m and retry < 3:
time.sleep(1)
lsn1m = monitor.run_sql_query(q)[0][0]
print("%s " % lsn1m, end="", flush=True)
eq_(lsn1, lsn1m)
def test_014_secondary_reports_lsn():
# make the primary and mostAdvanced secondary inaccessible
# and the candidate for failover as accessible
# which means that node2 will not be able to fetch wal
# and blocked until the other secondary is up
assert node1.wait_until_state(target_state="secondary")
assert node3.wait_until_state(target_state="primary")
node3.ifdown() # primary
node1.ifdown() # most advanced standby
node2.ifup() # failover candidate
print()
print("Calling pgautofailover.failover() on the monitor")
monitor.failover()
# node2 reports its LSN while others are inaccessible
assert node2.wait_until_state(target_state="report_lsn")
def test_015_finalize_failover_after_most_advanced_secondary_gets_back():
# when they are accessible again, both should become
# secondaries
node1.ifup() # old most advanced secondary, now secondary
node3.ifup() # old primary, now secondary
assert node1.wait_until_state(target_state="secondary")
assert node3.wait_until_state(target_state="secondary")
assert node2.wait_until_state(target_state="primary")
results = node2.run_sql_query("SELECT count(*) FROM t1")
eq_(results, [(10006,)])
|