File: rpl_parallel_optimistic_error_stop.test

package info (click to toggle)
mariadb 1%3A11.8.3-1
  • links: PTS, VCS
  • area: main
  • in suites: sid
  • size: 772,520 kB
  • sloc: ansic: 2,414,714; cpp: 1,791,394; asm: 381,336; perl: 62,905; sh: 49,647; pascal: 40,897; java: 39,363; python: 20,791; yacc: 20,432; sql: 17,907; xml: 12,344; ruby: 8,544; cs: 6,542; makefile: 6,145; ada: 1,879; lex: 1,193; javascript: 996; objc: 80; tcl: 73; awk: 46; php: 22
file content (180 lines) | stat: -rw-r--r-- 6,899 bytes parent folder | download | duplicates (5)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
--source include/have_innodb.inc
--source include/have_debug.inc
--source include/have_debug_sync.inc
--let $rpl_topology=1->2
--source include/rpl_init.inc

call mtr.add_suppression("Slave: Commit failed due to failure of an earlier commit");
call mtr.add_suppression("Slave: Duplicate entry '99'");

--connection server_1
ALTER TABLE mysql.gtid_slave_pos ENGINE=InnoDB;
CREATE TABLE t1 (a int PRIMARY KEY, b INT) ENGINE=InnoDB;
INSERT INTO t1 VALUES(1,1);  # hit a dup entry on slave
INSERT INTO t1 VALUES(2,1);  # races to "win" the last exit
INSERT INTO t1 VALUES(3,1);
INSERT INTO t1 VALUES(4,1);  # make W3 race over W1
--save_master_pos

--connection server_2
--sync_with_master
SET @old_parallel_threads=@@GLOBAL.slave_parallel_threads;
--source include/stop_slave.inc
SET @old_debug_dbug = @@global.debug_dbug;
# In a group of W1,W2,W3 of the same batch W2 simulates slowness.
SET @@global.debug_dbug = "d,hold_worker2_favor_worker3";
SET GLOBAL slave_parallel_threads=4;
CHANGE MASTER TO master_use_gtid=slave_pos;
SET @old_parallel_mode=@@GLOBAL.slave_parallel_mode;
SET GLOBAL slave_parallel_mode='optimistic';

# MDEV-30780 optimistic parallel slave hangs after hit an error
# Test workers hang scenario to prove it's no more neither
# out-of-order access to the active gco list.
#
# The test provides how to reproduce on the OLD version, false by default.
# That branch approximates the original hang with an assert that
# confirms the OLD version indeed could access already reclaimed gco.
--let $old_version_regression=0


--connection server_1

# Let W1,W2,W3,W4 parallel workers that are going to execute
#    the following transaction.
# W1 holds on with the 1st statement
#    then crashes W3 with the 2nd into retry,
#    finally hits with the 3rd a dup entry, on slave.
SET @@gtid_seq_no = 2001;
BEGIN;
  UPDATE t1 SET b = 11  WHERE a = 4;
  UPDATE t1 SET b = 11  WHERE a = 3;
  UPDATE t1 SET a = 99  WHERE a = 1;
COMMIT;
# In the buggy version W2 races to "win" the exit last (of W1..3)
# and by that to access last a gco struct, garbage-collected.
UPDATE t1 SET b = 2 WHERE a = 2;
# W3 garbage-collects the gco struct in the buggy version.
UPDATE t1 SET b = 3 WHERE a = 3;
# W4 resides in following "singleton" batch to a W2 replacement
# in the buggy version to allow W3 reclaim the batch's gco.
DROP TABLE IF EXISTS phantom_1;

--source include/save_master_gtid.inc

--connect (slave_local_0, 127.0.0.1, root,, test, $SLAVE_MYPORT,)
begin;
  UPDATE t1 set b = 11 where a = 4;
--connect (slave_local_1, 127.0.0.1, root,, test, $SLAVE_MYPORT,)
begin;
  INSERT INTO t1 VALUES (99, 11);

--connect (slave_local_2, 127.0.0.1, root,, test, $SLAVE_MYPORT,)
begin;
  UPDATE t1 SET b = 12 WHERE a = 2;

--connect (slave_local_3, 127.0.0.1, root,, test, $SLAVE_MYPORT,)
begin;
  UPDATE t1 SET b = 13 WHERE a = 3;

--connection server_2
--source include/start_slave.inc

--echo # W4 is waiting to start its DROP

--let $wait_condition= SELECT count(*) = 1 FROM information_schema.processlist WHERE state LIKE "Waiting for prior transaction to start commit%"
--source include/wait_condition.inc

--connection slave_local_3
# make W3 to set E.cc <- 1
  rollback;
--let $wait_condition= SELECT count(*) = 1 FROM information_schema.processlist WHERE state LIKE "Waiting for prior transaction to commit%"
--source include/wait_condition.inc

--connection slave_local_0
# make W3 into retry and delay it to let W1 hit a dupicate error first,
# see 'commit' by slave_local_1.
  rollback;
--let $wait_condition= SELECT count(*) = 1 FROM information_schema.processlist WHERE state LIKE "debug sync point: now"
--source include/wait_condition.inc
SELECT count(*) = 0 as "W3 undid its commit state" FROM information_schema.processlist WHERE state LIKE "Waiting for prior transaction to commit%";


--connection slave_local_2
  rollback;
# wait for W2 to start committing E.cc <- 2
--let $wait_condition= SELECT count(*) = 1 FROM information_schema.processlist WHERE state like "Waiting for prior transaction to commit"
--source include/wait_condition.inc

--connection slave_local_1

# W1 errors out
#  A. to alert W3
#  B. W3 will *not* wake up W4 in the fixed version, having to wait for W2 demise.
#  C. W2 will notify W3 that releases W4 as it would do in normal cases.
commit;

if (!$old_version_regression)
{
# A. In the fixed version show-processlist W4 is still in the ordered waiting
SELECT COUNT(*) = 1 as "W4 remains with the same status" FROM information_schema.processlist WHERE state LIKE "Waiting for prior transaction to start commit%";
--let $status= query_get_value("show slave status", Slave_SQL_Running, 1)
--echo #  Slave_SQL_Running YES = $status

# B. In the fixed version W3 is waiting for W2,...
--let $wait_condition= SELECT count(*) = 1 as "W4 is waiting" FROM information_schema.processlist WHERE state LIKE "Waiting for prior transaction to commit%"
--source include/wait_condition.inc
--echo # while W2 is held back ...
--let $wait_condition= SELECT count(*) = 1 as "W2 simulates slowness" FROM information_schema.processlist WHERE state LIKE "debug sync point: now"
--source include/wait_condition.inc

# C. # ...until NOW.
SET DEBUG_SYNC = 'now SIGNAL cont_worker2';

}

# To reproduce the hang on the OLD version ...
if ($old_version_regression)
{
  # replace the actual fixes block with checking W3,W4 have actually committed,
  # followed by signaling to W2 like on behalf of W4 which would end up in the hang.
  --let $wait_condition= SELECT COUNT(*) = 0 as "W4 has moved on" FROM information_schema.processlist WHERE state like "Waiting for prior transaction to start commit"
  --source include/wait_condition.inc
  --let $wait_condition= SELECT count(*) = 0 as "W3 does not wait on W2" FROM information_schema.processlist WHERE state LIKE "Waiting for prior transaction to commit%"
--source include/wait_condition.inc

  --let $wait_condition= SELECT count(*) = 1 as "W2 simulates slowness" FROM information_schema.processlist WHERE state LIKE "debug sync point: now"
  --source include/wait_condition.inc

  # Like above, but signaling is done after W4 is done to violate the commit order
  # that must fire a debug assert.
  SET DEBUG_SYNC = 'now SIGNAL cont_worker2';
}

--let $slave_sql_errno= 1062
--source include/wait_for_slave_sql_error.inc

# Restore the slave data and resume with replication
DELETE FROM t1 WHERE a=99;
--source include/start_slave.inc
--source include/sync_with_master_gtid.inc

#
# Clean up.
#
--connection server_2
--source include/stop_slave.inc
SET GLOBAL slave_parallel_mode=@old_parallel_mode;
SET GLOBAL slave_parallel_threads=@old_parallel_threads;
SET @@global.debug_dbug = @old_debug_dbug;
SET debug_sync = RESET;
--source include/start_slave.inc

--connection server_1
DROP TABLE t1;
--source include/save_master_gtid.inc

--connection server_2
--source include/sync_with_master_gtid.inc

--source include/rpl_end.inc