Skip to content

Commit 0cecc25

Browse files
committed
test: fix flaky router-luatest/router_3_3_test
This commit fixes the flakiness of the router_3_3_test. There were a lot of different fails, but in general they all are related to one problem: the tests didn't wait for failover to end properly, which caused skip of replicas in `callbro` request. In order to wait for failover to end we need to fix the statuses, which were incorrectly updated in the service itself. Closes #534 NO_DOC=testfix
1 parent 5faf050 commit 0cecc25

File tree

2 files changed

+37
-22
lines changed

2 files changed

+37
-22
lines changed

test/router-luatest/router_3_3_test.lua

Lines changed: 32 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -112,6 +112,32 @@ local function router_wait_prioritized(g, replica)
112112
end)
113113
end
114114

115+
local function router_wait_failover_new_ok(g, rs_uuid)
116+
g.router:exec(function(rs_uuid)
117+
local router = ivshard.router.internal.static_router
118+
local rs = router.replicasets[rs_uuid]
119+
120+
-- Wait for successful pings. No need to wait for new ones,
121+
-- any good ping is enough.
122+
for _, r in pairs(rs.replicas) do
123+
local s = r.worker.services['replica_failover']
124+
local opts = {on_yield = function()
125+
r.worker:wakeup_service('replica_failover')
126+
end}
127+
ivtest.wait_for_not_nil(s.data, 'info', opts)
128+
ivtest.service_wait_for_new_ok(s.data.info, opts)
129+
end
130+
131+
-- Wait for prioritized replica change if needed.
132+
local s = rs.worker.services['replicaset_failover']
133+
local opts = {on_yield = function()
134+
rs.worker:wakeup_service('replicaset_failover')
135+
end}
136+
ivtest.wait_for_not_nil(s.data, 'info', opts)
137+
ivtest.service_wait_for_new_ok(s.data.info, opts)
138+
end, {rs_uuid})
139+
end
140+
115141
local function router_test_callbro(g, bid, skip_uuids)
116142
g.router:exec(function(bid, skip)
117143
local uuid = ivshard.router.callbro(bid, 'get_uuid')
@@ -214,6 +240,7 @@ local function failover_health_check_broken_upstream(g)
214240
g.replica_1_a:exec(function()
215241
box.space.test:truncate()
216242
end)
243+
router_wait_failover_new_ok(g, g.replica_1_c:replicaset_uuid())
217244
end
218245

219246
local function failover_health_check_broken_upstream_not_switch(g)
@@ -264,6 +291,7 @@ local function failover_health_check_broken_upstream_not_switch(g)
264291
g.replica_1_a:exec(function()
265292
box.space.test:truncate()
266293
end)
294+
router_wait_failover_new_ok(g, g.replica_1_c:replicaset_uuid())
267295
end
268296

269297
local function failover_health_check_big_lag(g)
@@ -325,6 +353,7 @@ local function failover_health_check_small_failover_timeout(g)
325353
-- Since all nodes are broken, prioritized replica is not changed.
326354
router_assert_prioritized(g, g.replica_1_c)
327355
vtest.router_cfg(g.router, old_cfg)
356+
router_wait_failover_new_ok(g, g.replica_1_c:replicaset_uuid())
328357
end
329358

330359
local function failover_health_check_master_down(g)
@@ -335,21 +364,6 @@ local function failover_health_check_master_down(g)
335364
g.replica_1_a:stop()
336365

337366
local rs_uuid = g.replica_1_c:replicaset_uuid()
338-
local instance_uuid = g.replica_1_c:instance_uuid()
339-
g.router:exec(function(rs_uuid, uuid)
340-
rawset(_G, 'wait_for_failover', function(rs_uuid, uuid)
341-
local router = ivshard.router.internal.static_router
342-
local rs = router.replicasets[rs_uuid]
343-
local r = rs.replicas[uuid]
344-
local s = r.worker.services['replica_failover']
345-
local opts = {on_yield = function()
346-
r.worker:wakeup_service('replica_failover')
347-
end}
348-
ivtest.wait_for_not_nil(s.data, 'info', opts)
349-
ivtest.service_wait_for_new_ok(s.data.info, opts)
350-
end)
351-
_G.wait_for_failover(rs_uuid, uuid)
352-
end, {rs_uuid, instance_uuid})
353367
-- Since all nodes are broken, prioritized replica is not changed.
354368
router_assert_prioritized(g, g.replica_1_c)
355369
g.replica_1_a:start()
@@ -358,10 +372,7 @@ local function failover_health_check_master_down(g)
358372
end, {old_cfg})
359373
router_wait_prioritized(g, g.replica_1_c)
360374
-- Restore connection
361-
g.router:exec(function(rs_uuid, uuid)
362-
_G.wait_for_failover(rs_uuid, uuid)
363-
_G.wait_for_failover = nil
364-
end, {rs_uuid, g.replica_1_a:instance_uuid()})
375+
router_wait_failover_new_ok(g, rs_uuid)
365376
end
366377

367378
local function failover_health_check_missing_master(g)
@@ -386,6 +397,7 @@ local function failover_health_check_missing_master(g)
386397
end)
387398

388399
-- Not changed. Not enough info.
400+
router_wait_failover_new_ok(g, g.replica_1_c:replicaset_uuid())
389401
router_assert_prioritized(g, g.replica_1_c)
390402
router_test_callbro(g, vtest.storage_first_bucket(g.replica_1_b), {})
391403
g.replica_1_c:exec(function(cfg)
@@ -441,7 +453,7 @@ end
441453
-- earlier than storages.
442454
--
443455
local function failover_health_check_missing_health(g)
444-
router_wait_prioritized(g, g.replica_1_c)
456+
router_assert_prioritized(g, g.replica_1_c)
445457
g.replica_1_c:exec(function()
446458
rawset(_G, 'old_call', ivshard.storage._call)
447459
ivshard.storage._call = function(...)

vshard/replicaset.lua

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1769,6 +1769,7 @@ local function replica_failover_service_step(replica, data)
17691769
if not replica.conn or replica.down_ts ~= nil then
17701770
-- Nothing to ping. Connection is either dead or missing.
17711771
data.info:set_activity('idling')
1772+
data.info:set_status_error('Connection is down or missing')
17721773
return replica.failover_interval
17731774
end
17741775
local opts = {timeout = replica.failover_ping_timeout}
@@ -1884,9 +1885,11 @@ local function replicaset_failover_service_step(replicaset, data)
18841885
'Error during failovering: %s',
18851886
lerror.make(replica_is_changed)))
18861887
replica_is_changed = true
1887-
elseif not data.prev_was_ok then
1888-
log.info('All replicas are ok')
1888+
else
18891889
data.info:set_status_ok()
1890+
if not data.prev_was_ok then
1891+
log.info('All replicas are ok')
1892+
end
18901893
end
18911894
data.prev_was_ok = not replica_is_changed
18921895
local logf

0 commit comments

Comments
 (0)