Skip to content

Commit acd7606

Browse files
committed
test: introduce tests for skipping unhealthy replicas in callbro
This commit adds tests that check whether frozen replicas and masters are properly skipped in `callbro` requests. Follow-up #505 NO_DOC=test
1 parent 0cecc25 commit acd7606

File tree

1 file changed

+112
-0
lines changed

1 file changed

+112
-0
lines changed

test/router-luatest/router_3_3_test.lua

Lines changed: 112 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
local t = require('luatest')
22
local vtest = require('test.luatest_helpers.vtest')
3+
local vutil = require('vshard.util')
34
local wait_timeout = vtest.wait_timeout
45

56
local g = t.group()
@@ -59,6 +60,14 @@ g.before_all(function(g)
5960
})
6061
end)
6162
vtest.cluster_wait_fullsync(g)
63+
g.router:exec(function()
64+
-- Wait for buckets to be resolved and for a rs.replica to be assigned.
65+
ilt.helpers.retrying({timeout = iwait_timeout}, function()
66+
ivshard.router.discovery_wakeup()
67+
_G.failover_wakeup()
68+
ilt.assert_equals(#ivshard.router.info().alerts, 0)
69+
end)
70+
end)
6271
end)
6372

6473
g.after_all(function(g)
@@ -532,3 +541,106 @@ end
532541
g.test_failover_health_check_auto_master = function(g)
533542
failover_health_check(g, true)
534543
end
544+
545+
--
546+
-- gh-505: test, that freezed replica is skipped automatically, if
547+
-- ping managed to fail before user request.
548+
--
549+
local function test_callbro_with_freezed_replica_after_ping(g, replica)
550+
-- freeze() and thaw() are available only since Tarantool 2.4.1.
551+
t.run_only_if(vutil.version_is_at_least(2, 4, 1, nil, 0, 0))
552+
553+
-- Freeze the replica.
554+
local uuid = replica:instance_uuid()
555+
local rs_uuid = replica:replicaset_uuid()
556+
local bid = vtest.storage_first_bucket(replica)
557+
replica:freeze()
558+
559+
-- Wait for failed ping request. It will recreate the connection, which
560+
-- won't be shown as `connected` and replica will be skipped even without
561+
-- going into `replicaset_check_replica_health()`.
562+
g.router:exec(function(uuid, rs_uuid)
563+
local rs = ivshard.router.internal.static_router.replicasets[rs_uuid]
564+
local r = rs.replicas[uuid]
565+
local s = r.worker.services['replica_failover']
566+
local opts = {on_yield = function()
567+
r.worker:wakeup_service('replica_failover')
568+
end}
569+
ivtest.wait_for_not_nil(s.data, 'info', opts)
570+
ivtest.service_wait_for_error(s.data.info, 'Ping error', opts)
571+
ilt.assert_not(r:is_connected())
572+
end, {uuid, rs_uuid})
573+
574+
-- Replica is skipped during request.
575+
router_test_callbro(g, bid, {uuid})
576+
577+
-- Restore.
578+
replica:thaw()
579+
-- Wait for fullsync is needed, because replica may have too big lag,
580+
-- it may be master, which was dead for some time. Failover just checks,
581+
-- that replica can be pinged.
582+
vtest.cluster_wait_fullsync(g)
583+
router_wait_failover_new_ok(g, rs_uuid)
584+
router_test_callbro(g, bid, {})
585+
end
586+
587+
g.test_callbro_with_freezed_replica_after_ping = function(g)
588+
test_callbro_with_freezed_replica_after_ping(g, g.replica_1_b)
589+
end
590+
591+
g.test_callbro_with_freezed_master_after_ping = function(g)
592+
test_callbro_with_freezed_replica_after_ping(g, g.replica_1_a)
593+
end
594+
595+
--
596+
-- gh-505: test, that freezed replica is skipped after
597+
-- failover_sequential_fail_count failed requests.
598+
--
599+
local function test_callbro_with_freezed_replica_without_ping(g, replica)
600+
-- freeze() and thaw() are available only since Tarantool 2.4.1.
601+
t.run_only_if(vutil.version_is_at_least(2, 4, 1, nil, 0, 0))
602+
603+
-- Block failover, so that pings are not sent at all.
604+
g.router:eval('_G.failover_pause()')
605+
-- Set the maximum number of failed request to 1.
606+
local new_global_cfg = table.deepcopy(global_cfg)
607+
new_global_cfg.failover_sequential_fail_count = 1
608+
vtest.router_cfg(g.router, new_global_cfg)
609+
610+
-- Freeze the replica.
611+
local uuid = replica:instance_uuid()
612+
local rs_uuid = replica:replicaset_uuid()
613+
local bid = vtest.storage_first_bucket(replica)
614+
replica:freeze()
615+
616+
-- Make one failed request.
617+
g.router:exec(function(bid)
618+
local errors = 0
619+
for _ = 1, 3 do
620+
local res = ivshard.router.callbro(bid, 'get_uuid', {timeout = 0.5})
621+
if not res then
622+
errors = errors + 1
623+
end
624+
end
625+
ilt.assert_equals(errors, 1)
626+
end, {bid})
627+
628+
-- From now on requests don't fail.
629+
router_test_callbro(g, bid, {uuid})
630+
631+
-- Restore.
632+
replica:thaw()
633+
vtest.cluster_wait_fullsync(g)
634+
router_wait_failover_new_ok(g, rs_uuid)
635+
router_test_callbro(g, bid, {})
636+
g.router:eval('_G.failover_continue()')
637+
vtest.router_cfg(g.router, global_cfg)
638+
end
639+
640+
g.test_callbro_with_freezed_replica_without_ping = function(g)
641+
test_callbro_with_freezed_replica_without_ping(g, g.replica_1_c)
642+
end
643+
644+
g.test_callbro_with_freezed_master_without_ping = function(g)
645+
test_callbro_with_freezed_replica_without_ping(g, g.replica_1_a)
646+
end

0 commit comments

Comments
 (0)