|
1 | 1 | local t = require('luatest') |
2 | 2 | local vtest = require('test.luatest_helpers.vtest') |
| 3 | +local vutil = require('vshard.util') |
3 | 4 | local wait_timeout = vtest.wait_timeout |
4 | 5 |
|
5 | 6 | local g = t.group() |
@@ -59,6 +60,14 @@ g.before_all(function(g) |
59 | 60 | }) |
60 | 61 | end) |
61 | 62 | vtest.cluster_wait_fullsync(g) |
| 63 | + g.router:exec(function() |
| 64 | + -- Wait for buckets to be resolved and for a rs.replica to be assigned. |
| 65 | + ilt.helpers.retrying({timeout = iwait_timeout}, function() |
| 66 | + ivshard.router.discovery_wakeup() |
| 67 | + _G.failover_wakeup() |
| 68 | + ilt.assert_equals(#ivshard.router.info().alerts, 0) |
| 69 | + end) |
| 70 | + end) |
62 | 71 | end) |
63 | 72 |
|
64 | 73 | g.after_all(function(g) |
@@ -532,3 +541,106 @@ end |
532 | 541 | g.test_failover_health_check_auto_master = function(g) |
533 | 542 | failover_health_check(g, true) |
534 | 543 | end |
| 544 | + |
| 545 | +-- |
| 546 | +-- gh-505: test, that freezed replica is skipped automatically, if |
| 547 | +-- ping managed to fail before user request. |
| 548 | +-- |
| 549 | +local function test_callbro_with_freezed_replica_after_ping(g, replica) |
| 550 | + -- freeze() and thaw() are available only since Tarantool 2.4.1. |
| 551 | + t.run_only_if(vutil.version_is_at_least(2, 4, 1, nil, 0, 0)) |
| 552 | + |
| 553 | + -- Freeze the replica. |
| 554 | + local uuid = replica:instance_uuid() |
| 555 | + local rs_uuid = replica:replicaset_uuid() |
| 556 | + local bid = vtest.storage_first_bucket(replica) |
| 557 | + replica:freeze() |
| 558 | + |
| 559 | + -- Wait for failed ping request. It will recreate the connection, which |
| 560 | + -- won't be shown as `connected` and replica will be skipped even without |
| 561 | + -- going into `replicaset_check_replica_health()`. |
| 562 | + g.router:exec(function(uuid, rs_uuid) |
| 563 | + local rs = ivshard.router.internal.static_router.replicasets[rs_uuid] |
| 564 | + local r = rs.replicas[uuid] |
| 565 | + local s = r.worker.services['replica_failover'] |
| 566 | + local opts = {on_yield = function() |
| 567 | + r.worker:wakeup_service('replica_failover') |
| 568 | + end} |
| 569 | + ivtest.wait_for_not_nil(s.data, 'info', opts) |
| 570 | + ivtest.service_wait_for_error(s.data.info, 'Ping error', opts) |
| 571 | + ilt.assert_not(r:is_connected()) |
| 572 | + end, {uuid, rs_uuid}) |
| 573 | + |
| 574 | + -- Replica is skipped during request. |
| 575 | + router_test_callbro(g, bid, {uuid}) |
| 576 | + |
| 577 | + -- Restore. |
| 578 | + replica:thaw() |
| 579 | + -- Wait for fullsync is needed, because replica may have too big lag, |
| 580 | + -- it may be master, which was dead for some time. Failover just checks, |
| 581 | + -- that replica can be pinged. |
| 582 | + vtest.cluster_wait_fullsync(g) |
| 583 | + router_wait_failover_new_ok(g, rs_uuid) |
| 584 | + router_test_callbro(g, bid, {}) |
| 585 | +end |
| 586 | + |
| 587 | +g.test_callbro_with_freezed_replica_after_ping = function(g) |
| 588 | + test_callbro_with_freezed_replica_after_ping(g, g.replica_1_b) |
| 589 | +end |
| 590 | + |
| 591 | +g.test_callbro_with_freezed_master_after_ping = function(g) |
| 592 | + test_callbro_with_freezed_replica_after_ping(g, g.replica_1_a) |
| 593 | +end |
| 594 | + |
| 595 | +-- |
| 596 | +-- gh-505: test, that freezed replica is skipped after |
| 597 | +-- failover_sequential_fail_count failed requests. |
| 598 | +-- |
| 599 | +local function test_callbro_with_freezed_replica_without_ping(g, replica) |
| 600 | + -- freeze() and thaw() are available only since Tarantool 2.4.1. |
| 601 | + t.run_only_if(vutil.version_is_at_least(2, 4, 1, nil, 0, 0)) |
| 602 | + |
| 603 | + -- Block failover, so that pings are not sent at all. |
| 604 | + g.router:eval('_G.failover_pause()') |
| 605 | + -- Set the maximum number of failed request to 1. |
| 606 | + local new_global_cfg = table.deepcopy(global_cfg) |
| 607 | + new_global_cfg.failover_sequential_fail_count = 1 |
| 608 | + vtest.router_cfg(g.router, new_global_cfg) |
| 609 | + |
| 610 | + -- Freeze the replica. |
| 611 | + local uuid = replica:instance_uuid() |
| 612 | + local rs_uuid = replica:replicaset_uuid() |
| 613 | + local bid = vtest.storage_first_bucket(replica) |
| 614 | + replica:freeze() |
| 615 | + |
| 616 | + -- Make one failed request. |
| 617 | + g.router:exec(function(bid) |
| 618 | + local errors = 0 |
| 619 | + for _ = 1, 3 do |
| 620 | + local res = ivshard.router.callbro(bid, 'get_uuid', {timeout = 0.5}) |
| 621 | + if not res then |
| 622 | + errors = errors + 1 |
| 623 | + end |
| 624 | + end |
| 625 | + ilt.assert_equals(errors, 1) |
| 626 | + end, {bid}) |
| 627 | + |
| 628 | + -- From now on requests don't fail. |
| 629 | + router_test_callbro(g, bid, {uuid}) |
| 630 | + |
| 631 | + -- Restore. |
| 632 | + replica:thaw() |
| 633 | + vtest.cluster_wait_fullsync(g) |
| 634 | + router_wait_failover_new_ok(g, rs_uuid) |
| 635 | + router_test_callbro(g, bid, {}) |
| 636 | + g.router:eval('_G.failover_continue()') |
| 637 | + vtest.router_cfg(g.router, global_cfg) |
| 638 | +end |
| 639 | + |
| 640 | +g.test_callbro_with_freezed_replica_without_ping = function(g) |
| 641 | + test_callbro_with_freezed_replica_without_ping(g, g.replica_1_c) |
| 642 | +end |
| 643 | + |
| 644 | +g.test_callbro_with_freezed_master_without_ping = function(g) |
| 645 | + test_callbro_with_freezed_replica_without_ping(g, g.replica_1_a) |
| 646 | +end |
0 commit comments