Skip to content

Commit ec228f8

Browse files
authored
redis: fix segfault at cluster removing (#40488)
It is about to fix #38585 again --------- Signed-off-by: Chanhun Jeong <[email protected]>
1 parent 6ba543a commit ec228f8

File tree

4 files changed

+87
-10
lines changed

4 files changed

+87
-10
lines changed

source/extensions/clusters/redis/redis_cluster.cc

Lines changed: 42 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -74,11 +74,7 @@ RedisCluster::RedisCluster(
7474
context.serverFactoryContext().mainThreadDispatcher(),
7575
context.serverFactoryContext().clusterManager(),
7676
context.serverFactoryContext().api().timeSource())),
77-
registration_handle_(refresh_manager_->registerCluster(
78-
cluster_name_, redirect_refresh_interval_, redirect_refresh_threshold_,
79-
failure_refresh_threshold_, host_degraded_refresh_threshold_, [&]() {
80-
redis_discovery_session_->resolve_timer_->enableTimer(std::chrono::milliseconds(0));
81-
})) {
77+
registration_handle_(nullptr) {
8278
const auto& locality_lb_endpoints = load_assignment_.endpoints();
8379
for (const auto& locality_lb_endpoint : locality_lb_endpoints) {
8480
for (const auto& lb_endpoint : locality_lb_endpoint.lb_endpoints()) {
@@ -87,6 +83,32 @@ RedisCluster::RedisCluster(
8783
*this, host.socket_address().address(), host.socket_address().port_value()));
8884
}
8985
}
86+
87+
// Register the cluster callback using weak_ptr to avoid use-after-free
88+
std::weak_ptr<RedisDiscoverySession> weak_session = redis_discovery_session_;
89+
registration_handle_ = refresh_manager_->registerCluster(
90+
cluster_name_, redirect_refresh_interval_, redirect_refresh_threshold_,
91+
failure_refresh_threshold_, host_degraded_refresh_threshold_, [weak_session]() {
92+
// Try to lock the weak pointer to ensure the session is still alive
93+
auto session = weak_session.lock();
94+
if (session && session->resolve_timer_) {
95+
session->resolve_timer_->enableTimer(std::chrono::milliseconds(0));
96+
}
97+
});
98+
}
99+
100+
RedisCluster::~RedisCluster() {
101+
// Set flag to prevent any callbacks from executing during destruction
102+
is_destroying_.store(true);
103+
104+
// Reset redis_discovery_session_ before other members are destroyed
105+
// to ensure any pending callbacks from refresh_manager_ don't access it.
106+
// This matches the approach in PR #39625.
107+
redis_discovery_session_.reset();
108+
109+
// Also clear DNS discovery targets to prevent their callbacks from
110+
// accessing the destroyed cluster.
111+
dns_discovery_resolve_targets_.clear();
90112
}
91113

92114
void RedisCluster::startPreInit() {
@@ -198,7 +220,7 @@ RedisCluster::DnsDiscoveryResolveTarget::~DnsDiscoveryResolveTarget() {
198220
active_query_->cancel(Network::ActiveDnsQuery::CancelReason::QueryAbandoned);
199221
}
200222
// Disable timer for mock tests.
201-
if (resolve_timer_) {
223+
if (resolve_timer_ && resolve_timer_->enabled()) {
202224
resolve_timer_->disableTimer();
203225
}
204226
}
@@ -220,8 +242,13 @@ void RedisCluster::DnsDiscoveryResolveTarget::startResolveDns() {
220242
}
221243

222244
if (!resolve_timer_) {
223-
resolve_timer_ =
224-
parent_.dispatcher_.createTimer([this]() -> void { startResolveDns(); });
245+
resolve_timer_ = parent_.dispatcher_.createTimer([this]() -> void {
246+
// Check if the parent cluster is being destroyed
247+
if (parent_.is_destroying_.load()) {
248+
return;
249+
}
250+
startResolveDns();
251+
});
225252
}
226253
// if the initial dns resolved to empty, we'll skip the redis discovery phase and
227254
// treat it as an empty cluster.
@@ -244,7 +271,13 @@ RedisCluster::RedisDiscoverySession::RedisDiscoverySession(
244271
Envoy::Extensions::Clusters::Redis::RedisCluster& parent,
245272
NetworkFilters::Common::Redis::Client::ClientFactory& client_factory)
246273
: parent_(parent), dispatcher_(parent.dispatcher_),
247-
resolve_timer_(parent.dispatcher_.createTimer([this]() -> void { startResolveRedis(); })),
274+
resolve_timer_(parent.dispatcher_.createTimer([this]() -> void {
275+
// Check if the parent cluster is being destroyed
276+
if (parent_.is_destroying_.load()) {
277+
return;
278+
}
279+
startResolveRedis();
280+
})),
248281
client_factory_(client_factory), buffer_timeout_(0),
249282
redis_command_stats_(
250283
NetworkFilters::Common::Redis::RedisCommandStats::createRedisCommandStats(

source/extensions/clusters/redis/redis_cluster.h

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,7 @@ namespace Redis {
9090

9191
class RedisCluster : public Upstream::BaseDynamicClusterImpl {
9292
public:
93+
~RedisCluster();
9394
static absl::StatusOr<std::unique_ptr<RedisCluster>>
9495
create(const envoy::config::cluster::v3::Cluster& cluster,
9596
const envoy::extensions::clusters::redis::v3::RedisClusterConfig& redis_cluster,
@@ -302,7 +303,10 @@ class RedisCluster : public Upstream::BaseDynamicClusterImpl {
302303
const std::string auth_password_;
303304
const std::string cluster_name_;
304305
const Common::Redis::ClusterRefreshManagerSharedPtr refresh_manager_;
305-
const Common::Redis::ClusterRefreshManager::HandlePtr registration_handle_;
306+
Common::Redis::ClusterRefreshManager::HandlePtr registration_handle_;
307+
308+
// Flag to prevent callbacks during destruction
309+
std::atomic<bool> is_destroying_{false};
306310
};
307311

308312
class RedisClusterFactory : public Upstream::ConfigurableClusterFactoryBase<

test/extensions/clusters/redis/BUILD

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ envoy_extension_cc_test(
3131
"//source/server:transport_socket_config_lib",
3232
"//test/common/upstream:utility_lib",
3333
"//test/extensions/clusters/redis:redis_cluster_mocks",
34+
"//test/extensions/common/redis:mocks_lib",
3435
"//test/extensions/filters/network/common/redis:redis_mocks",
3536
"//test/extensions/filters/network/common/redis:test_utils_lib",
3637
"//test/extensions/filters/network/redis_proxy:redis_mocks",

test/extensions/clusters/redis/redis_cluster_test.cc

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818

1919
#include "test/common/upstream/utility.h"
2020
#include "test/extensions/clusters/redis/mocks.h"
21+
#include "test/extensions/common/redis/mocks.h"
2122
#include "test/extensions/filters/network/common/redis/mocks.h"
2223
#include "test/mocks/common.h"
2324
#include "test/mocks/protobuf/mocks.h"
@@ -1487,6 +1488,44 @@ TEST_F(RedisClusterTest, HostRemovalAfterHcFail) {
14871488
*/
14881489
}
14891490

1491+
// Test that verifies cluster destruction does not cause segfault when refresh manager
1492+
// triggers callback after cluster is destroyed. This reproduces the issue from #38585.
1493+
TEST_F(RedisClusterTest, NoSegfaultOnClusterDestructionWithPendingCallback) {
1494+
// This test verifies that destroying the cluster properly cleans up resources
1495+
// and doesn't cause a segfault. The key protection is in the destructor that
1496+
// sets is_destroying_ flag and cleans up the redis_discovery_session_.
1497+
1498+
// Create the cluster with basic configuration
1499+
setupFromV3Yaml(BasicConfig);
1500+
const std::list<std::string> resolved_addresses{"127.0.0.1"};
1501+
expectResolveDiscovery(Network::DnsLookupFamily::V4Only, "foo.bar.com", resolved_addresses);
1502+
expectRedisResolve(true);
1503+
1504+
cluster_->initialize([&]() {
1505+
initialized_.ready();
1506+
return absl::OkStatus();
1507+
});
1508+
1509+
EXPECT_CALL(membership_updated_, ready());
1510+
EXPECT_CALL(initialized_, ready());
1511+
EXPECT_CALL(*cluster_callback_, onClusterSlotUpdate(_, _));
1512+
std::bitset<ResponseFlagSize> single_slot_primary(0xfff);
1513+
std::bitset<ResponseReplicaFlagSize> no_replica(0);
1514+
expectClusterSlotResponse(createResponse(single_slot_primary, no_replica));
1515+
expectHealthyHosts(std::list<std::string>({"127.0.0.1:22120"}));
1516+
1517+
// Now destroy the cluster. With the fix in place (destructor setting is_destroying_
1518+
// and resetting redis_discovery_session_), this should not crash.
1519+
// Without the fix, accessing resolve_timer_ after destruction would segfault.
1520+
cluster_.reset();
1521+
1522+
// If we reach here without crashing, the test passes.
1523+
// The fix ensures that:
1524+
// 1. The destructor sets is_destroying_ = true
1525+
// 2. The destructor resets redis_discovery_session_
1526+
// 3. Timer callbacks check is_destroying_ before accessing cluster members
1527+
}
1528+
14901529
} // namespace Redis
14911530
} // namespace Clusters
14921531
} // namespace Extensions

0 commit comments

Comments
 (0)