Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
51 changes: 42 additions & 9 deletions source/extensions/clusters/redis/redis_cluster.cc
Original file line number Diff line number Diff line change
Expand Up @@ -74,11 +74,7 @@ RedisCluster::RedisCluster(
context.serverFactoryContext().mainThreadDispatcher(),
context.serverFactoryContext().clusterManager(),
context.serverFactoryContext().api().timeSource())),
registration_handle_(refresh_manager_->registerCluster(
cluster_name_, redirect_refresh_interval_, redirect_refresh_threshold_,
failure_refresh_threshold_, host_degraded_refresh_threshold_, [&]() {
redis_discovery_session_->resolve_timer_->enableTimer(std::chrono::milliseconds(0));
})) {
registration_handle_(nullptr) {
const auto& locality_lb_endpoints = load_assignment_.endpoints();
for (const auto& locality_lb_endpoint : locality_lb_endpoints) {
for (const auto& lb_endpoint : locality_lb_endpoint.lb_endpoints()) {
Expand All @@ -87,6 +83,32 @@ RedisCluster::RedisCluster(
*this, host.socket_address().address(), host.socket_address().port_value()));
}
}

// Register the cluster callback using weak_ptr to avoid use-after-free
std::weak_ptr<RedisDiscoverySession> weak_session = redis_discovery_session_;
registration_handle_ = refresh_manager_->registerCluster(
cluster_name_, redirect_refresh_interval_, redirect_refresh_threshold_,
failure_refresh_threshold_, host_degraded_refresh_threshold_, [weak_session]() {
// Try to lock the weak pointer to ensure the session is still alive
auto session = weak_session.lock();
if (session && session->resolve_timer_) {
session->resolve_timer_->enableTimer(std::chrono::milliseconds(0));
}
});
}

RedisCluster::~RedisCluster() {
// Set flag to prevent any callbacks from executing during destruction
is_destroying_.store(true);

// Reset redis_discovery_session_ before other members are destroyed
// to ensure any pending callbacks from refresh_manager_ don't access it.
// This matches the approach in PR #39625.
redis_discovery_session_.reset();

// Also clear DNS discovery targets to prevent their callbacks from
// accessing the destroyed cluster.
dns_discovery_resolve_targets_.clear();
}

void RedisCluster::startPreInit() {
Expand Down Expand Up @@ -198,7 +220,7 @@ RedisCluster::DnsDiscoveryResolveTarget::~DnsDiscoveryResolveTarget() {
active_query_->cancel(Network::ActiveDnsQuery::CancelReason::QueryAbandoned);
}
// Disable timer for mock tests.
if (resolve_timer_) {
if (resolve_timer_ && resolve_timer_->enabled()) {
resolve_timer_->disableTimer();
}
}
Expand All @@ -220,8 +242,13 @@ void RedisCluster::DnsDiscoveryResolveTarget::startResolveDns() {
}

if (!resolve_timer_) {
resolve_timer_ =
parent_.dispatcher_.createTimer([this]() -> void { startResolveDns(); });
resolve_timer_ = parent_.dispatcher_.createTimer([this]() -> void {
// Check if the parent cluster is being destroyed
if (parent_.is_destroying_.load()) {
return;
}
startResolveDns();
});
}
// if the initial dns resolved to empty, we'll skip the redis discovery phase and
// treat it as an empty cluster.
Expand All @@ -244,7 +271,13 @@ RedisCluster::RedisDiscoverySession::RedisDiscoverySession(
Envoy::Extensions::Clusters::Redis::RedisCluster& parent,
NetworkFilters::Common::Redis::Client::ClientFactory& client_factory)
: parent_(parent), dispatcher_(parent.dispatcher_),
resolve_timer_(parent.dispatcher_.createTimer([this]() -> void { startResolveRedis(); })),
resolve_timer_(parent.dispatcher_.createTimer([this]() -> void {
// Check if the parent cluster is being destroyed
if (parent_.is_destroying_.load()) {
return;
}
startResolveRedis();
})),
client_factory_(client_factory), buffer_timeout_(0),
redis_command_stats_(
NetworkFilters::Common::Redis::RedisCommandStats::createRedisCommandStats(
Expand Down
6 changes: 5 additions & 1 deletion source/extensions/clusters/redis/redis_cluster.h
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,7 @@ namespace Redis {

class RedisCluster : public Upstream::BaseDynamicClusterImpl {
public:
~RedisCluster();
static absl::StatusOr<std::unique_ptr<RedisCluster>>
create(const envoy::config::cluster::v3::Cluster& cluster,
const envoy::extensions::clusters::redis::v3::RedisClusterConfig& redis_cluster,
Expand Down Expand Up @@ -302,7 +303,10 @@ class RedisCluster : public Upstream::BaseDynamicClusterImpl {
const std::string auth_password_;
const std::string cluster_name_;
const Common::Redis::ClusterRefreshManagerSharedPtr refresh_manager_;
const Common::Redis::ClusterRefreshManager::HandlePtr registration_handle_;
Common::Redis::ClusterRefreshManager::HandlePtr registration_handle_;

// Flag to prevent callbacks during destruction
std::atomic<bool> is_destroying_{false};
};

class RedisClusterFactory : public Upstream::ConfigurableClusterFactoryBase<
Expand Down
1 change: 1 addition & 0 deletions test/extensions/clusters/redis/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ envoy_extension_cc_test(
"//source/server:transport_socket_config_lib",
"//test/common/upstream:utility_lib",
"//test/extensions/clusters/redis:redis_cluster_mocks",
"//test/extensions/common/redis:mocks_lib",
"//test/extensions/filters/network/common/redis:redis_mocks",
"//test/extensions/filters/network/common/redis:test_utils_lib",
"//test/extensions/filters/network/redis_proxy:redis_mocks",
Expand Down
39 changes: 39 additions & 0 deletions test/extensions/clusters/redis/redis_cluster_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@

#include "test/common/upstream/utility.h"
#include "test/extensions/clusters/redis/mocks.h"
#include "test/extensions/common/redis/mocks.h"
#include "test/extensions/filters/network/common/redis/mocks.h"
#include "test/mocks/common.h"
#include "test/mocks/protobuf/mocks.h"
Expand Down Expand Up @@ -1487,6 +1488,44 @@ TEST_F(RedisClusterTest, HostRemovalAfterHcFail) {
*/
}

// Test that verifies cluster destruction does not cause segfault when refresh manager
// triggers callback after cluster is destroyed. This reproduces the issue from #38585.
TEST_F(RedisClusterTest, NoSegfaultOnClusterDestructionWithPendingCallback) {
// This test verifies that destroying the cluster properly cleans up resources
// and doesn't cause a segfault. The key protection is in the destructor that
// sets is_destroying_ flag and cleans up the redis_discovery_session_.

// Create the cluster with basic configuration
setupFromV3Yaml(BasicConfig);
const std::list<std::string> resolved_addresses{"127.0.0.1"};
expectResolveDiscovery(Network::DnsLookupFamily::V4Only, "foo.bar.com", resolved_addresses);
expectRedisResolve(true);

cluster_->initialize([&]() {
initialized_.ready();
return absl::OkStatus();
});

EXPECT_CALL(membership_updated_, ready());
EXPECT_CALL(initialized_, ready());
EXPECT_CALL(*cluster_callback_, onClusterSlotUpdate(_, _));
std::bitset<ResponseFlagSize> single_slot_primary(0xfff);
std::bitset<ResponseReplicaFlagSize> no_replica(0);
expectClusterSlotResponse(createResponse(single_slot_primary, no_replica));
expectHealthyHosts(std::list<std::string>({"127.0.0.1:22120"}));

// Now destroy the cluster. With the fix in place (destructor setting is_destroying_
// and resetting redis_discovery_session_), this should not crash.
// Without the fix, accessing resolve_timer_ after destruction would segfault.
cluster_.reset();

// If we reach here without crashing, the test passes.
// The fix ensures that:
// 1. The destructor sets is_destroying_ = true
// 2. The destructor resets redis_discovery_session_
// 3. Timer callbacks check is_destroying_ before accessing cluster members
}

} // namespace Redis
} // namespace Clusters
} // namespace Extensions
Expand Down