Skip to content

Commit f364725

Browse files
committed
clientv3: implement exponential backoff mechanism
The current clientv3 backoff behavior is to do a flat backoff with jitter. Having a too low backoff wait time can amplify cascading failures as client requests can be retried many times with a low backoff between each request. Operators of large etcd clusters can increase the backoff wait time, but for large clusters that wait time needs to be quite large in order to safely protect the cluster from a large number of clients retrying. A very high backoff time means that retries in a non cascading failure will have a larger wait time than needed. A better solution to handle both cascading failures as well as having lower retry times in non cascading failures is to implement exponential backoff within the etcd clients. This commit implements the mechanism for exponential backoff in clients with two new parameters: 1. BackoffExponent: configures exponential backoff factor. For example a BackoffExponent of 2.0 doubles the backoff time between each retry. The default value of BackoffExponent is 1.0 which disables exponential backoff for reverse compatibility. 2. BackoffMaxWaitBetween: configures the max wait time when performing exponential backoff. The default value is 5 seconds. Signed-off-by: Elias Carter <[email protected]>
1 parent d3f136a commit f364725

File tree

6 files changed

+141
-4
lines changed

6 files changed

+141
-4
lines changed

client/v3/client.go

Lines changed: 22 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -264,10 +264,20 @@ func (c *Client) dialSetupOpts(creds grpccredentials.TransportCredentials, dopts
264264
backoffJitterFraction = c.cfg.BackoffJitterFraction
265265
}
266266

267+
backoffExponent := defaultBackoffExponent
268+
if c.cfg.BackoffExponent > 0 {
269+
backoffExponent = c.cfg.BackoffExponent
270+
}
271+
272+
backoffMaxWaitBetween := defaultBackoffMaxWaitBetween
273+
if c.cfg.BackoffMaxWaitBetween > 0 {
274+
backoffMaxWaitBetween = c.cfg.BackoffMaxWaitBetween
275+
}
276+
267277
// Interceptor retry and backoff.
268278
// TODO: Replace all of clientv3/retry.go with RetryPolicy:
269279
// https://github.com/grpc/grpc-proto/blob/cdd9ed5c3d3f87aef62f373b93361cf7bddc620d/grpc/service_config/service_config.proto#L130
270-
rrBackoff := withBackoff(c.roundRobinQuorumBackoff(backoffWaitBetween, backoffJitterFraction))
280+
rrBackoff := withBackoff(c.roundRobinQuorumBackoff(backoffWaitBetween, backoffJitterFraction, backoffExponent, backoffMaxWaitBetween))
271281
opts = append(opts,
272282
// Disable stream retry by default since go-grpc-middleware/retry does not support client streams.
273283
// Streams that are safe to retry are enabled individually.
@@ -502,14 +512,22 @@ func newClient(cfg *Config) (*Client, error) {
502512

503513
// roundRobinQuorumBackoff retries against quorum between each backoff.
504514
// This is intended for use with a round robin load balancer.
505-
func (c *Client) roundRobinQuorumBackoff(waitBetween time.Duration, jitterFraction float64) backoffFunc {
515+
func (c *Client) roundRobinQuorumBackoff(waitBetween time.Duration, jitterFraction float64, backoffExponent float64, maxWaitBetween time.Duration) backoffFunc {
506516
return func(attempt uint) time.Duration {
507517
// after each round robin across quorum, backoff for our wait between duration
508518
n := uint(len(c.Endpoints()))
509519
quorum := (n/2 + 1)
510520
if attempt%quorum == 0 {
511-
c.lg.Debug("backoff", zap.Uint("attempt", attempt), zap.Uint("quorum", quorum), zap.Duration("waitBetween", waitBetween), zap.Float64("jitterFraction", jitterFraction))
512-
return jitterUp(waitBetween, jitterFraction)
521+
c.lg.Debug(
522+
"backoff",
523+
zap.Uint("attempt", attempt),
524+
zap.Uint("quorum", quorum),
525+
zap.Duration("waitBetween", waitBetween),
526+
zap.Float64("jitterFraction", jitterFraction),
527+
zap.Float64("backoffExponent", backoffExponent),
528+
zap.Duration("maxWaitBetween", maxWaitBetween),
529+
)
530+
return jitterUp(expBackoff(attempt, backoffExponent, waitBetween, maxWaitBetween), jitterFraction)
513531
}
514532
c.lg.Debug("backoff skipped", zap.Uint("attempt", attempt), zap.Uint("quorum", quorum))
515533
return 0

client/v3/client_test.go

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -194,6 +194,46 @@ func TestBackoffJitterFraction(t *testing.T) {
194194
require.InDelta(t, backoffJitterFraction, c.cfg.BackoffJitterFraction, 0.01)
195195
}
196196

197+
func TestBackoffExponent(t *testing.T) {
198+
backoffExponent := float64(2.0)
199+
cfg := Config{
200+
Endpoints: []string{"127.0.0.1:12345"},
201+
BackoffExponent: backoffExponent,
202+
}
203+
c, err := NewClient(t, cfg)
204+
require.NoError(t, err)
205+
require.NotNil(t, c)
206+
defer c.Close()
207+
208+
require.InDelta(t, backoffExponent, c.cfg.BackoffExponent, 0.01)
209+
210+
backoffExponent = float64(1.0)
211+
cfg = Config{
212+
Endpoints: []string{"127.0.0.1:12345"},
213+
BackoffExponent: backoffExponent,
214+
}
215+
c, err = NewClient(t, cfg)
216+
require.NoError(t, err)
217+
require.NotNil(t, c)
218+
defer c.Close()
219+
220+
require.InDelta(t, backoffExponent, c.cfg.BackoffExponent, 0.01)
221+
}
222+
223+
func TestMaxBackoff(t *testing.T) {
224+
backoffMaxWaitBetween := 100 * time.Millisecond
225+
cfg := Config{
226+
Endpoints: []string{"127.0.0.1:12345"},
227+
BackoffMaxWaitBetween: backoffMaxWaitBetween,
228+
}
229+
c, err := NewClient(t, cfg)
230+
require.NoError(t, err)
231+
require.NotNil(t, c)
232+
defer c.Close()
233+
234+
require.Equal(t, backoffMaxWaitBetween, c.cfg.BackoffMaxWaitBetween)
235+
}
236+
197237
func TestIsHaltErr(t *testing.T) {
198238
assert.Truef(t,
199239
isHaltErr(t.Context(), errors.New("etcdserver: some etcdserver error")),

client/v3/config.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -102,6 +102,12 @@ type Config struct {
102102
// BackoffJitterFraction is the jitter fraction to randomize backoff wait time.
103103
BackoffJitterFraction float64 `json:"backoff-jitter-fraction"`
104104

105+
// BackoffExponent is the exponential backoff factor for retries.
106+
BackoffExponent float64 `json:"backoff-exponent"`
107+
108+
// BackoffMaxWaitBetween is the max wait time before retrying an RPC after exponential backoff.
109+
BackoffMaxWaitBetween time.Duration `json:"backoff-max-wait-between"`
110+
105111
// TODO: support custom balancer picker
106112
}
107113

client/v3/options.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,12 @@ var (
5454

5555
// client-side retry backoff default jitter fraction.
5656
defaultBackoffJitterFraction = 0.10
57+
58+
// client-side retry backoff exponential factor. Default of 1.0 which is no exponential backoff.
59+
defaultBackoffExponent = 1.0
60+
61+
// client-side retry backoff exponential max wait between requests.
62+
defaultBackoffMaxWaitBetween = 5 * time.Second
5763
)
5864

5965
// defaultCallOpts defines a list of default "gRPC.CallOption".

client/v3/utils.go

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
package clientv3
1616

1717
import (
18+
"math"
1819
"math/rand"
1920
"time"
2021
)
@@ -29,3 +30,13 @@ func jitterUp(duration time.Duration, jitter float64) time.Duration {
2930
multiplier := jitter * (rand.Float64()*2 - 1)
3031
return time.Duration(float64(duration) * (1 + multiplier))
3132
}
33+
34+
// expBackoff returns an exponential backoff duration.
35+
//
36+
// This will calculate exponential backoff based upon generation and exponent. The backoff is within [minDelay, maxDelay].
37+
// For example, an exponent of 2.0 will double the backoff duration every subsequent generation. A generation of 0 will
38+
// return minDelay.
39+
func expBackoff(generation uint, exponent float64, minDelay, maxDelay time.Duration) time.Duration {
40+
delay := math.Min(math.Pow(exponent, float64(generation))*float64(minDelay), float64(maxDelay))
41+
return time.Duration(delay)
42+
}

client/v3/utils_test.go

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
// Copyright 2025 The etcd Authors
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
15+
package clientv3
16+
17+
import (
18+
"fmt"
19+
"math"
20+
"testing"
21+
"time"
22+
23+
"github.com/stretchr/testify/require"
24+
)
25+
26+
func TestExpBackoff(t *testing.T) {
27+
testCases := []struct {
28+
generation uint
29+
exponent float64
30+
minDelay time.Duration
31+
maxDelay time.Duration
32+
expectedBackoff time.Duration
33+
}{
34+
// exponential backoff with 2.0 exponent
35+
{generation: 0, exponent: 2.0, minDelay: 100 * time.Millisecond, maxDelay: 500 * time.Millisecond, expectedBackoff: 100 * time.Millisecond},
36+
{generation: 1, exponent: 2.0, minDelay: 100 * time.Millisecond, maxDelay: 500 * time.Millisecond, expectedBackoff: 200 * time.Millisecond},
37+
{generation: 2, exponent: 2.0, minDelay: 100 * time.Millisecond, maxDelay: 500 * time.Millisecond, expectedBackoff: 400 * time.Millisecond},
38+
{generation: 3, exponent: 2.0, minDelay: 100 * time.Millisecond, maxDelay: 500 * time.Millisecond, expectedBackoff: 500 * time.Millisecond},
39+
{generation: math.MaxUint, exponent: 2.0, minDelay: 100 * time.Millisecond, maxDelay: 500 * time.Millisecond, expectedBackoff: 500 * time.Millisecond},
40+
41+
// exponential backoff with 1.0 exponent
42+
{generation: 0, exponent: 1.0, minDelay: 100 * time.Millisecond, maxDelay: 500 * time.Millisecond, expectedBackoff: 100 * time.Millisecond},
43+
{generation: 1, exponent: 1.0, minDelay: 100 * time.Millisecond, maxDelay: 500 * time.Millisecond, expectedBackoff: 100 * time.Millisecond},
44+
{generation: 2, exponent: 1.0, minDelay: 100 * time.Millisecond, maxDelay: 500 * time.Millisecond, expectedBackoff: 100 * time.Millisecond},
45+
{generation: 3, exponent: 1.0, minDelay: 100 * time.Millisecond, maxDelay: 500 * time.Millisecond, expectedBackoff: 100 * time.Millisecond},
46+
{generation: math.MaxUint, exponent: 1.0, minDelay: 100 * time.Millisecond, maxDelay: 500 * time.Millisecond, expectedBackoff: 100 * time.Millisecond},
47+
}
48+
49+
for _, testCase := range testCases {
50+
testName := fmt.Sprintf("%+v", testCase)
51+
t.Run(testName, func(t *testing.T) {
52+
backoff := expBackoff(testCase.generation, testCase.exponent, testCase.minDelay, testCase.maxDelay)
53+
require.InDelta(t, backoff, testCase.expectedBackoff, 0.01)
54+
})
55+
}
56+
}

0 commit comments

Comments
 (0)