Skip to content

Commit 1f49d66

Browse files
authored
feat(histograms): static config for controlling timer/histogram emission (#7298)
Now that the basics are working: time to allow turning off timers. Expected use is: - leave things at default - add a histogram version of a timer metric (i.e. deploy that code) - verify new histogram metrics, migrate alerts and dashboards - the timer data is now unused, so: - set the timer's key to "histogram" to turn off its emission to save on computation - eventually it will be deleted from the source code (~months later) - any keys explicitly set to "timer" or "both" will eventually error when the timer is removed, to prevent missing alerts during the final migration - this is not yet implemented, but seems pretty easy. just check contents against metrics definitions, metrics-defs tests show how. - "histogram" or empty should not error, we can remove it later (and force config files to remove the config then too) And this can also disable histograms if e.g. cardinality is too high and it causes problems: just set the key to "timer". Hopefully this is not needed, but if jumping from "no histograms" to "all timers migrated" in one shot it might be useful for gradual migration / to reduce large blasts of new metrics. --- I'd originally considered doing this in dynamic config, but I doubt we'll need to make quick changes, and static config is significantly less of a pain, both internally for Reasons™ and in code due to the key-ID verbosity. Unrelatedly: this PR is a near-ideal example of why dependency injection frameworks are nice 😢 --------- Signed-off-by: Steven L <[email protected]>
1 parent 69c3939 commit 1f49d66

File tree

68 files changed

+517
-131
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

68 files changed

+517
-131
lines changed

client/wrappers/metered/metered_test.go

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ func TestWrappers(t *testing.T) {
4444
clientMock := frontend.NewMockClient(ctrl)
4545

4646
testScope := tally.NewTestScope("", nil)
47-
metricsClient := metrics.NewClient(testScope, metrics.ServiceIdx(0))
47+
metricsClient := metrics.NewClient(testScope, metrics.ServiceIdx(0), metrics.HistogramMigration{})
4848

4949
clientMock.EXPECT().CountWorkflowExecutions(gomock.Any(), gomock.Any(), gomock.Any()).
5050
Return(nil, nil).Times(1)
@@ -63,7 +63,7 @@ func TestWrappers(t *testing.T) {
6363
clientMock := frontend.NewMockClient(ctrl)
6464

6565
testScope := tally.NewTestScope("", nil)
66-
metricsClient := metrics.NewClient(testScope, metrics.ServiceIdx(0))
66+
metricsClient := metrics.NewClient(testScope, metrics.ServiceIdx(0), metrics.HistogramMigration{})
6767

6868
clientMock.EXPECT().CountWorkflowExecutions(gomock.Any(), gomock.Any(), gomock.Any()).
6969
Return(nil, errors.New("error"))
@@ -85,7 +85,7 @@ func TestMatching(t *testing.T) {
8585
clientMock := matching.NewMockClient(ctrl)
8686

8787
testScope := tally.NewTestScope("", nil)
88-
metricsClient := metrics.NewClient(testScope, metrics.ServiceIdx(0))
88+
metricsClient := metrics.NewClient(testScope, metrics.ServiceIdx(0), metrics.HistogramMigration{})
8989

9090
clientMock.EXPECT().AddActivityTask(gomock.Any(), gomock.Any(), gomock.Any()).
9191
Return(&types.AddActivityTaskResponse{}, nil).Times(1)
@@ -106,7 +106,7 @@ func TestMatching(t *testing.T) {
106106
clientMock := matching.NewMockClient(ctrl)
107107

108108
testScope := tally.NewTestScope("", nil)
109-
metricsClient := metrics.NewClient(testScope, metrics.ServiceIdx(0))
109+
metricsClient := metrics.NewClient(testScope, metrics.ServiceIdx(0), metrics.HistogramMigration{})
110110

111111
clientMock.EXPECT().PollForDecisionTask(gomock.Any(), gomock.Any(), gomock.Any()).
112112
Return(&types.MatchingPollForDecisionTaskResponse{}, nil).Times(1)
@@ -129,7 +129,7 @@ func TestMatching(t *testing.T) {
129129
clientMock := matching.NewMockClient(ctrl)
130130

131131
testScope := tally.NewTestScope("", nil)
132-
metricsClient := metrics.NewClient(testScope, metrics.ServiceIdx(0))
132+
metricsClient := metrics.NewClient(testScope, metrics.ServiceIdx(0), metrics.HistogramMigration{})
133133

134134
clientMock.EXPECT().AddActivityTask(gomock.Any(), gomock.Any(), gomock.Any()).
135135
Return(&types.AddActivityTaskResponse{}, nil).Times(1)
@@ -150,7 +150,7 @@ func TestMatching(t *testing.T) {
150150
clientMock := matching.NewMockClient(ctrl)
151151

152152
testScope := tally.NewTestScope("", nil)
153-
metricsClient := metrics.NewClient(testScope, metrics.ServiceIdx(0))
153+
metricsClient := metrics.NewClient(testScope, metrics.ServiceIdx(0), metrics.HistogramMigration{})
154154

155155
clientMock.EXPECT().AddActivityTask(gomock.Any(), gomock.Any(), gomock.Any()).
156156
Return(&types.AddActivityTaskResponse{}, nil).Times(1)

common/archiver/gcloud/historyArchiver_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,7 @@ func (h *historyArchiverSuite) SetupTest() {
6363
h.Assertions = require.New(h.T())
6464
h.container = &archiver.HistoryBootstrapContainer{
6565
Logger: testlogger.New(h.T()),
66-
MetricsClient: metrics.NewClient(tally.NoopScope, metrics.History),
66+
MetricsClient: metrics.NewClient(tally.NoopScope, metrics.History, metrics.HistogramMigration{}),
6767
}
6868
h.testArchivalURI, _ = archiver.NewURI("gs://my-bucket-cad/cadence_archival/development")
6969
}

common/archiver/gcloud/visibilityArchiver_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ func (s *visibilityArchiverSuite) SetupTest() {
4949
s.Assertions = require.New(s.T())
5050
s.container = &archiver.VisibilityBootstrapContainer{
5151
Logger: testlogger.New(s.T()),
52-
MetricsClient: metrics.NewClient(tally.NoopScope, metrics.History),
52+
MetricsClient: metrics.NewClient(tally.NoopScope, metrics.History, metrics.HistogramMigration{}),
5353
}
5454
s.expectedVisibilityRecords = []*visibilityRecord{
5555
{

common/archiver/s3store/historyArchiver_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -99,7 +99,7 @@ func (s *historyArchiverSuite) SetupTest() {
9999
s.Assertions = require.New(s.T())
100100
s.container = &archiver.HistoryBootstrapContainer{
101101
Logger: testlogger.New(s.T()),
102-
MetricsClient: metrics.NewClient(scope, metrics.History),
102+
MetricsClient: metrics.NewClient(scope, metrics.History, metrics.HistogramMigration{}),
103103
}
104104
}
105105

common/archiver/s3store/visibilityArchiver_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -124,7 +124,7 @@ func (s *visibilityArchiverSuite) SetupSuite() {
124124

125125
s.container = &archiver.VisibilityBootstrapContainer{
126126
Logger: testlogger.New(s.T()),
127-
MetricsClient: metrics.NewClient(scope, metrics.History),
127+
MetricsClient: metrics.NewClient(scope, metrics.History, metrics.HistogramMigration{}),
128128
}
129129
s.setupVisibilityDirectory()
130130
}

common/cache/domainCache_test.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,7 @@ func (s *domainCacheSuite) SetupTest() {
7676
s.logger = testlogger.New(s.Suite.T())
7777

7878
s.metadataMgr = &mocks.MetadataManager{}
79-
metricsClient := metrics.NewClient(tally.NoopScope, metrics.History)
79+
metricsClient := metrics.NewClient(tally.NoopScope, metrics.History, metrics.HistogramMigration{})
8080
s.domainCache = NewDomainCache(s.metadataMgr, cluster.GetTestClusterMetadata(true), metricsClient, s.logger)
8181

8282
s.domainCache.timeSource = clock.NewMockedTimeSource()
@@ -1184,7 +1184,7 @@ func Test_WithTimeSource(t *testing.T) {
11841184
metadataMgr := &mocks.MetadataManager{}
11851185

11861186
timeSource := clock.NewRealTimeSource()
1187-
domainCache := NewDomainCache(metadataMgr, cluster.GetTestClusterMetadata(true), metrics.NewClient(tally.NoopScope, metrics.History), log.NewNoop(), WithTimeSource(timeSource))
1187+
domainCache := NewDomainCache(metadataMgr, cluster.GetTestClusterMetadata(true), metrics.NewClient(tally.NoopScope, metrics.History, metrics.HistogramMigration{}), log.NewNoop(), WithTimeSource(timeSource))
11881188

11891189
assert.Equal(t, timeSource, domainCache.timeSource)
11901190
}

common/cache/metricsScopeCache_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ func TestDomainMetricsCacheSuite(t *testing.T) {
4949

5050
func (s *domainMetricsCacheSuite) SetupTest() {
5151
s.Assertions = require.New(s.T())
52-
s.metricsClient = metrics.NewClient(tally.NoopScope, metrics.Frontend)
52+
s.metricsClient = metrics.NewClient(tally.NoopScope, metrics.Frontend, metrics.HistogramMigration{})
5353

5454
metricsCache := NewDomainMetricsScopeCache().(*domainMetricsScopeCache)
5555
metricsCache.flushDuration = 100 * time.Millisecond

common/config/config.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ import (
3535
"github.com/uber/cadence/common/dynamicconfig"
3636
c "github.com/uber/cadence/common/dynamicconfig/configstore/config"
3737
"github.com/uber/cadence/common/dynamicconfig/dynamicproperties"
38+
"github.com/uber/cadence/common/metrics"
3839
"github.com/uber/cadence/common/peerprovider/ringpopprovider"
3940
"github.com/uber/cadence/common/service"
4041
)
@@ -92,6 +93,11 @@ type (
9293

9394
// ShardDistribution is a config for the shard distributor leader election component that allows to run a single process per region and manage shard namespaces.
9495
ShardDistribution ShardDistribution `yaml:"shardDistribution"`
96+
97+
// Histograms controls timer vs histogram metric emission while they are being migrated.
98+
//
99+
// Timers will eventually be dropped, and this config will be validation-only (e.g. to error if any explicitly request timers).
100+
Histograms metrics.HistogramMigration `yaml:"histograms"`
95101
}
96102

97103
// Membership holds peer provider configuration.

common/config/config_test.go

Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,12 +22,16 @@ package config
2222

2323
import (
2424
"net/url"
25+
"strings"
2526
"testing"
2627

2728
"github.com/stretchr/testify/assert"
2829
"github.com/stretchr/testify/require"
30+
uconfig "go.uber.org/config"
31+
"gopkg.in/validator.v2"
2932

3033
"github.com/uber/cadence/common/constants"
34+
"github.com/uber/cadence/common/metrics"
3135
"github.com/uber/cadence/common/service"
3236
)
3337

@@ -387,3 +391,87 @@ func TestInvalidShardedNoSQLConfig_TasklistShardingRefersToUnknownConnection(t *
387391
err := cfg.ValidateAndFillDefaults()
388392
require.ErrorContains(t, err, "Unknown tasklist shard name")
389393
}
394+
395+
func TestHistogramMigrationConfig(t *testing.T) {
396+
t.Run("valid", func(t *testing.T) {
397+
orig := metrics.HistogramMigrationMetrics
398+
t.Cleanup(func() {
399+
metrics.HistogramMigrationMetrics = orig
400+
})
401+
metrics.HistogramMigrationMetrics = map[string]struct{}{
402+
"key1": {},
403+
"key2": {},
404+
"key3": {},
405+
"key4": {},
406+
}
407+
408+
// intentionally avoiding the full config struct, as it has other required config
409+
yaml, err := uconfig.NewYAML(uconfig.RawSource(strings.NewReader(`
410+
default: histogram
411+
names:
412+
key1: true
413+
key2: false
414+
key3:
415+
`)))
416+
require.NoError(t, err)
417+
418+
var cfg metrics.HistogramMigration
419+
err = yaml.Get(uconfig.Root).Populate(&cfg)
420+
require.NoError(t, err)
421+
422+
err = validator.Validate(cfg)
423+
require.NoError(t, err)
424+
425+
check := func(key string, timer, histogram bool) {
426+
assert.Equalf(t, timer, cfg.EmitTimer(key), "wrong value for EmitTimer(%q)", key)
427+
assert.Equalf(t, histogram, cfg.EmitHistogram(key), "wrong value for EmitHistogram(%q)", key)
428+
}
429+
check("key1", true, true)
430+
check("key2", false, false)
431+
check("key3", false, false) // the type's default mode == false. not truly intended behavior, but it's weird config so it's fine.
432+
check("key4", false, true) // configured default == histogram
433+
check("key5", true, true) // not migrating = always emitted
434+
if t.Failed() {
435+
t.Logf("config: %#v", cfg)
436+
}
437+
})
438+
t.Run("invalid default", func(t *testing.T) {
439+
yaml, err := uconfig.NewYAML(uconfig.RawSource(strings.NewReader(`
440+
default: xyz
441+
`)))
442+
require.NoError(t, err)
443+
444+
var cfg metrics.HistogramMigration
445+
err = yaml.Get(uconfig.Root).Populate(&cfg)
446+
assert.ErrorContains(t, err, `unsupported histogram migration mode "xyz", must be "timer", "histogram", or "both"`)
447+
})
448+
t.Run("invalid key", func(t *testing.T) {
449+
orig := metrics.HistogramMigrationMetrics
450+
t.Cleanup(func() {
451+
metrics.HistogramMigrationMetrics = orig
452+
})
453+
metrics.HistogramMigrationMetrics = map[string]struct{}{
454+
"key1": {},
455+
}
456+
yaml, err := uconfig.NewYAML(uconfig.RawSource(strings.NewReader(`
457+
names:
458+
key1: xyz
459+
`)))
460+
require.NoError(t, err)
461+
462+
var cfg metrics.HistogramMigration
463+
err = yaml.Get(uconfig.Root).Populate(&cfg)
464+
assert.ErrorContains(t, err, "cannot unmarshal !!str `xyz` into bool")
465+
})
466+
t.Run("nonexistent key", func(t *testing.T) {
467+
yaml, err := uconfig.NewYAML(uconfig.RawSource(strings.NewReader(`
468+
names:
469+
definitely_does_not_exist: true
470+
`)))
471+
require.NoError(t, err)
472+
473+
var cfg metrics.HistogramMigration
474+
err = yaml.Get(uconfig.Root).Populate(&cfg)
475+
assert.ErrorContains(t, err, `unknown histogram-migration metric name "definitely_does_not_exist"`)
476+
})
477+
}

common/domain/failover_watcher_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,7 @@ func (s *failoverWatcherSuite) SetupTest() {
8686

8787
logger := testlogger.New(s.T())
8888
scope := tally.NewTestScope("failover_test", nil)
89-
metricsClient := metrics.NewClient(scope, metrics.Frontend)
89+
metricsClient := metrics.NewClient(scope, metrics.Frontend, metrics.HistogramMigration{})
9090
s.watcher = NewFailoverWatcher(
9191
s.mockDomainCache,
9292
s.mockMetadataMgr,

0 commit comments

Comments
 (0)