-
Notifications
You must be signed in to change notification settings - Fork 869
feat(shard-distributor): implement WatchNamespaceState streaming RPC #7432
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
bc0a271
3f205ea
5cd267c
838c152
e1e06af
b2b2d81
ae5d504
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -142,5 +142,78 @@ func (h *handlerImpl) assignEphemeralShard(ctx context.Context, namespace string | |
| } | ||
|
|
||
| func (h *handlerImpl) WatchNamespaceState(request *types.WatchNamespaceStateRequest, server WatchNamespaceStateServer) error { | ||
| return fmt.Errorf("not implemented") | ||
| h.startWG.Wait() | ||
|
|
||
| // Subscribe to state changes from storage | ||
| assignmentChangesChan, unSubscribe, err := h.storage.SubscribeToAssignmentChanges(server.Context(), request.Namespace) | ||
| defer unSubscribe() | ||
| if err != nil { | ||
| return fmt.Errorf("subscribe to namespace state: %w", err) | ||
| } | ||
|
|
||
| // Send initial state immediately so client doesn't have to wait for first update | ||
| state, err := h.storage.GetState(server.Context(), request.Namespace) | ||
| if err != nil { | ||
| return fmt.Errorf("get initial state: %w", err) | ||
| } | ||
| response := toWatchNamespaceStateResponse(state) | ||
| if err := server.Send(response); err != nil { | ||
| return fmt.Errorf("send initial state: %w", err) | ||
| } | ||
|
|
||
| // Stream subsequent updates | ||
| for { | ||
| select { | ||
| case <-server.Context().Done(): | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. what if we stop shardDistributor? is it implicitly handled witht he server context?
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Good question - I assume so - it's the only context availible at least
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think we can test this, shutting down the shard distributor and checking that the canaries are not hanging but they connect to a new stream :) |
||
| return server.Context().Err() | ||
| case assignmentChanges, ok := <-assignmentChangesChan: | ||
| if !ok { | ||
| return fmt.Errorf("unexpected close of updates channel") | ||
| } | ||
| response := &types.WatchNamespaceStateResponse{ | ||
| Executors: make([]*types.ExecutorShardAssignment, 0, len(state.ShardAssignments)), | ||
| } | ||
| for executor, shardIDs := range assignmentChanges { | ||
| response.Executors = append(response.Executors, &types.ExecutorShardAssignment{ | ||
| ExecutorID: executor.ExecutorID, | ||
| AssignedShards: WrapShards(shardIDs), | ||
| Metadata: executor.Metadata, | ||
| }) | ||
| } | ||
|
|
||
| err = server.Send(response) | ||
| if err != nil { | ||
| return fmt.Errorf("send response: %w", err) | ||
| } | ||
| } | ||
| } | ||
| } | ||
|
|
||
| func toWatchNamespaceStateResponse(state *store.NamespaceState) *types.WatchNamespaceStateResponse { | ||
| response := &types.WatchNamespaceStateResponse{ | ||
| Executors: make([]*types.ExecutorShardAssignment, 0, len(state.ShardAssignments)), | ||
| } | ||
|
|
||
| for executorID, assignment := range state.ShardAssignments { | ||
| // Extract shard IDs from the assigned shards map | ||
| shardIDs := make([]string, 0, len(assignment.AssignedShards)) | ||
| for shardID := range assignment.AssignedShards { | ||
| shardIDs = append(shardIDs, shardID) | ||
| } | ||
|
|
||
| response.Executors = append(response.Executors, &types.ExecutorShardAssignment{ | ||
| ExecutorID: executorID, | ||
| AssignedShards: WrapShards(shardIDs), | ||
| Metadata: state.Executors[executorID].Metadata, | ||
| }) | ||
| } | ||
| return response | ||
| } | ||
|
|
||
| func WrapShards(shardIDs []string) []*types.Shard { | ||
| shards := make([]*types.Shard, 0, len(shardIDs)) | ||
| for _, shardID := range shardIDs { | ||
| shards = append(shards, &types.Shard{ShardKey: shardID}) | ||
| } | ||
| return shards | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,65 @@ | ||
| package shardcache | ||
|
|
||
| import ( | ||
| "context" | ||
| "sync" | ||
|
|
||
| "github.com/google/uuid" | ||
|
|
||
| "github.com/uber/cadence/common/log" | ||
| "github.com/uber/cadence/common/log/tag" | ||
| "github.com/uber/cadence/service/sharddistributor/store" | ||
| ) | ||
|
|
||
| // executorStatePubSub manages subscriptions to executor state changes | ||
| type executorStatePubSub struct { | ||
| mu sync.RWMutex | ||
| subscribers map[string]chan<- map[*store.ShardOwner][]string | ||
| logger log.Logger | ||
| namespace string | ||
| } | ||
|
|
||
| func newExecutorStatePubSub(logger log.Logger, namespace string) *executorStatePubSub { | ||
| return &executorStatePubSub{ | ||
| subscribers: make(map[string]chan<- map[*store.ShardOwner][]string), | ||
| logger: logger, | ||
| namespace: namespace, | ||
| } | ||
| } | ||
|
|
||
| // Subscribe returns a channel that receives executor state updates. | ||
| func (p *executorStatePubSub) subscribe(ctx context.Context) (<-chan map[*store.ShardOwner][]string, func()) { | ||
| ch := make(chan map[*store.ShardOwner][]string) | ||
| uniqueID := uuid.New().String() | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. thinking out loud, should we return the subscription ID for debug purposes?
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't see the value, but maybe if you elaborate a bit?
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In case of a issues with a subscription we only have the subscriptionID stored on SD side and we don't know which instance is not receiving updates. We can understand which namespace is impacted but maybe it is too wide. I am thinking if we should prepend the caller instance to this uid for example.
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ill do a followup PR to add a spectator ID so we can make this connection |
||
|
|
||
| p.mu.Lock() | ||
| defer p.mu.Unlock() | ||
| p.subscribers[uniqueID] = ch | ||
|
|
||
| unSub := func() { | ||
| p.unSubscribe(uniqueID) | ||
| } | ||
|
|
||
| return ch, unSub | ||
| } | ||
|
|
||
| func (p *executorStatePubSub) unSubscribe(uniqueID string) { | ||
| p.mu.Lock() | ||
| defer p.mu.Unlock() | ||
| delete(p.subscribers, uniqueID) | ||
| } | ||
|
|
||
| // Publish sends the state to all subscribers (non-blocking) | ||
| func (p *executorStatePubSub) publish(state map[*store.ShardOwner][]string) { | ||
| p.mu.RLock() | ||
| defer p.mu.RUnlock() | ||
|
|
||
| for _, sub := range p.subscribers { | ||
| select { | ||
| case sub <- state: | ||
| default: | ||
| // Subscriber is not reading fast enough, skip this update | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. should we retry? we call refresh and then publish only in case of changes, let's say that no changes happen, some subscribers will have stale info until next change, are we fine with that?
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think that is a good point - maybe we can send a reconciliation message every 1s?
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yes, this is a good idea
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Will do a follow up PR |
||
| p.logger.Warn("Subscriber not keeping up with state updates, dropping update", tag.ShardNamespace(p.namespace)) | ||
| } | ||
| } | ||
| } | ||
Uh oh!
There was an error while loading. Please reload this page.