Skip to content

Commit e0833b7

Browse files
authored
Minor auto_refresh cleanup (#5813)
* Minor auto refresh cleanup Signed-off-by: Jason Parraga <sovietaced@gmail.com> * lint-fix Signed-off-by: Jason Parraga <sovietaced@gmail.com> --------- Signed-off-by: Jason Parraga <sovietaced@gmail.com>
1 parent f5e016f commit e0833b7

File tree

4 files changed

+470
-385
lines changed

4 files changed

+470
-385
lines changed

flytestdlib/cache/auto_refresh.go

Lines changed: 0 additions & 365 deletions
Original file line numberDiff line numberDiff line change
@@ -2,20 +2,8 @@ package cache
22

33
import (
44
"context"
5-
"fmt"
6-
"runtime/debug"
7-
"sync"
8-
"time"
95

10-
lru "github.com/hashicorp/golang-lru"
11-
"github.com/prometheus/client_golang/prometheus"
12-
"k8s.io/client-go/util/workqueue"
13-
"k8s.io/utils/clock"
14-
15-
"github.com/flyteorg/flyte/flytestdlib/contextutils"
166
"github.com/flyteorg/flyte/flytestdlib/errors"
17-
"github.com/flyteorg/flyte/flytestdlib/logger"
18-
"github.com/flyteorg/flyte/flytestdlib/promutils"
197
)
208

219
type ItemID = string
@@ -45,16 +33,6 @@ type AutoRefresh interface {
4533
DeleteDelayed(id ItemID) error
4634
}
4735

48-
type metrics struct {
49-
SyncErrors prometheus.Counter
50-
Evictions prometheus.Counter
51-
SyncLatency promutils.StopWatch
52-
CacheHit prometheus.Counter
53-
CacheMiss prometheus.Counter
54-
Size prometheus.Gauge
55-
scope promutils.Scope
56-
}
57-
5836
type Item interface {
5937
IsTerminal() bool
6038
}
@@ -91,346 +69,3 @@ type SyncFunc func(ctx context.Context, batch Batch) (
9169
// CreateBatchesFunc is a func type. Your implementation of this function for your cache instance is responsible for
9270
// subdividing the list of cache items into batches.
9371
type CreateBatchesFunc func(ctx context.Context, snapshot []ItemWrapper) (batches []Batch, err error)
94-
95-
type itemWrapper struct {
96-
id ItemID
97-
item Item
98-
}
99-
100-
func (i itemWrapper) GetID() ItemID {
101-
return i.id
102-
}
103-
104-
func (i itemWrapper) GetItem() Item {
105-
return i.item
106-
}
107-
108-
// Thread-safe general purpose auto-refresh cache that watches for updates asynchronously for the keys after they are added to
109-
// the cache. An item can be inserted only once.
110-
//
111-
// Get reads from sync.map while refresh is invoked on a snapshot of keys. Cache eventually catches up on deleted items.
112-
//
113-
// Sync is run as a fixed-interval-scheduled-task, and is skipped if sync from previous cycle is still running.
114-
type autoRefresh struct {
115-
name string
116-
metrics metrics
117-
syncCb SyncFunc
118-
createBatchesCb CreateBatchesFunc
119-
lruMap *lru.Cache
120-
// Items that are currently being processed are in the processing set.
121-
// It will prevent the same item from being processed multiple times by different workers.
122-
processing *sync.Map
123-
toDelete *syncSet
124-
syncPeriod time.Duration
125-
workqueue workqueue.RateLimitingInterface
126-
parallelism uint
127-
lock sync.RWMutex
128-
clock clock.Clock
129-
}
130-
131-
func getEvictionFunction(counter prometheus.Counter) func(key interface{}, value interface{}) {
132-
return func(_ interface{}, _ interface{}) {
133-
counter.Inc()
134-
}
135-
}
136-
137-
func SingleItemBatches(_ context.Context, snapshot []ItemWrapper) (batches []Batch, err error) {
138-
res := make([]Batch, 0, len(snapshot))
139-
for _, item := range snapshot {
140-
res = append(res, Batch{item})
141-
}
142-
143-
return res, nil
144-
}
145-
146-
func newMetrics(scope promutils.Scope) metrics {
147-
return metrics{
148-
SyncErrors: scope.MustNewCounter("sync_errors", "Counter for sync errors."),
149-
Evictions: scope.MustNewCounter("lru_evictions", "Counter for evictions from LRU."),
150-
SyncLatency: scope.MustNewStopWatch("latency", "Latency for sync operations.", time.Millisecond),
151-
CacheHit: scope.MustNewCounter("cache_hit", "Counter for cache hits."),
152-
CacheMiss: scope.MustNewCounter("cache_miss", "Counter for cache misses."),
153-
Size: scope.MustNewGauge("size", "Current size of the cache"),
154-
scope: scope,
155-
}
156-
}
157-
158-
func (w *autoRefresh) Start(ctx context.Context) error {
159-
for i := uint(0); i < w.parallelism; i++ {
160-
go func(ctx context.Context) {
161-
err := w.sync(ctx)
162-
if err != nil {
163-
logger.Errorf(ctx, "Failed to sync. Error: %v", err)
164-
}
165-
}(contextutils.WithGoroutineLabel(ctx, fmt.Sprintf("%v-worker-%v", w.name, i)))
166-
}
167-
168-
enqueueCtx := contextutils.WithGoroutineLabel(ctx, fmt.Sprintf("%v-enqueue", w.name))
169-
go w.enqueueLoop(enqueueCtx)
170-
171-
return nil
172-
}
173-
174-
func (w *autoRefresh) enqueueLoop(ctx context.Context) {
175-
timer := w.clock.NewTimer(w.syncPeriod)
176-
defer timer.Stop()
177-
178-
for {
179-
select {
180-
case <-ctx.Done():
181-
return
182-
case <-timer.C():
183-
err := w.enqueueBatches(ctx)
184-
if err != nil {
185-
logger.Errorf(ctx, "Failed to enqueue. Error: %v", err)
186-
}
187-
timer.Reset(w.syncPeriod)
188-
}
189-
}
190-
}
191-
192-
// Update updates the item only if it exists in the cache, return true if we updated the item.
193-
func (w *autoRefresh) Update(id ItemID, item Item) (ok bool) {
194-
w.lock.Lock()
195-
defer w.lock.Unlock()
196-
ok = w.lruMap.Contains(id)
197-
if ok {
198-
w.lruMap.Add(id, item)
199-
}
200-
return ok
201-
}
202-
203-
// Delete deletes the item from the cache if it exists.
204-
func (w *autoRefresh) Delete(key interface{}) {
205-
w.lock.Lock()
206-
defer w.lock.Unlock()
207-
w.toDelete.Remove(key)
208-
w.lruMap.Remove(key)
209-
}
210-
211-
func (w *autoRefresh) Get(id ItemID) (Item, error) {
212-
if val, ok := w.lruMap.Get(id); ok {
213-
w.metrics.CacheHit.Inc()
214-
return val.(Item), nil
215-
}
216-
217-
w.metrics.CacheMiss.Inc()
218-
return nil, errors.Errorf(ErrNotFound, "Item with id [%v] not found.", id)
219-
}
220-
221-
// Return the item if exists else create it.
222-
// Create should be invoked only once. recreating the object is not supported.
223-
func (w *autoRefresh) GetOrCreate(id ItemID, item Item) (Item, error) {
224-
if val, ok := w.lruMap.Get(id); ok {
225-
w.metrics.CacheHit.Inc()
226-
return val.(Item), nil
227-
}
228-
229-
w.lruMap.Add(id, item)
230-
w.metrics.CacheMiss.Inc()
231-
232-
// It fixes cold start issue in the AutoRefreshCache by adding the item to the workqueue when it is created.
233-
// This way, the item will be processed without waiting for the next sync cycle (30s by default).
234-
batch := make([]ItemWrapper, 0, 1)
235-
batch = append(batch, itemWrapper{id: id, item: item})
236-
w.workqueue.AddRateLimited(&batch)
237-
w.processing.Store(id, w.clock.Now())
238-
return item, nil
239-
}
240-
241-
// DeleteDelayed queues an item for deletion. It Will get deleted as part of the next Sync cycle. Until the next sync
242-
// cycle runs, Get and GetOrCreate will continue to return the Item in its previous state.
243-
func (w *autoRefresh) DeleteDelayed(id ItemID) error {
244-
w.toDelete.Insert(id)
245-
return nil
246-
}
247-
248-
// This function is called internally by its own timer. Roughly, it will list keys, create batches of keys based on
249-
// createBatchesCb and, enqueue all the batches into the workqueue.
250-
func (w *autoRefresh) enqueueBatches(ctx context.Context) error {
251-
keys := w.lruMap.Keys()
252-
w.metrics.Size.Set(float64(len(keys)))
253-
254-
snapshot := make([]ItemWrapper, 0, len(keys))
255-
for _, k := range keys {
256-
if w.toDelete.Contains(k) {
257-
w.Delete(k)
258-
continue
259-
}
260-
// If not ok, it means evicted between the item was evicted between getting the keys and this update loop
261-
// which is fine, we can just ignore.
262-
if value, ok := w.lruMap.Peek(k); ok {
263-
if item, ok := value.(Item); !ok || (ok && !item.IsTerminal() && !w.inProcessing(k)) {
264-
snapshot = append(snapshot, itemWrapper{
265-
id: k.(ItemID),
266-
item: value.(Item),
267-
})
268-
}
269-
}
270-
}
271-
272-
batches, err := w.createBatchesCb(ctx, snapshot)
273-
if err != nil {
274-
return err
275-
}
276-
277-
for _, batch := range batches {
278-
b := batch
279-
w.workqueue.AddRateLimited(&b)
280-
for i := 1; i < len(b); i++ {
281-
w.processing.Store(b[i].GetID(), w.clock.Now())
282-
}
283-
}
284-
285-
return nil
286-
}
287-
288-
// There are w.parallelism instances of this function running all the time, each one will:
289-
// - Retrieve an item from the workqueue
290-
// - For each batch of the keys, call syncCb, which tells us if the items have been updated
291-
// -- If any has, then overwrite the item in the cache.
292-
//
293-
// What happens when the number of things that a user is trying to keep track of exceeds the size
294-
// of the cache? Trivial case where the cache is size 1 and we're trying to keep track of two things.
295-
// * Plugin asks for update on item 1 - cache evicts item 2, stores 1 and returns it unchanged
296-
// * Plugin asks for update on item 2 - cache evicts item 1, stores 2 and returns it unchanged
297-
// * Sync loop updates item 2, repeat
298-
func (w *autoRefresh) sync(ctx context.Context) (err error) {
299-
defer func() {
300-
var isErr bool
301-
rVal := recover()
302-
if rVal == nil {
303-
return
304-
}
305-
306-
if err, isErr = rVal.(error); isErr {
307-
err = fmt.Errorf("worker panic'd and is shutting down. Error: %w with Stack: %v", err, string(debug.Stack()))
308-
} else {
309-
err = fmt.Errorf("worker panic'd and is shutting down. Panic value: %v with Stack: %v", rVal, string(debug.Stack()))
310-
}
311-
312-
logger.Error(ctx, err)
313-
}()
314-
315-
for {
316-
select {
317-
case <-ctx.Done():
318-
return nil
319-
default:
320-
batch, shutdown := w.workqueue.Get()
321-
if shutdown {
322-
logger.Debugf(ctx, "Shutting down worker")
323-
return nil
324-
}
325-
// Since we create batches every time we sync, we will just remove the item from the queue here
326-
// regardless of whether it succeeded the sync or not.
327-
w.workqueue.Forget(batch)
328-
w.workqueue.Done(batch)
329-
330-
newBatch := make(Batch, 0, len(*batch.(*Batch)))
331-
for _, b := range *batch.(*Batch) {
332-
itemID := b.GetID()
333-
w.processing.Delete(itemID)
334-
item, ok := w.lruMap.Get(itemID)
335-
if !ok {
336-
logger.Debugf(ctx, "item with id [%v] not found in cache", itemID)
337-
continue
338-
}
339-
if item.(Item).IsTerminal() {
340-
logger.Debugf(ctx, "item with id [%v] is terminal", itemID)
341-
continue
342-
}
343-
newBatch = append(newBatch, b)
344-
}
345-
if len(newBatch) == 0 {
346-
continue
347-
}
348-
349-
t := w.metrics.SyncLatency.Start()
350-
updatedBatch, err := w.syncCb(ctx, newBatch)
351-
352-
if err != nil {
353-
w.metrics.SyncErrors.Inc()
354-
logger.Errorf(ctx, "failed to get latest copy of a batch. Error: %v", err)
355-
t.Stop()
356-
continue
357-
}
358-
359-
for _, item := range updatedBatch {
360-
if item.Action == Update {
361-
// Updates an existing item.
362-
w.Update(item.ID, item.Item)
363-
}
364-
}
365-
366-
w.toDelete.Range(func(key interface{}) bool {
367-
w.Delete(key)
368-
return true
369-
})
370-
371-
t.Stop()
372-
}
373-
}
374-
}
375-
376-
// Checks if the item is currently being processed and returns false if the item has been in processing for too long
377-
func (w *autoRefresh) inProcessing(key interface{}) bool {
378-
item, found := w.processing.Load(key)
379-
if found {
380-
// handle potential race conditions where the item is in processing but not in the workqueue
381-
if timeItem, ok := item.(time.Time); ok && w.clock.Since(timeItem) > (w.syncPeriod*5) {
382-
w.processing.Delete(key)
383-
return false
384-
}
385-
return true
386-
}
387-
return false
388-
}
389-
390-
// Instantiates a new AutoRefresh Cache that syncs items in batches.
391-
func NewAutoRefreshBatchedCache(name string, createBatches CreateBatchesFunc, syncCb SyncFunc, syncRateLimiter workqueue.RateLimiter,
392-
resyncPeriod time.Duration, parallelism, size uint, scope promutils.Scope) (AutoRefresh, error) {
393-
return newAutoRefreshBatchedCacheWithClock(name, createBatches, syncCb, syncRateLimiter, resyncPeriod, parallelism, size, scope, clock.RealClock{})
394-
}
395-
396-
func newAutoRefreshBatchedCacheWithClock(name string, createBatches CreateBatchesFunc, syncCb SyncFunc, syncRateLimiter workqueue.RateLimiter,
397-
resyncPeriod time.Duration, parallelism, size uint, scope promutils.Scope, clock clock.WithTicker) (AutoRefresh, error) {
398-
399-
metrics := newMetrics(scope)
400-
// #nosec G115
401-
lruCache, err := lru.NewWithEvict(int(size), getEvictionFunction(metrics.Evictions))
402-
if err != nil {
403-
return nil, err
404-
}
405-
406-
cache := &autoRefresh{
407-
name: name,
408-
metrics: metrics,
409-
parallelism: parallelism,
410-
createBatchesCb: createBatches,
411-
syncCb: syncCb,
412-
lruMap: lruCache,
413-
processing: &sync.Map{},
414-
toDelete: newSyncSet(),
415-
syncPeriod: resyncPeriod,
416-
workqueue: workqueue.NewRateLimitingQueueWithConfig(syncRateLimiter, workqueue.RateLimitingQueueConfig{
417-
Name: scope.CurrentScope(),
418-
Clock: clock,
419-
}),
420-
clock: clock,
421-
}
422-
423-
return cache, nil
424-
}
425-
426-
// Instantiates a new AutoRefresh Cache that syncs items periodically.
427-
func NewAutoRefreshCache(name string, syncCb SyncFunc, syncRateLimiter workqueue.RateLimiter, resyncPeriod time.Duration,
428-
parallelism, size uint, scope promutils.Scope) (AutoRefresh, error) {
429-
430-
return NewAutoRefreshBatchedCache(name, SingleItemBatches, syncCb, syncRateLimiter, resyncPeriod, parallelism, size, scope)
431-
}
432-
433-
func newAutoRefreshCacheWithClock(name string, syncCb SyncFunc, syncRateLimiter workqueue.RateLimiter, resyncPeriod time.Duration,
434-
parallelism, size uint, scope promutils.Scope, clock clock.WithTicker) (AutoRefresh, error) {
435-
return newAutoRefreshBatchedCacheWithClock(name, SingleItemBatches, syncCb, syncRateLimiter, resyncPeriod, parallelism, size, scope, clock)
436-
}

flytestdlib/cache/auto_refresh_example_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ import (
1515
type ExampleItemStatus string
1616

1717
const (
18-
ExampleStatusNotStarted ExampleItemStatus = "Not-started"
18+
ExampleStatusNotStarted ExampleItemStatus = "Not-enqueueLoopRunning"
1919
ExampleStatusStarted ExampleItemStatus = "Started"
2020
ExampleStatusSucceeded ExampleItemStatus = "Completed"
2121
)

0 commit comments

Comments
 (0)