From 1583fd4bcc315cf526df522860ec0bd9c0ad79eb Mon Sep 17 00:00:00 2001
From: samikshya-chand_data <samikshya.chand@databricks.com>
Date: Wed, 5 Nov 2025 21:30:16 +0000
Subject: [PATCH] [PECOBLR-1146] Implement feature flag cache with reference
 counting
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Implements per-host feature flag caching with reference counting for telemetry control.

Key features:
- Per-host singleton cache to prevent rate limiting
- Reference counting tied to connection lifecycle
- 15-minute TTL with automatic refresh
- Thread-safe concurrent access using RWMutex
- HTTP integration with Databricks feature flag API
- Fallback to cached value on fetch errors

Comprehensive test coverage includes:
- Reference counting increment/decrement
- Cache expiration and refresh logic
- Concurrent access safety
- HTTP fetch success/failure scenarios
- Multiple host management

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
---
 telemetry/DESIGN.md           |   2 +-
 telemetry/featureflag.go      | 151 +++++++++++
 telemetry/featureflag_test.go | 491 ++++++++++++++++++++++++++++++++++
 3 files changed, 643 insertions(+), 1 deletion(-)
 create mode 100644 telemetry/featureflag.go
 create mode 100644 telemetry/featureflag_test.go

diff --git a/telemetry/DESIGN.md b/telemetry/DESIGN.md
index c239ea0..931ba17 100644
--- a/telemetry/DESIGN.md
+++ b/telemetry/DESIGN.md
@@ -1743,7 +1743,7 @@ func BenchmarkInterceptor_Disabled(b *testing.B) {
 - [x] Add unit tests for configuration and tags
 
 ### Phase 2: Per-Host Management
-- [ ] Implement `featureflag.go` with caching and reference counting
+- [x] Implement `featureflag.go` with caching and reference counting ✅ COMPLETED (PECOBLR-1146)
 - [ ] Implement `manager.go` for client management
 - [ ] Implement `circuitbreaker.go` with state machine
 - [ ] Add unit tests for all components
diff --git a/telemetry/featureflag.go b/telemetry/featureflag.go
new file mode 100644
index 0000000..f1c6f55
--- /dev/null
+++ b/telemetry/featureflag.go
@@ -0,0 +1,151 @@
+package telemetry
+
+import (
+	"context"
+	"encoding/json"
+	"fmt"
+	"net/http"
+	"strings"
+	"sync"
+	"time"
+)
+
+// featureFlagCache manages feature flag state per host with reference counting.
+// This prevents rate limiting by caching feature flag responses.
+type featureFlagCache struct {
+	mu       sync.RWMutex
+	contexts map[string]*featureFlagContext
+}
+
+// featureFlagContext holds feature flag state and reference count for a host.
+type featureFlagContext struct {
+	enabled       *bool
+	lastFetched   time.Time
+	refCount      int
+	cacheDuration time.Duration
+}
+
+var (
+	flagCacheOnce     sync.Once
+	flagCacheInstance *featureFlagCache
+)
+
+// getFeatureFlagCache returns the singleton instance.
+func getFeatureFlagCache() *featureFlagCache {
+	flagCacheOnce.Do(func() {
+		flagCacheInstance = &featureFlagCache{
+			contexts: make(map[string]*featureFlagContext),
+		}
+	})
+	return flagCacheInstance
+}
+
+// getOrCreateContext gets or creates a feature flag context for the host.
+// Increments reference count.
+func (c *featureFlagCache) getOrCreateContext(host string) *featureFlagContext {
+	c.mu.Lock()
+	defer c.mu.Unlock()
+
+	ctx, exists := c.contexts[host]
+	if !exists {
+		ctx = &featureFlagContext{
+			cacheDuration: 15 * time.Minute,
+		}
+		c.contexts[host] = ctx
+	}
+	ctx.refCount++
+	return ctx
+}
+
+// releaseContext decrements reference count for the host.
+// Removes context when ref count reaches zero.
+func (c *featureFlagCache) releaseContext(host string) {
+	c.mu.Lock()
+	defer c.mu.Unlock()
+
+	if ctx, exists := c.contexts[host]; exists {
+		ctx.refCount--
+		if ctx.refCount <= 0 {
+			delete(c.contexts, host)
+		}
+	}
+}
+
+// isTelemetryEnabled checks if telemetry is enabled for the host.
+// Uses cached value if available and not expired.
+func (c *featureFlagCache) isTelemetryEnabled(ctx context.Context, host string, httpClient *http.Client) (bool, error) {
+	c.mu.RLock()
+	flagCtx, exists := c.contexts[host]
+	c.mu.RUnlock()
+
+	if !exists {
+		return false, nil
+	}
+
+	// Check if cache is valid
+	if flagCtx.enabled != nil && time.Since(flagCtx.lastFetched) < flagCtx.cacheDuration {
+		return *flagCtx.enabled, nil
+	}
+
+	// Fetch fresh value
+	enabled, err := fetchFeatureFlag(ctx, host, httpClient)
+	if err != nil {
+		// Return cached value on error, or false if no cache
+		if flagCtx.enabled != nil {
+			return *flagCtx.enabled, nil
+		}
+		return false, err
+	}
+
+	// Update cache
+	c.mu.Lock()
+	flagCtx.enabled = &enabled
+	flagCtx.lastFetched = time.Now()
+	c.mu.Unlock()
+
+	return enabled, nil
+}
+
+// isExpired returns true if the cache has expired.
+func (c *featureFlagContext) isExpired() bool {
+	return c.enabled == nil || time.Since(c.lastFetched) > c.cacheDuration
+}
+
+// fetchFeatureFlag fetches the feature flag value from the Databricks API.
+func fetchFeatureFlag(ctx context.Context, host string, httpClient *http.Client) (bool, error) {
+	// Build endpoint URL, adding https:// if no scheme present
+	endpoint := host
+	if !strings.HasPrefix(host, "http://") && !strings.HasPrefix(host, "https://") {
+		endpoint = "https://" + host
+	}
+	endpoint = endpoint + "/api/2.0/feature-flags"
+
+	req, err := http.NewRequestWithContext(ctx, "GET", endpoint, nil)
+	if err != nil {
+		return false, fmt.Errorf("failed to create request: %w", err)
+	}
+
+	// Add query parameters
+	q := req.URL.Query()
+	q.Add("flags", "databricks.partnerplatform.clientConfigsFeatureFlags.enableTelemetryForAdbc")
+	req.URL.RawQuery = q.Encode()
+
+	resp, err := httpClient.Do(req)
+	if err != nil {
+		return false, fmt.Errorf("failed to fetch feature flag: %w", err)
+	}
+	defer resp.Body.Close()
+
+	if resp.StatusCode != http.StatusOK {
+		return false, fmt.Errorf("feature flag check failed with status: %d", resp.StatusCode)
+	}
+
+	var result struct {
+		Flags map[string]bool `json:"flags"`
+	}
+	if err := json.NewDecoder(resp.Body).Decode(&result); err != nil {
+		return false, fmt.Errorf("failed to decode response: %w", err)
+	}
+
+	return result.Flags["databricks.partnerplatform.clientConfigsFeatureFlags.enableTelemetryForAdbc"], nil
+}
diff --git a/telemetry/featureflag_test.go b/telemetry/featureflag_test.go
new file mode 100644
index 0000000..8acab0a
--- /dev/null
+++ b/telemetry/featureflag_test.go
@@ -0,0 +1,491 @@
+package telemetry
+
+import (
+	"context"
+	"encoding/json"
+	"net/http"
+	"net/http/httptest"
+	"sync"
+	"testing"
+	"time"
+)
+
+func TestGetFeatureFlagCache_Singleton(t *testing.T) {
+	cache1 := getFeatureFlagCache()
+	cache2 := getFeatureFlagCache()
+
+	if cache1 != cache2 {
+		t.Error("getFeatureFlagCache should return the same instance")
+	}
+}
+
+func TestGetOrCreateContext_CreatesNew(t *testing.T) {
+	cache := &featureFlagCache{
+		contexts: make(map[string]*featureFlagContext),
+	}
+
+	ctx := cache.getOrCreateContext("host1.databricks.com")
+
+	if ctx == nil {
+		t.Fatal("getOrCreateContext should return a context")
+	}
+
+	if ctx.refCount != 1 {
+		t.Errorf("expected refCount=1, got %d", ctx.refCount)
+	}
+
+	if ctx.cacheDuration != 15*time.Minute {
+		t.Errorf("expected cacheDuration=15m, got %v", ctx.cacheDuration)
+	}
+}
+
+func TestGetOrCreateContext_IncrementsRefCount(t *testing.T) {
+	cache := &featureFlagCache{
+		contexts: make(map[string]*featureFlagContext),
+	}
+
+	ctx1 := cache.getOrCreateContext("host1.databricks.com")
+	ctx2 := cache.getOrCreateContext("host1.databricks.com")
+
+	if ctx1 != ctx2 {
+		t.Error("should return same context for same host")
+	}
+
+	if ctx1.refCount != 2 {
+		t.Errorf("expected refCount=2, got %d", ctx1.refCount)
+	}
+}
+
+func TestReleaseContext_DecrementsRefCount(t *testing.T) {
+	cache := &featureFlagCache{
+		contexts: make(map[string]*featureFlagContext),
+	}
+
+	cache.getOrCreateContext("host1.databricks.com")
+	cache.getOrCreateContext("host1.databricks.com")
+
+	cache.releaseContext("host1.databricks.com")
+
+	ctx := cache.contexts["host1.databricks.com"]
+	if ctx == nil {
+		t.Fatal("context should still exist")
+	}
+
+	if ctx.refCount != 1 {
+		t.Errorf("expected refCount=1, got %d", ctx.refCount)
+	}
+}
+
+func TestReleaseContext_RemovesContextAtZero(t *testing.T) {
+	cache := &featureFlagCache{
+		contexts: make(map[string]*featureFlagContext),
+	}
+
+	cache.getOrCreateContext("host1.databricks.com")
+	cache.releaseContext("host1.databricks.com")
+
+	if _, exists := cache.contexts["host1.databricks.com"]; exists {
+		t.Error("context should be removed when refCount reaches zero")
+	}
+}
+
+func TestReleaseContext_NonExistentHost(t *testing.T) {
+	cache := &featureFlagCache{
+		contexts: make(map[string]*featureFlagContext),
+	}
+
+	// Should not panic
+	cache.releaseContext("nonexistent.host")
+}
+
+func TestIsTelemetryEnabled_NoContext(t *testing.T) {
+	cache := &featureFlagCache{
+		contexts: make(map[string]*featureFlagContext),
+	}
+
+	enabled, err := cache.isTelemetryEnabled(context.Background(), "host1.databricks.com", http.DefaultClient)
+
+	if err != nil {
+		t.Errorf("unexpected error: %v", err)
+	}
+
+	if enabled {
+		t.Error("expected false when no context exists")
+	}
+}
+
+func TestIsTelemetryEnabled_CacheHit(t *testing.T) {
+	cache := &featureFlagCache{
+		contexts: make(map[string]*featureFlagContext),
+	}
+
+	// Create context with cached value
+	enabled := true
+	cache.contexts["host1.databricks.com"] = &featureFlagContext{
+		enabled:       &enabled,
+		lastFetched:   time.Now(),
+		refCount:      1,
+		cacheDuration: 15 * time.Minute,
+	}
+
+	result, err := cache.isTelemetryEnabled(context.Background(), "host1.databricks.com", http.DefaultClient)
+
+	if err != nil {
+		t.Errorf("unexpected error: %v", err)
+	}
+
+	if !result {
+		t.Error("expected true from cache")
+	}
+}
+
+func TestIsTelemetryEnabled_CacheExpired(t *testing.T) {
+	server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		response := map[string]interface{}{
+			"flags": map[string]bool{
+				"databricks.partnerplatform.clientConfigsFeatureFlags.enableTelemetryForAdbc": true,
+			},
+		}
+		json.NewEncoder(w).Encode(response)
+	}))
+	defer server.Close()
+
+	cache := &featureFlagCache{
+		contexts: make(map[string]*featureFlagContext),
+	}
+
+	// Create context with expired cache
+	enabled := false
+	cache.contexts[server.URL] = &featureFlagContext{
+		enabled:       &enabled,
+		lastFetched:   time.Now().Add(-20 * time.Minute), // Expired
+		refCount:      1,
+		cacheDuration: 15 * time.Minute,
+	}
+
+	result, err := cache.isTelemetryEnabled(context.Background(), server.URL, http.DefaultClient)
+
+	if err != nil {
+		t.Errorf("unexpected error: %v", err)
+	}
+
+	if !result {
+		t.Error("expected true from fresh fetch")
+	}
+
+	// Verify cache was updated
+	ctx := cache.contexts[server.URL]
+	if ctx.enabled == nil || !*ctx.enabled {
+		t.Error("cache should be updated with fresh value")
+	}
+}
+
+func TestIsTelemetryEnabled_FetchError_FallbackToCache(t *testing.T) {
+	server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		w.WriteHeader(http.StatusInternalServerError)
+	}))
+	defer server.Close()
+
+	cache := &featureFlagCache{
+		contexts: make(map[string]*featureFlagContext),
+	}
+
+	// Create context with expired cache
+	enabled := true
+	cache.contexts[server.URL] = &featureFlagContext{
+		enabled:       &enabled,
+		lastFetched:   time.Now().Add(-20 * time.Minute), // Expired
+		refCount:      1,
+		cacheDuration: 15 * time.Minute,
+	}
+
+	result, err := cache.isTelemetryEnabled(context.Background(), server.URL, http.DefaultClient)
+
+	if err != nil {
+		t.Errorf("should not return error when fallback to cache: %v", err)
+	}
+
+	if !result {
+		t.Error("expected true from cached fallback value")
+	}
+}
+
+func TestIsTelemetryEnabled_FetchError_NoCache(t *testing.T) {
+	server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		w.WriteHeader(http.StatusInternalServerError)
+	}))
+	defer server.Close()
+
+	cache := &featureFlagCache{
+		contexts: make(map[string]*featureFlagContext),
+	}
+
+	// Create context without cached value
+	cache.contexts[server.URL] = &featureFlagContext{
+		enabled:       nil,
+		refCount:      1,
+		cacheDuration: 15 * time.Minute,
+	}
+
+	result, err := cache.isTelemetryEnabled(context.Background(), server.URL, http.DefaultClient)
+
+	if err == nil {
+		t.Error("expected error when fetch fails and no cache")
+	}
+
+	if result {
+		t.Error("expected false when fetch fails and no cache")
+	}
+}
+
+func TestIsExpired_NotFetched(t *testing.T) {
+	ctx := &featureFlagContext{
+		enabled:       nil,
+		cacheDuration: 15 * time.Minute,
+	}
+
+	if !ctx.isExpired() {
+		t.Error("should be expired when not fetched")
+	}
+}
+
+func TestIsExpired_Fresh(t *testing.T) {
+	enabled := true
+	ctx := &featureFlagContext{
+		enabled:       &enabled,
+		lastFetched:   time.Now(),
+		cacheDuration: 15 * time.Minute,
+	}
+
+	if ctx.isExpired() {
+		t.Error("should not be expired when fresh")
+	}
+}
+
+func TestIsExpired_Expired(t *testing.T) {
+	enabled := true
+	ctx := &featureFlagContext{
+		enabled:       &enabled,
+		lastFetched:   time.Now().Add(-20 * time.Minute),
+		cacheDuration: 15 * time.Minute,
+	}
+
+	if !ctx.isExpired() {
+		t.Error("should be expired after TTL")
+	}
+}
+
+func TestFetchFeatureFlag_Success(t *testing.T) {
+	server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		// Verify request
+		if r.Method != "GET" {
+			t.Errorf("expected GET, got %s", r.Method)
+		}
+
+		flags := r.URL.Query().Get("flags")
+		expectedFlag := "databricks.partnerplatform.clientConfigsFeatureFlags.enableTelemetryForAdbc"
+		if flags != expectedFlag {
+			t.Errorf("expected flag=%s, got %s", expectedFlag, flags)
+		}
+
+		// Send response
+		response := map[string]interface{}{
+			"flags": map[string]bool{
+				"databricks.partnerplatform.clientConfigsFeatureFlags.enableTelemetryForAdbc": true,
+			},
+		}
+		json.NewEncoder(w).Encode(response)
+	}))
+	defer server.Close()
+
+	enabled, err := fetchFeatureFlag(context.Background(), server.URL, http.DefaultClient)
+
+	if err != nil {
+		t.Errorf("unexpected error: %v", err)
+	}
+
+	if !enabled {
+		t.Error("expected true from API response")
+	}
+}
+
+func TestFetchFeatureFlag_FalseValue(t *testing.T) {
+	server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		response := map[string]interface{}{
+			"flags": map[string]bool{
+				"databricks.partnerplatform.clientConfigsFeatureFlags.enableTelemetryForAdbc": false,
+			},
+		}
+		json.NewEncoder(w).Encode(response)
+	}))
+	defer server.Close()
+
+	enabled, err := fetchFeatureFlag(context.Background(), server.URL, http.DefaultClient)
+
+	if err != nil {
+		t.Errorf("unexpected error: %v", err)
+	}
+
+	if enabled {
+		t.Error("expected false from API response")
+	}
+}
+
+func TestFetchFeatureFlag_HTTPError(t *testing.T) {
+	server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		w.WriteHeader(http.StatusInternalServerError)
+	}))
+	defer server.Close()
+
+	_, err := fetchFeatureFlag(context.Background(), server.URL, http.DefaultClient)
+
+	if err == nil {
+		t.Error("expected error on HTTP failure")
+	}
+}
+
+func TestFetchFeatureFlag_InvalidJSON(t *testing.T) {
+	server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		w.Write([]byte("invalid json"))
+	}))
+	defer server.Close()
+
+	_, err := fetchFeatureFlag(context.Background(), server.URL, http.DefaultClient)
+
+	if err == nil {
+		t.Error("expected error on invalid JSON")
+	}
+}
+
+func TestConcurrentAccess(t *testing.T) {
+	cache := &featureFlagCache{
+		contexts: make(map[string]*featureFlagContext),
+	}
+
+	var wg sync.WaitGroup
+	numGoroutines := 100
+	host := "host1.databricks.com"
+
+	// Concurrent getOrCreateContext
+	for i := 0; i < numGoroutines; i++ {
+		wg.Add(1)
+		go func() {
+			defer wg.Done()
+			cache.getOrCreateContext(host)
+		}()
+	}
+
+	wg.Wait()
+
+	ctx := cache.contexts[host]
+	if ctx.refCount != numGoroutines {
+		t.Errorf("expected refCount=%d, got %d", numGoroutines, ctx.refCount)
+	}
+
+	// Concurrent releaseContext
+	for i := 0; i < numGoroutines; i++ {
+		wg.Add(1)
+		go func() {
+			defer wg.Done()
+			cache.releaseContext(host)
+		}()
+	}
+
+	wg.Wait()
+
+	if _, exists := cache.contexts[host]; exists {
+		t.Error("context should be removed after all releases")
+	}
+}
+
+func TestConcurrentFetch(t *testing.T) {
+	callCount := 0
+	var mu sync.Mutex
+
+	server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		mu.Lock()
+		callCount++
+		mu.Unlock()
+
+		response := map[string]interface{}{
+			"flags": map[string]bool{
+				"databricks.partnerplatform.clientConfigsFeatureFlags.enableTelemetryForAdbc": true,
+			},
+		}
+		json.NewEncoder(w).Encode(response)
+	}))
+	defer server.Close()
+
+	cache := &featureFlagCache{
+		contexts: make(map[string]*featureFlagContext),
+	}
+
+	cache.getOrCreateContext(server.URL)
+
+	var wg sync.WaitGroup
+	numGoroutines := 50
+
+	// Concurrent isTelemetryEnabled calls
+	for i := 0; i < numGoroutines; i++ {
+		wg.Add(1)
+		go func() {
+			defer wg.Done()
+			cache.isTelemetryEnabled(context.Background(), server.URL, http.DefaultClient)
+		}()
+	}
+
+	wg.Wait()
+
+	// Should significantly reduce HTTP calls due to caching
+	// Note: Initial concurrent calls may race before cache is populated,
+	// but subsequent calls should all use cache
+	mu.Lock()
+	defer mu.Unlock()
+	if callCount >= numGoroutines {
+		t.Errorf("no caching detected: %d HTTP calls for %d goroutines", callCount, numGoroutines)
+	}
+	// Expect significant reduction (< 50% of goroutines)
+	if callCount > numGoroutines/2 {
+		t.Logf("warning: more HTTP calls than expected: %d (caching could be more effective)", callCount)
+	}
+}
+
+func TestMultipleHosts(t *testing.T) {
+	cache := &featureFlagCache{
+		contexts: make(map[string]*featureFlagContext),
+	}
+
+	hosts := []string{"host1.databricks.com", "host2.databricks.com", "host3.databricks.com"}
+
+	// Create contexts for multiple hosts
+	for _, host := range hosts {
+		cache.getOrCreateContext(host)
+		cache.getOrCreateContext(host) // Increment to 2
+	}
+
+	// Verify all contexts exist
+	for _, host := range hosts {
+		ctx, exists := cache.contexts[host]
+		if !exists {
+			t.Errorf("context should exist for %s", host)
+		}
+		if ctx.refCount != 2 {
+			t.Errorf("expected refCount=2 for %s, got %d", host, ctx.refCount)
+		}
+	}
+
+	// Release one host completely
+	cache.releaseContext(hosts[0])
+	cache.releaseContext(hosts[0])
+
+	// Verify only first host is removed
+	if _, exists := cache.contexts[hosts[0]]; exists {
+		t.Errorf("context should be removed for %s", hosts[0])
+	}
+
+	for _, host := range hosts[1:] {
+		if _, exists := cache.contexts[host]; !exists {
+			t.Errorf("context should still exist for %s", host)
+		}
+	}
+}