Skip to content

Commit 7775a41

Browse files
feat(tools): Querytee Goldfish (#17959)
Signed-off-by: Jordan Rushing <[email protected]>
1 parent ecdd613 commit 7775a41

23 files changed

+3725
-23
lines changed

cmd/querytee/main.go

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -67,13 +67,13 @@ func lokiReadRoutes(cfg Config) []querytee.Route {
6767
})
6868

6969
return []querytee.Route{
70-
{Path: "/loki/api/v1/query_range", RouteName: "api_v1_query_range", Methods: []string{"GET"}, ResponseComparator: samplesComparator},
71-
{Path: "/loki/api/v1/query", RouteName: "api_v1_query", Methods: []string{"GET"}, ResponseComparator: samplesComparator},
70+
{Path: "/loki/api/v1/query_range", RouteName: "api_v1_query_range", Methods: []string{"GET", "POST"}, ResponseComparator: samplesComparator},
71+
{Path: "/loki/api/v1/query", RouteName: "api_v1_query", Methods: []string{"GET", "POST"}, ResponseComparator: samplesComparator},
7272
{Path: "/loki/api/v1/label", RouteName: "api_v1_label", Methods: []string{"GET"}, ResponseComparator: nil},
7373
{Path: "/loki/api/v1/labels", RouteName: "api_v1_labels", Methods: []string{"GET"}, ResponseComparator: nil},
7474
{Path: "/loki/api/v1/label/{name}/values", RouteName: "api_v1_label_name_values", Methods: []string{"GET"}, ResponseComparator: nil},
7575
{Path: "/loki/api/v1/series", RouteName: "api_v1_series", Methods: []string{"GET"}, ResponseComparator: nil},
76-
{Path: "/api/prom/query", RouteName: "api_prom_query", Methods: []string{"GET"}, ResponseComparator: samplesComparator},
76+
{Path: "/api/prom/query", RouteName: "api_prom_query", Methods: []string{"GET", "POST"}, ResponseComparator: samplesComparator},
7777
{Path: "/api/prom/label", RouteName: "api_prom_label", Methods: []string{"GET"}, ResponseComparator: nil},
7878
{Path: "/api/prom/label/{name}/values", RouteName: "api_prom_label_name_values", Methods: []string{"GET"}, ResponseComparator: nil},
7979
{Path: "/api/prom/series", RouteName: "api_prom_series", Methods: []string{"GET"}, ResponseComparator: nil},

tools/querytee/goldfish/README.md

Lines changed: 225 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,225 @@
1+
# Goldfish - Query Comparison for Loki QueryTee
2+
3+
**⚠️ EXPERIMENTAL**: Goldfish is an experimental feature and its API/configuration may change in future releases.
4+
5+
Goldfish is a feature within QueryTee that enables sampling and comparison of query responses between multiple Loki cells. It helps identify discrepancies and performance differences between cells during migrations or when running multiple Loki deployments.
6+
7+
## Features
8+
9+
- **Tenant-based Sampling**: Configure sampling rates per tenant or use a default rate
10+
- **Privacy-Compliant Comparison**: Hash-based comparison without storing sensitive data:
11+
- Response integrity verification using fnv32 content hashes
12+
- Performance statistics comparison and analysis
13+
- **Performance Analysis**: Rich performance metrics tracking:
14+
- Execution time, queue time, processing rates
15+
- Bytes/lines processed comparison
16+
- Query complexity metrics (splits, shards)
17+
- Performance variance detection and reporting
18+
- **Query Engine Version Tracking**: Tracks which queries used the new experimental query engine vs the old engine
19+
- **Persistent Storage**: MySQL storage via Google Cloud SQL Proxy or Amazon RDS for storing query samples and comparison results
20+
21+
## Configuration
22+
23+
Goldfish is configured through command-line flags:
24+
25+
```bash
26+
# Enable Goldfish
27+
-goldfish.enabled=true
28+
29+
# Sampling configuration
30+
-goldfish.sampling.default-rate=0.1 # Sample 10% of queries by default
31+
-goldfish.sampling.tenant-rules="tenant1:0.5,tenant2:1.0" # Tenant-specific rates
32+
33+
# Storage configuration (optional - defaults to no-op if not specified)
34+
35+
# Option 1: CloudSQL (MySQL) configuration
36+
-goldfish.storage.type=cloudsql
37+
-goldfish.storage.cloudsql.host=cloudsql-proxy # CloudSQL proxy host
38+
-goldfish.storage.cloudsql.port=3306 # MySQL port (default: 3306)
39+
-goldfish.storage.cloudsql.database=goldfish
40+
-goldfish.storage.cloudsql.user=goldfish-user
41+
42+
# Option 2: RDS (MySQL) configuration
43+
-goldfish.storage.type=rds
44+
-goldfish.storage.rds.endpoint=mydb.123456789012.us-east-1.rds.amazonaws.com:3306
45+
-goldfish.storage.rds.database=goldfish
46+
-goldfish.storage.rds.user=goldfish-user
47+
48+
# Password must be provided via GOLDFISH_DB_PASSWORD environment variable (for both CloudSQL and RDS)
49+
export GOLDFISH_DB_PASSWORD=your-password
50+
51+
# Connection pool settings (apply to both CloudSQL and RDS)
52+
-goldfish.storage.max-connections=10 # Maximum database connections
53+
-goldfish.storage.max-idle-time=300 # Connection idle timeout (seconds)
54+
55+
# Performance comparison settings
56+
-goldfish.performance-tolerance=0.1 # 10% tolerance for execution time variance
57+
58+
# Or run without storage (sampling and comparison only, no persistence)
59+
# Simply omit the storage configuration
60+
```
61+
62+
## Architecture
63+
64+
```
65+
┌─────────────┐ ┌─────────────┐
66+
│ Client │────▶│ QueryTee │ ◄── Existing functionality unchanged
67+
└─────────────┘ └──────┬──────┘
68+
69+
┌──────┴──────┐ ◄── Optional Goldfish integration
70+
│ Goldfish │ (only when enabled)
71+
│ Manager │
72+
└──────┬──────┘
73+
74+
┌────────────┼────────────┐
75+
▼ ▼ ▼
76+
┌──────────┐ ┌──────────┐ ┌──────────┐
77+
│ Sampler │ │Hash-based│ │ Storage │
78+
│ │ │Comparator│ │ │
79+
└──────────┘ └──────────┘ └──────────┘
80+
```
81+
82+
## Database Schema
83+
84+
Goldfish uses two main tables:
85+
86+
### sampled_queries
87+
88+
Stores query metadata and performance statistics (no sensitive data):
89+
90+
- correlation_id (PRIMARY KEY)
91+
- tenant_id, query, query_type
92+
- start_time, end_time, step_duration
93+
- Performance statistics for both cells:
94+
- exec_time_ms, queue_time_ms
95+
- bytes_processed, lines_processed
96+
- bytes_per_second, lines_per_second
97+
- entries_returned, splits, shards
98+
- Response metadata without content:
99+
- response_hash (for integrity verification)
100+
- response_size, status_code
101+
- Query engine version tracking:
102+
- cell_a_used_new_engine (BOOLEAN)
103+
- cell_b_used_new_engine (BOOLEAN)
104+
- sampled_at
105+
106+
### comparison_outcomes
107+
108+
Stores the results of comparing responses:
109+
110+
- correlation_id (PRIMARY KEY, FOREIGN KEY)
111+
- comparison_status (match, mismatch, error, partial)
112+
- difference_details (JSONB)
113+
- performance_metrics (JSONB)
114+
- compared_at
115+
116+
## Comparison Logic
117+
118+
Goldfish uses a simplified, privacy-focused comparison approach:
119+
120+
1. **Content Integrity**: Response content is hashed (fnv32) for integrity verification
121+
- Matching hashes = identical content = **MATCH**
122+
- Different hashes = different content = **MISMATCH**
123+
124+
2. **Performance Analysis**: Execution statistics are compared for optimization insights:
125+
- Execution time variance (with configurable tolerance for normal variation)
126+
- Bytes/lines processed (must be identical for same query)
127+
- Query complexity differences (splits, shards)
128+
129+
3. **Status Code Comparison**:
130+
- Different status codes = **MISMATCH**
131+
- Both non-200 status codes = **MATCH** (both failed consistently)
132+
133+
4. **Query Engine Version Detection**:
134+
- Goldfish detects when queries use the new experimental query engine by parsing warnings in the response
135+
- When Loki includes the warning "Query was executed using the new experimental query engine and dataobj storage.", Goldfish tracks this in the database
136+
- This helps identify which queries are using the new vs old engine during migration
137+
138+
**Important**: Performance differences do NOT affect match status. If content hashes match, queries are considered equivalent regardless of execution time differences.
139+
140+
## Usage
141+
142+
Once configured, Goldfish automatically:
143+
144+
1. Samples queries based on tenant configuration
145+
2. Captures responses from both Loki cells
146+
3. Extracts performance statistics and computes content hashes
147+
4. Compares hashes and performance metrics
148+
5. Stores results in the configured database
149+
150+
Query the database to analyze differences:
151+
152+
```sql
153+
-- Find all mismatches for a tenant
154+
SELECT * FROM comparison_outcomes co
155+
JOIN sampled_queries sq ON co.correlation_id = sq.correlation_id
156+
WHERE sq.tenant_id = 'tenant1'
157+
AND co.comparison_status = 'mismatch';
158+
159+
-- Performance comparison
160+
SELECT
161+
sq.tenant_id,
162+
AVG((co.performance_metrics->>'QueryTimeRatio')::float) as avg_time_ratio,
163+
COUNT(*) as query_count
164+
FROM comparison_outcomes co
165+
JOIN sampled_queries sq ON co.correlation_id = sq.correlation_id
166+
GROUP BY sq.tenant_id;
167+
168+
-- Query engine version analysis
169+
SELECT
170+
sq.tenant_id,
171+
SUM(CASE WHEN sq.cell_a_used_new_engine THEN 1 ELSE 0 END) as cell_a_new_engine_count,
172+
SUM(CASE WHEN sq.cell_b_used_new_engine THEN 1 ELSE 0 END) as cell_b_new_engine_count,
173+
COUNT(*) as total_queries
174+
FROM sampled_queries sq
175+
GROUP BY sq.tenant_id;
176+
177+
-- Find queries where cells used different engines
178+
SELECT * FROM sampled_queries
179+
WHERE cell_a_used_new_engine != cell_b_used_new_engine;
180+
```
181+
182+
## Storage Configuration
183+
184+
Goldfish supports MySQL storage via Google Cloud SQL Proxy or Amazon RDS. The storage is optional - if not configured, Goldfish will perform sampling and comparison but won't persist results.
185+
186+
### Setting up CloudSQL
187+
188+
1. Ensure your CloudSQL proxy is running and accessible
189+
2. Create a MySQL database for Goldfish
190+
3. Configure the connection parameters via flags
191+
4. Set the database password using the `GOLDFISH_DB_PASSWORD` environment variable
192+
193+
### Setting up RDS
194+
195+
1. Create an RDS MySQL instance
196+
2. Ensure your Loki cells can reach the RDS endpoint
197+
3. Create a MySQL database for Goldfish
198+
4. Configure the connection parameters via flags
199+
5. Set the database password using the `GOLDFISH_DB_PASSWORD` environment variable
200+
201+
The schema will be automatically created on first run for both CloudSQL and RDS.
202+
203+
## Testing
204+
205+
The Goldfish implementation includes comprehensive tests that verify:
206+
207+
- Hash-based comparison logic
208+
- Performance statistics analysis
209+
- Status code handling
210+
- SQL injection protection with parameterized queries
211+
- Configurable performance tolerance
212+
- Backward compatibility with existing QueryTee functionality
213+
214+
Run tests with:
215+
216+
```bash
217+
# Test only Goldfish functionality
218+
go test ./tools/querytee/goldfish/...
219+
220+
# Test QueryTee including Goldfish integration
221+
go test ./tools/querytee/...
222+
223+
# Build the QueryTee binary with Goldfish support
224+
make loki-querytee
225+
```
Lines changed: 105 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,105 @@
1+
package goldfish
2+
3+
import (
4+
"net/http"
5+
"time"
6+
)
7+
8+
// CompareResponses compares performance statistics and hashes from QuerySample
9+
func CompareResponses(sample *QuerySample, performanceTolerance float64) ComparisonResult {
10+
result := ComparisonResult{
11+
CorrelationID: sample.CorrelationID,
12+
DifferenceDetails: make(map[string]any),
13+
PerformanceMetrics: PerformanceMetrics{
14+
CellAQueryTime: time.Duration(sample.CellAStats.ExecTimeMs) * time.Millisecond,
15+
CellBQueryTime: time.Duration(sample.CellBStats.ExecTimeMs) * time.Millisecond,
16+
CellABytesTotal: sample.CellAResponseSize,
17+
CellBBytesTotal: sample.CellBResponseSize,
18+
},
19+
ComparedAt: time.Now(),
20+
}
21+
22+
// Calculate ratios
23+
if sample.CellAStats.ExecTimeMs > 0 {
24+
result.PerformanceMetrics.QueryTimeRatio = float64(sample.CellBStats.ExecTimeMs) / float64(sample.CellAStats.ExecTimeMs)
25+
}
26+
if sample.CellAResponseSize > 0 {
27+
result.PerformanceMetrics.BytesRatio = float64(sample.CellBResponseSize) / float64(sample.CellAResponseSize)
28+
}
29+
30+
// Compare responses using clear matching rules
31+
switch {
32+
case sample.CellAStatusCode != sample.CellBStatusCode:
33+
// Different status codes always indicate a mismatch
34+
result.ComparisonStatus = ComparisonStatusMismatch
35+
result.DifferenceDetails["status_code"] = map[string]any{
36+
"cell_a": sample.CellAStatusCode,
37+
"cell_b": sample.CellBStatusCode,
38+
}
39+
return result
40+
41+
case sample.CellAStatusCode == sample.CellBStatusCode && sample.CellAStatusCode != http.StatusOK:
42+
// Same non-200 status codes indicate matching error behavior
43+
// Both services are failing in the same way (e.g., both returning 404 for not found)
44+
result.ComparisonStatus = ComparisonStatusMatch
45+
return result
46+
47+
case sample.CellAResponseHash != sample.CellBResponseHash:
48+
// Both returned 200 but with different content
49+
result.ComparisonStatus = ComparisonStatusMismatch
50+
result.DifferenceDetails["content_hash"] = map[string]any{
51+
"cell_a": sample.CellAResponseHash,
52+
"cell_b": sample.CellBResponseHash,
53+
}
54+
return result
55+
56+
default:
57+
// Both returned 200 with identical content
58+
result.ComparisonStatus = ComparisonStatusMatch
59+
}
60+
61+
// Still compare performance statistics for analysis, but don't change match status
62+
compareQueryStats(sample.CellAStats, sample.CellBStats, &result, performanceTolerance)
63+
64+
return result
65+
}
66+
67+
// compareQueryStats compares performance statistics between two queries
68+
func compareQueryStats(statsA, statsB QueryStats, result *ComparisonResult, tolerance float64) {
69+
70+
// Compare execution times (record variance for analysis)
71+
if statsA.ExecTimeMs > 0 && statsB.ExecTimeMs > 0 {
72+
ratio := float64(statsB.ExecTimeMs) / float64(statsA.ExecTimeMs)
73+
if ratio > (1+tolerance) || ratio < (1-tolerance) {
74+
result.DifferenceDetails["exec_time_variance"] = map[string]any{
75+
"cell_a_ms": statsA.ExecTimeMs,
76+
"cell_b_ms": statsB.ExecTimeMs,
77+
"ratio": ratio,
78+
}
79+
}
80+
}
81+
82+
// Compare bytes processed (should be exactly the same for same query)
83+
if statsA.BytesProcessed != statsB.BytesProcessed {
84+
result.DifferenceDetails["bytes_processed"] = map[string]any{
85+
"cell_a": statsA.BytesProcessed,
86+
"cell_b": statsB.BytesProcessed,
87+
}
88+
}
89+
90+
// Compare lines processed (should be exactly the same for same query)
91+
if statsA.LinesProcessed != statsB.LinesProcessed {
92+
result.DifferenceDetails["lines_processed"] = map[string]any{
93+
"cell_a": statsA.LinesProcessed,
94+
"cell_b": statsB.LinesProcessed,
95+
}
96+
}
97+
98+
// Compare total entries returned (should be exactly the same for same query)
99+
if statsA.TotalEntriesReturned != statsB.TotalEntriesReturned {
100+
result.DifferenceDetails["entries_returned"] = map[string]any{
101+
"cell_a": statsA.TotalEntriesReturned,
102+
"cell_b": statsB.TotalEntriesReturned,
103+
}
104+
}
105+
}

0 commit comments

Comments
 (0)