Skip to content

Commit 1e6b9e2

Browse files
authored
fix(uptime): Emit arroyo.consumer.latency when using thread-queue-parallel mode (#95919)
We need to emit this standard metric since we have our own custom offset committing strategy. Without this, a bunch of datadog alerts fire. <!-- Describe your PR here. -->
1 parent 01bf898 commit 1e6b9e2

File tree

6 files changed

+108
-63
lines changed

6 files changed

+108
-63
lines changed

src/sentry/conf/types/kafka_definition.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -126,6 +126,9 @@ class ConsumerDefinition(TypedDict, total=False):
126126
# Hardcoded additional kwargs for strategy_factory
127127
static_args: Mapping[str, Any]
128128

129+
# Pass the consumer group ID to the strategy factory as 'consumer_group' kwarg
130+
pass_consumer_group: bool
131+
129132
require_synchronization: bool
130133
synchronize_commit_group_default: str
131134
synchronize_commit_log_topic_default: str

src/sentry/consumers/__init__.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -271,6 +271,7 @@ def ingest_transactions_options() -> list[click.Option]:
271271
"topic": Topic.UPTIME_RESULTS,
272272
"strategy_factory": "sentry.uptime.consumers.results_consumer.UptimeResultsStrategyFactory",
273273
"click_options": uptime_options(),
274+
"pass_consumer_group": True,
274275
},
275276
"billing-metrics-consumer": {
276277
"topic": Topic.SNUBA_GENERIC_METRICS,
@@ -511,8 +512,14 @@ def get_stream_processor(
511512
name=consumer_name, params=list(consumer_definition.get("click_options") or ())
512513
)
513514
cmd_context = cmd.make_context(consumer_name, list(consumer_args))
515+
extra_kwargs = {}
516+
if consumer_definition.get("pass_consumer_group", False):
517+
extra_kwargs["consumer_group"] = group_id
514518
strategy_factory = cmd_context.invoke(
515-
strategy_factory_cls, **cmd_context.params, **consumer_definition.get("static_args") or {}
519+
strategy_factory_cls,
520+
**cmd_context.params,
521+
**consumer_definition.get("static_args") or {},
522+
**extra_kwargs,
516523
)
517524

518525
def build_consumer_config(group_id: str):

src/sentry/remote_subscriptions/consumers/queue_consumer.py

Lines changed: 55 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
from collections import defaultdict
88
from collections.abc import Callable
99
from dataclasses import dataclass
10+
from datetime import datetime
1011
from typing import Any, Generic, TypeVar
1112

1213
import sentry_sdk
@@ -30,25 +31,25 @@ class UnassignedPartitionError(Exception):
3031

3132
@dataclass
3233
class WorkItem(Generic[T]):
33-
"""Work item that includes the original message for offset tracking."""
34+
"""Work item for processing with offset tracking information."""
3435

3536
partition: Partition
3637
offset: int
38+
timestamp: datetime
3739
result: T
38-
message: Message[KafkaPayload | FilteredPayload]
3940

4041

4142
class OffsetTracker:
4243
"""
4344
Tracks outstanding offsets and determines which offsets are safe to commit.
4445
45-
- Tracks offsets per partition
46+
- Tracks offsets per partition with their timestamps
4647
- Only commits offsets when all prior offsets are processed
4748
- Thread-safe for concurrent access with per-partition locks
4849
"""
4950

5051
def __init__(self) -> None:
51-
self.all_offsets: dict[Partition, set[int]] = defaultdict(set)
52+
self.all_offsets: dict[Partition, dict[int, datetime]] = defaultdict(dict)
5253
self.outstanding: dict[Partition, set[int]] = defaultdict(set)
5354
self.last_committed: dict[Partition, int] = {}
5455
self.partition_locks: dict[Partition, threading.Lock] = {}
@@ -57,15 +58,15 @@ def _get_partition_lock(self, partition: Partition) -> threading.Lock:
5758
"""Get the lock for a partition."""
5859
return self.partition_locks[partition]
5960

60-
def add_offset(self, partition: Partition, offset: int) -> None:
61+
def add_offset(self, partition: Partition, offset: int, timestamp: datetime) -> None:
6162
"""Record that we've started processing an offset."""
6263
if partition not in self.partition_locks:
6364
raise UnassignedPartitionError(
6465
f"Partition {partition} is not assigned to this consumer"
6566
)
6667

6768
with self._get_partition_lock(partition):
68-
self.all_offsets[partition].add(offset)
69+
self.all_offsets[partition][offset] = timestamp
6970
self.outstanding[partition].add(offset)
7071

7172
def complete_offset(self, partition: Partition, offset: int) -> None:
@@ -76,11 +77,12 @@ def complete_offset(self, partition: Partition, offset: int) -> None:
7677
with self._get_partition_lock(partition):
7778
self.outstanding[partition].discard(offset)
7879

79-
def get_committable_offsets(self) -> dict[Partition, int]:
80+
def get_committable_offsets(self) -> dict[Partition, tuple[int, datetime]]:
8081
"""
8182
Get the highest offset per partition that can be safely committed.
8283
83-
For each partition, finds the highest contiguous offset that has been processed.
84+
For each partition, finds the highest contiguous offset that has been processed,
85+
and returns the timestamp of the oldest offset in the contiguous set.
8486
"""
8587
committable = {}
8688
for partition in list(self.all_offsets.keys()):
@@ -92,20 +94,26 @@ def get_committable_offsets(self) -> dict[Partition, int]:
9294
outstanding = self.outstanding[partition]
9395
last_committed = self.last_committed.get(partition, -1)
9496

95-
min_offset = min(all_offsets)
96-
max_offset = max(all_offsets)
97+
offset_keys = set(all_offsets.keys())
98+
min_offset = min(offset_keys)
99+
max_offset = max(offset_keys)
97100

98101
start = max(last_committed + 1, min_offset)
99102

100103
highest_committable = last_committed
104+
oldest_timestamp = None
105+
101106
for offset in range(start, max_offset + 1):
102107
if offset in all_offsets and offset not in outstanding:
103108
highest_committable = offset
109+
timestamp = all_offsets[offset]
110+
if oldest_timestamp is None or timestamp < oldest_timestamp:
111+
oldest_timestamp = timestamp
104112
else:
105113
break
106114

107-
if highest_committable > last_committed:
108-
committable[partition] = highest_committable
115+
if highest_committable > last_committed and oldest_timestamp:
116+
committable[partition] = (highest_committable, oldest_timestamp)
109117

110118
return committable
111119

@@ -114,7 +122,9 @@ def mark_committed(self, partition: Partition, offset: int) -> None:
114122
with self._get_partition_lock(partition):
115123
self.last_committed[partition] = offset
116124
# Remove all offsets <= committed offset
117-
self.all_offsets[partition] = {o for o in self.all_offsets[partition] if o > offset}
125+
self.all_offsets[partition] = {
126+
k: v for k, v in self.all_offsets[partition].items() if k > offset
127+
}
118128

119129
def clear(self) -> None:
120130
"""Clear all offset tracking state."""
@@ -193,13 +203,15 @@ def __init__(
193203
self,
194204
result_processor: Callable[[str, T], None],
195205
identifier: str,
206+
consumer_group: str,
196207
num_queues: int = 20,
197208
commit_interval: float = 1.0,
198209
) -> None:
199210
self.result_processor = result_processor
200211
self.identifier = identifier
201212
self.num_queues = num_queues
202213
self.commit_interval = commit_interval
214+
self.consumer_group = consumer_group
203215
self.offset_tracker = OffsetTracker()
204216
self.queues: list[queue.Queue[WorkItem[T]]] = []
205217
self.workers: list[OrderedQueueWorker[T]] = []
@@ -239,9 +251,21 @@ def _commit_loop(self) -> None:
239251
len(committable),
240252
tags={"identifier": self.identifier},
241253
)
242-
243-
self.commit_function(committable)
244-
for partition, offset in committable.items():
254+
for partition, (offset, oldest_timestamp) in committable.items():
255+
metrics.timing(
256+
"arroyo.consumer.latency",
257+
time.time() - oldest_timestamp.timestamp(),
258+
tags={
259+
"partition": partition.index,
260+
"kafka_topic": partition.topic.name,
261+
"consumer_group": self.consumer_group,
262+
},
263+
)
264+
265+
self.commit_function(
266+
{partition: offset for partition, (offset, _) in committable.items()}
267+
)
268+
for partition, (offset, _) in committable.items():
245269
self.offset_tracker.mark_committed(partition, offset)
246270
except Exception:
247271
logger.exception("Error in commit loop")
@@ -257,7 +281,9 @@ def submit(self, group_key: str, work_item: WorkItem[T]) -> None:
257281
Submit a work item to the appropriate queue.
258282
"""
259283
try:
260-
self.offset_tracker.add_offset(work_item.partition, work_item.offset)
284+
self.offset_tracker.add_offset(
285+
work_item.partition, work_item.offset, work_item.timestamp
286+
)
261287
except UnassignedPartitionError:
262288
logger.exception(
263289
"Received message for unassigned partition, skipping",
@@ -400,10 +426,11 @@ def submit(self, message: Message[KafkaPayload | FilteredPayload]) -> None:
400426
assert isinstance(message.value, BrokerValue)
401427
partition = message.value.partition
402428
offset = message.value.offset
429+
timestamp = message.value.timestamp
403430

404431
if result is None:
405432
try:
406-
self.queue_pool.offset_tracker.add_offset(partition, offset)
433+
self.queue_pool.offset_tracker.add_offset(partition, offset, timestamp)
407434
self.queue_pool.offset_tracker.complete_offset(partition, offset)
408435
except UnassignedPartitionError:
409436
pass
@@ -414,21 +441,24 @@ def submit(self, message: Message[KafkaPayload | FilteredPayload]) -> None:
414441
work_item = WorkItem(
415442
partition=partition,
416443
offset=offset,
444+
timestamp=timestamp,
417445
result=result,
418-
message=message,
419446
)
420447

421448
self.queue_pool.submit(group_key, work_item)
422449

423450
except Exception:
424451
logger.exception("Error submitting message to queue")
425452
if isinstance(message.value, BrokerValue):
426-
self.queue_pool.offset_tracker.add_offset(
427-
message.value.partition, message.value.offset
428-
)
429-
self.queue_pool.offset_tracker.complete_offset(
430-
message.value.partition, message.value.offset
431-
)
453+
try:
454+
self.queue_pool.offset_tracker.add_offset(
455+
message.value.partition, message.value.offset, message.value.timestamp
456+
)
457+
self.queue_pool.offset_tracker.complete_offset(
458+
message.value.partition, message.value.offset
459+
)
460+
except UnassignedPartitionError:
461+
pass
432462

433463
def poll(self) -> None:
434464
stats = self.queue_pool.get_stats()

src/sentry/remote_subscriptions/consumers/result_consumer.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,7 @@ class ResultsStrategyFactory(ProcessingStrategyFactory[KafkaPayload], Generic[T,
105105

106106
def __init__(
107107
self,
108+
consumer_group: str,
108109
mode: Literal["batched-parallel", "parallel", "serial", "thread-queue-parallel"] = "serial",
109110
max_batch_size: int | None = None,
110111
max_batch_time: int | None = None,
@@ -115,6 +116,7 @@ def __init__(
115116
commit_interval: float | None = None,
116117
) -> None:
117118
self.mode = mode
119+
self.consumer_group = consumer_group
118120
metric_tags = {"identifier": self.identifier, "mode": self.mode}
119121
self.result_processor = self.result_processor_cls()
120122
if mode == "batched-parallel":
@@ -134,6 +136,7 @@ def __init__(
134136
self.queue_pool = FixedQueuePool(
135137
result_processor=self.result_processor,
136138
identifier=self.identifier,
139+
consumer_group=consumer_group,
137140
num_queues=max_workers or 20,
138141
commit_interval=commit_interval or 1.0,
139142
)

0 commit comments

Comments
 (0)