-
Notifications
You must be signed in to change notification settings - Fork 1.5k
kafka_consumer: bound and refine estimated_consumer_lag #24167
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
5331ba9
a8d4125
04836ec
fefa2a2
ac84e55
17b30ca
70abe3b
686a373
6559b08
b9fde56
83c37e4
9f76e78
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1 @@ | ||
| Improve the accuracy of ``estimated_consumer_lag`` for consumers that are far behind: cap interpolation for offsets older than the cached broker history, use the low watermark as a floor for the lag offset when cluster monitoring is enabled, and retain a longer broker-timestamp history by compacting the cache (Visvalingam-Whyatt) and pruning samples below the lowest readable offset (the low watermark, or the earliest consumer offset when cluster monitoring is disabled) instead of evicting the oldest one. |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,6 +1,7 @@ | ||
| # (C) Datadog, Inc. 2019-present | ||
| # All rights reserved | ||
| # Licensed under Simplified BSD License (see LICENSE) | ||
| import heapq | ||
| import json | ||
| from collections import defaultdict | ||
| from time import time | ||
|
|
@@ -17,6 +18,8 @@ | |
|
|
||
| MAX_TIMESTAMPS = 1000 | ||
|
|
||
| LAG_EXTRAPOLATION_LIMIT_SECONDS = 600 | ||
|
|
||
|
|
||
| class KafkaCheck(AgentCheck): | ||
| __NAMESPACE__ = 'kafka' | ||
|
|
@@ -63,6 +66,8 @@ def check(self, _): | |
| # Fetch the broker highwater offsets | ||
| highwater_offsets = {} | ||
| broker_timestamps = defaultdict(dict) | ||
| low_watermark_offsets = {} | ||
| topic_partitions = {} | ||
| cluster_id = "" | ||
| persistent_cache_key = "broker_timestamps_" | ||
| consumer_contexts_count = self.count_consumer_contexts(consumer_offsets) | ||
|
|
@@ -82,9 +87,17 @@ def check(self, _): | |
| partitions.add((topic, partition)) | ||
| # Expected format: ({(topic, partition): offset}, cluster_id) | ||
| highwater_offsets, cluster_id = self.get_highwater_offsets(partitions) | ||
| if self.config._cluster_monitoring_enabled: | ||
| topic_partitions = self.client.get_topic_partitions() | ||
| low_watermark_offsets = self.metadata_collector.fetch_earliest_offsets(topic_partitions) | ||
| if self._data_streams_enabled: | ||
| broker_timestamps = self._load_broker_timestamps(persistent_cache_key) | ||
| self._add_broker_timestamps(broker_timestamps, highwater_offsets) | ||
| if low_watermark_offsets: | ||
| prune_floors = low_watermark_offsets | ||
| else: | ||
| self.log.debug("No low watermarks available; pruning cache by earliest consumer offset") | ||
| prune_floors = self._earliest_consumer_offsets(consumer_offsets) | ||
| self._add_broker_timestamps(broker_timestamps, highwater_offsets, prune_floors) | ||
| self._save_broker_timestamps(broker_timestamps, persistent_cache_key) | ||
| else: | ||
| self.warning("Context limit reached. Skipping highwater offset collection.") | ||
|
|
@@ -125,13 +138,14 @@ def check(self, _): | |
| reporting_limit - len(highwater_offsets), | ||
| broker_timestamps, | ||
| cluster_id, | ||
| low_watermark_offsets, | ||
| ) | ||
|
|
||
| # Collect cluster metadata if enabled | ||
| if self.config._cluster_monitoring_enabled: | ||
| self._send_cluster_monitoring_heartbeat(total_contexts, cluster_id) | ||
| try: | ||
| self.metadata_collector.collect_all_metadata(highwater_offsets) | ||
| self.metadata_collector.collect_all_metadata(highwater_offsets, low_watermark_offsets, topic_partitions) | ||
| except Exception as e: | ||
| self.log.error("Error collecting cluster metadata: %s", e) | ||
|
|
||
|
|
@@ -254,7 +268,17 @@ def _load_broker_timestamps(self, persistent_cache_key): | |
| self.log.warning('Could not read broker timestamps from cache: %s', str(e)) | ||
| return broker_timestamps | ||
|
|
||
| def _add_broker_timestamps(self, broker_timestamps, highwater_offsets): | ||
| def _earliest_consumer_offsets(self, consumer_offsets): | ||
| """Return the lowest committed offset per (topic, partition) across all consumer groups.""" | ||
| earliest = {} | ||
| for offsets in consumer_offsets.values(): | ||
| for topic_partition, offset in offsets.items(): | ||
| if topic_partition not in earliest or offset < earliest[topic_partition]: | ||
| earliest[topic_partition] = offset | ||
| return earliest | ||
|
|
||
| def _add_broker_timestamps(self, broker_timestamps, highwater_offsets, prune_floors=None): | ||
| prune_floors = prune_floors or {} | ||
| for (topic, partition), highwater_offset in highwater_offsets.items(): | ||
| timestamps = broker_timestamps["{}_{}".format(topic, partition)] | ||
| # If the highwater offset went backwards (topic recreated, | ||
|
|
@@ -265,11 +289,11 @@ def _add_broker_timestamps(self, broker_timestamps, highwater_offsets): | |
| for o in stale: | ||
| del timestamps[o] | ||
| timestamps[highwater_offset] = time() | ||
| # If there's too many timestamps, we delete the oldest one (by | ||
| # timestamp, not by offset — evicting by min offset would discard | ||
| # the fresh post-reset entries and keep poisonous stale ones). | ||
| if len(timestamps) > self._max_timestamps: | ||
| del timestamps[min(timestamps, key=timestamps.get)] | ||
| if len(timestamps) >= self._max_timestamps: | ||
| prune_floor = prune_floors.get((topic, partition)) | ||
| if prune_floor is not None: | ||
| _prune_below_anchor(timestamps, prune_floor) | ||
| _visvalingam_whyatt(timestamps, max(2, self._max_timestamps // 2)) | ||
|
|
||
| def _save_broker_timestamps(self, broker_timestamps, persistent_cache_key): | ||
| """Saves broker timestamps to persistent cache.""" | ||
|
|
@@ -292,9 +316,16 @@ def report_highwater_offsets(self, highwater_offsets, contexts_limit, cluster_id | |
| self.log.debug('%s highwater offsets reported', reported_contexts) | ||
|
|
||
| def report_consumer_offsets_and_lag( | ||
| self, consumer_offsets, highwater_offsets, contexts_limit, broker_timestamps, cluster_id | ||
| self, | ||
| consumer_offsets, | ||
| highwater_offsets, | ||
| contexts_limit, | ||
| broker_timestamps, | ||
| cluster_id, | ||
| low_watermark_offsets=None, | ||
| ): | ||
| """Report the consumer offsets and consumer lag.""" | ||
| low_watermark_offsets = low_watermark_offsets or {} | ||
| reported_contexts = 0 | ||
| self.log.debug("Reporting consumer offsets and lag metrics") | ||
| for consumer_group, offsets in consumer_offsets.items(): | ||
|
|
@@ -368,7 +399,9 @@ def report_consumer_offsets_and_lag( | |
| timestamps = broker_timestamps["{}_{}".format(topic, partition)] | ||
| # The producer timestamp can be not set if there was an error fetching broker offsets. | ||
| producer_timestamp = timestamps.get(producer_offset, None) | ||
| consumer_timestamp = _get_interpolated_timestamp(timestamps, consumer_offset) | ||
| low_watermark = low_watermark_offsets.get((topic, partition)) | ||
| effective_offset = consumer_offset if low_watermark is None else max(consumer_offset, low_watermark) | ||
| consumer_timestamp = _get_interpolated_timestamp(timestamps, effective_offset) | ||
| if consumer_timestamp is None or producer_timestamp is None: | ||
| continue | ||
| lag = producer_timestamp - consumer_timestamp | ||
|
|
@@ -482,4 +515,58 @@ def _get_interpolated_timestamp(timestamps, offset): | |
| timestamp_after = timestamps[offset_after] | ||
| slope = (timestamp_after - timestamp_before) / float(offset_after - offset_before) | ||
| timestamp = slope * (offset - offset_after) + timestamp_after | ||
|
|
||
| if offset < offset_before: | ||
| # Cap how far past the oldest cached sample we extrapolate, so estimated lag stays bounded. | ||
| timestamp = max(timestamp, timestamp_before - LAG_EXTRAPOLATION_LIMIT_SECONDS) | ||
| return timestamp | ||
|
|
||
|
|
||
| def _prune_below_anchor(timestamps, floor): | ||
| below = [o for o in timestamps if o < floor] | ||
| if len(below) <= 1: | ||
| return | ||
| anchor = max(below) | ||
| for o in below: | ||
| if o != anchor: | ||
| del timestamps[o] | ||
|
|
||
|
|
||
| def _visvalingam_whyatt(timestamps, target_count): | ||
| if len(timestamps) <= target_count: | ||
| return timestamps | ||
|
|
||
| offsets = sorted(timestamps) | ||
| prev = {o: (offsets[i - 1] if i > 0 else None) for i, o in enumerate(offsets)} | ||
| nxt = {o: (offsets[i + 1] if i < len(offsets) - 1 else None) for i, o in enumerate(offsets)} | ||
| alive = set(offsets) | ||
|
|
||
| current = {} | ||
| heap = [] | ||
| for o in offsets: | ||
| if prev[o] is not None and nxt[o] is not None: | ||
| current[o] = _interpolation_error(o, prev, nxt, timestamps) | ||
| heap.append((current[o], o)) | ||
|
Comment on lines
+546
to
+549
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
When a topic is recreated or reset and the new highwater is still above an old cached offset, the stale purge above only removes cached offsets greater than the new highwater, so lower offsets from the previous topic generation can remain. This compactor never enqueues the first/last offset, so the smallest cached offset is preserved forever instead of eventually being evicted by age as before; that stale timestamp can then keep poisoning Useful? React with 👍 / 👎. |
||
| heapq.heapify(heap) | ||
|
|
||
| remaining = len(offsets) | ||
| while remaining > target_count and heap: | ||
| error, o = heapq.heappop(heap) | ||
| if o not in alive or error != current.get(o): | ||
| continue | ||
| before, after = prev[o], nxt[o] | ||
| alive.discard(o) | ||
| del timestamps[o] | ||
| remaining -= 1 | ||
| nxt[before], prev[after] = after, before | ||
| for neighbor in (before, after): | ||
| if prev[neighbor] is not None and nxt[neighbor] is not None: | ||
| current[neighbor] = _interpolation_error(neighbor, prev, nxt, timestamps) | ||
| heapq.heappush(heap, (current[neighbor], neighbor)) | ||
| return timestamps | ||
|
|
||
|
|
||
| def _interpolation_error(o, prev, nxt, timestamps): | ||
| before, after = prev[o], nxt[o] | ||
| predicted = timestamps[before] + (timestamps[after] - timestamps[before]) * (o - before) / (after - before) | ||
| return abs(timestamps[o] - predicted) | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This could be a good bit slower than the previous eviction behavior. Do you think it will take too long on clusters with a lot of partitions?
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
it's O(nlog(m)), so it will still be very fast. And it only runs 1/500 of the time.
The previous one was O(nlog(n)) and was running every time.
I don't expect that to be an issue.