|
35 | 35 | from airflow.exceptions import AirflowException |
36 | 36 | from airflow.listeners.listener import get_listener_manager |
37 | 37 | from airflow.models.base import ID_LEN, Base |
38 | | -from airflow.observability.trace import DebugTrace, add_debug_span |
39 | 38 | from airflow.utils.helpers import convert_camel_to_snake |
40 | 39 | from airflow.utils.log.logging_mixin import LoggingMixin |
41 | 40 | from airflow.utils.net import get_hostname |
@@ -210,68 +209,61 @@ def heartbeat( |
210 | 209 | :param session to use for saving the job |
211 | 210 | """ |
212 | 211 | previous_heartbeat = self.latest_heartbeat |
213 | | - with DebugTrace.start_span(span_name="heartbeat", component="Job") as span: |
214 | | - try: |
215 | | - span.set_attribute("heartbeat", str(self.latest_heartbeat)) |
216 | | - # This will cause it to load from the db |
| 212 | + try: |
| 213 | + # This will cause it to load from the db |
| 214 | + session.merge(self) |
| 215 | + previous_heartbeat = self.latest_heartbeat |
| 216 | + |
| 217 | + if self.state == JobState.RESTARTING: |
| 218 | + self.kill() |
| 219 | + |
| 220 | + # Figure out how long to sleep for |
| 221 | + sleep_for: float = 0 |
| 222 | + if self.latest_heartbeat: |
| 223 | + seconds_remaining = ( |
| 224 | + self.heartrate - (timezone.utcnow() - self.latest_heartbeat).total_seconds() |
| 225 | + ) |
| 226 | + sleep_for = max(0, seconds_remaining) |
| 227 | + sleep(sleep_for) |
| 228 | + # Update last heartbeat time |
| 229 | + with create_session() as session: |
| 230 | + # Make the session aware of this object |
217 | 231 | session.merge(self) |
| 232 | + self.latest_heartbeat = timezone.utcnow() |
| 233 | + session.commit() |
| 234 | + time_since_last_heartbeat: float = ( |
| 235 | + 0 |
| 236 | + if previous_heartbeat is None |
| 237 | + else (timezone.utcnow() - previous_heartbeat).total_seconds() |
| 238 | + ) |
| 239 | + health_check_threshold_value = health_check_threshold(self.job_type, self.heartrate) |
| 240 | + if time_since_last_heartbeat > health_check_threshold_value: |
| 241 | + self.log.info("Heartbeat recovered after %.2f seconds", time_since_last_heartbeat) |
| 242 | + # At this point, the DB has updated. |
218 | 243 | previous_heartbeat = self.latest_heartbeat |
219 | | - |
220 | | - if self.state == JobState.RESTARTING: |
221 | | - self.kill() |
222 | | - |
223 | | - # Figure out how long to sleep for |
224 | | - sleep_for: float = 0 |
225 | | - if self.latest_heartbeat: |
226 | | - seconds_remaining = ( |
227 | | - self.heartrate - (timezone.utcnow() - self.latest_heartbeat).total_seconds() |
228 | | - ) |
229 | | - sleep_for = max(0, seconds_remaining) |
230 | | - if span.is_recording(): |
231 | | - span.add_event(name="sleep", attributes={"sleep_for": sleep_for}) |
232 | | - sleep(sleep_for) |
233 | | - # Update last heartbeat time |
234 | | - with create_session() as session: |
235 | | - # Make the session aware of this object |
236 | | - session.merge(self) |
237 | | - self.latest_heartbeat = timezone.utcnow() |
238 | | - session.commit() |
239 | | - time_since_last_heartbeat: float = ( |
240 | | - 0 |
241 | | - if previous_heartbeat is None |
242 | | - else (timezone.utcnow() - previous_heartbeat).total_seconds() |
243 | | - ) |
244 | | - health_check_threshold_value = health_check_threshold(self.job_type, self.heartrate) |
245 | | - if time_since_last_heartbeat > health_check_threshold_value: |
246 | | - self.log.info("Heartbeat recovered after %.2f seconds", time_since_last_heartbeat) |
247 | | - # At this point, the DB has updated. |
248 | | - previous_heartbeat = self.latest_heartbeat |
249 | | - heartbeat_callback(session) |
250 | | - self.log.debug("[heartbeat]") |
251 | | - self.heartbeat_failed = False |
252 | | - except OperationalError: |
253 | | - Stats.incr(convert_camel_to_snake(self.__class__.__name__) + "_heartbeat_failure", 1, 1) |
254 | | - if not self.heartbeat_failed: |
255 | | - self.log.exception("%s heartbeat failed with error", self.__class__.__name__) |
256 | | - self.heartbeat_failed = True |
257 | | - msg = f"{self.__class__.__name__} heartbeat got an exception" |
258 | | - if span.is_recording(): |
259 | | - span.add_event(name="error", attributes={"message": msg}) |
260 | | - if self.is_alive(): |
261 | | - self.log.error( |
262 | | - "%s heartbeat failed with error. Scheduler may go into unhealthy state", |
263 | | - self.__class__.__name__, |
264 | | - ) |
265 | | - msg = f"{self.__class__.__name__} heartbeat failed with error. Scheduler may go into unhealthy state" |
266 | | - if span.is_recording(): |
267 | | - span.add_event(name="error", attributes={"message": msg}) |
268 | | - else: |
269 | | - msg = f"{self.__class__.__name__} heartbeat failed with error. Scheduler is in unhealthy state" |
270 | | - self.log.error(msg) |
271 | | - if span.is_recording(): |
272 | | - span.add_event(name="error", attributes={"message": msg}) |
273 | | - # We didn't manage to heartbeat, so make sure that the timestamp isn't updated |
274 | | - self.latest_heartbeat = previous_heartbeat |
| 244 | + heartbeat_callback(session) |
| 245 | + self.log.debug("[heartbeat]") |
| 246 | + self.heartbeat_failed = False |
| 247 | + except OperationalError: |
| 248 | + Stats.incr(convert_camel_to_snake(self.__class__.__name__) + "_heartbeat_failure", 1, 1) |
| 249 | + if not self.heartbeat_failed: |
| 250 | + self.log.exception("%s heartbeat failed with error", self.__class__.__name__) |
| 251 | + self.heartbeat_failed = True |
| 252 | + msg = f"{self.__class__.__name__} heartbeat got an exception" |
| 253 | + self.log.error(msg) |
| 254 | + if self.is_alive(): |
| 255 | + self.log.error( |
| 256 | + "%s heartbeat failed with error. Scheduler may go into unhealthy state", |
| 257 | + self.__class__.__name__, |
| 258 | + ) |
| 259 | + msg = f"{self.__class__.__name__} heartbeat failed with error. Scheduler may go into unhealthy state" |
| 260 | + else: |
| 261 | + msg = ( |
| 262 | + f"{self.__class__.__name__} heartbeat failed with error. Scheduler is in unhealthy state" |
| 263 | + ) |
| 264 | + self.log.error(msg) |
| 265 | + # We didn't manage to heartbeat, so make sure that the timestamp isn't updated |
| 266 | + self.latest_heartbeat = previous_heartbeat |
275 | 267 |
|
276 | 268 | @provide_session |
277 | 269 | def prepare_for_execution(self, session: Session = NEW_SESSION): |
@@ -401,7 +393,6 @@ def execute_job(job: Job, execute_callable: Callable[[], int | None]) -> int | N |
401 | 393 | return ret |
402 | 394 |
|
403 | 395 |
|
404 | | -@add_debug_span |
405 | 396 | def perform_heartbeat( |
406 | 397 | job: Job, heartbeat_callback: Callable[[Session], None], only_if_necessary: bool |
407 | 398 | ) -> None: |
|
0 commit comments