awdavidson commented on code in PR #37541: URL: https://github.com/apache/airflow/pull/37541#discussion_r1497904760
########## airflow/jobs/job.py: ########## @@ -183,30 +184,32 @@ def heartbeat( previous_heartbeat = self.latest_heartbeat try: - # This will cause it to load from the db - self._merge_from(Job._fetch_from_db(self, session)) - previous_heartbeat = self.latest_heartbeat - - if self.state == JobState.RESTARTING: - self.kill() - - # Figure out how long to sleep for - sleep_for = 0 - if self.latest_heartbeat: - seconds_remaining = ( - self.heartrate - (timezone.utcnow() - self.latest_heartbeat).total_seconds() - ) - sleep_for = max(0, seconds_remaining) - sleep(sleep_for) - - job = Job._update_heartbeat(job=self, session=session) - self._merge_from(job) - - # At this point, the DB has updated. - previous_heartbeat = self.latest_heartbeat - - heartbeat_callback(session) - self.log.debug("[heartbeat]") + for attempt in run_with_db_retries(logger=self.log): + with attempt: + # This will cause it to load from the db + self._merge_from(Job._fetch_from_db(self, session)) + previous_heartbeat = self.latest_heartbeat + + if self.state == JobState.RESTARTING: + self.kill() + + # Figure out how long to sleep for + sleep_for = 0 + if self.latest_heartbeat: + seconds_remaining = ( + self.heartrate - (timezone.utcnow() - self.latest_heartbeat).total_seconds() + ) + sleep_for = max(0, seconds_remaining) + sleep(sleep_for) + + job = Job._update_heartbeat(job=self, session=session) + self._merge_from(job) + + # At this point, the DB has updated. + previous_heartbeat = self.latest_heartbeat + + heartbeat_callback(session) + self.log.debug("[heartbeat]") except OperationalError: Review Comment: Yes the heartbeat failure due to transient issue is not a direct cause but we have seen the heartbeat failure lead to the job being killed immediately after even though the user logic of the job was running without issue I think the retry within the heartbeat adds additional tolerance and prevents situations where `is_alive` returns `False` due to transient db errors -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@airflow.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org