Nataneljpwd commented on code in PR #61778:
URL: https://github.com/apache/airflow/pull/61778#discussion_r2824273277


##########
providers/cncf/kubernetes/src/airflow/providers/cncf/kubernetes/utils/pod_manager.py:
##########
@@ -188,19 +188,47 @@ def detect_pod_terminate_early_issues(pod: V1Pod) -> str 
| None:
     """
     Identify issues that justify terminating the pod early.
 
+    This method distinguishes between permanent failures (e.g., invalid image 
names)
+    and transient errors (e.g., rate limits) that should be retried by 
Kubernetes.
+
     :param pod: The pod object to check.
     :return: An error message if an issue is detected; otherwise, None.
     """
+    # Indicators in error messages that suggest transient issues
+    TRANSIENT_ERROR_PATTERNS = [
+        "pull qps exceeded",
+        "rate limit",
+        "too many requests",
+        "quota exceeded",
+        "temporarily unavailable",
+        "timeout",
+        "account limit",
+    ]
+
+    FATAL_STATES = ["InvalidImageName", "ErrImageNeverPull"]
+    TRANSIENT_STATES = ["ErrImagePull", "ImagePullBackOff"]
+
     pod_status = pod.status
     if pod_status.container_statuses:
         for container_status in pod_status.container_statuses:
             container_state: V1ContainerState = container_status.state
             container_waiting: V1ContainerStateWaiting | None = 
container_state.waiting
-            if container_waiting:
-                if container_waiting.reason in ["ErrImagePull", 
"ImagePullBackOff", "InvalidImageName"]:
+            if not container_waiting:
+                continue
+
+            if container_waiting.reason in FATAL_STATES:
+                return (
+                    f"Image cannot be pulled, unable to start: 
{container_waiting.reason}"
+                    f"\n{container_waiting.message or ''}"
+                )
+
+            if container_waiting.reason in TRANSIENT_STATES:
+                message_lower = (container_waiting.message or "").lower()
+                is_transient = any(pattern in message_lower for pattern in 
TRANSIENT_ERROR_PATTERNS)
+                if not is_transient:
                     return (
-                        f"Pod docker image cannot be pulled, unable to start: 
{container_waiting.reason}"
-                        f"\n{container_waiting.message}"
+                        f"Image cannot be pulled, unable to start: 
{container_waiting.reason}"
+                        f"\n{container_waiting.message or ''}"

Review Comment:
   why not reuse message_lower here?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to