hterik commented on code in PR #54115:
URL: https://github.com/apache/airflow/pull/54115#discussion_r2269205716
##########
providers/cncf/kubernetes/src/airflow/providers/cncf/kubernetes/executors/kubernetes_executor_utils.py:
##########
@@ -302,6 +353,114 @@ def process_status(
)
+def collect_pod_failure_details(pod: k8s.V1Pod) -> FailureDetails | None:
+ """
+ Collect detailed failure information from a failed pod.
+
+ Analyzes both init containers and main containers to determine the root
cause
+ of pod failure, prioritizing terminated containers with non-zero exit
codes.
+
+ Args:
+ pod: The Kubernetes V1Pod object to analyze
+
+ Returns:
+ FailureDetails dict with failure information, or None if no failure
details found
+ """
+ if not pod.status or pod.status.phase != "Failed":
+ return None
+
+ try:
+ # Basic pod-level information
+ failure_details: FailureDetails = {
+ "pod_status": getattr(pod.status, "phase", None),
+ "pod_reason": getattr(pod.status, "reason", None),
+ "pod_message": getattr(pod.status, "message", None),
+ }
+
+ # Check init containers first (they run before main containers)
+ container_failure = _analyze_init_containers(pod.status)
+
+ # If no init container failure found, check main containers
+ if not container_failure:
+ container_failure = _analyze_main_containers(pod.status)
+
+ # Merge container failure details
+ if container_failure:
+ failure_details.update(container_failure)
+
+ return failure_details
+
+ except Exception:
+ # Log unexpected exception for debugging
+ import logging
+
+ logging.getLogger(__name__).exception(
+ "Unexpected error while collecting pod failure details for pod %s",
+ getattr(pod.metadata, "name", "unknown"),
+ )
+ # Return basic pod info if container analysis fails
+ return {
+ "pod_status": getattr(pod.status, "phase", None),
+ "pod_reason": getattr(pod.status, "reason", None),
+ "pod_message": getattr(pod.status, "message", None),
+ }
+
+
+def _analyze_containers(container_statuses: list | None, container_type: str)
-> FailureDetails | None:
Review Comment:
list[k8s.V1Container]
##########
providers/cncf/kubernetes/src/airflow/providers/cncf/kubernetes/executors/kubernetes_executor_types.py:
##########
@@ -16,9 +16,25 @@
# under the License.
from __future__ import annotations
-from typing import TYPE_CHECKING, Any
+from typing import TYPE_CHECKING, Any, TypedDict
ADOPTED = "adopted"
+
+
+class FailureDetails(TypedDict, total=False):
+ """Detailed information about pod/container failure."""
+
+ pod_status: str | None
+ pod_reason: str | None
+ pod_message: str | None
+ container_state: str | None
+ container_reason: str | None
+ container_message: str | None
+ exit_code: int | None
+ container_type: str | None # "init" or "main"
Review Comment:
`Literal["init","main"] | None`
##########
providers/cncf/kubernetes/src/airflow/providers/cncf/kubernetes/executors/kubernetes_executor_utils.py:
##########
@@ -302,6 +353,139 @@ def process_status(
)
+def collect_pod_failure_details(pod: k8s.V1Pod) -> FailureDetails | None:
+ """
+ Collect detailed failure information from a failed pod.
+
+ Analyzes both init containers and main containers to determine the root
cause
+ of pod failure, prioritizing terminated containers with non-zero exit
codes.
+
+ Args:
+ pod: The Kubernetes V1Pod object to analyze
+
+ Returns:
+ FailureDetails dict with failure information, or None if no failure
details found
+ """
+ if not pod.status or pod.status.phase != "Failed":
+ return None
+
+ try:
+ # Basic pod-level information
+ failure_details: FailureDetails = {
+ "pod_status": getattr(pod.status, "phase", None),
+ "pod_reason": getattr(pod.status, "reason", None),
+ "pod_message": getattr(pod.status, "message", None),
+ }
+
+ # Check init containers first (they run before main containers)
+ container_failure = _analyze_init_containers(pod.status)
+
+ # If no init container failure found, check main containers
+ if not container_failure:
+ container_failure = _analyze_main_containers(pod.status)
+
+ # Merge container failure details
+ if container_failure:
+ failure_details.update(container_failure)
+
+ return failure_details
+
+ except Exception:
+ # Return basic pod info if container analysis fails
+ return {
Review Comment:
Done
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]