pierrejeambrun commented on code in PR #64523:
URL: https://github.com/apache/airflow/pull/64523#discussion_r3033319957


##########
airflow-core/src/airflow/api_fastapi/common/http_access_log.py:
##########
@@ -23,13 +23,87 @@
 from typing import TYPE_CHECKING
 
 import structlog
+from starlette.routing import Match
+
+from airflow._shared.observability.metrics.stats import Stats
 
 if TYPE_CHECKING:
     from starlette.types import ASGIApp, Message, Receive, Scope, Send
 
 logger = structlog.get_logger(logger_name="http.access")
 
 _HEALTH_PATHS = frozenset(["/api/v2/monitor/health"])
+_API_PATH_PREFIX_TO_SURFACE = (
+    ("/api/v2", "public"),
+    ("/ui", "ui"),
+)
+
+
+def _get_api_surface(path: str) -> str | None:
+    for prefix, surface in _API_PATH_PREFIX_TO_SURFACE:
+        if path == prefix or path.startswith(f"{prefix}/"):
+            return surface
+    return None
+
+
+def _get_status_family(status_code: int) -> str:
+    if status_code < 100:
+        return "unknown"
+    return f"{status_code // 100}xx"

Review Comment:
   HTTP status codes are always 100-599. A status code < 100 can only happen if 
response was None and status fell through as 0. The middleware already handles 
this: status = response["status"] if response is not None else 0. Consider 
handling the 0 case explicitly (e.g., "0xx" or "no_response") rather than 
"unknown", which is ambiguous.
   



##########
airflow-core/src/airflow/api_fastapi/common/http_access_log.py:
##########
@@ -23,13 +23,87 @@
 from typing import TYPE_CHECKING
 
 import structlog
+from starlette.routing import Match
+
+from airflow._shared.observability.metrics.stats import Stats
 
 if TYPE_CHECKING:
     from starlette.types import ASGIApp, Message, Receive, Scope, Send
 
 logger = structlog.get_logger(logger_name="http.access")
 
 _HEALTH_PATHS = frozenset(["/api/v2/monitor/health"])
+_API_PATH_PREFIX_TO_SURFACE = (
+    ("/api/v2", "public"),
+    ("/ui", "ui"),
+)
+
+
+def _get_api_surface(path: str) -> str | None:
+    for prefix, surface in _API_PATH_PREFIX_TO_SURFACE:
+        if path == prefix or path.startswith(f"{prefix}/"):
+            return surface
+    return None
+
+
+def _get_status_family(status_code: int) -> str:
+    if status_code < 100:
+        return "unknown"
+    return f"{status_code // 100}xx"
+
+
+def _get_route_tag(scope: Scope) -> str:
+    router = scope.get("router")
+    routes = getattr(router, "routes", None)
+    partial_route_path: str | None = None
+    if routes:
+        for route in routes:
+            match, _ = route.matches(scope)
+            route_path = getattr(route, "path", None)
+            if not isinstance(route_path, str) or not route_path:
+                continue
+            if match == Match.FULL:
+                return route_path
+            if match == Match.PARTIAL and partial_route_path is None:
+                partial_route_path = route_path
+
+    if partial_route_path:
+        return partial_route_path
+
+    endpoint = scope.get("endpoint")
+    endpoint_name = getattr(endpoint, "__name__", None)
+    if isinstance(endpoint_name, str) and endpoint_name:
+        return endpoint_name
+
+    return "unmatched"
+
+
+def _emit_api_metrics(
+    *,
+    scope: Scope,
+    path: str,
+    method: str,
+    status_code: int,
+    duration_us: int,
+) -> None:
+    api_surface = _get_api_surface(path)
+    if api_surface is None:
+        return
+
+    # Keep tags bounded so API metrics remain usable across supported backends.
+    tags = {
+        "api_surface": api_surface,
+        "method": method or "UNKNOWN",
+        "route": _get_route_tag(scope),
+        "status_code": str(status_code) if status_code else "unknown",
+        "status_family": _get_status_family(status_code),
+    }
+    duration_ms = duration_us / 1000.0
+
+    Stats.incr("api.requests", tags=tags)
+    Stats.timing("api.request.duration", duration_ms, tags=tags)
+    if status_code >= 400:
+        Stats.incr("api.request.errors", tags=tags)

Review Comment:
   We should probably split client vs server error. 
   
   Client spamming invalid entries shouldn't inflate the number of 'errors', 
those are not really errors.



##########
airflow-core/src/airflow/api_fastapi/common/http_access_log.py:
##########
@@ -23,13 +23,87 @@
 from typing import TYPE_CHECKING
 
 import structlog
+from starlette.routing import Match
+
+from airflow._shared.observability.metrics.stats import Stats
 
 if TYPE_CHECKING:
     from starlette.types import ASGIApp, Message, Receive, Scope, Send
 
 logger = structlog.get_logger(logger_name="http.access")
 
 _HEALTH_PATHS = frozenset(["/api/v2/monitor/health"])
+_API_PATH_PREFIX_TO_SURFACE = (
+    ("/api/v2", "public"),
+    ("/ui", "ui"),
+)
+
+
+def _get_api_surface(path: str) -> str | None:
+    for prefix, surface in _API_PATH_PREFIX_TO_SURFACE:
+        if path == prefix or path.startswith(f"{prefix}/"):
+            return surface
+    return None
+
+
+def _get_status_family(status_code: int) -> str:
+    if status_code < 100:
+        return "unknown"
+    return f"{status_code // 100}xx"
+
+
+def _get_route_tag(scope: Scope) -> str:
+    router = scope.get("router")
+    routes = getattr(router, "routes", None)
+    partial_route_path: str | None = None
+    if routes:
+        for route in routes:
+            match, _ = route.matches(scope)
+            route_path = getattr(route, "path", None)
+            if not isinstance(route_path, str) or not route_path:
+                continue
+            if match == Match.FULL:
+                return route_path
+            if match == Match.PARTIAL and partial_route_path is None:
+                partial_route_path = route_path

Review Comment:
   Yep, we should rework this. Looping through all routes probably isn't 
necessary, and with the number of routes in the API we should avoid it.



##########
airflow-core/src/airflow/api_fastapi/common/http_access_log.py:
##########
@@ -95,6 +169,16 @@ async def capture_send(message: Message) -> None:
                     client = scope.get("client")
                     client_addr = f"{client[0]}:{client[1]}" if client else 
None
 
+                    # Observability failures must never affect serving the 
request.
+                    with contextlib.suppress(Exception):
+                        _emit_api_metrics(
+                            scope=scope,
+                            path=path,
+                            method=method,
+                            status_code=status,
+                            duration_us=duration_us,
+                        )

Review Comment:
   I would still log the metric code error, instead of silently failing. 
Otherwise it's impossible to 'debug' why metrics emission is failing.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to