ephraimbuddy commented on code in PR #63871:
URL: https://github.com/apache/airflow/pull/63871#discussion_r3220723394
##########
airflow-core/src/airflow/serialization/helpers.py:
##########
@@ -19,87 +19,89 @@
from __future__ import annotations
import contextlib
+import inspect
from typing import TYPE_CHECKING, Any
from airflow._shared.module_loading import qualname
from airflow._shared.secrets_masker import redact
from airflow._shared.template_rendering import truncate_rendered_value
from airflow.configuration import conf
-from airflow.settings import json
if TYPE_CHECKING:
from airflow.partition_mappers.base import PartitionMapper
from airflow.timetables.base import Timetable as CoreTimetable
-def serialize_template_field(template_field: Any, name: str) -> str | dict |
list | int | float:
+def serialize_template_field(template_field: Any, name: str) -> str | dict |
list | int | float | bool:
"""
Return a serializable representation of the templated field.
- If ``templated_field`` is provided via a callable then
- return the following serialized value: ``<callable full_qualified_name>``
+ The walk has two responsibilities:
- If ``templated_field`` contains a class or instance that requires recursive
- templating, store them as strings. Otherwise simply return the field as-is.
+ 1. **Make the template_field JSON-encodable** — every container is rebuilt
+ with primitive leaves (str/int/float/bool/None), tuples and sets are
+ flattened to lists, and unsupported objects fall through to ``str()``
+ so ``json.dumps`` never raises on the result.
+ 2. **Keep the output deterministic across parses** — callables are replaced
+ with their qualified name (never the default ``<function ... at 0x...>``
+ repr), dicts are key-sorted, and (frozen)sets are sorted by element so
+ the same input always produces the same string.
"""
- def is_jsonable(x):
- try:
- json.dumps(x)
- except (TypeError, OverflowError):
- return False
- else:
- return True
-
- def translate_tuples_to_lists(obj: Any):
- """Recursively convert tuples to lists."""
- if isinstance(obj, tuple):
- return [translate_tuples_to_lists(item) for item in obj]
- if isinstance(obj, list):
- return [translate_tuples_to_lists(item) for item in obj]
- if isinstance(obj, dict):
- return {key: translate_tuples_to_lists(value) for key, value in
obj.items()}
- return obj
+ def is_jsonable_primitive_type(k):
+ """Whether ``k`` is one of the primitive types json.dumps accepts as a
dict key or value."""
+ return k is None or isinstance(k, (str, int, float, bool))
+
+ def serialize_dict_key(key):
+ """Normalize a dict key to a JSON-encodable primitive."""
+ serialized_key = serialize_object(key)
+ return serialized_key if is_jsonable_primitive_type(serialized_key)
else str(serialized_key)
+
+ def serialize_object(obj):
+ """Recursively rewrite ``obj`` into a JSON-encodable, hash-stable
structure."""
+ if is_jsonable_primitive_type(obj):
+ return obj
- def sort_dict_recursively(obj: Any) -> Any:
- """Recursively sort dictionaries to ensure consistent ordering."""
if isinstance(obj, dict):
- return {k: sort_dict_recursively(v) for k, v in
sorted(obj.items())}
- if isinstance(obj, list):
- return [sort_dict_recursively(item) for item in obj]
- if isinstance(obj, tuple):
- return tuple(sort_dict_recursively(item) for item in obj)
- return obj
+ # Sort dictionaries recursively to ensure consistent string
representation
+ # This prevents hash inconsistencies when dict ordering varies
+ return {
+ serialize_dict_key(k): serialize_object(v)
+ for k, v in sorted(obj.items(), key=lambda kv:
(type(kv[0]).__name__, repr(kv[0])))
Review Comment:
The sort uses `(type(kv[0]).__name__, repr(kv[0]))`, but repr on a callable
embeds its memory address `(<function f at 0x...>)`. Concrete repro using the
new code:
```
v1 = {(lambda x: x): "a", (lambda y: y): "b"}
v2 = {(lambda y: y): "b", (lambda x: x): "a"}
serialize_template_field(v1, "k") != serialize_template_field(v2, "k") #
True
```
Both lambdas collapse to the same output key `<callable __main__.<lambda>>`,
and which value survives depends on the address-based sort. Same issue for any
two objects whose repr defaults to `<… at 0x…> (no custom __repr__)`. Fix: sort
on the serialized form of the key `((type_name, str(serialize_dict_key(k))))`
so callables sort by qualname, not address. Worth a regression test too.
##########
airflow-core/src/airflow/serialization/helpers.py:
##########
@@ -19,87 +19,89 @@
from __future__ import annotations
import contextlib
+import inspect
from typing import TYPE_CHECKING, Any
from airflow._shared.module_loading import qualname
from airflow._shared.secrets_masker import redact
from airflow._shared.template_rendering import truncate_rendered_value
from airflow.configuration import conf
-from airflow.settings import json
if TYPE_CHECKING:
from airflow.partition_mappers.base import PartitionMapper
from airflow.timetables.base import Timetable as CoreTimetable
-def serialize_template_field(template_field: Any, name: str) -> str | dict |
list | int | float:
+def serialize_template_field(template_field: Any, name: str) -> str | dict |
list | int | float | bool:
"""
Return a serializable representation of the templated field.
- If ``templated_field`` is provided via a callable then
- return the following serialized value: ``<callable full_qualified_name>``
+ The walk has two responsibilities:
- If ``templated_field`` contains a class or instance that requires recursive
- templating, store them as strings. Otherwise simply return the field as-is.
+ 1. **Make the template_field JSON-encodable** — every container is rebuilt
+ with primitive leaves (str/int/float/bool/None), tuples and sets are
+ flattened to lists, and unsupported objects fall through to ``str()``
+ so ``json.dumps`` never raises on the result.
+ 2. **Keep the output deterministic across parses** — callables are replaced
+ with their qualified name (never the default ``<function ... at 0x...>``
+ repr), dicts are key-sorted, and (frozen)sets are sorted by element so
+ the same input always produces the same string.
"""
- def is_jsonable(x):
- try:
- json.dumps(x)
- except (TypeError, OverflowError):
- return False
- else:
- return True
-
- def translate_tuples_to_lists(obj: Any):
- """Recursively convert tuples to lists."""
- if isinstance(obj, tuple):
- return [translate_tuples_to_lists(item) for item in obj]
- if isinstance(obj, list):
- return [translate_tuples_to_lists(item) for item in obj]
- if isinstance(obj, dict):
- return {key: translate_tuples_to_lists(value) for key, value in
obj.items()}
- return obj
+ def is_jsonable_primitive_type(k):
+ """Whether ``k`` is one of the primitive types json.dumps accepts as a
dict key or value."""
+ return k is None or isinstance(k, (str, int, float, bool))
+
+ def serialize_dict_key(key):
+ """Normalize a dict key to a JSON-encodable primitive."""
+ serialized_key = serialize_object(key)
+ return serialized_key if is_jsonable_primitive_type(serialized_key)
else str(serialized_key)
Review Comment:
This still leaves primitive non-string dict keys as `int/bool/None`. That
makes mixed-key template fields pass this helper, but
`SerializedDagModel.hash()` later does `sorted(serialized_dag.items())` and
`json.dumps(..., sort_keys=True)` in serialized_dag.py (line 369), which can
still raise for `{1: "a", "b": ...}.` The new mixed-key tests only cover the
helper, not full Dag serialization or hashing. Normalize dict keys to strings
before returning, or fix the full hashing path too.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]