This is an automated email from the ASF dual-hosted git repository. rusackas pushed a commit to branch fix-25125-json-data-type-preservation in repository https://gitbox.apache.org/repos/asf/superset.git
commit ffc31f03fd7674c88753fb80f18396b5e9e917f9 Author: Evan Rusackas <[email protected]> AuthorDate: Sun Feb 22 15:17:20 2026 -0800 fix(result_set): preserve JSON/JSONB data as objects instead of strings This fix ensures that JSON and JSONB data from databases (like PostgreSQL) is preserved as Python objects (dicts/lists) when converting result sets to pandas DataFrames. Previously, nested data types were being stringified, which broke features like Handlebars templates that need to access JSON data as objects rather than strings. The fix works by: 1. Tracking columns with nested/JSON data before stringification 2. Restoring the original Python objects when converting to pandas Fixes #25125 Co-Authored-By: Claude <[email protected]> --- superset/result_set.py | 18 ++++++-- tests/unit_tests/result_set_test.py | 88 +++++++++++++++++++++++++++++++++++++ 2 files changed, 102 insertions(+), 4 deletions(-) diff --git a/superset/result_set.py b/superset/result_set.py index ff65453aff1..a6dd18705d3 100644 --- a/superset/result_set.py +++ b/superset/result_set.py @@ -113,6 +113,8 @@ class SupersetResultSet: deduped_cursor_desc: list[tuple[Any, ...]] = [] numpy_dtype: list[tuple[str, ...]] = [] stringified_arr: NDArray[Any] + # Track columns with nested/JSON data to preserve them as objects + self._nested_columns: dict[str, list[Any]] = {} if cursor_description: # get deduped list of column names @@ -154,9 +156,11 @@ class SupersetResultSet: if pa_data: # pylint: disable=too-many-nested-blocks for i, column in enumerate(column_names): if pa.types.is_nested(pa_data[i].type): - # TODO: revisit nested column serialization once nested types - # are added as a natively supported column type in Superset - # (superset.utils.core.GenericDataType). + # Preserve nested/JSON data as Python objects for use in + # templates like Handlebars. Store original values before + # stringifying for PyArrow compatibility. + # See: https://github.com/apache/superset/issues/25125 + self._nested_columns[column] = array[column].tolist() stringified_arr = stringify_values(array[column]) pa_data[i] = pa.array(stringified_arr.tolist()) @@ -247,7 +251,13 @@ class SupersetResultSet: return None def to_pandas_df(self) -> pd.DataFrame: - return self.convert_table_to_df(self.table) + df = self.convert_table_to_df(self.table) + # Restore nested/JSON columns as Python objects instead of strings + # This allows JSON data to be used directly in templates like Handlebars + for column, values in self._nested_columns.items(): + if column in df.columns: + df[column] = values + return df @property def pa_table(self) -> pa.Table: diff --git a/tests/unit_tests/result_set_test.py b/tests/unit_tests/result_set_test.py index da5dcdafabc..afd8896cebf 100644 --- a/tests/unit_tests/result_set_test.py +++ b/tests/unit_tests/result_set_test.py @@ -185,3 +185,91 @@ def test_get_column_description_from_empty_data_using_cursor_description( ) assert any(col.get("column_name") == "__time" for col in result_set.columns) logger.exception.assert_not_called() + + +def test_json_data_type_preserved_as_objects() -> None: + """ + Test that JSON/JSONB data is preserved as Python objects (dicts/lists) + instead of being converted to strings. + + This is important for Handlebars templates and other features that need + to access JSON data as objects rather than strings. + + See: https://github.com/apache/superset/issues/25125 + """ + # Simulate data from PostgreSQL JSONB column - psycopg2 returns dicts + data = [ + (1, {"key": "value1", "nested": {"a": 1}}, "text1"), + (2, {"key": "value2", "items": [1, 2, 3]}, "text2"), + (3, None, "text3"), + (4, {"mixed": "string"}, "text4"), + ] + description = [ + ("id", 23, None, None, None, None, None), # INT + ("json_col", 3802, None, None, None, None, None), # JSONB + ("text_col", 1043, None, None, None, None, None), # VARCHAR + ] + result_set = SupersetResultSet(data, description, BaseEngineSpec) # type: ignore + df = result_set.to_pandas_df() + + # JSON column should be preserved as Python objects, not strings + assert df["json_col"].iloc[0] == {"key": "value1", "nested": {"a": 1}} + assert isinstance(df["json_col"].iloc[0], dict) + assert df["json_col"].iloc[1] == {"key": "value2", "items": [1, 2, 3]} + assert df["json_col"].iloc[2] is None + assert df["json_col"].iloc[3] == {"mixed": "string"} + + # Verify the data can be serialized to JSON (as it would be for API response) + from superset.utils import json as superset_json + + records = df.to_dict(orient="records") + json_output = superset_json.dumps(records) + parsed = superset_json.loads(json_output) + assert parsed[0]["json_col"]["key"] == "value1" + assert parsed[0]["json_col"]["nested"]["a"] == 1 + assert parsed[1]["json_col"]["items"] == [1, 2, 3] + + +def test_json_data_with_homogeneous_structure() -> None: + """ + Test that JSON data with consistent structure is also preserved as objects. + """ + # All rows have the same JSON structure + data = [ + (1, {"name": "Alice", "age": 30}), + (2, {"name": "Bob", "age": 25}), + (3, {"name": "Charlie", "age": 35}), + ] + description = [ + ("id", 23, None, None, None, None, None), + ("data", 3802, None, None, None, None, None), + ] + result_set = SupersetResultSet(data, description, BaseEngineSpec) # type: ignore + df = result_set.to_pandas_df() + + # Should be preserved as dicts + assert isinstance(df["data"].iloc[0], dict) + assert df["data"].iloc[0]["name"] == "Alice" + assert df["data"].iloc[1]["age"] == 25 + + +def test_array_data_type_preserved() -> None: + """ + Test that array data is also preserved as Python lists. + """ + data = [ + (1, [1, 2, 3]), + (2, [4, 5, 6]), + (3, None), + ] + description = [ + ("id", 23, None, None, None, None, None), + ("arr", 1007, None, None, None, None, None), # INT ARRAY + ] + result_set = SupersetResultSet(data, description, BaseEngineSpec) # type: ignore + df = result_set.to_pandas_df() + + # Arrays should be preserved as lists + assert df["arr"].iloc[0] == [1, 2, 3] + assert isinstance(df["arr"].iloc[0], list) + assert df["arr"].iloc[2] is None
