style: fix black formatting in result_set_tests.py

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
fix(result_set): detect nested types in ArrowInvalid except branch
2026-04-29 13:04:22 +00:00 · 2026-04-25 17:23:24 -07:00 · 2026-04-25 17:23:24 -07:00 · 2026-04-25 17:23:24 -07:00 · 2026-04-25 17:23:24 -07:00 · 2026-04-25 17:23:23 -07:00
4 changed files with 156 additions and 13 deletions
--- a/superset/dataframe.py
+++ b/superset/dataframe.py
@@ -37,6 +37,22 @@ def _convert_big_integers(val: Any) -> Any:
    return str(val) if isinstance(val, int) and abs(val) > JS_MAX_INTEGER else val


+def _is_na(val: Any) -> bool:
+    """
+    Check if a value is NA/NaN for scalar values only.
+
+    pd.isna() raises ValueError for arrays/lists, so we catch that case.
+
+    :param val: the value to check
+    :returns: True if the value is NA/NaN, False otherwise
+    """
+    try:
+        return bool(pd.isna(val))
+    except ValueError:
+        # pd.isna raises ValueError for arrays (e.g., lists, dicts from JSON)
+        return False
+
+
 def df_to_records(dframe: pd.DataFrame) -> list[dict[str, Any]]:
    """
    Convert a DataFrame to a set of records.
@@ -56,7 +72,7 @@ def df_to_records(dframe: pd.DataFrame) -> list[dict[str, Any]]:
    for record in records:
        for key in record:
            record[key] = (
-                None if pd.isna(record[key]) else _convert_big_integers(record[key])
+                None if _is_na(record[key]) else _convert_big_integers(record[key])
            )

    return records
--- a/superset/result_set.py
+++ b/superset/result_set.py
@@ -145,6 +145,8 @@ class SupersetResultSet:
        deduped_cursor_desc: list[tuple[Any, ...]] = []
        numpy_dtype: list[tuple[str, ...]] = []
        stringified_arr: NDArray[Any]
+        # Track columns with nested/JSON data to preserve them as objects
+        self._nested_columns: dict[str, list[Any]] = {}

        if cursor_description:
            # get deduped list of column names
@@ -184,6 +186,17 @@ class SupersetResultSet:
                TypeError,  # this is super hackey,
                # https://issues.apache.org/jira/browse/ARROW-7855
            ):
+                # Check if original data has nested types (lists/dicts)
+                # before stringifying, since stringification removes
+                # the nested structure that the second loop relies on
+                # to detect via pa.types.is_nested().
+                original_values = array[column].tolist()
+                if any(
+                    isinstance(v, (list, dict))
+                    for v in original_values
+                    if v is not None
+                ):
+                    self._nested_columns[column] = original_values
                # attempt serialization of values as strings
                stringified_arr = stringify_values(array[column])
                pa_data.append(pa.array(stringified_arr.tolist()))
@@ -191,9 +204,11 @@ class SupersetResultSet:
        if pa_data:  # pylint: disable=too-many-nested-blocks
            for i, column in enumerate(column_names):
                if pa.types.is_nested(pa_data[i].type):
-                    # TODO: revisit nested column serialization once nested types
-                    #  are added as a natively supported column type in Superset
-                    #  (superset.utils.core.GenericDataType).
+                    # Preserve nested/JSON data as Python objects for use in
+                    # templates like Handlebars. Store original values before
+                    # stringifying for PyArrow compatibility.
+                    # See: https://github.com/apache/superset/issues/25125
+                    self._nested_columns[column] = array[column].tolist()
                    stringified_arr = stringify_values(array[column])
                    pa_data[i] = pa.array(stringified_arr.tolist())

@@ -284,7 +299,13 @@ class SupersetResultSet:
        return None

    def to_pandas_df(self) -> pd.DataFrame:
-        return self.convert_table_to_df(self.table)
+        df = self.convert_table_to_df(self.table)
+        # Restore nested/JSON columns as Python objects instead of strings
+        # This allows JSON data to be used directly in templates like Handlebars
+        for column, values in self._nested_columns.items():
+            if column in df.columns:
+                df[column] = values
+        return df

    @property
    def pa_table(self) -> pa.Table:
--- a/tests/integration_tests/result_set_tests.py
+++ b/tests/integration_tests/result_set_tests.py
@@ -226,18 +226,19 @@ class TestSupersetResultSet(SupersetTestCase):
        assert results.columns[3]["type"] == "STRING"
        assert results.columns[3]["type_generic"] == GenericDataType.STRING
        df = results.to_pandas_df()
+        # JSON/JSONB data is preserved as objects instead of being stringified
        assert df_to_records(df) == [
            {
                "id": 4,
-                "dict_arr": '[{"table_name": "unicode_test", "database_id": 1}]',
-                "num_arr": "[1, 2, 3]",
-                "map_col": "{'chart_name': 'scatter'}",
+                "dict_arr": [{"table_name": "unicode_test", "database_id": 1}],
+                "num_arr": [1, 2, 3],
+                "map_col": {"chart_name": "scatter"},
            },
            {
                "id": 3,
-                "dict_arr": '[{"table_name": "birth_names", "database_id": 1}]',
-                "num_arr": "[4, 5, 6]",
-                "map_col": "{'chart_name': 'plot'}",
+                "dict_arr": [{"table_name": "birth_names", "database_id": 1}],
+                "num_arr": [4, 5, 6],
+                "map_col": {"chart_name": "plot"},
            },
        ]

@@ -267,9 +268,25 @@ class TestSupersetResultSet(SupersetTestCase):
        assert results.columns[0]["type"] == "STRING"
        assert results.columns[0]["type_generic"] == GenericDataType.STRING
        df = results.to_pandas_df()
+        # JSON/JSONB data is preserved as objects instead of being stringified
        assert df_to_records(df) == [
            {
-                "metadata": '["test", [["foo", 123456, [[["test"], 3432546, 7657658766], [["fake"], 656756765, 324324324324]]]], ["test2", 43, 765765765], null, null]'  # noqa: E501
+                "metadata": [
+                    "test",
+                    [
+                        [
+                            "foo",
+                            123456,
+                            [
+                                [["test"], 3432546, 7657658766],
+                                [["fake"], 656756765, 324324324324],
+                            ],
+                        ]
+                    ],
+                    ["test2", 43, 765765765],
+                    None,
+                    None,
+                ]
            }
        ]

@@ -280,7 +297,8 @@ class TestSupersetResultSet(SupersetTestCase):
        assert results.columns[0]["type"] == "STRING"
        assert results.columns[0]["type_generic"] == GenericDataType.STRING
        df = results.to_pandas_df()
-        assert df_to_records(df) == [{"metadata": '[{"TestKey": [123456, "foo"]}]'}]
+        # JSON/JSONB data is preserved as objects instead of being stringified
+        assert df_to_records(df) == [{"metadata": [{"TestKey": [123456, "foo"]}]}]

    def test_empty_datetime(self):
        data = [(None,)]
--- a/tests/unit_tests/result_set_test.py
+++ b/tests/unit_tests/result_set_test.py
@@ -244,3 +244,91 @@ def test_empty_column_names_do_not_rename_explicit_synthetic_names() -> None:
    df = result_set.to_pandas_df()
    assert list(df.columns) == ["_col_1", "_col_0"]
    assert df.iloc[0].tolist() == [10, 20]
+
+
+def test_json_data_type_preserved_as_objects() -> None:
+    """
+    Test that JSON/JSONB data is preserved as Python objects (dicts/lists)
+    instead of being converted to strings.
+
+    This is important for Handlebars templates and other features that need
+    to access JSON data as objects rather than strings.
+
+    See: https://github.com/apache/superset/issues/25125
+    """
+    # Simulate data from PostgreSQL JSONB column - psycopg2 returns dicts
+    data = [
+        (1, {"key": "value1", "nested": {"a": 1}}, "text1"),
+        (2, {"key": "value2", "items": [1, 2, 3]}, "text2"),
+        (3, None, "text3"),
+        (4, {"mixed": "string"}, "text4"),
+    ]
+    description = [
+        ("id", 23, None, None, None, None, None),  # INT
+        ("json_col", 3802, None, None, None, None, None),  # JSONB
+        ("text_col", 1043, None, None, None, None, None),  # VARCHAR
+    ]
+    result_set = SupersetResultSet(data, description, BaseEngineSpec)  # type: ignore
+    df = result_set.to_pandas_df()
+
+    # JSON column should be preserved as Python objects, not strings
+    assert df["json_col"].iloc[0] == {"key": "value1", "nested": {"a": 1}}
+    assert isinstance(df["json_col"].iloc[0], dict)
+    assert df["json_col"].iloc[1] == {"key": "value2", "items": [1, 2, 3]}
+    assert df["json_col"].iloc[2] is None
+    assert df["json_col"].iloc[3] == {"mixed": "string"}
+
+    # Verify the data can be serialized to JSON (as it would be for API response)
+    from superset.utils import json as superset_json
+
+    records = df.to_dict(orient="records")
+    json_output = superset_json.dumps(records)
+    parsed = superset_json.loads(json_output)
+    assert parsed[0]["json_col"]["key"] == "value1"
+    assert parsed[0]["json_col"]["nested"]["a"] == 1
+    assert parsed[1]["json_col"]["items"] == [1, 2, 3]
+
+
+def test_json_data_with_homogeneous_structure() -> None:
+    """
+    Test that JSON data with consistent structure is also preserved as objects.
+    """
+    # All rows have the same JSON structure
+    data = [
+        (1, {"name": "Alice", "age": 30}),
+        (2, {"name": "Bob", "age": 25}),
+        (3, {"name": "Charlie", "age": 35}),
+    ]
+    description = [
+        ("id", 23, None, None, None, None, None),
+        ("data", 3802, None, None, None, None, None),
+    ]
+    result_set = SupersetResultSet(data, description, BaseEngineSpec)  # type: ignore
+    df = result_set.to_pandas_df()
+
+    # Should be preserved as dicts
+    assert isinstance(df["data"].iloc[0], dict)
+    assert df["data"].iloc[0]["name"] == "Alice"
+    assert df["data"].iloc[1]["age"] == 25
+
+
+def test_array_data_type_preserved() -> None:
+    """
+    Test that array data is also preserved as Python lists.
+    """
+    data = [
+        (1, [1, 2, 3]),
+        (2, [4, 5, 6]),
+        (3, None),
+    ]
+    description = [
+        ("id", 23, None, None, None, None, None),
+        ("arr", 1007, None, None, None, None, None),  # INT ARRAY
+    ]
+    result_set = SupersetResultSet(data, description, BaseEngineSpec)  # type: ignore
+    df = result_set.to_pandas_df()
+
+    # Arrays should be preserved as lists
+    assert df["arr"].iloc[0] == [1, 2, 3]
+    assert isinstance(df["arr"].iloc[0], list)
+    assert df["arr"].iloc[2] is None
Author	SHA1	Message	Date
Evan Rusackas	c45e40e939	style: fix black formatting in result_set_tests.py Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>	2026-04-25 17:23:24 -07:00
Evan Rusackas	e7763e7aa3	fix(result_set): detect nested types in ArrowInvalid except branch When heterogeneous data (e.g., [123456, "foo"]) causes PyArrow to throw ArrowInvalid, the except branch stringifies the data before the second loop can detect nested types via pa.types.is_nested(). This means columns with nested data (lists/dicts) never get added to _nested_columns and their JSON structure is lost. Fix by checking the original data for nested types (lists/dicts) in the except branch before stringifying, preserving them in _nested_columns so they are restored as Python objects in to_pandas_df(). Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>	2026-04-25 17:23:24 -07:00
Evan Rusackas	219a0d5866	test(result_set): update nested type tests for JSON preservation Update test expectations to expect JSON data as preserved objects (dicts/lists) instead of stringified JSON, matching the new behavior. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>	2026-04-25 17:23:24 -07:00
Evan Rusackas	3b2e73592d	fix(dataframe): handle arrays in NA check for JSON data preservation pd.isna() raises ValueError when called on arrays (lists/dicts from JSON). Use a helper function that catches this exception and returns False for array values. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>	2026-04-25 17:23:24 -07:00
Evan Rusackas	a3be280fc2	fix(result_set): preserve JSON/JSONB data as objects instead of strings This fix ensures that JSON and JSONB data from databases (like PostgreSQL) is preserved as Python objects (dicts/lists) when converting result sets to pandas DataFrames. Previously, nested data types were being stringified, which broke features like Handlebars templates that need to access JSON data as objects rather than strings. The fix works by: 1. Tracking columns with nested/JSON data before stringification 2. Restoring the original Python objects when converting to pandas Fixes #25125 Co-Authored-By: Claude <noreply@anthropic.com>	2026-04-25 17:23:23 -07:00