Files
superset2/tests/unit_tests/result_set_test.py
Evan Rusackas 884681fa13 fix(result_set): preserve JSON/JSONB data as objects instead of strings
This fix ensures that JSON and JSONB data from databases (like PostgreSQL)
is preserved as Python objects (dicts/lists) when converting result sets
to pandas DataFrames. Previously, nested data types were being stringified,
which broke features like Handlebars templates that need to access JSON
data as objects rather than strings.

The fix works by:
1. Tracking columns with nested/JSON data before stringification
2. Restoring the original Python objects when converting to pandas

Fixes #25125

Co-Authored-By: Claude <noreply@anthropic.com>
2026-02-23 16:27:59 -08:00

276 lines
8.9 KiB
Python

# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
# pylint: disable=import-outside-toplevel, unused-argument
from datetime import datetime, timezone
import numpy as np
import pandas as pd
from numpy.core.multiarray import array
from pytest_mock import MockerFixture
from superset.db_engine_specs.base import BaseEngineSpec
from superset.result_set import stringify_values, SupersetResultSet
from superset.superset_typing import DbapiResult
def test_column_names_as_bytes() -> None:
"""
Test that we can handle column names as bytes.
"""
from superset.db_engine_specs.redshift import RedshiftEngineSpec
from superset.result_set import SupersetResultSet
data = (
[
"2016-01-26",
392.002014,
397.765991,
390.575012,
392.153015,
392.153015,
58147000,
],
[
"2016-01-27",
392.444,
396.842987,
391.782013,
394.971985,
394.971985,
47424400,
],
)
description = [
(b"date", 1043, None, None, None, None, None),
(b"open", 701, None, None, None, None, None),
(b"high", 701, None, None, None, None, None),
(b"low", 701, None, None, None, None, None),
(b"close", 701, None, None, None, None, None),
(b"adj close", 701, None, None, None, None, None),
(b"volume", 20, None, None, None, None, None),
]
result_set = SupersetResultSet(data, description, RedshiftEngineSpec) # type: ignore
assert (
result_set.to_pandas_df().to_markdown()
== """
| | date | open | high | low | close | adj close | volume |
|---:|:-----------|--------:|--------:|--------:|--------:|------------:|---------:|
| 0 | 2016-01-26 | 392.002 | 397.766 | 390.575 | 392.153 | 392.153 | 58147000 |
| 1 | 2016-01-27 | 392.444 | 396.843 | 391.782 | 394.972 | 394.972 | 47424400 |
""".strip()
)
def test_stringify_with_null_integers():
"""
Test that we can safely handle type errors when an integer column has a null value
"""
data = [
("foo", "bar", pd.NA, None),
("foo", "bar", pd.NA, True),
("foo", "bar", pd.NA, None),
]
numpy_dtype = [
("id", "object"),
("value", "object"),
("num", "object"),
("bool", "object"),
]
array2 = np.array(data, dtype=numpy_dtype)
column_names = ["id", "value", "num", "bool"]
result_set = np.array([stringify_values(array2[column]) for column in column_names])
expected = np.array(
[
array(["foo", "foo", "foo"], dtype=object),
array(["bar", "bar", "bar"], dtype=object),
array([None, None, None], dtype=object),
array([None, "True", None], dtype=object),
]
)
assert np.array_equal(result_set, expected)
def test_stringify_with_null_timestamps():
"""
Test that we can safely handle type errors when a timestamp column has a null value
"""
data = [
("foo", "bar", pd.NaT, None),
("foo", "bar", pd.NaT, True),
("foo", "bar", pd.NaT, None),
]
numpy_dtype = [
("id", "object"),
("value", "object"),
("num", "object"),
("bool", "object"),
]
array2 = np.array(data, dtype=numpy_dtype)
column_names = ["id", "value", "num", "bool"]
result_set = np.array([stringify_values(array2[column]) for column in column_names])
expected = np.array(
[
array(["foo", "foo", "foo"], dtype=object),
array(["bar", "bar", "bar"], dtype=object),
array([None, None, None], dtype=object),
array([None, "True", None], dtype=object),
]
)
assert np.array_equal(result_set, expected)
def test_timezone_series(mocker: MockerFixture) -> None:
"""
Test that we can handle timezone-aware datetimes correctly.
This covers a regression that happened when upgrading from Pandas 1.5.3 to 2.0.3.
"""
logger = mocker.patch("superset.result_set.logger")
data = [[datetime(2023, 1, 1, tzinfo=timezone.utc)]]
description = [(b"__time", "datetime", None, None, None, None, False)]
result_set = SupersetResultSet(
data,
description, # type: ignore
BaseEngineSpec,
)
assert result_set.to_pandas_df().values.tolist() == [
[pd.Timestamp("2023-01-01 00:00:00+0000", tz="UTC")]
]
logger.exception.assert_not_called()
def test_get_column_description_from_empty_data_using_cursor_description(
mocker: MockerFixture,
) -> None:
"""
Test that we can handle get_column_decription from the cursor description
when data is empty
"""
logger = mocker.patch("superset.result_set.logger")
data: DbapiResult = []
description = [(b"__time", "datetime", None, None, None, None, 1, 0, 255)]
result_set = SupersetResultSet(
data,
description, # type: ignore
BaseEngineSpec,
)
assert any(col.get("column_name") == "__time" for col in result_set.columns)
logger.exception.assert_not_called()
def test_json_data_type_preserved_as_objects() -> None:
"""
Test that JSON/JSONB data is preserved as Python objects (dicts/lists)
instead of being converted to strings.
This is important for Handlebars templates and other features that need
to access JSON data as objects rather than strings.
See: https://github.com/apache/superset/issues/25125
"""
# Simulate data from PostgreSQL JSONB column - psycopg2 returns dicts
data = [
(1, {"key": "value1", "nested": {"a": 1}}, "text1"),
(2, {"key": "value2", "items": [1, 2, 3]}, "text2"),
(3, None, "text3"),
(4, {"mixed": "string"}, "text4"),
]
description = [
("id", 23, None, None, None, None, None), # INT
("json_col", 3802, None, None, None, None, None), # JSONB
("text_col", 1043, None, None, None, None, None), # VARCHAR
]
result_set = SupersetResultSet(data, description, BaseEngineSpec) # type: ignore
df = result_set.to_pandas_df()
# JSON column should be preserved as Python objects, not strings
assert df["json_col"].iloc[0] == {"key": "value1", "nested": {"a": 1}}
assert isinstance(df["json_col"].iloc[0], dict)
assert df["json_col"].iloc[1] == {"key": "value2", "items": [1, 2, 3]}
assert df["json_col"].iloc[2] is None
assert df["json_col"].iloc[3] == {"mixed": "string"}
# Verify the data can be serialized to JSON (as it would be for API response)
from superset.utils import json as superset_json
records = df.to_dict(orient="records")
json_output = superset_json.dumps(records)
parsed = superset_json.loads(json_output)
assert parsed[0]["json_col"]["key"] == "value1"
assert parsed[0]["json_col"]["nested"]["a"] == 1
assert parsed[1]["json_col"]["items"] == [1, 2, 3]
def test_json_data_with_homogeneous_structure() -> None:
"""
Test that JSON data with consistent structure is also preserved as objects.
"""
# All rows have the same JSON structure
data = [
(1, {"name": "Alice", "age": 30}),
(2, {"name": "Bob", "age": 25}),
(3, {"name": "Charlie", "age": 35}),
]
description = [
("id", 23, None, None, None, None, None),
("data", 3802, None, None, None, None, None),
]
result_set = SupersetResultSet(data, description, BaseEngineSpec) # type: ignore
df = result_set.to_pandas_df()
# Should be preserved as dicts
assert isinstance(df["data"].iloc[0], dict)
assert df["data"].iloc[0]["name"] == "Alice"
assert df["data"].iloc[1]["age"] == 25
def test_array_data_type_preserved() -> None:
"""
Test that array data is also preserved as Python lists.
"""
data = [
(1, [1, 2, 3]),
(2, [4, 5, 6]),
(3, None),
]
description = [
("id", 23, None, None, None, None, None),
("arr", 1007, None, None, None, None, None), # INT ARRAY
]
result_set = SupersetResultSet(data, description, BaseEngineSpec) # type: ignore
df = result_set.to_pandas_df()
# Arrays should be preserved as lists
assert df["arr"].iloc[0] == [1, 2, 3]
assert isinstance(df["arr"].iloc[0], list)
assert df["arr"].iloc[2] is None