mirror of
https://github.com/apache/superset.git
synced 2026-04-27 03:55:47 +00:00
This fix ensures that JSON and JSONB data from databases (like PostgreSQL) is preserved as Python objects (dicts/lists) when converting result sets to pandas DataFrames. Previously, nested data types were being stringified, which broke features like Handlebars templates that need to access JSON data as objects rather than strings. The fix works by: 1. Tracking columns with nested/JSON data before stringification 2. Restoring the original Python objects when converting to pandas Fixes #25125 Co-Authored-By: Claude <noreply@anthropic.com>
276 lines
8.9 KiB
Python
276 lines
8.9 KiB
Python
# Licensed to the Apache Software Foundation (ASF) under one
|
|
# or more contributor license agreements. See the NOTICE file
|
|
# distributed with this work for additional information
|
|
# regarding copyright ownership. The ASF licenses this file
|
|
# to you under the Apache License, Version 2.0 (the
|
|
# "License"); you may not use this file except in compliance
|
|
# with the License. You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing,
|
|
# software distributed under the License is distributed on an
|
|
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
# KIND, either express or implied. See the License for the
|
|
# specific language governing permissions and limitations
|
|
# under the License.
|
|
|
|
# pylint: disable=import-outside-toplevel, unused-argument
|
|
|
|
from datetime import datetime, timezone
|
|
|
|
import numpy as np
|
|
import pandas as pd
|
|
from numpy.core.multiarray import array
|
|
from pytest_mock import MockerFixture
|
|
|
|
from superset.db_engine_specs.base import BaseEngineSpec
|
|
from superset.result_set import stringify_values, SupersetResultSet
|
|
from superset.superset_typing import DbapiResult
|
|
|
|
|
|
def test_column_names_as_bytes() -> None:
|
|
"""
|
|
Test that we can handle column names as bytes.
|
|
"""
|
|
from superset.db_engine_specs.redshift import RedshiftEngineSpec
|
|
from superset.result_set import SupersetResultSet
|
|
|
|
data = (
|
|
[
|
|
"2016-01-26",
|
|
392.002014,
|
|
397.765991,
|
|
390.575012,
|
|
392.153015,
|
|
392.153015,
|
|
58147000,
|
|
],
|
|
[
|
|
"2016-01-27",
|
|
392.444,
|
|
396.842987,
|
|
391.782013,
|
|
394.971985,
|
|
394.971985,
|
|
47424400,
|
|
],
|
|
)
|
|
description = [
|
|
(b"date", 1043, None, None, None, None, None),
|
|
(b"open", 701, None, None, None, None, None),
|
|
(b"high", 701, None, None, None, None, None),
|
|
(b"low", 701, None, None, None, None, None),
|
|
(b"close", 701, None, None, None, None, None),
|
|
(b"adj close", 701, None, None, None, None, None),
|
|
(b"volume", 20, None, None, None, None, None),
|
|
]
|
|
result_set = SupersetResultSet(data, description, RedshiftEngineSpec) # type: ignore
|
|
|
|
assert (
|
|
result_set.to_pandas_df().to_markdown()
|
|
== """
|
|
| | date | open | high | low | close | adj close | volume |
|
|
|---:|:-----------|--------:|--------:|--------:|--------:|------------:|---------:|
|
|
| 0 | 2016-01-26 | 392.002 | 397.766 | 390.575 | 392.153 | 392.153 | 58147000 |
|
|
| 1 | 2016-01-27 | 392.444 | 396.843 | 391.782 | 394.972 | 394.972 | 47424400 |
|
|
""".strip()
|
|
)
|
|
|
|
|
|
def test_stringify_with_null_integers():
|
|
"""
|
|
Test that we can safely handle type errors when an integer column has a null value
|
|
"""
|
|
|
|
data = [
|
|
("foo", "bar", pd.NA, None),
|
|
("foo", "bar", pd.NA, True),
|
|
("foo", "bar", pd.NA, None),
|
|
]
|
|
numpy_dtype = [
|
|
("id", "object"),
|
|
("value", "object"),
|
|
("num", "object"),
|
|
("bool", "object"),
|
|
]
|
|
|
|
array2 = np.array(data, dtype=numpy_dtype)
|
|
column_names = ["id", "value", "num", "bool"]
|
|
|
|
result_set = np.array([stringify_values(array2[column]) for column in column_names])
|
|
|
|
expected = np.array(
|
|
[
|
|
array(["foo", "foo", "foo"], dtype=object),
|
|
array(["bar", "bar", "bar"], dtype=object),
|
|
array([None, None, None], dtype=object),
|
|
array([None, "True", None], dtype=object),
|
|
]
|
|
)
|
|
|
|
assert np.array_equal(result_set, expected)
|
|
|
|
|
|
def test_stringify_with_null_timestamps():
|
|
"""
|
|
Test that we can safely handle type errors when a timestamp column has a null value
|
|
"""
|
|
|
|
data = [
|
|
("foo", "bar", pd.NaT, None),
|
|
("foo", "bar", pd.NaT, True),
|
|
("foo", "bar", pd.NaT, None),
|
|
]
|
|
numpy_dtype = [
|
|
("id", "object"),
|
|
("value", "object"),
|
|
("num", "object"),
|
|
("bool", "object"),
|
|
]
|
|
|
|
array2 = np.array(data, dtype=numpy_dtype)
|
|
column_names = ["id", "value", "num", "bool"]
|
|
|
|
result_set = np.array([stringify_values(array2[column]) for column in column_names])
|
|
|
|
expected = np.array(
|
|
[
|
|
array(["foo", "foo", "foo"], dtype=object),
|
|
array(["bar", "bar", "bar"], dtype=object),
|
|
array([None, None, None], dtype=object),
|
|
array([None, "True", None], dtype=object),
|
|
]
|
|
)
|
|
|
|
assert np.array_equal(result_set, expected)
|
|
|
|
|
|
def test_timezone_series(mocker: MockerFixture) -> None:
|
|
"""
|
|
Test that we can handle timezone-aware datetimes correctly.
|
|
|
|
This covers a regression that happened when upgrading from Pandas 1.5.3 to 2.0.3.
|
|
"""
|
|
logger = mocker.patch("superset.result_set.logger")
|
|
|
|
data = [[datetime(2023, 1, 1, tzinfo=timezone.utc)]]
|
|
description = [(b"__time", "datetime", None, None, None, None, False)]
|
|
result_set = SupersetResultSet(
|
|
data,
|
|
description, # type: ignore
|
|
BaseEngineSpec,
|
|
)
|
|
assert result_set.to_pandas_df().values.tolist() == [
|
|
[pd.Timestamp("2023-01-01 00:00:00+0000", tz="UTC")]
|
|
]
|
|
logger.exception.assert_not_called()
|
|
|
|
|
|
def test_get_column_description_from_empty_data_using_cursor_description(
|
|
mocker: MockerFixture,
|
|
) -> None:
|
|
"""
|
|
Test that we can handle get_column_decription from the cursor description
|
|
when data is empty
|
|
"""
|
|
logger = mocker.patch("superset.result_set.logger")
|
|
|
|
data: DbapiResult = []
|
|
description = [(b"__time", "datetime", None, None, None, None, 1, 0, 255)]
|
|
result_set = SupersetResultSet(
|
|
data,
|
|
description, # type: ignore
|
|
BaseEngineSpec,
|
|
)
|
|
assert any(col.get("column_name") == "__time" for col in result_set.columns)
|
|
logger.exception.assert_not_called()
|
|
|
|
|
|
def test_json_data_type_preserved_as_objects() -> None:
|
|
"""
|
|
Test that JSON/JSONB data is preserved as Python objects (dicts/lists)
|
|
instead of being converted to strings.
|
|
|
|
This is important for Handlebars templates and other features that need
|
|
to access JSON data as objects rather than strings.
|
|
|
|
See: https://github.com/apache/superset/issues/25125
|
|
"""
|
|
# Simulate data from PostgreSQL JSONB column - psycopg2 returns dicts
|
|
data = [
|
|
(1, {"key": "value1", "nested": {"a": 1}}, "text1"),
|
|
(2, {"key": "value2", "items": [1, 2, 3]}, "text2"),
|
|
(3, None, "text3"),
|
|
(4, {"mixed": "string"}, "text4"),
|
|
]
|
|
description = [
|
|
("id", 23, None, None, None, None, None), # INT
|
|
("json_col", 3802, None, None, None, None, None), # JSONB
|
|
("text_col", 1043, None, None, None, None, None), # VARCHAR
|
|
]
|
|
result_set = SupersetResultSet(data, description, BaseEngineSpec) # type: ignore
|
|
df = result_set.to_pandas_df()
|
|
|
|
# JSON column should be preserved as Python objects, not strings
|
|
assert df["json_col"].iloc[0] == {"key": "value1", "nested": {"a": 1}}
|
|
assert isinstance(df["json_col"].iloc[0], dict)
|
|
assert df["json_col"].iloc[1] == {"key": "value2", "items": [1, 2, 3]}
|
|
assert df["json_col"].iloc[2] is None
|
|
assert df["json_col"].iloc[3] == {"mixed": "string"}
|
|
|
|
# Verify the data can be serialized to JSON (as it would be for API response)
|
|
from superset.utils import json as superset_json
|
|
|
|
records = df.to_dict(orient="records")
|
|
json_output = superset_json.dumps(records)
|
|
parsed = superset_json.loads(json_output)
|
|
assert parsed[0]["json_col"]["key"] == "value1"
|
|
assert parsed[0]["json_col"]["nested"]["a"] == 1
|
|
assert parsed[1]["json_col"]["items"] == [1, 2, 3]
|
|
|
|
|
|
def test_json_data_with_homogeneous_structure() -> None:
|
|
"""
|
|
Test that JSON data with consistent structure is also preserved as objects.
|
|
"""
|
|
# All rows have the same JSON structure
|
|
data = [
|
|
(1, {"name": "Alice", "age": 30}),
|
|
(2, {"name": "Bob", "age": 25}),
|
|
(3, {"name": "Charlie", "age": 35}),
|
|
]
|
|
description = [
|
|
("id", 23, None, None, None, None, None),
|
|
("data", 3802, None, None, None, None, None),
|
|
]
|
|
result_set = SupersetResultSet(data, description, BaseEngineSpec) # type: ignore
|
|
df = result_set.to_pandas_df()
|
|
|
|
# Should be preserved as dicts
|
|
assert isinstance(df["data"].iloc[0], dict)
|
|
assert df["data"].iloc[0]["name"] == "Alice"
|
|
assert df["data"].iloc[1]["age"] == 25
|
|
|
|
|
|
def test_array_data_type_preserved() -> None:
|
|
"""
|
|
Test that array data is also preserved as Python lists.
|
|
"""
|
|
data = [
|
|
(1, [1, 2, 3]),
|
|
(2, [4, 5, 6]),
|
|
(3, None),
|
|
]
|
|
description = [
|
|
("id", 23, None, None, None, None, None),
|
|
("arr", 1007, None, None, None, None, None), # INT ARRAY
|
|
]
|
|
result_set = SupersetResultSet(data, description, BaseEngineSpec) # type: ignore
|
|
df = result_set.to_pandas_df()
|
|
|
|
# Arrays should be preserved as lists
|
|
assert df["arr"].iloc[0] == [1, 2, 3]
|
|
assert isinstance(df["arr"].iloc[0], list)
|
|
assert df["arr"].iloc[2] is None
|