mirror of
https://github.com/apache/superset.git
synced 2026-04-29 13:04:22 +00:00
Compare commits
5 Commits
enxdev/fea
...
fix-25125-
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
c45e40e939 | ||
|
|
e7763e7aa3 | ||
|
|
219a0d5866 | ||
|
|
3b2e73592d | ||
|
|
a3be280fc2 |
@@ -37,6 +37,22 @@ def _convert_big_integers(val: Any) -> Any:
|
||||
return str(val) if isinstance(val, int) and abs(val) > JS_MAX_INTEGER else val
|
||||
|
||||
|
||||
def _is_na(val: Any) -> bool:
|
||||
"""
|
||||
Check if a value is NA/NaN for scalar values only.
|
||||
|
||||
pd.isna() raises ValueError for arrays/lists, so we catch that case.
|
||||
|
||||
:param val: the value to check
|
||||
:returns: True if the value is NA/NaN, False otherwise
|
||||
"""
|
||||
try:
|
||||
return bool(pd.isna(val))
|
||||
except ValueError:
|
||||
# pd.isna raises ValueError for arrays (e.g., lists, dicts from JSON)
|
||||
return False
|
||||
|
||||
|
||||
def df_to_records(dframe: pd.DataFrame) -> list[dict[str, Any]]:
|
||||
"""
|
||||
Convert a DataFrame to a set of records.
|
||||
@@ -56,7 +72,7 @@ def df_to_records(dframe: pd.DataFrame) -> list[dict[str, Any]]:
|
||||
for record in records:
|
||||
for key in record:
|
||||
record[key] = (
|
||||
None if pd.isna(record[key]) else _convert_big_integers(record[key])
|
||||
None if _is_na(record[key]) else _convert_big_integers(record[key])
|
||||
)
|
||||
|
||||
return records
|
||||
|
||||
@@ -145,6 +145,8 @@ class SupersetResultSet:
|
||||
deduped_cursor_desc: list[tuple[Any, ...]] = []
|
||||
numpy_dtype: list[tuple[str, ...]] = []
|
||||
stringified_arr: NDArray[Any]
|
||||
# Track columns with nested/JSON data to preserve them as objects
|
||||
self._nested_columns: dict[str, list[Any]] = {}
|
||||
|
||||
if cursor_description:
|
||||
# get deduped list of column names
|
||||
@@ -184,6 +186,17 @@ class SupersetResultSet:
|
||||
TypeError, # this is super hackey,
|
||||
# https://issues.apache.org/jira/browse/ARROW-7855
|
||||
):
|
||||
# Check if original data has nested types (lists/dicts)
|
||||
# before stringifying, since stringification removes
|
||||
# the nested structure that the second loop relies on
|
||||
# to detect via pa.types.is_nested().
|
||||
original_values = array[column].tolist()
|
||||
if any(
|
||||
isinstance(v, (list, dict))
|
||||
for v in original_values
|
||||
if v is not None
|
||||
):
|
||||
self._nested_columns[column] = original_values
|
||||
# attempt serialization of values as strings
|
||||
stringified_arr = stringify_values(array[column])
|
||||
pa_data.append(pa.array(stringified_arr.tolist()))
|
||||
@@ -191,9 +204,11 @@ class SupersetResultSet:
|
||||
if pa_data: # pylint: disable=too-many-nested-blocks
|
||||
for i, column in enumerate(column_names):
|
||||
if pa.types.is_nested(pa_data[i].type):
|
||||
# TODO: revisit nested column serialization once nested types
|
||||
# are added as a natively supported column type in Superset
|
||||
# (superset.utils.core.GenericDataType).
|
||||
# Preserve nested/JSON data as Python objects for use in
|
||||
# templates like Handlebars. Store original values before
|
||||
# stringifying for PyArrow compatibility.
|
||||
# See: https://github.com/apache/superset/issues/25125
|
||||
self._nested_columns[column] = array[column].tolist()
|
||||
stringified_arr = stringify_values(array[column])
|
||||
pa_data[i] = pa.array(stringified_arr.tolist())
|
||||
|
||||
@@ -284,7 +299,13 @@ class SupersetResultSet:
|
||||
return None
|
||||
|
||||
def to_pandas_df(self) -> pd.DataFrame:
|
||||
return self.convert_table_to_df(self.table)
|
||||
df = self.convert_table_to_df(self.table)
|
||||
# Restore nested/JSON columns as Python objects instead of strings
|
||||
# This allows JSON data to be used directly in templates like Handlebars
|
||||
for column, values in self._nested_columns.items():
|
||||
if column in df.columns:
|
||||
df[column] = values
|
||||
return df
|
||||
|
||||
@property
|
||||
def pa_table(self) -> pa.Table:
|
||||
|
||||
@@ -226,18 +226,19 @@ class TestSupersetResultSet(SupersetTestCase):
|
||||
assert results.columns[3]["type"] == "STRING"
|
||||
assert results.columns[3]["type_generic"] == GenericDataType.STRING
|
||||
df = results.to_pandas_df()
|
||||
# JSON/JSONB data is preserved as objects instead of being stringified
|
||||
assert df_to_records(df) == [
|
||||
{
|
||||
"id": 4,
|
||||
"dict_arr": '[{"table_name": "unicode_test", "database_id": 1}]',
|
||||
"num_arr": "[1, 2, 3]",
|
||||
"map_col": "{'chart_name': 'scatter'}",
|
||||
"dict_arr": [{"table_name": "unicode_test", "database_id": 1}],
|
||||
"num_arr": [1, 2, 3],
|
||||
"map_col": {"chart_name": "scatter"},
|
||||
},
|
||||
{
|
||||
"id": 3,
|
||||
"dict_arr": '[{"table_name": "birth_names", "database_id": 1}]',
|
||||
"num_arr": "[4, 5, 6]",
|
||||
"map_col": "{'chart_name': 'plot'}",
|
||||
"dict_arr": [{"table_name": "birth_names", "database_id": 1}],
|
||||
"num_arr": [4, 5, 6],
|
||||
"map_col": {"chart_name": "plot"},
|
||||
},
|
||||
]
|
||||
|
||||
@@ -267,9 +268,25 @@ class TestSupersetResultSet(SupersetTestCase):
|
||||
assert results.columns[0]["type"] == "STRING"
|
||||
assert results.columns[0]["type_generic"] == GenericDataType.STRING
|
||||
df = results.to_pandas_df()
|
||||
# JSON/JSONB data is preserved as objects instead of being stringified
|
||||
assert df_to_records(df) == [
|
||||
{
|
||||
"metadata": '["test", [["foo", 123456, [[["test"], 3432546, 7657658766], [["fake"], 656756765, 324324324324]]]], ["test2", 43, 765765765], null, null]' # noqa: E501
|
||||
"metadata": [
|
||||
"test",
|
||||
[
|
||||
[
|
||||
"foo",
|
||||
123456,
|
||||
[
|
||||
[["test"], 3432546, 7657658766],
|
||||
[["fake"], 656756765, 324324324324],
|
||||
],
|
||||
]
|
||||
],
|
||||
["test2", 43, 765765765],
|
||||
None,
|
||||
None,
|
||||
]
|
||||
}
|
||||
]
|
||||
|
||||
@@ -280,7 +297,8 @@ class TestSupersetResultSet(SupersetTestCase):
|
||||
assert results.columns[0]["type"] == "STRING"
|
||||
assert results.columns[0]["type_generic"] == GenericDataType.STRING
|
||||
df = results.to_pandas_df()
|
||||
assert df_to_records(df) == [{"metadata": '[{"TestKey": [123456, "foo"]}]'}]
|
||||
# JSON/JSONB data is preserved as objects instead of being stringified
|
||||
assert df_to_records(df) == [{"metadata": [{"TestKey": [123456, "foo"]}]}]
|
||||
|
||||
def test_empty_datetime(self):
|
||||
data = [(None,)]
|
||||
|
||||
@@ -244,3 +244,91 @@ def test_empty_column_names_do_not_rename_explicit_synthetic_names() -> None:
|
||||
df = result_set.to_pandas_df()
|
||||
assert list(df.columns) == ["_col_1", "_col_0"]
|
||||
assert df.iloc[0].tolist() == [10, 20]
|
||||
|
||||
|
||||
def test_json_data_type_preserved_as_objects() -> None:
|
||||
"""
|
||||
Test that JSON/JSONB data is preserved as Python objects (dicts/lists)
|
||||
instead of being converted to strings.
|
||||
|
||||
This is important for Handlebars templates and other features that need
|
||||
to access JSON data as objects rather than strings.
|
||||
|
||||
See: https://github.com/apache/superset/issues/25125
|
||||
"""
|
||||
# Simulate data from PostgreSQL JSONB column - psycopg2 returns dicts
|
||||
data = [
|
||||
(1, {"key": "value1", "nested": {"a": 1}}, "text1"),
|
||||
(2, {"key": "value2", "items": [1, 2, 3]}, "text2"),
|
||||
(3, None, "text3"),
|
||||
(4, {"mixed": "string"}, "text4"),
|
||||
]
|
||||
description = [
|
||||
("id", 23, None, None, None, None, None), # INT
|
||||
("json_col", 3802, None, None, None, None, None), # JSONB
|
||||
("text_col", 1043, None, None, None, None, None), # VARCHAR
|
||||
]
|
||||
result_set = SupersetResultSet(data, description, BaseEngineSpec) # type: ignore
|
||||
df = result_set.to_pandas_df()
|
||||
|
||||
# JSON column should be preserved as Python objects, not strings
|
||||
assert df["json_col"].iloc[0] == {"key": "value1", "nested": {"a": 1}}
|
||||
assert isinstance(df["json_col"].iloc[0], dict)
|
||||
assert df["json_col"].iloc[1] == {"key": "value2", "items": [1, 2, 3]}
|
||||
assert df["json_col"].iloc[2] is None
|
||||
assert df["json_col"].iloc[3] == {"mixed": "string"}
|
||||
|
||||
# Verify the data can be serialized to JSON (as it would be for API response)
|
||||
from superset.utils import json as superset_json
|
||||
|
||||
records = df.to_dict(orient="records")
|
||||
json_output = superset_json.dumps(records)
|
||||
parsed = superset_json.loads(json_output)
|
||||
assert parsed[0]["json_col"]["key"] == "value1"
|
||||
assert parsed[0]["json_col"]["nested"]["a"] == 1
|
||||
assert parsed[1]["json_col"]["items"] == [1, 2, 3]
|
||||
|
||||
|
||||
def test_json_data_with_homogeneous_structure() -> None:
|
||||
"""
|
||||
Test that JSON data with consistent structure is also preserved as objects.
|
||||
"""
|
||||
# All rows have the same JSON structure
|
||||
data = [
|
||||
(1, {"name": "Alice", "age": 30}),
|
||||
(2, {"name": "Bob", "age": 25}),
|
||||
(3, {"name": "Charlie", "age": 35}),
|
||||
]
|
||||
description = [
|
||||
("id", 23, None, None, None, None, None),
|
||||
("data", 3802, None, None, None, None, None),
|
||||
]
|
||||
result_set = SupersetResultSet(data, description, BaseEngineSpec) # type: ignore
|
||||
df = result_set.to_pandas_df()
|
||||
|
||||
# Should be preserved as dicts
|
||||
assert isinstance(df["data"].iloc[0], dict)
|
||||
assert df["data"].iloc[0]["name"] == "Alice"
|
||||
assert df["data"].iloc[1]["age"] == 25
|
||||
|
||||
|
||||
def test_array_data_type_preserved() -> None:
|
||||
"""
|
||||
Test that array data is also preserved as Python lists.
|
||||
"""
|
||||
data = [
|
||||
(1, [1, 2, 3]),
|
||||
(2, [4, 5, 6]),
|
||||
(3, None),
|
||||
]
|
||||
description = [
|
||||
("id", 23, None, None, None, None, None),
|
||||
("arr", 1007, None, None, None, None, None), # INT ARRAY
|
||||
]
|
||||
result_set = SupersetResultSet(data, description, BaseEngineSpec) # type: ignore
|
||||
df = result_set.to_pandas_df()
|
||||
|
||||
# Arrays should be preserved as lists
|
||||
assert df["arr"].iloc[0] == [1, 2, 3]
|
||||
assert isinstance(df["arr"].iloc[0], list)
|
||||
assert df["arr"].iloc[2] is None
|
||||
|
||||
Reference in New Issue
Block a user