diff --git a/superset/dataframe.py b/superset/dataframe.py index 5f3c0dc7798..0e7cba0bc3c 100644 --- a/superset/dataframe.py +++ b/superset/dataframe.py @@ -41,6 +41,9 @@ def df_to_records(dframe: pd.DataFrame) -> list[dict[str, Any]]: """ Convert a DataFrame to a set of records. + NaN values are converted to None for JSON compatibility. + This handles division by zero and other operations that produce NaN. + :param dframe: the DataFrame to convert :returns: a list of dictionaries reflecting each single row of the DataFrame """ @@ -52,6 +55,8 @@ def df_to_records(dframe: pd.DataFrame) -> list[dict[str, Any]]: for record in records: for key in record: - record[key] = _convert_big_integers(record[key]) + record[key] = ( + None if pd.isna(record[key]) else _convert_big_integers(record[key]) + ) return records diff --git a/tests/unit_tests/dataframe_test.py b/tests/unit_tests/dataframe_test.py index 0443bc1461c..934edea2047 100644 --- a/tests/unit_tests/dataframe_test.py +++ b/tests/unit_tests/dataframe_test.py @@ -17,18 +17,19 @@ # pylint: disable=unused-argument, import-outside-toplevel from datetime import datetime +import numpy as np import pytest from pandas import Timestamp from pandas._libs.tslibs import NaT from superset.dataframe import df_to_records +from superset.db_engine_specs import BaseEngineSpec +from superset.result_set import SupersetResultSet from superset.superset_typing import DbapiDescription +from superset.utils import json as superset_json def test_df_to_records() -> None: - from superset.db_engine_specs import BaseEngineSpec - from superset.result_set import SupersetResultSet - data = [("a1", "b1", "c1"), ("a2", "b2", "c2")] cursor_descr: DbapiDescription = [ (column, "string", None, None, None, None, False) for column in ("a", "b", "c") @@ -43,9 +44,6 @@ def test_df_to_records() -> None: def test_df_to_records_NaT_type() -> None: # noqa: N802 - from superset.db_engine_specs import BaseEngineSpec - from superset.result_set import SupersetResultSet - data = [(NaT,), (Timestamp("2023-01-06 20:50:31.749000+0000", tz="UTC"),)] cursor_descr: DbapiDescription = [ ("date", "timestamp with time zone", None, None, None, None, False) @@ -60,9 +58,6 @@ def test_df_to_records_NaT_type() -> None: # noqa: N802 def test_df_to_records_mixed_emoji_type() -> None: - from superset.db_engine_specs import BaseEngineSpec - from superset.result_set import SupersetResultSet - data = [ ("What's up?", "This is a string text", 1), ("What's up?", "This is a string with an 😍 added", 2), @@ -100,9 +95,6 @@ def test_df_to_records_mixed_emoji_type() -> None: def test_df_to_records_mixed_accent_type() -> None: - from superset.db_engine_specs import BaseEngineSpec - from superset.result_set import SupersetResultSet - data = [ ("What's up?", "This is a string text", 1), ("What's up?", "This is a string with áccent", 2), @@ -140,9 +132,6 @@ def test_df_to_records_mixed_accent_type() -> None: def test_js_max_int() -> None: - from superset.db_engine_specs import BaseEngineSpec - from superset.result_set import SupersetResultSet - data = [(1, 1239162456494753670, "c1"), (2, 100, "c2")] cursor_descr: DbapiDescription = [ ("a", "int", None, None, None, None, False), @@ -192,9 +181,6 @@ def test_js_max_int() -> None: ], ) def test_max_pandas_timestamp(input_, expected) -> None: - from superset.db_engine_specs import BaseEngineSpec - from superset.result_set import SupersetResultSet - cursor_descr: DbapiDescription = [ ("a", "datetime", None, None, None, None, False), ("b", "int", None, None, None, None, False), @@ -203,3 +189,177 @@ def test_max_pandas_timestamp(input_, expected) -> None: df = results.to_pandas_df() assert df_to_records(df) == expected + + +def test_df_to_records_with_nan_from_division_by_zero() -> None: + """Test that NaN values from division by zero are converted to None.""" + # Simulate Athena query: select 0.00 / 0.00 as test + data = [(np.nan,), (5.0,), (np.nan,)] + cursor_descr: DbapiDescription = [("test", "double", None, None, None, None, False)] + results = SupersetResultSet(data, cursor_descr, BaseEngineSpec) + df = results.to_pandas_df() + + assert df_to_records(df) == [ + {"test": None}, + {"test": 5.0}, + {"test": None}, + ] + + +def test_df_to_records_with_mixed_nan_and_valid_values() -> None: + """Test that NaN values are properly handled alongside valid numeric data.""" + + # Simulate a query with multiple columns containing NaN values + data = [ + ("row1", 10.5, np.nan, 100), + ("row2", np.nan, 20.3, 200), + ("row3", 30.7, 40.2, np.nan), + ("row4", np.nan, np.nan, np.nan), + ] + cursor_descr: DbapiDescription = [ + ("name", "varchar", None, None, None, None, False), + ("value1", "double", None, None, None, None, False), + ("value2", "double", None, None, None, None, False), + ("value3", "int", None, None, None, None, False), + ] + results = SupersetResultSet(data, cursor_descr, BaseEngineSpec) + df = results.to_pandas_df() + + assert df_to_records(df) == [ + {"name": "row1", "value1": 10.5, "value2": None, "value3": 100}, + {"name": "row2", "value1": None, "value2": 20.3, "value3": 200}, + {"name": "row3", "value1": 30.7, "value2": 40.2, "value3": None}, + {"name": "row4", "value1": None, "value2": None, "value3": None}, + ] + + +def test_df_to_records_with_inf_and_nan() -> None: + """Test that both NaN and infinity values are handled correctly.""" + # Test various edge cases: NaN, positive infinity, negative infinity + data = [ + (np.nan, "division by zero"), + (np.inf, "positive infinity"), + (-np.inf, "negative infinity"), + (0.0, "zero"), + (42.5, "normal value"), + ] + cursor_descr: DbapiDescription = [ + ("result", "double", None, None, None, None, False), + ("description", "varchar", None, None, None, None, False), + ] + results = SupersetResultSet(data, cursor_descr, BaseEngineSpec) + df = results.to_pandas_df() + + records = df_to_records(df) + + # NaN should be converted to None + assert records[0]["result"] is None + assert records[0]["description"] == "division by zero" + + # Infinity values should remain as-is (they're valid JSON) + assert records[1]["result"] == np.inf + assert records[2]["result"] == -np.inf + + # Normal values should remain unchanged + assert records[3]["result"] == 0.0 + assert records[4]["result"] == 42.5 + + +def test_df_to_records_nan_json_serialization() -> None: + """ + Test that NaN values are properly converted to None for JSON serialization. + + Without the pd.isna() check, np.nan values would be passed through to JSON + serialization, which either produces non-spec-compliant output or requires + special handling with ignore_nan flags throughout the codebase. + + This test validates that our fix converts NaN to None for proper JSON + serialization. + """ + # Simulate Athena query: SELECT 0.00 / 0.00 as test + data = [(np.nan,), (5.0,), (np.nan,)] + cursor_descr: DbapiDescription = [("test", "double", None, None, None, None, False)] + results = SupersetResultSet(data, cursor_descr, BaseEngineSpec) + df = results.to_pandas_df() + + # Get records with our fix + records = df_to_records(df) + + # Verify NaN values are converted to None + assert records == [ + {"test": None}, # NaN converted to None + {"test": 5.0}, + {"test": None}, # NaN converted to None + ] + + # This should succeed with valid, spec-compliant JSON + json_output = superset_json.dumps(records) + parsed = superset_json.loads(json_output) + + # Verify JSON serialization works correctly + assert parsed == records + + # Demonstrate what happens WITHOUT the fix + # (simulate the old behavior by directly using to_dict) + records_without_fix = df.to_dict(orient="records") + + # Verify the records contain actual NaN values (not None) + assert np.isnan(records_without_fix[0]["test"]) + assert records_without_fix[1]["test"] == 5.0 + assert np.isnan(records_without_fix[2]["test"]) + + # Demonstrate the actual bug: without the fix, ignore_nan=False raises ValueError + # This is the error users would see without our fix + with pytest.raises( + ValueError, match="Out of range float values are not JSON compliant" + ): + superset_json.dumps(records_without_fix, ignore_nan=False) + + # With ignore_nan=True, it works by converting NaN to null + # But this requires the flag to be set everywhere - our fix eliminates this need + json_with_ignore = superset_json.dumps(records_without_fix, ignore_nan=True) + parsed_with_ignore = superset_json.loads(json_with_ignore) + # The output is the same, but our fix doesn't require the ignore_nan flag + assert parsed_with_ignore[0]["test"] is None + + +def test_df_to_records_with_json_serialization_like_sql_lab() -> None: + """ + Test that mimics the actual SQL Lab serialization flow. + This shows how the fix prevents errors in the real usage path. + """ + # Simulate query with NaN results + data = [ + ("user1", 100.0, np.nan), + ("user2", np.nan, 50.0), + ("user3", 75.0, 25.0), + ] + cursor_descr: DbapiDescription = [ + ("name", "varchar", None, None, None, None, False), + ("value1", "double", None, None, None, None, False), + ("value2", "double", None, None, None, None, False), + ] + results = SupersetResultSet(data, cursor_descr, BaseEngineSpec) + df = results.to_pandas_df() + + # Mimic sql_lab.py:360 - this is where df_to_records is used + records = df_to_records(df) or [] + + # Mimic sql_lab.py:332 - JSON serialization with Superset's custom json.dumps + # This should work without errors + json_str = superset_json.dumps( + records, default=superset_json.json_iso_dttm_ser, ignore_nan=True + ) + + # Verify it's valid JSON and NaN values are properly handled as null + parsed = superset_json.loads(json_str) + assert parsed[0]["value2"] is None # NaN became null + assert parsed[1]["value1"] is None # NaN became null + assert parsed[0]["value1"] == 100.0 + + # Also verify it works without ignore_nan flag (since we convert NaN to None) + json_str_no_flag = superset_json.dumps( + records, default=superset_json.json_iso_dttm_ser, ignore_nan=False + ) + parsed_no_flag = superset_json.loads(json_str_no_flag) + assert parsed_no_flag == parsed # Same result