fix(api): nan is not properly handled for athena connections (#37071)

2026-04-19 16:14:52 +00:00 · 2026-01-22 11:29:09 -04:00
parent cc972cad5a
commit fadab21493
2 changed files with 184 additions and 19 deletions
--- a/tests/unit_tests/dataframe_test.py
+++ b/tests/unit_tests/dataframe_test.py
@@ -17,18 +17,19 @@
 # pylint: disable=unused-argument, import-outside-toplevel
 from datetime import datetime

+import numpy as np
 import pytest
 from pandas import Timestamp
 from pandas._libs.tslibs import NaT

 from superset.dataframe import df_to_records
+from superset.db_engine_specs import BaseEngineSpec
+from superset.result_set import SupersetResultSet
 from superset.superset_typing import DbapiDescription
+from superset.utils import json as superset_json


 def test_df_to_records() -> None:
-    from superset.db_engine_specs import BaseEngineSpec
-    from superset.result_set import SupersetResultSet
-
    data = [("a1", "b1", "c1"), ("a2", "b2", "c2")]
    cursor_descr: DbapiDescription = [
        (column, "string", None, None, None, None, False) for column in ("a", "b", "c")
@@ -43,9 +44,6 @@ def test_df_to_records() -> None:


 def test_df_to_records_NaT_type() -> None:  # noqa: N802
-    from superset.db_engine_specs import BaseEngineSpec
-    from superset.result_set import SupersetResultSet
-
    data = [(NaT,), (Timestamp("2023-01-06 20:50:31.749000+0000", tz="UTC"),)]
    cursor_descr: DbapiDescription = [
        ("date", "timestamp with time zone", None, None, None, None, False)
@@ -60,9 +58,6 @@ def test_df_to_records_NaT_type() -> None:  # noqa: N802


 def test_df_to_records_mixed_emoji_type() -> None:
-    from superset.db_engine_specs import BaseEngineSpec
-    from superset.result_set import SupersetResultSet
-
    data = [
        ("What's up?", "This is a string text", 1),
        ("What's up?", "This is a string with an 😍 added", 2),
@@ -100,9 +95,6 @@ def test_df_to_records_mixed_emoji_type() -> None:


 def test_df_to_records_mixed_accent_type() -> None:
-    from superset.db_engine_specs import BaseEngineSpec
-    from superset.result_set import SupersetResultSet
-
    data = [
        ("What's up?", "This is a string text", 1),
        ("What's up?", "This is a string with áccent", 2),
@@ -140,9 +132,6 @@ def test_df_to_records_mixed_accent_type() -> None:


 def test_js_max_int() -> None:
-    from superset.db_engine_specs import BaseEngineSpec
-    from superset.result_set import SupersetResultSet
-
    data = [(1, 1239162456494753670, "c1"), (2, 100, "c2")]
    cursor_descr: DbapiDescription = [
        ("a", "int", None, None, None, None, False),
@@ -192,9 +181,6 @@ def test_js_max_int() -> None:
    ],
 )
 def test_max_pandas_timestamp(input_, expected) -> None:
-    from superset.db_engine_specs import BaseEngineSpec
-    from superset.result_set import SupersetResultSet
-
    cursor_descr: DbapiDescription = [
        ("a", "datetime", None, None, None, None, False),
        ("b", "int", None, None, None, None, False),
@@ -203,3 +189,177 @@ def test_max_pandas_timestamp(input_, expected) -> None:
    df = results.to_pandas_df()

    assert df_to_records(df) == expected
+
+
+def test_df_to_records_with_nan_from_division_by_zero() -> None:
+    """Test that NaN values from division by zero are converted to None."""
+    # Simulate Athena query: select 0.00 / 0.00 as test
+    data = [(np.nan,), (5.0,), (np.nan,)]
+    cursor_descr: DbapiDescription = [("test", "double", None, None, None, None, False)]
+    results = SupersetResultSet(data, cursor_descr, BaseEngineSpec)
+    df = results.to_pandas_df()
+
+    assert df_to_records(df) == [
+        {"test": None},
+        {"test": 5.0},
+        {"test": None},
+    ]
+
+
+def test_df_to_records_with_mixed_nan_and_valid_values() -> None:
+    """Test that NaN values are properly handled alongside valid numeric data."""
+
+    # Simulate a query with multiple columns containing NaN values
+    data = [
+        ("row1", 10.5, np.nan, 100),
+        ("row2", np.nan, 20.3, 200),
+        ("row3", 30.7, 40.2, np.nan),
+        ("row4", np.nan, np.nan, np.nan),
+    ]
+    cursor_descr: DbapiDescription = [
+        ("name", "varchar", None, None, None, None, False),
+        ("value1", "double", None, None, None, None, False),
+        ("value2", "double", None, None, None, None, False),
+        ("value3", "int", None, None, None, None, False),
+    ]
+    results = SupersetResultSet(data, cursor_descr, BaseEngineSpec)
+    df = results.to_pandas_df()
+
+    assert df_to_records(df) == [
+        {"name": "row1", "value1": 10.5, "value2": None, "value3": 100},
+        {"name": "row2", "value1": None, "value2": 20.3, "value3": 200},
+        {"name": "row3", "value1": 30.7, "value2": 40.2, "value3": None},
+        {"name": "row4", "value1": None, "value2": None, "value3": None},
+    ]
+
+
+def test_df_to_records_with_inf_and_nan() -> None:
+    """Test that both NaN and infinity values are handled correctly."""
+    # Test various edge cases: NaN, positive infinity, negative infinity
+    data = [
+        (np.nan, "division by zero"),
+        (np.inf, "positive infinity"),
+        (-np.inf, "negative infinity"),
+        (0.0, "zero"),
+        (42.5, "normal value"),
+    ]
+    cursor_descr: DbapiDescription = [
+        ("result", "double", None, None, None, None, False),
+        ("description", "varchar", None, None, None, None, False),
+    ]
+    results = SupersetResultSet(data, cursor_descr, BaseEngineSpec)
+    df = results.to_pandas_df()
+
+    records = df_to_records(df)
+
+    # NaN should be converted to None
+    assert records[0]["result"] is None
+    assert records[0]["description"] == "division by zero"
+
+    # Infinity values should remain as-is (they're valid JSON)
+    assert records[1]["result"] == np.inf
+    assert records[2]["result"] == -np.inf
+
+    # Normal values should remain unchanged
+    assert records[3]["result"] == 0.0
+    assert records[4]["result"] == 42.5
+
+
+def test_df_to_records_nan_json_serialization() -> None:
+    """
+    Test that NaN values are properly converted to None for JSON serialization.
+
+    Without the pd.isna() check, np.nan values would be passed through to JSON
+    serialization, which either produces non-spec-compliant output or requires
+    special handling with ignore_nan flags throughout the codebase.
+
+    This test validates that our fix converts NaN to None for proper JSON
+    serialization.
+    """
+    # Simulate Athena query: SELECT 0.00 / 0.00 as test
+    data = [(np.nan,), (5.0,), (np.nan,)]
+    cursor_descr: DbapiDescription = [("test", "double", None, None, None, None, False)]
+    results = SupersetResultSet(data, cursor_descr, BaseEngineSpec)
+    df = results.to_pandas_df()
+
+    # Get records with our fix
+    records = df_to_records(df)
+
+    # Verify NaN values are converted to None
+    assert records == [
+        {"test": None},  # NaN converted to None
+        {"test": 5.0},
+        {"test": None},  # NaN converted to None
+    ]
+
+    # This should succeed with valid, spec-compliant JSON
+    json_output = superset_json.dumps(records)
+    parsed = superset_json.loads(json_output)
+
+    # Verify JSON serialization works correctly
+    assert parsed == records
+
+    # Demonstrate what happens WITHOUT the fix
+    # (simulate the old behavior by directly using to_dict)
+    records_without_fix = df.to_dict(orient="records")
+
+    # Verify the records contain actual NaN values (not None)
+    assert np.isnan(records_without_fix[0]["test"])
+    assert records_without_fix[1]["test"] == 5.0
+    assert np.isnan(records_without_fix[2]["test"])
+
+    # Demonstrate the actual bug: without the fix, ignore_nan=False raises ValueError
+    # This is the error users would see without our fix
+    with pytest.raises(
+        ValueError, match="Out of range float values are not JSON compliant"
+    ):
+        superset_json.dumps(records_without_fix, ignore_nan=False)
+
+    # With ignore_nan=True, it works by converting NaN to null
+    # But this requires the flag to be set everywhere - our fix eliminates this need
+    json_with_ignore = superset_json.dumps(records_without_fix, ignore_nan=True)
+    parsed_with_ignore = superset_json.loads(json_with_ignore)
+    # The output is the same, but our fix doesn't require the ignore_nan flag
+    assert parsed_with_ignore[0]["test"] is None
+
+
+def test_df_to_records_with_json_serialization_like_sql_lab() -> None:
+    """
+    Test that mimics the actual SQL Lab serialization flow.
+    This shows how the fix prevents errors in the real usage path.
+    """
+    # Simulate query with NaN results
+    data = [
+        ("user1", 100.0, np.nan),
+        ("user2", np.nan, 50.0),
+        ("user3", 75.0, 25.0),
+    ]
+    cursor_descr: DbapiDescription = [
+        ("name", "varchar", None, None, None, None, False),
+        ("value1", "double", None, None, None, None, False),
+        ("value2", "double", None, None, None, None, False),
+    ]
+    results = SupersetResultSet(data, cursor_descr, BaseEngineSpec)
+    df = results.to_pandas_df()
+
+    # Mimic sql_lab.py:360 - this is where df_to_records is used
+    records = df_to_records(df) or []
+
+    # Mimic sql_lab.py:332 - JSON serialization with Superset's custom json.dumps
+    # This should work without errors
+    json_str = superset_json.dumps(
+        records, default=superset_json.json_iso_dttm_ser, ignore_nan=True
+    )
+
+    # Verify it's valid JSON and NaN values are properly handled as null
+    parsed = superset_json.loads(json_str)
+    assert parsed[0]["value2"] is None  # NaN became null
+    assert parsed[1]["value1"] is None  # NaN became null
+    assert parsed[0]["value1"] == 100.0
+
+    # Also verify it works without ignore_nan flag (since we convert NaN to None)
+    json_str_no_flag = superset_json.dumps(
+        records, default=superset_json.json_iso_dttm_ser, ignore_nan=False
+    )
+    parsed_no_flag = superset_json.loads(json_str_no_flag)
+    assert parsed_no_flag == parsed  # Same result