feat(reports): allow custom na values (#35481)

Co-authored-by: bito-code-review[bot] <188872107+bito-code-review[bot]@users.noreply.github.com> Co-authored-by: Beto Dealmeida <roberto@dealmeida.net>
2026-06-08 17:19:20 +00:00 · 2025-10-21 19:05:57 +02:00
parent b4a8acc584
commit 58758de93d
4 changed files with 208 additions and 1 deletions
--- a/superset/charts/client_processing.py
+++ b/superset/charts/client_processing.py
@@ -30,6 +30,7 @@ from typing import Any, Optional, TYPE_CHECKING, Union

 import numpy as np
 import pandas as pd
+from flask import current_app
 from flask_babel import gettext as __

 from superset.common.chart_data import ChartDataResultFormat
@@ -340,7 +341,15 @@ def apply_client_processing(  # noqa: C901
        if query["result_format"] == ChartDataResultFormat.JSON:
            df = pd.DataFrame.from_dict(data)
        elif query["result_format"] == ChartDataResultFormat.CSV:
-            df = pd.read_csv(StringIO(data))
+            # Use custom NA values configuration for
+            # reports to avoid unwanted conversions
+            # This allows users to control which values should be treated as null/NA
+            na_values = current_app.config["REPORTS_CSV_NA_NAMES"]
+            df = pd.read_csv(
+                StringIO(data),
+                keep_default_na=na_values is None,
+                na_values=na_values,
+            )

        # convert all columns to verbose (label) name
        if datasource:
--- a/superset/config.py
+++ b/superset/config.py
@@ -1354,6 +1354,15 @@ ALLOWED_USER_CSV_SCHEMA_FUNC = allowed_schemas_for_csv_upload
 # Values that should be treated as nulls for the csv uploads.
 CSV_DEFAULT_NA_NAMES = list(STR_NA_VALUES)

+# Values that should be treated as nulls for scheduled reports CSV processing.
+# If not set or None, defaults to standard pandas NA handling behavior.
+# Set to a custom list to control which values should be treated as null.
+# Examples:
+# REPORTS_CSV_NA_NAMES = None  # Use default pandas NA handling (backwards compatible)
+# REPORTS_CSV_NA_NAMES = []    # Disable all automatic NA conversion
+# REPORTS_CSV_NA_NAMES = ["", "NULL", "null"]  # Only treat these specific values as NA
+REPORTS_CSV_NA_NAMES: list[str] | None = None
+
 # Chunk size for reading CSV files during uploads
 # Smaller values use less memory but may be slower for large files
 READ_CSV_CHUNK_SIZE = 1000
--- a/tests/unit_tests/charts/test_client_processing.py
+++ b/tests/unit_tests/charts/test_client_processing.py
@@ -24,6 +24,7 @@ from sqlalchemy.orm.session import Session
 from superset.charts.client_processing import apply_client_processing, pivot_df, table
 from superset.common.chart_data import ChartDataResultFormat
 from superset.utils.core import GenericDataType
+from tests.conftest import with_config


 def test_pivot_df_no_cols_no_rows_single_metric():
@@ -2653,3 +2654,137 @@ def test_pivot_multi_level_index():
 | ('Total (Sum)', '', '')           |            210 |            105 |              0 |
    """.strip()
    )
+
+
+@with_config({"REPORTS_CSV_NA_NAMES": []})
+def test_apply_client_processing_csv_format_preserves_na_strings():
+    """
+    Test that apply_client_processing preserves "NA" when REPORTS_CSV_NA_NAMES is [].
+
+    This ensures that scheduled reports can be configured to
+    preserve strings like "NA" as literal values.
+    """
+
+    # CSV data with "NA" string that should be preserved
+    csv_data = "first_name,last_name\nJeff,Smith\nAlice,NA"
+
+    result = {
+        "queries": [
+            {
+                "result_format": ChartDataResultFormat.CSV,
+                "data": csv_data,
+            }
+        ]
+    }
+
+    form_data = {
+        "datasource": "1__table",
+        "viz_type": "table",
+        "slice_id": 1,
+        "url_params": {},
+        "metrics": [],
+        "groupby": [],
+        "columns": ["first_name", "last_name"],
+        "extra_form_data": {},
+        "force": False,
+        "result_format": "csv",
+        "result_type": "results",
+    }
+
+    # Test with REPORTS_CSV_NA_NAMES set to empty list (disable NA conversion)
+
+    processed_result = apply_client_processing(result, form_data)
+
+    # Verify the CSV data still contains "NA" as string, not converted to null
+    output_data = processed_result["queries"][0]["data"]
+    assert "NA" in output_data
+    # The "NA" should be preserved in the output CSV
+    lines = output_data.strip().split("\n")
+    assert "Alice,NA" in lines[2]  # Second data row should preserve "NA"
+
+
+@with_config({"REPORTS_CSV_NA_NAMES": ["MISSING"]})
+def test_apply_client_processing_csv_format_custom_na_values():
+    """
+    Test that apply_client_processing respects custom NA values configuration.
+    """
+
+    csv_data = "name,status\nJeff,MISSING\nAlice,OK"
+
+    result = {
+        "queries": [
+            {
+                "result_format": ChartDataResultFormat.CSV,
+                "data": csv_data,
+            }
+        ]
+    }
+
+    form_data = {
+        "datasource": "1__table",
+        "viz_type": "table",
+        "slice_id": 1,
+        "url_params": {},
+        "metrics": [],
+        "groupby": [],
+        "columns": ["name", "status"],
+        "extra_form_data": {},
+        "force": False,
+        "result_format": "csv",
+        "result_type": "results",
+    }
+
+    # Test with custom NA values - only "MISSING" should be treated as NA
+    processed_result = apply_client_processing(result, form_data)
+
+    output_data = processed_result["queries"][0]["data"]
+    lines = output_data.strip().split("\n")
+    assert len(lines) >= 3  # header + 2 data rows
+    assert "Jeff," in lines[1]  # First data row should have empty status after "Jeff,"
+    assert "Alice,OK" in lines[2]  # Second data row should preserve "OK"
+
+
+@with_config({"REPORTS_CSV_NA_NAMES": []})
+def test_apply_client_processing_csv_format_default_na_behavior():
+    """
+    Test that apply_client_processing uses default pandas NA behavior
+    when REPORTS_CSV_NA_NAMES is not configured.
+    This ensures backwards compatibility.
+    """
+
+    # CSV data with "NA" string that should be converted to null in default behavior
+    csv_data = "first_name,last_name\nJeff,Smith\nAlice,NA"
+
+    result = {
+        "queries": [
+            {
+                "result_format": ChartDataResultFormat.CSV,
+                "data": csv_data,
+            }
+        ]
+    }
+
+    form_data = {
+        "datasource": "1__table",
+        "viz_type": "table",
+        "slice_id": 1,
+        "url_params": {},
+        "metrics": [],
+        "groupby": [],
+        "columns": ["first_name", "last_name"],
+        "extra_form_data": {},
+        "force": False,
+        "result_format": "csv",
+        "result_type": "results",
+    }
+
+    processed_result = apply_client_processing(result, form_data)
+
+    # Verify the CSV data has "NA" converted to empty (default pandas behavior)
+    output_data = processed_result["queries"][0]["data"]
+    lines = output_data.strip().split("\n")
+    assert len(lines) >= 3  # header + 2 data rows
+    # The "NA" should be converted to empty by default pandas behavior
+    assert (
+        "Alice," in lines[2]
+    )  # Second data row should have empty last_name (NA converted to null)
--- a/tests/unit_tests/utils/csv_tests.py
+++ b/tests/unit_tests/utils/csv_tests.py
@@ -127,6 +127,21 @@ def fake_get_chart_csv_data_hierarchical(chart_url, auth_cookies=None):
    return json.dumps(fake_result).encode("utf-8")


+def fake_get_chart_csv_data_with_na_values(chart_url, auth_cookies=None):
+    # Return JSON with data containing "NA" string value that will be treated as null
+    fake_result = {
+        "result": [
+            {
+                "data": {"first_name": ["Jeff", "Alice"], "last_name": ["Smith", "NA"]},
+                "coltypes": [GenericDataType.STRING, GenericDataType.STRING],
+                "colnames": ["first_name", "last_name"],
+                "indexnames": ["idx1", "idx2"],
+            }
+        ]
+    }
+    return json.dumps(fake_result).encode("utf-8")
+
+
 def test_df_to_escaped_csv():
    df = pd.DataFrame(
        data={
@@ -263,3 +278,42 @@ def test_get_chart_dataframe_with_hierarchical_columns(monkeypatch: pytest.Monke
 | ('idx',) |                 2 |
 """
    assert markdown_str.strip() == expected_markdown_str.strip()
+
+
+def test_get_chart_dataframe_preserves_na_string_values(
+    monkeypatch: pytest.MonkeyPatch,
+):
+    """
+    Test that get_chart_dataframe currently preserves rows containing "NA"
+    string values.
+    This test verifies the existing behavior before implementing custom NA handling.
+    """
+    monkeypatch.setattr(
+        csv, "get_chart_csv_data", fake_get_chart_csv_data_with_na_values
+    )
+    df = get_chart_dataframe("http://dummy-url")
+    assert df is not None
+
+    # Verify the DataFrame structure
+    expected_columns = pd.MultiIndex.from_tuples([("first_name",), ("last_name",)])
+    pd.testing.assert_index_equal(df.columns, expected_columns)
+
+    expected_index = pd.MultiIndex.from_tuples([("idx1",), ("idx2",)])
+    pd.testing.assert_index_equal(df.index, expected_index)
+
+    # Check that we have both rows initially
+    assert len(df) == 2
+
+    # Verify the data contains the "NA" string value (not converted to NaN)
+    pd.testing.assert_series_equal(
+        df[("first_name",)],
+        pd.Series(["Jeff", "Alice"], name=("first_name",), index=df.index),
+    )
+    pd.testing.assert_series_equal(
+        df[("last_name",)],
+        pd.Series(["Smith", "NA"], name=("last_name",), index=df.index),
+    )
+
+    last_name_values = df[("last_name",)].values
+    assert last_name_values[0] == "Smith"
+    assert last_name_values[1] == "NA"