diff --git a/superset/charts/client_processing.py b/superset/charts/client_processing.py index 1bf263f67ae..b2f443885a4 100644 --- a/superset/charts/client_processing.py +++ b/superset/charts/client_processing.py @@ -30,6 +30,7 @@ from typing import Any, Optional, TYPE_CHECKING, Union import numpy as np import pandas as pd +from flask import current_app from flask_babel import gettext as __ from superset.common.chart_data import ChartDataResultFormat @@ -340,7 +341,15 @@ def apply_client_processing( # noqa: C901 if query["result_format"] == ChartDataResultFormat.JSON: df = pd.DataFrame.from_dict(data) elif query["result_format"] == ChartDataResultFormat.CSV: - df = pd.read_csv(StringIO(data)) + # Use custom NA values configuration for + # reports to avoid unwanted conversions + # This allows users to control which values should be treated as null/NA + na_values = current_app.config["REPORTS_CSV_NA_NAMES"] + df = pd.read_csv( + StringIO(data), + keep_default_na=na_values is None, + na_values=na_values, + ) # convert all columns to verbose (label) name if datasource: diff --git a/superset/config.py b/superset/config.py index a36d1699f8b..fcb930143f4 100644 --- a/superset/config.py +++ b/superset/config.py @@ -1354,6 +1354,15 @@ ALLOWED_USER_CSV_SCHEMA_FUNC = allowed_schemas_for_csv_upload # Values that should be treated as nulls for the csv uploads. CSV_DEFAULT_NA_NAMES = list(STR_NA_VALUES) +# Values that should be treated as nulls for scheduled reports CSV processing. +# If not set or None, defaults to standard pandas NA handling behavior. +# Set to a custom list to control which values should be treated as null. +# Examples: +# REPORTS_CSV_NA_NAMES = None # Use default pandas NA handling (backwards compatible) +# REPORTS_CSV_NA_NAMES = [] # Disable all automatic NA conversion +# REPORTS_CSV_NA_NAMES = ["", "NULL", "null"] # Only treat these specific values as NA +REPORTS_CSV_NA_NAMES: list[str] | None = None + # Chunk size for reading CSV files during uploads # Smaller values use less memory but may be slower for large files READ_CSV_CHUNK_SIZE = 1000 diff --git a/tests/unit_tests/charts/test_client_processing.py b/tests/unit_tests/charts/test_client_processing.py index cecc43c45b9..7cb64c233f2 100644 --- a/tests/unit_tests/charts/test_client_processing.py +++ b/tests/unit_tests/charts/test_client_processing.py @@ -24,6 +24,7 @@ from sqlalchemy.orm.session import Session from superset.charts.client_processing import apply_client_processing, pivot_df, table from superset.common.chart_data import ChartDataResultFormat from superset.utils.core import GenericDataType +from tests.conftest import with_config def test_pivot_df_no_cols_no_rows_single_metric(): @@ -2653,3 +2654,137 @@ def test_pivot_multi_level_index(): | ('Total (Sum)', '', '') | 210 | 105 | 0 | """.strip() ) + + +@with_config({"REPORTS_CSV_NA_NAMES": []}) +def test_apply_client_processing_csv_format_preserves_na_strings(): + """ + Test that apply_client_processing preserves "NA" when REPORTS_CSV_NA_NAMES is []. + + This ensures that scheduled reports can be configured to + preserve strings like "NA" as literal values. + """ + + # CSV data with "NA" string that should be preserved + csv_data = "first_name,last_name\nJeff,Smith\nAlice,NA" + + result = { + "queries": [ + { + "result_format": ChartDataResultFormat.CSV, + "data": csv_data, + } + ] + } + + form_data = { + "datasource": "1__table", + "viz_type": "table", + "slice_id": 1, + "url_params": {}, + "metrics": [], + "groupby": [], + "columns": ["first_name", "last_name"], + "extra_form_data": {}, + "force": False, + "result_format": "csv", + "result_type": "results", + } + + # Test with REPORTS_CSV_NA_NAMES set to empty list (disable NA conversion) + + processed_result = apply_client_processing(result, form_data) + + # Verify the CSV data still contains "NA" as string, not converted to null + output_data = processed_result["queries"][0]["data"] + assert "NA" in output_data + # The "NA" should be preserved in the output CSV + lines = output_data.strip().split("\n") + assert "Alice,NA" in lines[2] # Second data row should preserve "NA" + + +@with_config({"REPORTS_CSV_NA_NAMES": ["MISSING"]}) +def test_apply_client_processing_csv_format_custom_na_values(): + """ + Test that apply_client_processing respects custom NA values configuration. + """ + + csv_data = "name,status\nJeff,MISSING\nAlice,OK" + + result = { + "queries": [ + { + "result_format": ChartDataResultFormat.CSV, + "data": csv_data, + } + ] + } + + form_data = { + "datasource": "1__table", + "viz_type": "table", + "slice_id": 1, + "url_params": {}, + "metrics": [], + "groupby": [], + "columns": ["name", "status"], + "extra_form_data": {}, + "force": False, + "result_format": "csv", + "result_type": "results", + } + + # Test with custom NA values - only "MISSING" should be treated as NA + processed_result = apply_client_processing(result, form_data) + + output_data = processed_result["queries"][0]["data"] + lines = output_data.strip().split("\n") + assert len(lines) >= 3 # header + 2 data rows + assert "Jeff," in lines[1] # First data row should have empty status after "Jeff," + assert "Alice,OK" in lines[2] # Second data row should preserve "OK" + + +@with_config({"REPORTS_CSV_NA_NAMES": []}) +def test_apply_client_processing_csv_format_default_na_behavior(): + """ + Test that apply_client_processing uses default pandas NA behavior + when REPORTS_CSV_NA_NAMES is not configured. + This ensures backwards compatibility. + """ + + # CSV data with "NA" string that should be converted to null in default behavior + csv_data = "first_name,last_name\nJeff,Smith\nAlice,NA" + + result = { + "queries": [ + { + "result_format": ChartDataResultFormat.CSV, + "data": csv_data, + } + ] + } + + form_data = { + "datasource": "1__table", + "viz_type": "table", + "slice_id": 1, + "url_params": {}, + "metrics": [], + "groupby": [], + "columns": ["first_name", "last_name"], + "extra_form_data": {}, + "force": False, + "result_format": "csv", + "result_type": "results", + } + + processed_result = apply_client_processing(result, form_data) + + # Verify the CSV data has "NA" converted to empty (default pandas behavior) + output_data = processed_result["queries"][0]["data"] + lines = output_data.strip().split("\n") + assert len(lines) >= 3 # header + 2 data rows + # The "NA" should be converted to empty by default pandas behavior + assert ( + "Alice," in lines[2] + ) # Second data row should have empty last_name (NA converted to null) diff --git a/tests/unit_tests/utils/csv_tests.py b/tests/unit_tests/utils/csv_tests.py index 9987ead0f9c..747e8b32870 100644 --- a/tests/unit_tests/utils/csv_tests.py +++ b/tests/unit_tests/utils/csv_tests.py @@ -127,6 +127,21 @@ def fake_get_chart_csv_data_hierarchical(chart_url, auth_cookies=None): return json.dumps(fake_result).encode("utf-8") +def fake_get_chart_csv_data_with_na_values(chart_url, auth_cookies=None): + # Return JSON with data containing "NA" string value that will be treated as null + fake_result = { + "result": [ + { + "data": {"first_name": ["Jeff", "Alice"], "last_name": ["Smith", "NA"]}, + "coltypes": [GenericDataType.STRING, GenericDataType.STRING], + "colnames": ["first_name", "last_name"], + "indexnames": ["idx1", "idx2"], + } + ] + } + return json.dumps(fake_result).encode("utf-8") + + def test_df_to_escaped_csv(): df = pd.DataFrame( data={ @@ -263,3 +278,42 @@ def test_get_chart_dataframe_with_hierarchical_columns(monkeypatch: pytest.Monke | ('idx',) | 2 | """ assert markdown_str.strip() == expected_markdown_str.strip() + + +def test_get_chart_dataframe_preserves_na_string_values( + monkeypatch: pytest.MonkeyPatch, +): + """ + Test that get_chart_dataframe currently preserves rows containing "NA" + string values. + This test verifies the existing behavior before implementing custom NA handling. + """ + monkeypatch.setattr( + csv, "get_chart_csv_data", fake_get_chart_csv_data_with_na_values + ) + df = get_chart_dataframe("http://dummy-url") + assert df is not None + + # Verify the DataFrame structure + expected_columns = pd.MultiIndex.from_tuples([("first_name",), ("last_name",)]) + pd.testing.assert_index_equal(df.columns, expected_columns) + + expected_index = pd.MultiIndex.from_tuples([("idx1",), ("idx2",)]) + pd.testing.assert_index_equal(df.index, expected_index) + + # Check that we have both rows initially + assert len(df) == 2 + + # Verify the data contains the "NA" string value (not converted to NaN) + pd.testing.assert_series_equal( + df[("first_name",)], + pd.Series(["Jeff", "Alice"], name=("first_name",), index=df.index), + ) + pd.testing.assert_series_equal( + df[("last_name",)], + pd.Series(["Smith", "NA"], name=("last_name",), index=df.index), + ) + + last_name_values = df[("last_name",)].values + assert last_name_values[0] == "Smith" + assert last_name_values[1] == "NA"