mirror of
https://github.com/apache/superset.git
synced 2026-04-18 15:44:57 +00:00
feat(reports): allow custom na values (#35481)
Co-authored-by: bito-code-review[bot] <188872107+bito-code-review[bot]@users.noreply.github.com> Co-authored-by: Beto Dealmeida <roberto@dealmeida.net>
This commit is contained in:
@@ -30,6 +30,7 @@ from typing import Any, Optional, TYPE_CHECKING, Union
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from flask import current_app
|
||||
from flask_babel import gettext as __
|
||||
|
||||
from superset.common.chart_data import ChartDataResultFormat
|
||||
@@ -340,7 +341,15 @@ def apply_client_processing( # noqa: C901
|
||||
if query["result_format"] == ChartDataResultFormat.JSON:
|
||||
df = pd.DataFrame.from_dict(data)
|
||||
elif query["result_format"] == ChartDataResultFormat.CSV:
|
||||
df = pd.read_csv(StringIO(data))
|
||||
# Use custom NA values configuration for
|
||||
# reports to avoid unwanted conversions
|
||||
# This allows users to control which values should be treated as null/NA
|
||||
na_values = current_app.config["REPORTS_CSV_NA_NAMES"]
|
||||
df = pd.read_csv(
|
||||
StringIO(data),
|
||||
keep_default_na=na_values is None,
|
||||
na_values=na_values,
|
||||
)
|
||||
|
||||
# convert all columns to verbose (label) name
|
||||
if datasource:
|
||||
|
||||
@@ -1354,6 +1354,15 @@ ALLOWED_USER_CSV_SCHEMA_FUNC = allowed_schemas_for_csv_upload
|
||||
# Values that should be treated as nulls for the csv uploads.
|
||||
CSV_DEFAULT_NA_NAMES = list(STR_NA_VALUES)
|
||||
|
||||
# Values that should be treated as nulls for scheduled reports CSV processing.
|
||||
# If not set or None, defaults to standard pandas NA handling behavior.
|
||||
# Set to a custom list to control which values should be treated as null.
|
||||
# Examples:
|
||||
# REPORTS_CSV_NA_NAMES = None # Use default pandas NA handling (backwards compatible)
|
||||
# REPORTS_CSV_NA_NAMES = [] # Disable all automatic NA conversion
|
||||
# REPORTS_CSV_NA_NAMES = ["", "NULL", "null"] # Only treat these specific values as NA
|
||||
REPORTS_CSV_NA_NAMES: list[str] | None = None
|
||||
|
||||
# Chunk size for reading CSV files during uploads
|
||||
# Smaller values use less memory but may be slower for large files
|
||||
READ_CSV_CHUNK_SIZE = 1000
|
||||
|
||||
@@ -24,6 +24,7 @@ from sqlalchemy.orm.session import Session
|
||||
from superset.charts.client_processing import apply_client_processing, pivot_df, table
|
||||
from superset.common.chart_data import ChartDataResultFormat
|
||||
from superset.utils.core import GenericDataType
|
||||
from tests.conftest import with_config
|
||||
|
||||
|
||||
def test_pivot_df_no_cols_no_rows_single_metric():
|
||||
@@ -2653,3 +2654,137 @@ def test_pivot_multi_level_index():
|
||||
| ('Total (Sum)', '', '') | 210 | 105 | 0 |
|
||||
""".strip()
|
||||
)
|
||||
|
||||
|
||||
@with_config({"REPORTS_CSV_NA_NAMES": []})
|
||||
def test_apply_client_processing_csv_format_preserves_na_strings():
|
||||
"""
|
||||
Test that apply_client_processing preserves "NA" when REPORTS_CSV_NA_NAMES is [].
|
||||
|
||||
This ensures that scheduled reports can be configured to
|
||||
preserve strings like "NA" as literal values.
|
||||
"""
|
||||
|
||||
# CSV data with "NA" string that should be preserved
|
||||
csv_data = "first_name,last_name\nJeff,Smith\nAlice,NA"
|
||||
|
||||
result = {
|
||||
"queries": [
|
||||
{
|
||||
"result_format": ChartDataResultFormat.CSV,
|
||||
"data": csv_data,
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
form_data = {
|
||||
"datasource": "1__table",
|
||||
"viz_type": "table",
|
||||
"slice_id": 1,
|
||||
"url_params": {},
|
||||
"metrics": [],
|
||||
"groupby": [],
|
||||
"columns": ["first_name", "last_name"],
|
||||
"extra_form_data": {},
|
||||
"force": False,
|
||||
"result_format": "csv",
|
||||
"result_type": "results",
|
||||
}
|
||||
|
||||
# Test with REPORTS_CSV_NA_NAMES set to empty list (disable NA conversion)
|
||||
|
||||
processed_result = apply_client_processing(result, form_data)
|
||||
|
||||
# Verify the CSV data still contains "NA" as string, not converted to null
|
||||
output_data = processed_result["queries"][0]["data"]
|
||||
assert "NA" in output_data
|
||||
# The "NA" should be preserved in the output CSV
|
||||
lines = output_data.strip().split("\n")
|
||||
assert "Alice,NA" in lines[2] # Second data row should preserve "NA"
|
||||
|
||||
|
||||
@with_config({"REPORTS_CSV_NA_NAMES": ["MISSING"]})
|
||||
def test_apply_client_processing_csv_format_custom_na_values():
|
||||
"""
|
||||
Test that apply_client_processing respects custom NA values configuration.
|
||||
"""
|
||||
|
||||
csv_data = "name,status\nJeff,MISSING\nAlice,OK"
|
||||
|
||||
result = {
|
||||
"queries": [
|
||||
{
|
||||
"result_format": ChartDataResultFormat.CSV,
|
||||
"data": csv_data,
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
form_data = {
|
||||
"datasource": "1__table",
|
||||
"viz_type": "table",
|
||||
"slice_id": 1,
|
||||
"url_params": {},
|
||||
"metrics": [],
|
||||
"groupby": [],
|
||||
"columns": ["name", "status"],
|
||||
"extra_form_data": {},
|
||||
"force": False,
|
||||
"result_format": "csv",
|
||||
"result_type": "results",
|
||||
}
|
||||
|
||||
# Test with custom NA values - only "MISSING" should be treated as NA
|
||||
processed_result = apply_client_processing(result, form_data)
|
||||
|
||||
output_data = processed_result["queries"][0]["data"]
|
||||
lines = output_data.strip().split("\n")
|
||||
assert len(lines) >= 3 # header + 2 data rows
|
||||
assert "Jeff," in lines[1] # First data row should have empty status after "Jeff,"
|
||||
assert "Alice,OK" in lines[2] # Second data row should preserve "OK"
|
||||
|
||||
|
||||
@with_config({"REPORTS_CSV_NA_NAMES": []})
|
||||
def test_apply_client_processing_csv_format_default_na_behavior():
|
||||
"""
|
||||
Test that apply_client_processing uses default pandas NA behavior
|
||||
when REPORTS_CSV_NA_NAMES is not configured.
|
||||
This ensures backwards compatibility.
|
||||
"""
|
||||
|
||||
# CSV data with "NA" string that should be converted to null in default behavior
|
||||
csv_data = "first_name,last_name\nJeff,Smith\nAlice,NA"
|
||||
|
||||
result = {
|
||||
"queries": [
|
||||
{
|
||||
"result_format": ChartDataResultFormat.CSV,
|
||||
"data": csv_data,
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
form_data = {
|
||||
"datasource": "1__table",
|
||||
"viz_type": "table",
|
||||
"slice_id": 1,
|
||||
"url_params": {},
|
||||
"metrics": [],
|
||||
"groupby": [],
|
||||
"columns": ["first_name", "last_name"],
|
||||
"extra_form_data": {},
|
||||
"force": False,
|
||||
"result_format": "csv",
|
||||
"result_type": "results",
|
||||
}
|
||||
|
||||
processed_result = apply_client_processing(result, form_data)
|
||||
|
||||
# Verify the CSV data has "NA" converted to empty (default pandas behavior)
|
||||
output_data = processed_result["queries"][0]["data"]
|
||||
lines = output_data.strip().split("\n")
|
||||
assert len(lines) >= 3 # header + 2 data rows
|
||||
# The "NA" should be converted to empty by default pandas behavior
|
||||
assert (
|
||||
"Alice," in lines[2]
|
||||
) # Second data row should have empty last_name (NA converted to null)
|
||||
|
||||
@@ -127,6 +127,21 @@ def fake_get_chart_csv_data_hierarchical(chart_url, auth_cookies=None):
|
||||
return json.dumps(fake_result).encode("utf-8")
|
||||
|
||||
|
||||
def fake_get_chart_csv_data_with_na_values(chart_url, auth_cookies=None):
|
||||
# Return JSON with data containing "NA" string value that will be treated as null
|
||||
fake_result = {
|
||||
"result": [
|
||||
{
|
||||
"data": {"first_name": ["Jeff", "Alice"], "last_name": ["Smith", "NA"]},
|
||||
"coltypes": [GenericDataType.STRING, GenericDataType.STRING],
|
||||
"colnames": ["first_name", "last_name"],
|
||||
"indexnames": ["idx1", "idx2"],
|
||||
}
|
||||
]
|
||||
}
|
||||
return json.dumps(fake_result).encode("utf-8")
|
||||
|
||||
|
||||
def test_df_to_escaped_csv():
|
||||
df = pd.DataFrame(
|
||||
data={
|
||||
@@ -263,3 +278,42 @@ def test_get_chart_dataframe_with_hierarchical_columns(monkeypatch: pytest.Monke
|
||||
| ('idx',) | 2 |
|
||||
"""
|
||||
assert markdown_str.strip() == expected_markdown_str.strip()
|
||||
|
||||
|
||||
def test_get_chart_dataframe_preserves_na_string_values(
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
):
|
||||
"""
|
||||
Test that get_chart_dataframe currently preserves rows containing "NA"
|
||||
string values.
|
||||
This test verifies the existing behavior before implementing custom NA handling.
|
||||
"""
|
||||
monkeypatch.setattr(
|
||||
csv, "get_chart_csv_data", fake_get_chart_csv_data_with_na_values
|
||||
)
|
||||
df = get_chart_dataframe("http://dummy-url")
|
||||
assert df is not None
|
||||
|
||||
# Verify the DataFrame structure
|
||||
expected_columns = pd.MultiIndex.from_tuples([("first_name",), ("last_name",)])
|
||||
pd.testing.assert_index_equal(df.columns, expected_columns)
|
||||
|
||||
expected_index = pd.MultiIndex.from_tuples([("idx1",), ("idx2",)])
|
||||
pd.testing.assert_index_equal(df.index, expected_index)
|
||||
|
||||
# Check that we have both rows initially
|
||||
assert len(df) == 2
|
||||
|
||||
# Verify the data contains the "NA" string value (not converted to NaN)
|
||||
pd.testing.assert_series_equal(
|
||||
df[("first_name",)],
|
||||
pd.Series(["Jeff", "Alice"], name=("first_name",), index=df.index),
|
||||
)
|
||||
pd.testing.assert_series_equal(
|
||||
df[("last_name",)],
|
||||
pd.Series(["Smith", "NA"], name=("last_name",), index=df.index),
|
||||
)
|
||||
|
||||
last_name_values = df[("last_name",)].values
|
||||
assert last_name_values[0] == "Smith"
|
||||
assert last_name_values[1] == "NA"
|
||||
|
||||
Reference in New Issue
Block a user