fix(sqllab_export): manually encode CSV output to support utf-8-sig (#34235)

This commit is contained in:
Ahmed Habeeb
2025-07-24 04:44:56 +03:00
committed by GitHub
parent 9099b0f00d
commit 43775e9373
6 changed files with 42 additions and 12 deletions

View File

@@ -23,6 +23,7 @@ This file documents any backwards-incompatible changes in Superset and
assists people when migrating to a new version.
## Next
- [34235](https://github.com/apache/superset/pull/34235) CSV exports now use `utf-8-sig` encoding by default to include a UTF-8 BOM, improving compatibility with Excel.
- [34258](https://github.com/apache/superset/pull/34258) changing the default in Dockerfile to INCLUDE_CHROMIUM="false" (from "true") in the past. This ensures the `lean` layer is lean by default, and people can opt-in to the `chromium` layer by setting the build arg `INCLUDE_CHROMIUM=true`. This is a breaking change for anyone using the `lean` layer, as it will no longer include Chromium by default.
- [34204](https://github.com/apache/superset/pull/33603) OpenStreetView has been promoted as the new default for Deck.gl visualization since it can be enabled by default without requiring an API key. If you have Mapbox set up and want to disable OpenStreeView in your environment, please follow the steps documented here [https://superset.apache.org/docs/configuration/map-tiles].
- [33116](https://github.com/apache/superset/pull/33116) In Echarts Series charts (e.g. Line, Area, Bar, etc.) charts, the `x_axis_sort_series` and `x_axis_sort_series_ascending` form data items have been renamed with `x_axis_sort` and `x_axis_sort_asc`.

View File

@@ -131,7 +131,9 @@ class SqlResultExportCommand(BaseCommand):
self._query.schema,
)[:limit]
csv_data = csv.df_to_escaped_csv(df, index=False, **config["CSV_EXPORT"])
# Manual encoding using the specified encoding (default to utf-8 if not set)
csv_string = csv.df_to_escaped_csv(df, index=False, **config["CSV_EXPORT"])
csv_data = csv_string.encode(config["CSV_EXPORT"].get("encoding", "utf-8"))
return {
"query": self._query,

View File

@@ -946,7 +946,7 @@ ALLOWED_EXTENSIONS = {*EXCEL_EXTENSIONS, *CSV_EXTENSIONS, *COLUMNAR_EXTENSIONS}
# CSV Options: key/value pairs that will be passed as argument to DataFrame.to_csv
# method.
# note: index option should not be overridden
CSV_EXPORT = {"encoding": "utf-8"}
CSV_EXPORT = {"encoding": "utf-8-sig"}
# Excel Options: key/value pairs that will be passed as argument to DataFrame.to_excel
# method.

View File

@@ -450,12 +450,35 @@ class TestSqlLabApi(SupersetTestCase):
db.session.add(query_obj)
db.session.commit()
get_df_mock.return_value = pd.DataFrame({"foo": [1, 2, 3]})
# Include multilingual data
get_df_mock.return_value = pd.DataFrame(
{
"foo": [1, 2],
"مرحبا": ["أ", "ب"],
"姓名": ["", ""],
}
)
resp = self.get_resp("/api/v1/sqllab/export/test/")
data = csv.reader(io.StringIO(resp))
expected_data = csv.reader(io.StringIO("foo\n1\n2"))
assert list(expected_data) == list(data)
# Check for UTF-8 BOM
assert resp.startswith("\ufeff"), "Missing UTF-8 BOM at beginning of CSV"
# Parse CSV
reader = csv.reader(io.StringIO(resp))
data = list(reader)
# Strip BOM from the first cell of the header
if data and data[0]:
data[0][0] = data[0][0].lstrip("\ufeff")
# Expected header and rows
expected_data = [
["foo", "مرحبا", "姓名"],
["1", "أ", ""],
["2", "ب", ""],
]
assert data == expected_data, f"CSV data mismatch. Got: {data}"
db.session.delete(query_obj)
db.session.commit()

View File

@@ -177,7 +177,7 @@ class TestSqlResultExportCommand(SupersetTestCase):
get_df_mock.return_value = pd.DataFrame({"foo": [1, 2, 3]})
result = command.run()
assert result["data"] == "foo\n1\n2\n3\n"
assert result["data"] == b"\xef\xbb\xbffoo\n1\n2\n3\n"
assert result["count"] == 3
assert result["query"].client_id == "test"
@@ -195,7 +195,7 @@ class TestSqlResultExportCommand(SupersetTestCase):
get_df_mock.return_value = pd.DataFrame({"foo": [1, 2, 3]})
result = command.run()
assert result["data"] == "foo\n1\n2\n"
assert result["data"] == b"\xef\xbb\xbffoo\n1\n2\n"
assert result["count"] == 2
assert result["query"].client_id == "test"
@@ -217,7 +217,7 @@ class TestSqlResultExportCommand(SupersetTestCase):
result = command.run()
assert result["data"] == "foo\n1\n"
assert result["data"] == b"\xef\xbb\xbffoo\n1\n"
assert result["count"] == 1
assert result["query"].client_id == "test"
@@ -240,7 +240,7 @@ class TestSqlResultExportCommand(SupersetTestCase):
result = command.run()
assert result["data"] == "foo\n0\n1\n2\n3\n4\n"
assert result["data"] == b"\xef\xbb\xbffoo\n0\n1\n2\n3\n4\n"
assert result["count"] == 5
assert result["query"].client_id == "test"

View File

@@ -67,7 +67,9 @@ def test_get_data_csv(mock_df_to_escaped_csv, processor, mock_query_context):
mock_df_to_escaped_csv.return_value = "col1,col2\n1,a\n2,b\n3,c\n"
result = processor.get_data(df, coltypes)
assert result == "col1,col2\n1,a\n2,b\n3,c\n"
mock_df_to_escaped_csv.assert_called_once_with(df, index=False, encoding="utf-8")
mock_df_to_escaped_csv.assert_called_once_with(
df, index=False, encoding="utf-8-sig"
)
@patch("superset.common.query_context_processor.excel.df_to_excel")
@@ -141,7 +143,9 @@ def test_get_data_empty_dataframe_csv(
mock_df_to_escaped_csv.return_value = "col1,col2\n"
result = processor.get_data(df, coltypes)
assert result == "col1,col2\n"
mock_df_to_escaped_csv.assert_called_once_with(df, index=False, encoding="utf-8")
mock_df_to_escaped_csv.assert_called_once_with(
df, index=False, encoding="utf-8-sig"
)
@patch("superset.common.query_context_processor.excel.df_to_excel")