fix(sqllab_export): manually encode CSV output to support utf-8-sig (#34235)

2026-04-20 08:34:37 +00:00 · 2025-07-24 04:44:56 +03:00
parent 9099b0f00d
commit 43775e9373
6 changed files with 42 additions and 12 deletions
--- a/UPDATING.md
+++ b/UPDATING.md
@@ -23,6 +23,7 @@ This file documents any backwards-incompatible changes in Superset and
 assists people when migrating to a new version.

 ## Next
+- [34235](https://github.com/apache/superset/pull/34235) CSV exports now use `utf-8-sig` encoding by default to include a UTF-8 BOM, improving compatibility with Excel.
 - [34258](https://github.com/apache/superset/pull/34258) changing the default in Dockerfile to INCLUDE_CHROMIUM="false" (from "true") in the past. This ensures the `lean` layer is lean by default, and people can opt-in to the `chromium` layer by setting the build arg `INCLUDE_CHROMIUM=true`. This is a breaking change for anyone using the `lean` layer, as it will no longer include Chromium by default.
 - [34204](https://github.com/apache/superset/pull/33603) OpenStreetView has been promoted as the new default for Deck.gl visualization since it can be enabled by default without requiring an API key. If you have Mapbox set up and want to disable OpenStreeView in your environment, please follow the steps documented here [https://superset.apache.org/docs/configuration/map-tiles].
 - [33116](https://github.com/apache/superset/pull/33116) In Echarts Series charts (e.g. Line, Area, Bar, etc.) charts, the `x_axis_sort_series` and `x_axis_sort_series_ascending` form data items have been renamed with `x_axis_sort` and `x_axis_sort_asc`.
--- a/superset/commands/sql_lab/export.py
+++ b/superset/commands/sql_lab/export.py
@@ -131,7 +131,9 @@ class SqlResultExportCommand(BaseCommand):
                self._query.schema,
            )[:limit]

-        csv_data = csv.df_to_escaped_csv(df, index=False, **config["CSV_EXPORT"])
+        # Manual encoding using the specified encoding (default to utf-8 if not set)
+        csv_string = csv.df_to_escaped_csv(df, index=False, **config["CSV_EXPORT"])
+        csv_data = csv_string.encode(config["CSV_EXPORT"].get("encoding", "utf-8"))

        return {
            "query": self._query,
--- a/superset/config.py
+++ b/superset/config.py
@@ -946,7 +946,7 @@ ALLOWED_EXTENSIONS = {*EXCEL_EXTENSIONS, *CSV_EXTENSIONS, *COLUMNAR_EXTENSIONS}
 # CSV Options: key/value pairs that will be passed as argument to DataFrame.to_csv
 # method.
 # note: index option should not be overridden
-CSV_EXPORT = {"encoding": "utf-8"}
+CSV_EXPORT = {"encoding": "utf-8-sig"}

 # Excel Options: key/value pairs that will be passed as argument to DataFrame.to_excel
 # method.
--- a/tests/integration_tests/sql_lab/api_tests.py
+++ b/tests/integration_tests/sql_lab/api_tests.py
@@ -450,12 +450,35 @@ class TestSqlLabApi(SupersetTestCase):
        db.session.add(query_obj)
        db.session.commit()

-        get_df_mock.return_value = pd.DataFrame({"foo": [1, 2, 3]})
+        # Include multilingual data
+        get_df_mock.return_value = pd.DataFrame(
+            {
+                "foo": [1, 2],
+                "مرحبا": ["أ", "ب"],
+                "姓名": ["张", "李"],
+            }
+        )

        resp = self.get_resp("/api/v1/sqllab/export/test/")
-        data = csv.reader(io.StringIO(resp))
-        expected_data = csv.reader(io.StringIO("foo\n1\n2"))

-        assert list(expected_data) == list(data)
+        # Check for UTF-8 BOM
+        assert resp.startswith("\ufeff"), "Missing UTF-8 BOM at beginning of CSV"
+
+        # Parse CSV
+        reader = csv.reader(io.StringIO(resp))
+        data = list(reader)
+
+        # Strip BOM from the first cell of the header
+        if data and data[0]:
+            data[0][0] = data[0][0].lstrip("\ufeff")
+
+        # Expected header and rows
+        expected_data = [
+            ["foo", "مرحبا", "姓名"],
+            ["1", "أ", "张"],
+            ["2", "ب", "李"],
+        ]
+
+        assert data == expected_data, f"CSV data mismatch. Got: {data}"
        db.session.delete(query_obj)
        db.session.commit()
--- a/tests/integration_tests/sql_lab/commands_tests.py
+++ b/tests/integration_tests/sql_lab/commands_tests.py
@@ -177,7 +177,7 @@ class TestSqlResultExportCommand(SupersetTestCase):
        get_df_mock.return_value = pd.DataFrame({"foo": [1, 2, 3]})
        result = command.run()

-        assert result["data"] == "foo\n1\n2\n3\n"
+        assert result["data"] == b"\xef\xbb\xbffoo\n1\n2\n3\n"
        assert result["count"] == 3
        assert result["query"].client_id == "test"

@@ -195,7 +195,7 @@ class TestSqlResultExportCommand(SupersetTestCase):
        get_df_mock.return_value = pd.DataFrame({"foo": [1, 2, 3]})
        result = command.run()

-        assert result["data"] == "foo\n1\n2\n"
+        assert result["data"] == b"\xef\xbb\xbffoo\n1\n2\n"
        assert result["count"] == 2
        assert result["query"].client_id == "test"

@@ -217,7 +217,7 @@ class TestSqlResultExportCommand(SupersetTestCase):

        result = command.run()

-        assert result["data"] == "foo\n1\n"
+        assert result["data"] == b"\xef\xbb\xbffoo\n1\n"
        assert result["count"] == 1
        assert result["query"].client_id == "test"

@@ -240,7 +240,7 @@ class TestSqlResultExportCommand(SupersetTestCase):

        result = command.run()

-        assert result["data"] == "foo\n0\n1\n2\n3\n4\n"
+        assert result["data"] == b"\xef\xbb\xbffoo\n0\n1\n2\n3\n4\n"
        assert result["count"] == 5
        assert result["query"].client_id == "test"

--- a/tests/unit_tests/common/test_query_context_processor.py
+++ b/tests/unit_tests/common/test_query_context_processor.py
@@ -67,7 +67,9 @@ def test_get_data_csv(mock_df_to_escaped_csv, processor, mock_query_context):
    mock_df_to_escaped_csv.return_value = "col1,col2\n1,a\n2,b\n3,c\n"
    result = processor.get_data(df, coltypes)
    assert result == "col1,col2\n1,a\n2,b\n3,c\n"
-    mock_df_to_escaped_csv.assert_called_once_with(df, index=False, encoding="utf-8")
+    mock_df_to_escaped_csv.assert_called_once_with(
+        df, index=False, encoding="utf-8-sig"
+    )


@patch("superset.common.query_context_processor.excel.df_to_excel")
@@ -141,7 +143,9 @@ def test_get_data_empty_dataframe_csv(
    mock_df_to_escaped_csv.return_value = "col1,col2\n"
    result = processor.get_data(df, coltypes)
    assert result == "col1,col2\n"
-    mock_df_to_escaped_csv.assert_called_once_with(df, index=False, encoding="utf-8")
+    mock_df_to_escaped_csv.assert_called_once_with(
+        df, index=False, encoding="utf-8-sig"
+    )


@patch("superset.common.query_context_processor.excel.df_to_excel")