diff --git a/superset/result_set.py b/superset/result_set.py index ff65453aff1..13446d4c33e 100644 --- a/superset/result_set.py +++ b/superset/result_set.py @@ -99,6 +99,38 @@ def convert_to_string(value: Any) -> str: return str(value) +def normalize_cursor_description_names( + cursor_description: DbapiDescription, +) -> list[str]: + """ + Replace empty cursor.description names with synthetic names that do not + collide with any explicit column names. + """ + normalized_names: list[str] = [] + unavailable_names = { + convert_to_string(col[0]) + for col in cursor_description + if convert_to_string(col[0]) + } + synthetic_index = 0 + + for col in cursor_description: + column_name = convert_to_string(col[0]) + if column_name: + normalized_names.append(column_name) + continue + + while True: + synthetic_name = f"_col_{synthetic_index}" + synthetic_index += 1 + if synthetic_name not in unavailable_names: + unavailable_names.add(synthetic_name) + normalized_names.append(synthetic_name) + break + + return normalized_names + + class SupersetResultSet: def __init__( # pylint: disable=too-many-locals # noqa: C901 self, @@ -116,9 +148,14 @@ class SupersetResultSet: if cursor_description: # get deduped list of column names - column_names = dedup( - [convert_to_string(col[0]) for col in cursor_description] - ) + # Some databases (e.g. SQL Server) return an empty string as the + # column name for un-aliased expressions like SELECT COUNT(*). + # An empty field name is illegal in NumPy structured arrays and in + # PyArrow tables, so we substitute a synthetic name when needed. + # Synthetic names are chosen to avoid colliding with any explicit + # column names before deduplication runs. + # See https://github.com/apache/superset/issues/23848 + column_names = dedup(normalize_cursor_description_names(cursor_description)) # fix cursor descriptor with the deduped names deduped_cursor_desc = [ diff --git a/tests/unit_tests/result_set_test.py b/tests/unit_tests/result_set_test.py index da5dcdafabc..25df635247d 100644 --- a/tests/unit_tests/result_set_test.py +++ b/tests/unit_tests/result_set_test.py @@ -185,3 +185,62 @@ def test_get_column_description_from_empty_data_using_cursor_description( ) assert any(col.get("column_name") == "__time" for col in result_set.columns) logger.exception.assert_not_called() + + +def test_empty_column_names_get_synthetic_names() -> None: + """ + SQL Server returns an empty-string column name in cursor.description for + any un-aliased expression (e.g. ``SELECT COUNT(*) FROM t``). An empty + field name is illegal in NumPy structured arrays and PyArrow tables. + + SupersetResultSet must replace empty column names with synthetic names + so queries like ``SELECT COUNT(*) FROM t`` succeed on MSSQL. + + Regression test for https://github.com/apache/superset/issues/23848 + """ + data = [(42,)] + description = [("", 3, None, None, None, None, None)] + result_set = SupersetResultSet(data, description, BaseEngineSpec) # type: ignore + + assert result_set.columns[0]["column_name"] == "_col_0" + df = result_set.to_pandas_df() + assert list(df.columns) == ["_col_0"] + assert df["_col_0"].iloc[0] == 42 + + +def test_multiple_empty_column_names_get_unique_synthetic_names() -> None: + """ + When several columns have empty names (e.g. ``SELECT COUNT(*), SUM(x)`` + on MSSQL), each must receive a distinct synthetic name. + """ + data = [(10, 20)] + description = [ + ("", 3, None, None, None, None, None), + ("", 3, None, None, None, None, None), + ] + result_set = SupersetResultSet(data, description, BaseEngineSpec) # type: ignore + + col_names = [c["column_name"] for c in result_set.columns] + assert len(col_names) == 2 + assert len(set(col_names)) == 2 # all unique + df = result_set.to_pandas_df() + assert df.iloc[0].tolist() == [10, 20] + + +def test_empty_column_names_do_not_rename_explicit_synthetic_names() -> None: + """ + Synthetic names assigned to empty columns must not collide with explicit + user-selected names that already look like Superset fallbacks. + """ + data = [(10, 20)] + description = [ + ("", 3, None, None, None, None, None), + ("_col_0", 3, None, None, None, None, None), + ] + result_set = SupersetResultSet(data, description, BaseEngineSpec) # type: ignore + + col_names = [c["column_name"] for c in result_set.columns] + assert col_names == ["_col_1", "_col_0"] + df = result_set.to_pandas_df() + assert list(df.columns) == ["_col_1", "_col_0"] + assert df.iloc[0].tolist() == [10, 20]