From 75af53dc3da277174e85729424b0715e2d573236 Mon Sep 17 00:00:00 2001
From: "JUST.in DO IT" <justin.park@airbnb.com>
Date: Fri, 22 Aug 2025 10:00:39 -0700
Subject: [PATCH] fix: customize column description limit size in
 db_engine_spec (#34808)

---
 superset/connectors/sqla/utils.py             |  5 ++--
 superset/db_engine_specs/base.py              | 10 +++++++
 superset/models/core.py                       |  3 ++
 superset/result_set.py                        | 30 +++++++++----------
 tests/integration_tests/celery_tests.py       |  3 +-
 tests/integration_tests/datasource_tests.py   |  9 +++++-
 .../db_engine_specs/base_engine_spec_tests.py |  4 +++
 tests/integration_tests/result_set_tests.py   |  2 +-
 tests/unit_tests/result_set_test.py           | 21 +++++++++++++
 9 files changed, 67 insertions(+), 20 deletions(-)

diff --git a/superset/connectors/sqla/utils.py b/superset/connectors/sqla/utils.py
index 6c0d2a82606..2e7264964e9 100644
--- a/superset/connectors/sqla/utils.py
+++ b/superset/connectors/sqla/utils.py
@@ -148,11 +148,12 @@ def get_columns_description(
     try:
         with database.get_raw_connection(catalog=catalog, schema=schema) as conn:
             cursor = conn.cursor()
-            query = database.apply_limit_to_sql(query, limit=1)
+            limit = database.get_column_description_limit_size()
+            query = database.apply_limit_to_sql(query, limit=limit)
             mutated_query = database.mutate_sql_based_on_config(query)
             cursor.execute(mutated_query)
             db_engine_spec.execute(cursor, mutated_query, database)
-            result = db_engine_spec.fetch_data(cursor, limit=1)
+            result = db_engine_spec.fetch_data(cursor, limit=limit)
             result_set = SupersetResultSet(result, cursor.description, db_engine_spec)
             return result_set.columns
     except Exception as ex:
diff --git a/superset/db_engine_specs/base.py b/superset/db_engine_specs/base.py
index cccecff6539..07e787df119 100644
--- a/superset/db_engine_specs/base.py
+++ b/superset/db_engine_specs/base.py
@@ -1978,6 +1978,16 @@ class BaseEngineSpec:  # pylint: disable=too-many-public-methods
         """
         return []
 
+    @classmethod
+    def get_column_description_limit_size(cls) -> int:
+        """
+        Get a minimum limit size for the sample SELECT column query
+        to fetch the column metadata.
+
+        :return: A number of limit size
+        """
+        return 1
+
     @staticmethod
     def pyodbc_rows_to_tuples(data: list[Any]) -> list[tuple[Any, ...]]:
         """
diff --git a/superset/models/core.py b/superset/models/core.py
index 2e961dfdcae..603b0799d90 100755
--- a/superset/models/core.py
+++ b/superset/models/core.py
@@ -852,6 +852,9 @@ class Database(Model, AuditMixinNullable, ImportExportMixin):  # pylint: disable
 
         return script.format()
 
+    def get_column_description_limit_size(self) -> int:
+        return self.db_engine_spec.get_column_description_limit_size()
+
     def safe_sqlalchemy_uri(self) -> str:
         return self.sqlalchemy_uri
 
diff --git a/superset/result_set.py b/superset/result_set.py
index f6daa4b99eb..64d6f19afe8 100644
--- a/superset/result_set.py
+++ b/superset/result_set.py
@@ -135,21 +135,21 @@ class SupersetResultSet:
         if data and (not isinstance(data, list) or not isinstance(data[0], tuple)):
             data = [tuple(row) for row in data]
         array = np.array(data, dtype=numpy_dtype)
-        if array.size > 0:
-            for column in column_names:
-                try:
-                    pa_data.append(pa.array(array[column].tolist()))
-                except (
-                    pa.lib.ArrowInvalid,
-                    pa.lib.ArrowTypeError,
-                    pa.lib.ArrowNotImplementedError,
-                    ValueError,
-                    TypeError,  # this is super hackey,
-                    # https://issues.apache.org/jira/browse/ARROW-7855
-                ):
-                    # attempt serialization of values as strings
-                    stringified_arr = stringify_values(array[column])
-                    pa_data.append(pa.array(stringified_arr.tolist()))
+
+        for column in column_names:
+            try:
+                pa_data.append(pa.array(array[column].tolist()))
+            except (
+                pa.lib.ArrowInvalid,
+                pa.lib.ArrowTypeError,
+                pa.lib.ArrowNotImplementedError,
+                ValueError,
+                TypeError,  # this is super hackey,
+                # https://issues.apache.org/jira/browse/ARROW-7855
+            ):
+                # attempt serialization of values as strings
+                stringified_arr = stringify_values(array[column])
+                pa_data.append(pa.array(stringified_arr.tolist()))
 
         if pa_data:  # pylint: disable=too-many-nested-blocks
             for i, column in enumerate(column_names):
diff --git a/tests/integration_tests/celery_tests.py b/tests/integration_tests/celery_tests.py
index b517c20d2de..8fc5fe293e0 100644
--- a/tests/integration_tests/celery_tests.py
+++ b/tests/integration_tests/celery_tests.py
@@ -217,7 +217,8 @@ def test_run_sync_query_cta_no_data(test_client):
     sql_empty_result = "SELECT * FROM birth_names WHERE name='random'"
     result = run_sql(test_client, sql_empty_result)
     assert QueryStatus.SUCCESS == result["query"]["state"]
-    assert ([], []) == (result["data"], result["columns"])
+    assert [] == result["data"]
+    assert len(result["columns"]) > 0
 
     query = get_query_by_id(result["query"]["serverId"])
     assert QueryStatus.SUCCESS == query.status
diff --git a/tests/integration_tests/datasource_tests.py b/tests/integration_tests/datasource_tests.py
index 65c063c5811..319b8573720 100644
--- a/tests/integration_tests/datasource_tests.py
+++ b/tests/integration_tests/datasource_tests.py
@@ -718,7 +718,14 @@ def test_get_samples_with_filters(test_client, login_as_admin, virtual_dataset):
         },
     )
     assert rv.status_code == 200
-    assert rv.json["result"]["colnames"] == []
+    assert rv.json["result"]["colnames"] == [
+        "col1",
+        "col2",
+        "col3",
+        "col4",
+        "col5",
+        "col6",
+    ]
     assert rv.json["result"]["rowcount"] == 0
 
 
diff --git a/tests/integration_tests/db_engine_specs/base_engine_spec_tests.py b/tests/integration_tests/db_engine_specs/base_engine_spec_tests.py
index 7097e5b0971..122bb271a28 100644
--- a/tests/integration_tests/db_engine_specs/base_engine_spec_tests.py
+++ b/tests/integration_tests/db_engine_specs/base_engine_spec_tests.py
@@ -116,6 +116,10 @@ class SupersetTestCases(SupersetTestCase):
         )
         assert base_result_expected == base_result
 
+    def test_get_column_description_limit_size(self):
+        base_result = BaseEngineSpec.get_column_description_limit_size()
+        assert base_result == 1
+
     @pytest.mark.usefixtures("load_energy_table_with_slice")
     def test_column_datatype_to_string(self):
         example_db = get_example_database()
diff --git a/tests/integration_tests/result_set_tests.py b/tests/integration_tests/result_set_tests.py
index 6f6d701ed8b..bfdc7302644 100644
--- a/tests/integration_tests/result_set_tests.py
+++ b/tests/integration_tests/result_set_tests.py
@@ -308,4 +308,4 @@ class TestSupersetResultSet(SupersetTestCase):
             ("emptytwo", "int", None, None, None, None, True),
         ]
         results = SupersetResultSet(data, cursor_descr, BaseEngineSpec)
-        assert results.columns == []
+        assert len(results.columns) == 2
diff --git a/tests/unit_tests/result_set_test.py b/tests/unit_tests/result_set_test.py
index a629c2e2ec7..da5dcdafabc 100644
--- a/tests/unit_tests/result_set_test.py
+++ b/tests/unit_tests/result_set_test.py
@@ -26,6 +26,7 @@ from pytest_mock import MockerFixture
 
 from superset.db_engine_specs.base import BaseEngineSpec
 from superset.result_set import stringify_values, SupersetResultSet
+from superset.superset_typing import DbapiResult
 
 
 def test_column_names_as_bytes() -> None:
@@ -164,3 +165,23 @@ def test_timezone_series(mocker: MockerFixture) -> None:
         [pd.Timestamp("2023-01-01 00:00:00+0000", tz="UTC")]
     ]
     logger.exception.assert_not_called()
+
+
+def test_get_column_description_from_empty_data_using_cursor_description(
+    mocker: MockerFixture,
+) -> None:
+    """
+    Test that we can handle get_column_decription from the cursor description
+    when data is empty
+    """
+    logger = mocker.patch("superset.result_set.logger")
+
+    data: DbapiResult = []
+    description = [(b"__time", "datetime", None, None, None, None, 1, 0, 255)]
+    result_set = SupersetResultSet(
+        data,
+        description,  # type: ignore
+        BaseEngineSpec,
+    )
+    assert any(col.get("column_name") == "__time" for col in result_set.columns)
+    logger.exception.assert_not_called()