fix(BigQuery): explicitly quote columns in select_star (#16822)

* fix (BigQuery): explicitly quote columns in select_star * Fix test * Fix SELECT * in BQ * Add unit tests * Remove type changes
2026-04-19 08:04:53 +00:00 · 2021-10-06 07:43:32 -07:00
parent 191033cb44
commit c993c5845f
3 changed files with 296 additions and 18 deletions
--- a/superset/db_engine_specs/bigquery.py
+++ b/superset/db_engine_specs/bigquery.py
@@ -26,9 +26,10 @@ from apispec.ext.marshmallow import MarshmallowPlugin
 from flask_babel import gettext as __
 from marshmallow import fields, Schema
 from marshmallow.exceptions import ValidationError
-from sqlalchemy import literal_column
+from sqlalchemy import column
+from sqlalchemy.engine.base import Engine
 from sqlalchemy.engine.url import make_url
-from sqlalchemy.sql.expression import ColumnClause
+from sqlalchemy.sql import sqltypes
 from typing_extensions import TypedDict

 from superset.databases.schemas import encrypted_field_properties, EncryptedString
@@ -282,20 +283,6 @@ class BigQueryEngineSpec(BaseEngineSpec):
            "clustering": {"cols": cluster_columns},
        }

-    @classmethod
-    def _get_fields(cls, cols: List[Dict[str, Any]]) -> List[ColumnClause]:
-        """
-        BigQuery dialect requires us to not use backtick in the fieldname which are
-        nested.
-        Using literal_column handles that issue.
-        https://docs.sqlalchemy.org/en/latest/core/tutorial.html#using-more-specific-text-with-table-literal-column-and-column
-        Also explicility specifying column names so we don't encounter duplicate
-        column names in the result.
-        """
-        return [
-            literal_column(c["name"]).label(c["name"].replace(".", "__")) for c in cols
-        ]
-
    @classmethod
    def epoch_to_dttm(cls) -> str:
        return "TIMESTAMP_SECONDS({col})"
@@ -425,3 +412,104 @@ class BigQueryEngineSpec(BaseEngineSpec):
        ma_plugin.converter.add_attribute_function(encrypted_field_properties)
        spec.components.schema(cls.__name__, schema=cls.parameters_schema)
        return spec.to_dict()["components"]["schemas"][cls.__name__]
+
+    @classmethod
+    def select_star(  # pylint: disable=too-many-arguments
+        cls,
+        database: "Database",
+        table_name: str,
+        engine: Engine,
+        schema: Optional[str] = None,
+        limit: int = 100,
+        show_cols: bool = False,
+        indent: bool = True,
+        latest_partition: bool = True,
+        cols: Optional[List[Dict[str, Any]]] = None,
+    ) -> str:
+        """
+        Remove array structures from `SELECT *`.
+
+        BigQuery supports structures and arrays of structures, eg:
+
+            author STRUCT<name STRING, email STRING>
+            trailer ARRAY<STRUCT<key STRING, value STRING>>
+
+        When loading metadata for a table each key in the struct is displayed as a
+        separate pseudo-column, eg:
+
+            - author
+            - author.name
+            - author.email
+            - trailer
+            - trailer.key
+            - trailer.value
+
+        When generating the `SELECT *` statement we want to remove any keys from
+        structs inside an array, since selecting them results in an error. The correct
+        select statement should look like this:
+
+            SELECT
+              `author`,
+              `author`.`name`,
+              `author`.`email`,
+              `trailer`
+            FROM
+              table
+
+        Selecting `trailer.key` or `trailer.value` results in an error, as opposed to
+        selecting `author.name`, since they are keys in a structure inside an array.
+
+        This method removes any array pseudo-columns.
+        """
+        if cols:
+            # For arrays of structs, remove the child columns, otherwise the query
+            # will fail.
+            array_prefixes = {
+                col["name"] for col in cols if isinstance(col["type"], sqltypes.ARRAY)
+            }
+            cols = [
+                col
+                for col in cols
+                if "." not in col["name"]
+                or col["name"].split(".")[0] not in array_prefixes
+            ]
+
+        return super().select_star(
+            database,
+            table_name,
+            engine,
+            schema,
+            limit,
+            show_cols,
+            indent,
+            latest_partition,
+            cols,
+        )
+
+    @classmethod
+    def _get_fields(cls, cols: List[Dict[str, Any]]) -> List[Any]:
+        """
+        Label columns using their fully qualified name.
+
+        BigQuery supports columns of type `struct`, which are basically dictionaries.
+        When loading metadata for a table with struct columns, each key in the struct
+        is displayed as a separate pseudo-column, eg:
+
+            author STRUCT<name STRING, email STRING>
+
+        Will be shown as 3 columns:
+
+            - author
+            - author.name
+            - author.email
+
+        If we select those fields:
+
+            SELECT `author`, `author`.`name`, `author`.`email` FROM table
+
+        The resulting columns will be called "author", "name", and "email", This may
+        result in a clash with other columns. To prevent that, we explicitly label
+        the columns using their fully qualified name, so we end up with "author",
+        "author__name" and "author__email", respectively.
+        """
+        return [column(c["name"]).label(c["name"].replace(".", "__")) for c in cols]