fix(BigQuery): explicitly quote columns in select_star (#16822)

* fix (BigQuery): explicitly quote columns in select_star

* Fix test

* Fix SELECT * in BQ

* Add unit tests

* Remove type changes
This commit is contained in:
Beto Dealmeida
2021-10-06 07:43:32 -07:00
committed by GitHub
parent 191033cb44
commit c993c5845f
3 changed files with 296 additions and 18 deletions

View File

@@ -26,9 +26,10 @@ from apispec.ext.marshmallow import MarshmallowPlugin
from flask_babel import gettext as __
from marshmallow import fields, Schema
from marshmallow.exceptions import ValidationError
from sqlalchemy import literal_column
from sqlalchemy import column
from sqlalchemy.engine.base import Engine
from sqlalchemy.engine.url import make_url
from sqlalchemy.sql.expression import ColumnClause
from sqlalchemy.sql import sqltypes
from typing_extensions import TypedDict
from superset.databases.schemas import encrypted_field_properties, EncryptedString
@@ -282,20 +283,6 @@ class BigQueryEngineSpec(BaseEngineSpec):
"clustering": {"cols": cluster_columns},
}
@classmethod
def _get_fields(cls, cols: List[Dict[str, Any]]) -> List[ColumnClause]:
"""
BigQuery dialect requires us to not use backtick in the fieldname which are
nested.
Using literal_column handles that issue.
https://docs.sqlalchemy.org/en/latest/core/tutorial.html#using-more-specific-text-with-table-literal-column-and-column
Also explicility specifying column names so we don't encounter duplicate
column names in the result.
"""
return [
literal_column(c["name"]).label(c["name"].replace(".", "__")) for c in cols
]
@classmethod
def epoch_to_dttm(cls) -> str:
return "TIMESTAMP_SECONDS({col})"
@@ -425,3 +412,104 @@ class BigQueryEngineSpec(BaseEngineSpec):
ma_plugin.converter.add_attribute_function(encrypted_field_properties)
spec.components.schema(cls.__name__, schema=cls.parameters_schema)
return spec.to_dict()["components"]["schemas"][cls.__name__]
@classmethod
def select_star( # pylint: disable=too-many-arguments
cls,
database: "Database",
table_name: str,
engine: Engine,
schema: Optional[str] = None,
limit: int = 100,
show_cols: bool = False,
indent: bool = True,
latest_partition: bool = True,
cols: Optional[List[Dict[str, Any]]] = None,
) -> str:
"""
Remove array structures from `SELECT *`.
BigQuery supports structures and arrays of structures, eg:
author STRUCT<name STRING, email STRING>
trailer ARRAY<STRUCT<key STRING, value STRING>>
When loading metadata for a table each key in the struct is displayed as a
separate pseudo-column, eg:
- author
- author.name
- author.email
- trailer
- trailer.key
- trailer.value
When generating the `SELECT *` statement we want to remove any keys from
structs inside an array, since selecting them results in an error. The correct
select statement should look like this:
SELECT
`author`,
`author`.`name`,
`author`.`email`,
`trailer`
FROM
table
Selecting `trailer.key` or `trailer.value` results in an error, as opposed to
selecting `author.name`, since they are keys in a structure inside an array.
This method removes any array pseudo-columns.
"""
if cols:
# For arrays of structs, remove the child columns, otherwise the query
# will fail.
array_prefixes = {
col["name"] for col in cols if isinstance(col["type"], sqltypes.ARRAY)
}
cols = [
col
for col in cols
if "." not in col["name"]
or col["name"].split(".")[0] not in array_prefixes
]
return super().select_star(
database,
table_name,
engine,
schema,
limit,
show_cols,
indent,
latest_partition,
cols,
)
@classmethod
def _get_fields(cls, cols: List[Dict[str, Any]]) -> List[Any]:
"""
Label columns using their fully qualified name.
BigQuery supports columns of type `struct`, which are basically dictionaries.
When loading metadata for a table with struct columns, each key in the struct
is displayed as a separate pseudo-column, eg:
author STRUCT<name STRING, email STRING>
Will be shown as 3 columns:
- author
- author.name
- author.email
If we select those fields:
SELECT `author`, `author`.`name`, `author`.`email` FROM table
The resulting columns will be called "author", "name", and "email", This may
result in a clash with other columns. To prevent that, we explicitly label
the columns using their fully qualified name, so we end up with "author",
"author__name" and "author__email", respectively.
"""
return [column(c["name"]).label(c["name"].replace(".", "__")) for c in cols]