mirror of
https://github.com/apache/superset.git
synced 2026-04-19 08:04:53 +00:00
fix(BigQuery): explicitly quote columns in select_star (#16822)
* fix (BigQuery): explicitly quote columns in select_star * Fix test * Fix SELECT * in BQ * Add unit tests * Remove type changes
This commit is contained in:
@@ -26,9 +26,10 @@ from apispec.ext.marshmallow import MarshmallowPlugin
|
||||
from flask_babel import gettext as __
|
||||
from marshmallow import fields, Schema
|
||||
from marshmallow.exceptions import ValidationError
|
||||
from sqlalchemy import literal_column
|
||||
from sqlalchemy import column
|
||||
from sqlalchemy.engine.base import Engine
|
||||
from sqlalchemy.engine.url import make_url
|
||||
from sqlalchemy.sql.expression import ColumnClause
|
||||
from sqlalchemy.sql import sqltypes
|
||||
from typing_extensions import TypedDict
|
||||
|
||||
from superset.databases.schemas import encrypted_field_properties, EncryptedString
|
||||
@@ -282,20 +283,6 @@ class BigQueryEngineSpec(BaseEngineSpec):
|
||||
"clustering": {"cols": cluster_columns},
|
||||
}
|
||||
|
||||
@classmethod
|
||||
def _get_fields(cls, cols: List[Dict[str, Any]]) -> List[ColumnClause]:
|
||||
"""
|
||||
BigQuery dialect requires us to not use backtick in the fieldname which are
|
||||
nested.
|
||||
Using literal_column handles that issue.
|
||||
https://docs.sqlalchemy.org/en/latest/core/tutorial.html#using-more-specific-text-with-table-literal-column-and-column
|
||||
Also explicility specifying column names so we don't encounter duplicate
|
||||
column names in the result.
|
||||
"""
|
||||
return [
|
||||
literal_column(c["name"]).label(c["name"].replace(".", "__")) for c in cols
|
||||
]
|
||||
|
||||
@classmethod
|
||||
def epoch_to_dttm(cls) -> str:
|
||||
return "TIMESTAMP_SECONDS({col})"
|
||||
@@ -425,3 +412,104 @@ class BigQueryEngineSpec(BaseEngineSpec):
|
||||
ma_plugin.converter.add_attribute_function(encrypted_field_properties)
|
||||
spec.components.schema(cls.__name__, schema=cls.parameters_schema)
|
||||
return spec.to_dict()["components"]["schemas"][cls.__name__]
|
||||
|
||||
@classmethod
|
||||
def select_star( # pylint: disable=too-many-arguments
|
||||
cls,
|
||||
database: "Database",
|
||||
table_name: str,
|
||||
engine: Engine,
|
||||
schema: Optional[str] = None,
|
||||
limit: int = 100,
|
||||
show_cols: bool = False,
|
||||
indent: bool = True,
|
||||
latest_partition: bool = True,
|
||||
cols: Optional[List[Dict[str, Any]]] = None,
|
||||
) -> str:
|
||||
"""
|
||||
Remove array structures from `SELECT *`.
|
||||
|
||||
BigQuery supports structures and arrays of structures, eg:
|
||||
|
||||
author STRUCT<name STRING, email STRING>
|
||||
trailer ARRAY<STRUCT<key STRING, value STRING>>
|
||||
|
||||
When loading metadata for a table each key in the struct is displayed as a
|
||||
separate pseudo-column, eg:
|
||||
|
||||
- author
|
||||
- author.name
|
||||
- author.email
|
||||
- trailer
|
||||
- trailer.key
|
||||
- trailer.value
|
||||
|
||||
When generating the `SELECT *` statement we want to remove any keys from
|
||||
structs inside an array, since selecting them results in an error. The correct
|
||||
select statement should look like this:
|
||||
|
||||
SELECT
|
||||
`author`,
|
||||
`author`.`name`,
|
||||
`author`.`email`,
|
||||
`trailer`
|
||||
FROM
|
||||
table
|
||||
|
||||
Selecting `trailer.key` or `trailer.value` results in an error, as opposed to
|
||||
selecting `author.name`, since they are keys in a structure inside an array.
|
||||
|
||||
This method removes any array pseudo-columns.
|
||||
"""
|
||||
if cols:
|
||||
# For arrays of structs, remove the child columns, otherwise the query
|
||||
# will fail.
|
||||
array_prefixes = {
|
||||
col["name"] for col in cols if isinstance(col["type"], sqltypes.ARRAY)
|
||||
}
|
||||
cols = [
|
||||
col
|
||||
for col in cols
|
||||
if "." not in col["name"]
|
||||
or col["name"].split(".")[0] not in array_prefixes
|
||||
]
|
||||
|
||||
return super().select_star(
|
||||
database,
|
||||
table_name,
|
||||
engine,
|
||||
schema,
|
||||
limit,
|
||||
show_cols,
|
||||
indent,
|
||||
latest_partition,
|
||||
cols,
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def _get_fields(cls, cols: List[Dict[str, Any]]) -> List[Any]:
|
||||
"""
|
||||
Label columns using their fully qualified name.
|
||||
|
||||
BigQuery supports columns of type `struct`, which are basically dictionaries.
|
||||
When loading metadata for a table with struct columns, each key in the struct
|
||||
is displayed as a separate pseudo-column, eg:
|
||||
|
||||
author STRUCT<name STRING, email STRING>
|
||||
|
||||
Will be shown as 3 columns:
|
||||
|
||||
- author
|
||||
- author.name
|
||||
- author.email
|
||||
|
||||
If we select those fields:
|
||||
|
||||
SELECT `author`, `author`.`name`, `author`.`email` FROM table
|
||||
|
||||
The resulting columns will be called "author", "name", and "email", This may
|
||||
result in a clash with other columns. To prevent that, we explicitly label
|
||||
the columns using their fully qualified name, so we end up with "author",
|
||||
"author__name" and "author__email", respectively.
|
||||
"""
|
||||
return [column(c["name"]).label(c["name"].replace(".", "__")) for c in cols]
|
||||
|
||||
Reference in New Issue
Block a user