fix(sql): quote identifiers in transpile_to_dialect to fix case-sensitive column filters (#39521)

This commit is contained in:
Alexandru Soare
2026-05-06 10:53:09 +03:00
committed by GitHub
parent d7663a9a1c
commit adfbbf1433
3 changed files with 137 additions and 3 deletions

View File

@@ -360,11 +360,18 @@ class QueryObject: # pylint: disable=too-many-instance-attributes
engine = database.db_engine_spec.engine
if needs_transpilation:
clause = transpile_to_dialect(clause, engine)
# source_engine=engine ensures idempotency: this
# method can run more than once (validate() is called
# from both raise_for_access and get_df_payload), so
# the second pass must be able to re-parse the
# dialect-specific output (e.g. BigQuery backticks)
# produced by the first pass.
clause = transpile_to_dialect(
clause, engine, source_engine=engine, identify=True
)
sanitized_clause = sanitize_clause(clause, engine)
if sanitized_clause != clause:
self.extras[param] = sanitized_clause
self.extras[param] = sanitized_clause
except QueryClauseValidationException as ex:
raise QueryObjectValidationError(ex.message) from ex

View File

@@ -1568,6 +1568,7 @@ def transpile_to_dialect(
sql: str,
target_engine: str,
source_engine: str | None = None,
identify: bool = False,
) -> str:
"""
Transpile SQL from one database dialect to another using SQLGlot.
@@ -1576,6 +1577,7 @@ def transpile_to_dialect(
sql: The SQL query to transpile
target_engine: The target database engine (e.g., "mysql", "postgresql")
source_engine: The source database engine. If None, uses generic SQL dialect.
identify: If True, quote all identifiers per the target dialect.
Returns:
The transpiled SQL string
@@ -1598,6 +1600,7 @@ def transpile_to_dialect(
copy=True,
comments=False,
pretty=False,
identify=identify,
)
except ParseError as ex:
raise QueryClauseValidationException(f"Cannot parse SQL clause: {sql}") from ex

View File

@@ -396,3 +396,127 @@ def test_transpile_unknown_source_engine_uses_generic() -> None:
"SELECT * FROM orders", "postgresql", "unknown_engine"
)
assert result == "SELECT * FROM orders"
# Tests for identify=True (identifier quoting)
@pytest.mark.parametrize(
"sql,dialect,expected",
[
# PostgreSQL - double-quoted identifiers
(
"STATE ILIKE '%AL%'",
"postgresql",
"\"STATE\" ILIKE '%AL%'",
),
# MySQL - backtick-quoted identifiers, ILIKE transpiled
(
"STATE ILIKE '%AL%'",
"mysql",
"LOWER(`STATE`) LIKE LOWER('%AL%')",
),
# BigQuery - backtick-quoted identifiers, ILIKE transpiled
(
"STATE ILIKE '%AL%'",
"bigquery",
"LOWER(`STATE`) LIKE LOWER('%AL%')",
),
# Snowflake - double-quoted identifiers
(
"STATE ILIKE '%AL%'",
"snowflake",
"\"STATE\" ILIKE '%AL%'",
),
# MSSQL - bracket-quoted identifiers
(
"STATE = 'CA'",
"mssql",
"[STATE] = 'CA'",
),
# Compound filter with multiple identifiers
(
"STATE = 'CA' AND AIRLINE = 'Delta'",
"postgresql",
"\"STATE\" = 'CA' AND \"AIRLINE\" = 'Delta'",
),
# Lowercase identifiers also get quoted
(
"name = 'test'",
"postgresql",
"\"name\" = 'test'",
),
],
)
def test_identify_quotes_identifiers(sql: str, dialect: str, expected: str) -> None:
"""Test that identify=True quotes identifiers per target dialect."""
assert transpile_to_dialect(sql, dialect, identify=True) == expected
def test_identify_unknown_engine_returns_unchanged() -> None:
"""Test that identify=True has no effect on unknown engines."""
sql = "STATE = 'CA'"
assert transpile_to_dialect(sql, "unknown_engine", identify=True) == sql
@pytest.mark.parametrize(
"sql,engine,expected",
[
(
"STATE ILIKE '%AL%'",
"postgresql",
"\"STATE\" ILIKE '%AL%'",
),
(
"country ILIKE '%Italy%'",
"bigquery",
"LOWER(`country`) LIKE LOWER('%Italy%')",
),
],
)
def test_identify_with_source_engine(sql: str, engine: str, expected: str) -> None:
"""Test identify=True with source_engine matching target engine."""
result = transpile_to_dialect(sql, engine, source_engine=engine, identify=True)
assert result == expected
@pytest.mark.parametrize(
"engine",
["postgresql", "bigquery", "mysql", "snowflake"],
)
def test_identify_transpilation_is_idempotent(engine: str) -> None:
"""Test that transpiling twice produces the same result (idempotent).
This matters because _sanitize_filters() can be called multiple times
via validate().
"""
clause = "STATE ILIKE '%AL%'"
pass1 = transpile_to_dialect(clause, engine, source_engine=engine, identify=True)
pass2 = transpile_to_dialect(pass1, engine, source_engine=engine, identify=True)
assert pass1 == pass2
def test_sanitize_filters_writes_back_transpiled_clause() -> None:
"""Test that _sanitize_filters always persists the transpiled clause.
Regression test: a previous conditional `if sanitized_clause != clause`
skipped the write-back when transpile_to_dialect had already modified
the clause, leaving the original unquoted value in extras.
"""
from unittest.mock import MagicMock
from superset.common.query_object import QueryObject
mock_datasource = MagicMock()
mock_datasource.database.db_engine_spec.engine = "postgresql"
query_obj = QueryObject(
datasource=mock_datasource,
columns=["STATE"],
metrics=[],
extras={
"where": "STATE = 'CA'",
"transpile_to_dialect": True,
},
)
query_obj.validate()
assert '"STATE"' in query_obj.extras["where"]