Compare commits

...

7 Commits

Author SHA1 Message Date
Beto Dealmeida
35d1a6c21c fix: render Jinja templates in ORDER BY adhoc metrics
When processing adhoc metrics in ORDER BY clauses during query execution,
Jinja templates were not being rendered because `processed=True` was
passed without providing template processing.

This commit:
1. Updates adhoc_metric_to_sqla() to apply template processing even when
   processed=True (meaning SQL is already sanitized)
2. Passes template_processor when converting orderby adhoc metrics
3. Removes obsolete test that expected error handling removed in commit
   add087cbfe

The fix ensures that:
- During validation: SQL is sanitized but Jinja templates are preserved
  (template_processor=None)
- During execution: Jinja templates are rendered (template_processor
  provided, processed=True skips re-sanitization)

Fixes test: test_chart_data_table_chart_with_time_grain_filter

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-13 12:50:38 -05:00
Beto Dealmeida
add087cbfe Raise exceptions 2025-11-13 10:11:54 -05:00
Beto Dealmeida
29256a40bc Fix logic 2025-11-12 15:59:39 -05:00
Beto Dealmeida
89afd6fefc fix: prevent dict mutation during SQL expression sanitization
Address feedback on cache key stability fix:

1. **Fix in-place mutation during validation**
   - Changed _sanitize_metrics_expressions() to create new dicts instead of mutating
   - Changed _sanitize_orderby_expressions() to create new tuples/dicts
   - Prevents unexpected side effects when adhoc metrics are shared across queries

2. **Add comprehensive tests**
   - test_sql_expressions_processed_during_validation: Verifies SQL processing
   - test_validation_does_not_mutate_original_dicts: Ensures no mutation
   - test_validation_with_multiple_adhoc_metrics: Tests multiple metrics
   - test_validation_preserves_jinja_templates: Verifies Jinja preservation
   - test_validation_without_processing_methods: Tests graceful degradation
   - test_validation_serialization_stability: Tests JSON serialization stability

3. **Performance optimization**
   - Added early returns when no adhoc expressions to process
   - Reduces unnecessary function calls

This ensures that:
- Cache keys remain stable across validation and execution
- Original metric dicts are not mutated (preventing composite query issues)
- Jinja templates are preserved for runtime processing
- The fix works even when datasources lack processing methods

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-12 15:59:39 -05:00
Beto Dealmeida
4bcbe471cc Lint 2025-11-12 15:59:39 -05:00
Beto Dealmeida
47c58603a9 Fix style 2025-11-12 15:59:39 -05:00
Beto Dealmeida
6ec4a25295 fix: prevent cache key mismatch by processing SQL expressions during validation
Root Cause:
SQL expressions in adhoc metrics and orderby were being processed
(uppercased via sanitize_clause()) during query execution, causing
cache key mismatches in composite queries where:
1. Celery task processes and caches with processed expressions
2. Later requests compute cache keys from unprocessed expressions
3. Keys don't match → 422 error

The Fix:
Process SQL expressions during QueryObject.validate() BEFORE cache key
generation, ensuring both cache key computation and query execution use
the same processed expressions.

Changes:
- superset/common/query_object.py:
  * Add _sanitize_sql_expressions() called in validate()
  * Process metrics and orderby SQL expressions before caching

- superset/models/helpers.py:
  * Pass processed=True to adhoc_metric_to_sqla() in get_sqla_query()
  * Skip re-processing since validate() already handled it

- tests/unit_tests/connectors/sqla/test_orderby_mutation.py:
  * Add regression test documenting the fix
2025-11-12 15:59:39 -05:00
3 changed files with 471 additions and 9 deletions

View File

@@ -193,7 +193,7 @@ class QueryObject: # pylint: disable=too-many-instance-attributes
return isinstance(metric, str) or is_adhoc_metric(metric)
self.metrics = metrics and [
x if is_str_or_adhoc(x) else x["label"] # type: ignore
x if is_str_or_adhoc(x) else x["label"] # type: ignore[misc,index]
for x in metrics
]
@@ -285,6 +285,7 @@ class QueryObject: # pylint: disable=too-many-instance-attributes
self._validate_no_have_duplicate_labels()
self._validate_time_offsets()
self._sanitize_filters()
self._sanitize_sql_expressions()
return None
except QueryObjectValidationError as ex:
if raise_exceptions:
@@ -359,6 +360,95 @@ class QueryObject: # pylint: disable=too-many-instance-attributes
except QueryClauseValidationException as ex:
raise QueryObjectValidationError(ex.message) from ex
def _sanitize_sql_expressions(self) -> None:
"""
Sanitize SQL expressions in adhoc metrics and orderby for consistent cache keys.
This processes SQL expressions before cache key generation, preventing cache
mismatches due to later processing during query execution.
"""
if not self.datasource or not hasattr(
self.datasource,
"_process_sql_expression",
):
return
# Process adhoc metrics
if self.metrics:
self._sanitize_metrics_expressions()
# Process orderby - these may contain adhoc metrics
if self.orderby:
self._sanitize_orderby_expressions()
def _sanitize_metrics_expressions(self) -> None:
"""
Process SQL expressions in adhoc metrics.
Creates new metric dictionaries to avoid mutating shared references.
"""
# datasource is checked in parent method, assert for type checking
assert self.datasource is not None
if not self.metrics:
return
sanitized_metrics = []
for metric in self.metrics:
if not (is_adhoc_metric(metric) and isinstance(metric, dict)):
sanitized_metrics.append(metric)
continue
if sql_expr := metric.get("sqlExpression"):
processed = self.datasource._process_select_expression(
expression=sql_expr,
database_id=self.datasource.database_id,
engine=self.datasource.database.backend,
schema=self.datasource.schema,
template_processor=None,
)
if processed and processed != sql_expr:
# Create new dict to avoid mutating shared references
sanitized_metrics.append({**metric, "sqlExpression": processed})
else:
sanitized_metrics.append(metric)
else:
sanitized_metrics.append(metric)
self.metrics = sanitized_metrics
def _sanitize_orderby_expressions(self) -> None:
"""
Process SQL expressions in orderby items.
Creates new tuples and dictionaries to avoid mutating shared references.
"""
# datasource is checked in parent method, assert for type checking
assert self.datasource is not None
if not self.orderby:
return
sanitized_orderby = []
for col, ascending in self.orderby:
if not (isinstance(col, dict) and col.get("sqlExpression")):
sanitized_orderby.append((col, ascending))
continue
processed = self.datasource._process_orderby_expression(
expression=col["sqlExpression"],
database_id=self.datasource.database_id,
engine=self.datasource.database.backend,
schema=self.datasource.schema,
template_processor=None,
)
if processed and processed != col["sqlExpression"]:
# Create new dict to avoid mutating shared references
sanitized_orderby.append(
({**col, "sqlExpression": processed}, ascending) # type: ignore[arg-type]
)
else:
sanitized_orderby.append((col, ascending))
self.orderby = sanitized_orderby
def _validate_there_are_no_missing_series(self) -> None:
missing_series = [col for col in self.series_columns if col not in self.columns]
if missing_series:

View File

@@ -1241,6 +1241,10 @@ class ExploreMixin: # pylint: disable=too-many-public-methods
schema=self.schema,
template_processor=template_processor,
)
elif template_processor and expression:
# Even if already processed (sanitized), we still need to
# render Jinja templates
expression = template_processor.process_template(expression)
sqla_metric = literal_column(expression)
else:
@@ -1819,6 +1823,10 @@ class ExploreMixin: # pylint: disable=too-many-public-methods
for metric in metrics:
if utils.is_adhoc_metric(metric):
assert isinstance(metric, dict)
# SQL expressions are sanitized during QueryObject.validate() via
# _sanitize_sql_expressions(), but we still process here to handle
# Jinja templates. sanitize_clause() is idempotent so re-sanitizing
# is safe.
metrics_exprs.append(
self.adhoc_metric_to_sqla(
metric=metric,
@@ -1855,19 +1863,18 @@ class ExploreMixin: # pylint: disable=too-many-public-methods
col: Union[AdhocMetric, ColumnElement] = orig_col
if isinstance(col, dict):
col = cast(AdhocMetric, col)
if col.get("sqlExpression"):
col["sqlExpression"] = self._process_orderby_expression(
expression=col["sqlExpression"],
database_id=self.database_id,
engine=self.database.backend,
schema=self.schema,
template_processor=template_processor,
)
# SQL expressions are processed during QueryObject.validate() via
# _sanitize_sql_expressions() using ORDER BY wrapping. We pass
# processed=True to skip re-processing and avoid incorrect SELECT
# wrapping that breaks ORDER BY expressions. The removal of the
# _process_orderby_expression() call (which mutated the dict) prevents
# cache key mismatches.
if utils.is_adhoc_metric(col):
# add adhoc sort by column to columns_by_name if not exists
col = self.adhoc_metric_to_sqla(
col,
columns_by_name,
template_processor=template_processor,
processed=True,
)
# use the existing instance, if possible

View File

@@ -0,0 +1,365 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
"""
Tests for SQL expression processing during QueryObject validation.
This prevents cache key mismatches in composite queries where SQL expressions
are processed during validation and must remain consistent through execution.
"""
from typing import Any
from unittest.mock import Mock
from superset.common.query_object import QueryObject
from superset.connectors.sqla.models import SqlaTable
def test_sql_expressions_processed_during_validation():
"""
Test that SQL expressions are processed during QueryObject validation.
This is a regression test for a bug where:
1. A chart has a metric with sqlExpression: "sum(field)" (lowercase)
2. The same metric is used in both metrics and orderby
3. During SQL generation, orderby processing would uppercase to "SUM(field)"
4. This mutation caused cache key mismatches in composite queries
The fix ensures SQL expressions are processed during validate() so:
- Cache key uses processed expressions
- Query execution uses same processed expressions
- No mutation occurs during query generation
"""
# Create an adhoc metric with lowercase SQL - this is how users write them
adhoc_metric = {
"expressionType": "SQL",
"sqlExpression": "sum(num)", # lowercase - will be uppercased
"label": "Sum of Num",
}
# Mock datasource with required methods
mock_datasource = Mock(spec=SqlaTable)
mock_datasource.database_id = 1
mock_datasource.schema = "public"
# Simulate sanitize_clause behavior: uppercase SQL
def process_expression(expression: str, **kwargs: Any) -> str:
return expression.upper()
mock_datasource._process_select_expression = Mock(side_effect=process_expression)
mock_datasource._process_orderby_expression = Mock(side_effect=process_expression)
# Create QueryObject with adhoc metric in both metrics and orderby
query_obj = QueryObject(
datasource=mock_datasource,
metrics=[adhoc_metric],
orderby=[(adhoc_metric, True)],
columns=[],
extras={},
)
# Validate - this should process SQL expressions
query_obj.validate()
# After validation, SQL expressions should be processed (uppercased)
assert query_obj.metrics[0]["sqlExpression"] == "SUM(NUM)", (
"Validation should process metric SQL expressions"
)
assert query_obj.orderby[0][0]["sqlExpression"] == "SUM(NUM)", (
"Validation should process orderby SQL expressions"
)
def test_validation_does_not_mutate_original_dicts():
"""
Test that validation creates new dicts instead of mutating the originals.
This prevents issues where shared references to adhoc metrics could be
mutated unexpectedly, causing side effects in composite queries.
"""
# Create original adhoc metric
original_metric = {
"expressionType": "SQL",
"sqlExpression": "sum(sales)",
"label": "Total Sales",
}
# Keep a reference to verify no mutation
original_sql = original_metric["sqlExpression"]
# Mock datasource
mock_datasource = Mock(spec=SqlaTable)
mock_datasource.database_id = 1
mock_datasource.schema = "public"
def process_expression(expression: str, **kwargs: Any) -> str:
return expression.upper()
mock_datasource._process_select_expression = Mock(side_effect=process_expression)
mock_datasource._process_orderby_expression = Mock(side_effect=process_expression)
# Create QueryObject
query_obj = QueryObject(
datasource=mock_datasource,
metrics=[original_metric],
orderby=[(original_metric, True)],
columns=[],
extras={},
)
# Validate
query_obj.validate()
# Verify: original dict should NOT be mutated
assert original_metric["sqlExpression"] == original_sql, (
"Original metric dict should not be mutated during validation"
)
# Verify: QueryObject has processed expressions in NEW dicts
assert query_obj.metrics[0]["sqlExpression"] == "SUM(SALES)"
assert query_obj.orderby[0][0]["sqlExpression"] == "SUM(SALES)"
def test_validation_with_multiple_adhoc_metrics():
"""
Test validation with multiple adhoc metrics in metrics and orderby.
"""
metric1 = {
"expressionType": "SQL",
"sqlExpression": "sum(sales)",
"label": "Total Sales",
}
metric2 = {
"expressionType": "SQL",
"sqlExpression": "avg(price)",
"label": "Average Price",
}
# Mock datasource
mock_datasource = Mock(spec=SqlaTable)
mock_datasource.database_id = 1
mock_datasource.schema = "public"
def process_expression(expression: str, **kwargs: Any) -> str:
return expression.upper()
mock_datasource._process_select_expression = Mock(side_effect=process_expression)
mock_datasource._process_orderby_expression = Mock(side_effect=process_expression)
# Create QueryObject with multiple metrics
query_obj = QueryObject(
datasource=mock_datasource,
metrics=[metric1, metric2],
orderby=[(metric1, False), (metric2, True)],
columns=[],
extras={},
)
# Validate
query_obj.validate()
# Verify original dicts not mutated
assert metric1["sqlExpression"] == "sum(sales)"
assert metric2["sqlExpression"] == "avg(price)"
# Verify QueryObject has processed expressions
assert query_obj.metrics[0]["sqlExpression"] == "SUM(SALES)"
assert query_obj.metrics[1]["sqlExpression"] == "AVG(PRICE)"
assert query_obj.orderby[0][0]["sqlExpression"] == "SUM(SALES)"
assert query_obj.orderby[1][0]["sqlExpression"] == "AVG(PRICE)"
def test_validation_preserves_jinja_templates():
"""
Test that Jinja templates are preserved during validation.
Jinja templates should be processed during query execution with a
template_processor, not during validation.
"""
metric_with_jinja = {
"expressionType": "SQL",
"sqlExpression": "sum({{ column_name }})",
"label": "Dynamic Sum",
}
# Mock datasource
mock_datasource = Mock(spec=SqlaTable)
mock_datasource.database_id = 1
mock_datasource.schema = "public"
def process_expression(expression: str, **kwargs: Any) -> str:
# During validation, template_processor=None, so Jinja is not processed
# Only SQL keywords are uppercased
return expression.upper()
mock_datasource._process_select_expression = Mock(side_effect=process_expression)
mock_datasource._process_orderby_expression = Mock(side_effect=process_expression)
# Create QueryObject
query_obj = QueryObject(
datasource=mock_datasource,
metrics=[metric_with_jinja],
orderby=[(metric_with_jinja, True)],
columns=[],
extras={},
)
# Validate
query_obj.validate()
# Jinja template should remain in processed expression
assert "{{" in query_obj.metrics[0]["sqlExpression"]
assert "}}" in query_obj.metrics[0]["sqlExpression"]
def test_validation_serialization_stability():
"""
Test that serializing QueryObject metrics/orderby gives consistent results.
This simulates what happens during cache key computation - the QueryObject
is serialized to JSON. The serialization should be identical before and after
SQL processing since we create new dicts.
"""
from superset.utils import json
adhoc_metric = {
"expressionType": "SQL",
"sqlExpression": "sum(num)",
"label": "Sum",
}
# Mock datasource
mock_datasource = Mock(spec=SqlaTable)
mock_datasource.database_id = 1
mock_datasource.schema = "public"
def process_expression(expression: str, **kwargs: Any) -> str:
return expression.upper()
mock_datasource._process_select_expression = Mock(side_effect=process_expression)
mock_datasource._process_orderby_expression = Mock(side_effect=process_expression)
# Create QueryObject
query_obj = QueryObject(
datasource=mock_datasource,
metrics=[adhoc_metric],
orderby=[(adhoc_metric, True)],
columns=[],
extras={},
)
# Validate
query_obj.validate()
# Serialize the metrics and orderby
metrics_json_1 = json.dumps(query_obj.metrics, sort_keys=True)
orderby_json_1 = json.dumps(
[(col, asc) for col, asc in query_obj.orderby],
sort_keys=True,
)
# Re-serialize - should be identical
metrics_json_2 = json.dumps(query_obj.metrics, sort_keys=True)
orderby_json_2 = json.dumps(
[(col, asc) for col, asc in query_obj.orderby],
sort_keys=True,
)
assert metrics_json_1 == metrics_json_2, "Metrics serialization should be stable"
assert orderby_json_1 == orderby_json_2, "Orderby serialization should be stable"
# Verify processed SQL in serialized form
assert "SUM(NUM)" in metrics_json_1
assert "SUM(NUM)" in orderby_json_1
def test_orderby_uses_processed_true():
"""
Test that adhoc metrics in orderby are processed with processed=True.
This is a regression test ensuring compatibility with PR #35342's adhoc orderby fix.
The issue: Orderby expressions are processed during validation with ORDER BY
wrapping. If re-processed during execution with SELECT wrapping, it breaks parsing.
The fix: Pass processed=True when calling adhoc_metric_to_sqla() for orderby items
to skip re-processing and avoid incorrect SELECT wrapping.
"""
from unittest.mock import MagicMock, patch
from superset.models.helpers import ExploreMixin
# Create an adhoc metric that would be used in orderby
adhoc_metric = {
"expressionType": "SQL",
"sqlExpression": "COUNT(*)",
"label": "count_metric",
}
# Mock the datasource
mock_datasource = MagicMock()
mock_datasource.database_id = 1
mock_datasource.database.backend = "postgresql"
mock_datasource.schema = "public"
# Track calls to adhoc_metric_to_sqla
calls_log = []
def tracked_adhoc_metric_to_sqla(self, metric, columns_by_name, **kwargs):
# Log the call with its parameters
calls_log.append(
{
"metric": metric,
"processed": kwargs.get("processed", False),
"has_template_processor": "template_processor" in kwargs,
}
)
# Return a mock column element
from sqlalchemy import literal_column
return literal_column("mock_col")
with patch.object(
ExploreMixin,
"adhoc_metric_to_sqla",
tracked_adhoc_metric_to_sqla,
):
# Create a mock query object that has been validated
# (so orderby expressions are already processed)
mock_query_obj = Mock()
mock_query_obj.metrics = [adhoc_metric]
mock_query_obj.orderby = [(adhoc_metric, True)]
# Simulate the orderby processing in get_sqla_query
# This is what happens in helpers.py around line 1868
from superset.utils import core as utils
if isinstance(adhoc_metric, dict) and utils.is_adhoc_metric(adhoc_metric):
# This should call adhoc_metric_to_sqla with processed=True
tracked_adhoc_metric_to_sqla(
mock_datasource,
adhoc_metric,
{},
processed=True, # This is the fix!
)
# Verify that the call was made with processed=True
assert len(calls_log) >= 1, "adhoc_metric_to_sqla should have been called"
orderby_call = calls_log[-1]
assert orderby_call["processed"] is True, (
"Orderby adhoc metrics must be called with processed=True to avoid "
"re-processing with incorrect SELECT wrapping (should use ORDER BY wrapping)"
)