fix(datasets): prevent double time filter application in virtual datasets (#35890)

Co-authored-by: Claude <noreply@anthropic.com>
2026-04-18 15:44:57 +00:00 · 2025-11-17 16:47:10 -05:00
parent 9d06a5888f
commit 3b226038ba
2 changed files with 400 additions and 9 deletions
--- a/superset/models/helpers.py
+++ b/superset/models/helpers.py
@@ -2074,6 +2074,11 @@ class ExploreMixin:  # pylint: disable=too-many-public-methods
            metrics_exprs = []

        time_filters = []
+
+        # Process FROM clause early to populate removed_filters from virtual dataset
+        # templates before we decide whether to add time filters
+        tbl, cte = self.get_from_clause(template_processor)
+
        if granularity:
            if granularity not in columns_by_name or not dttm_col:
                raise QueryObjectValidationError(
@@ -2096,6 +2101,7 @@ class ExploreMixin:  # pylint: disable=too-many-public-methods
                self.always_filter_main_dttm
                and self.main_dttm_col in self.dttm_cols
                and self.main_dttm_col != dttm_col.column_name
+                and self.main_dttm_col not in removed_filters
            ):
                time_filters.append(
                    self.get_time_filter(
@@ -2106,13 +2112,21 @@ class ExploreMixin:  # pylint: disable=too-many-public-methods
                    )
                )

-            time_filter_column = self.get_time_filter(
-                time_col=dttm_col,
-                start_dttm=from_dttm,
-                end_dttm=to_dttm,
-                template_processor=template_processor,
+            # Check if time filter should be skipped because it was handled in template.
+            # Check both the actual column name and __timestamp alias
+            should_skip_time_filter = (
+                dttm_col.column_name in removed_filters
+                or utils.DTTM_ALIAS in removed_filters
            )
-            time_filters.append(time_filter_column)
+
+            if not should_skip_time_filter:
+                time_filter_column = self.get_time_filter(
+                    time_col=dttm_col,
+                    start_dttm=from_dttm,
+                    end_dttm=to_dttm,
+                    template_processor=template_processor,
+                )
+                time_filters.append(time_filter_column)

        # Always remove duplicates by column name, as sometimes `metrics_exprs`
        # can have the same name as a groupby column (e.g. when users use
@@ -2131,8 +2145,6 @@ class ExploreMixin:  # pylint: disable=too-many-public-methods

        qry = sa.select(select_exprs)

-        tbl, cte = self.get_from_clause(template_processor)
-
        if groupby_all_columns:
            qry = qry.group_by(*groupby_all_columns.values())

@@ -2177,7 +2189,17 @@ class ExploreMixin:  # pylint: disable=too-many-public-methods
                    is_metric_filter = True
            filter_grain = flt.get("grain")

-            if get_column_name(flt_col) in removed_filters:
+            # Check if this filter should be skipped because it was handled in
+            # template. Special handling for __timestamp alias: check both the
+            # alias and the actual column name
+            filter_col_name = get_column_name(flt_col)
+            should_skip_filter = filter_col_name in removed_filters
+            if not should_skip_filter and flt_col == utils.DTTM_ALIAS and col_obj:
+                # For __timestamp, also check if the actual datetime column was
+                # removed
+                should_skip_filter = col_obj.column_name in removed_filters
+
+            if should_skip_filter:
                # Skip generating SQLA filter when the jinja template handles it.
                continue

--- a/tests/unit_tests/models/test_time_filter_double_application.py
+++ b/tests/unit_tests/models/test_time_filter_double_application.py
@@ -0,0 +1,369 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Test for double time filter application in virtual datasets (Issue #34894)"""
+
+from __future__ import annotations
+
+from datetime import datetime
+
+from flask import Flask
+from freezegun import freeze_time
+from pytest_mock import MockerFixture
+
+from superset.connectors.sqla.models import SqlaTable, SqlMetric, TableColumn
+from superset.models.core import Database
+from superset.superset_typing import QueryObjectDict
+
+
+def test_time_filter_applied_once_with_remove_filter_in_virtual_dataset(
+    mocker: MockerFixture,
+    app: Flask,
+) -> None:
+    """
+    Test that time filter is applied only once when using get_time_filter with
+    remove_filter=True in a virtual dataset.
+
+    This test verifies the fix for GitHub issue #34894 where time filters were
+    being applied twice - once in the inner query (virtual dataset) and again in
+    the outer query.
+
+    Expected behavior: When remove_filter=True is used in the virtual dataset,
+    the time filter should only be applied in the inner query, not in the outer query.
+    """
+    # use a sqlite database just for the dttm conversion from its DB engine spec
+    database = Database(
+        id=1,
+        database_name="test_db",
+        sqlalchemy_uri="sqlite://",
+    )
+
+    # Create a virtual dataset that uses get_time_filter with
+    # remove_filter=True
+    virtual_dataset_sql = """
+    SELECT *
+    FROM my_table
+    WHERE
+        dttm >= {{
+            get_time_filter('dttm', remove_filter=True, target_type='DATE')
+                .from_expr
+        }}
+        AND dttm < {{
+            get_time_filter('dttm', remove_filter=True, target_type='DATE')
+                .to_expr
+        }}
+    """
+
+    columns = [
+        TableColumn(column_name="dttm", is_dttm=1, type="TIMESTAMP"),
+        TableColumn(column_name="value", type="INTEGER"),
+    ]
+
+    virtual_dataset = SqlaTable(
+        table_name="virtual_dataset",
+        sql=virtual_dataset_sql,
+        columns=columns,
+        main_dttm_col="dttm",
+        database=database,
+    )
+
+    # Mock the security manager to avoid RLS checks
+    mocker.patch(
+        "superset.connectors.sqla.models.security_manager.get_guest_rls_filters",
+        return_value=[],
+    )
+    mocker.patch(
+        "superset.connectors.sqla.models.security_manager.is_guest_user",
+        return_value=False,
+    )
+
+    # Create a query object with a time filter
+    query_obj: QueryObjectDict = {
+        "granularity": "dttm",
+        "from_dttm": datetime(2024, 1, 1),
+        "to_dttm": datetime(2024, 1, 31),
+        "is_timeseries": False,
+        "filter": [
+            {
+                "col": "dttm",
+                "op": "TEMPORAL_RANGE",
+                "val": "2024-01-01 : 2024-01-31",
+            }
+        ],
+        "metrics": [],
+        "columns": ["value"],
+    }
+
+    with (
+        freeze_time("2024-01-15"),
+        app.test_request_context(
+            json={
+                "queries": [
+                    {
+                        "filters": [
+                            {
+                                "col": "dttm",
+                                "op": "TEMPORAL_RANGE",
+                                "val": "2024-01-01 : 2024-01-31",
+                            }
+                        ],
+                    }
+                ],
+            }
+        ),
+    ):
+        # Get the query SQL
+        sqla_query = virtual_dataset.get_query_str_extended(query_obj, mutate=False)
+        generated_sql = sqla_query.sql.lower()
+
+        # Verify that the time filter appears only in the inner query from the
+        # virtual dataset and NOT in the outer query WHERE clause
+
+        # Check if there's a WHERE clause after the subquery closes
+        # Split by "as virtual_table" and check if WHERE is in outer part
+        if "as virtual_table" in generated_sql:
+            outer_part = generated_sql.split("as virtual_table")[1]
+            # Extract just the part before GROUP BY/ORDER BY/LIMIT to check for WHERE
+            outer_where_part = (
+                outer_part.split("group by")[0]
+                if "group by" in outer_part
+                else outer_part
+            )
+            outer_where_part = (
+                outer_where_part.split("order by")[0]
+                if "order by" in outer_where_part
+                else outer_where_part
+            )
+            outer_where_part = (
+                outer_where_part.split("limit")[0]
+                if "limit" in outer_where_part
+                else outer_where_part
+            )
+            has_outer_where = "where" in outer_where_part and "dttm" in outer_where_part
+        else:
+            has_outer_where = False
+
+        # Assert that there's NO time filter in the outer query
+        outer_clause = outer_where_part if "outer_where_part" in locals() else "N/A"
+        assert not has_outer_where, (
+            f"Time filter should only be applied in the inner query "
+            f"(virtual dataset), not in the outer query when using "
+            f"remove_filter=True. Found outer WHERE clause: {outer_clause}"
+        )
+
+
+def test_time_filter_removed_from_outer_query_simple_case(
+    mocker: MockerFixture, app: Flask
+) -> None:
+    """
+    Test the basic mechanism: when a Jinja template uses get_time_filter with
+    remove_filter=True, the filter should be added to removed_filters list and
+    not be applied in the SQLAlchemy query generation.
+
+    This is a simpler test that verifies the core mechanism works at a single level.
+    """
+    # use a sqlite database just for the dttm conversion from its DB engine spec
+    database = Database(
+        id=1,
+        database_name="test_db",
+        sqlalchemy_uri="sqlite://",
+    )
+
+    columns = [
+        TableColumn(column_name="dttm", is_dttm=1, type="TIMESTAMP"),
+        TableColumn(column_name="value", type="INTEGER"),
+    ]
+
+    # Simple query with Jinja template that removes the filter
+    simple_sql = """
+    SELECT *
+    FROM my_table
+    WHERE dttm >= {{
+        get_time_filter('dttm', remove_filter=True, target_type='DATE').from_expr
+    }}
+    """
+
+    dataset = SqlaTable(
+        table_name="test_table",
+        sql=simple_sql,
+        columns=columns,
+        main_dttm_col="dttm",
+        database=database,
+    )
+
+    mocker.patch(
+        "superset.connectors.sqla.models.security_manager.get_guest_rls_filters",
+        return_value=[],
+    )
+    mocker.patch(
+        "superset.connectors.sqla.models.security_manager.is_guest_user",
+        return_value=False,
+    )
+
+    query_obj: QueryObjectDict = {
+        "granularity": "dttm",
+        "from_dttm": datetime(2024, 1, 1),
+        "to_dttm": datetime(2024, 1, 31),
+        "is_timeseries": False,
+        "filter": [
+            {
+                "col": "dttm",
+                "op": "TEMPORAL_RANGE",
+                "val": "2024-01-01 : 2024-01-31",
+            }
+        ],
+        "metrics": [],
+        "columns": ["value"],
+    }
+
+    with (
+        freeze_time("2024-01-15"),
+        app.test_request_context(
+            json={
+                "queries": [
+                    {
+                        "filters": [
+                            {
+                                "col": "dttm",
+                                "op": "TEMPORAL_RANGE",
+                                "val": "2024-01-01 : 2024-01-31",
+                            }
+                        ],
+                    }
+                ],
+            }
+        ),
+    ):
+        sqla_query = dataset.get_query_str_extended(query_obj, mutate=False)
+        generated_sql = sqla_query.sql.lower()
+
+        # In this simple case, the filter should appear exactly once
+        # (in the template, not added by SQLAlchemy)
+        # Count occurrences of date filter patterns
+        dttm_count = generated_sql.count("dttm")
+
+        # We expect to see dttm in the SELECT and WHERE (from template only)
+        # But NOT duplicated by SQLAlchemy's filter application
+        assert dttm_count <= 3, (
+            f"Expected at most 3 occurrences of 'dttm' "
+            f"(SELECT, WHERE from template), but found {dttm_count}. "
+            f"This suggests the filter is being applied multiple times."
+        )
+
+
+def test_time_filter_with_timestamp_alias(mocker: MockerFixture, app: Flask) -> None:
+    """
+    Test that remove_filter works correctly when using __timestamp alias.
+
+    This test specifically addresses the issue where time filters use the __timestamp
+    alias in timeseries queries, but get_time_filter uses the actual column name.
+    The fix ensures both are recognized as the same column.
+    """
+    # use a sqlite database just for the dttm conversion from its DB engine spec
+    database = Database(
+        id=1,
+        database_name="test_db",
+        sqlalchemy_uri="sqlite://",
+    )
+
+    columns = [
+        TableColumn(column_name="dttm", is_dttm=1, type="TIMESTAMP"),
+        TableColumn(column_name="value", type="INTEGER"),
+    ]
+
+    # Virtual dataset with Jinja template that removes the filter using
+    # actual column name
+    virtual_sql = """
+    SELECT *
+    FROM my_table
+    WHERE dttm >= {{
+        get_time_filter('dttm', remove_filter=True, target_type='DATE').from_expr
+    }}
+      AND dttm < {{
+        get_time_filter('dttm', remove_filter=True, target_type='DATE').to_expr
+    }}
+    """
+
+    dataset = SqlaTable(
+        table_name="test_table",
+        sql=virtual_sql,
+        columns=columns,
+        main_dttm_col="dttm",
+        database=database,
+        metrics=[SqlMetric(metric_name="count", expression="COUNT(*)")],
+    )
+
+    mocker.patch(
+        "superset.connectors.sqla.models.security_manager.get_guest_rls_filters",
+        return_value=[],
+    )
+    mocker.patch(
+        "superset.connectors.sqla.models.security_manager.is_guest_user",
+        return_value=False,
+    )
+
+    # Query using __timestamp alias (as timeseries charts do)
+    query_obj: QueryObjectDict = {
+        "granularity": "dttm",
+        "from_dttm": datetime(2024, 1, 1),
+        "to_dttm": datetime(2024, 1, 31),
+        "is_timeseries": True,  # This will cause __timestamp alias to be used
+        "filter": [
+            {
+                "col": "__timestamp",  # Using the special alias
+                "op": "TEMPORAL_RANGE",
+                "val": "2024-01-01 : 2024-01-31",
+            }
+        ],
+        "metrics": ["count"],
+        "columns": [],
+    }
+
+    with (
+        freeze_time("2024-01-15"),
+        app.test_request_context(
+            json={
+                "queries": [
+                    {
+                        "filters": [
+                            {
+                                "col": "__timestamp",
+                                "op": "TEMPORAL_RANGE",
+                                "val": "2024-01-01 : 2024-01-31",
+                            }
+                        ],
+                    }
+                ],
+            }
+        ),
+    ):
+        sqla_query = dataset.get_query_str_extended(query_obj, mutate=False)
+        generated_sql = sqla_query.sql.lower()
+
+        # The filter should only appear in the inner query (virtual dataset)
+        # NOT in the outer query WHERE clause
+        # Check if there's a WHERE clause after the subquery
+        parts = generated_sql.split(")")
+        if len(parts) > 1:
+            outer_part = parts[-1]
+            # The outer part should not have additional date filter conditions
+            # (It may have GROUP BY, ORDER BY, LIMIT, but not WHERE with dttm)
+            if "where" in outer_part:
+                assert "dttm" not in outer_part or "to_date" not in outer_part, (
+                    "Time filter should not be applied in outer query when using "
+                    "remove_filter=True in virtual dataset. The __timestamp alias "
+                    "should be recognized as equivalent to the actual column name."
+                )