# Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. from __future__ import annotations from datetime import datetime from typing import Any import pandas as pd import pyarrow as pa import pytest from superset_core.semantic_layers.types import ( AggregationType, Dimension, Filter, GroupLimit, Metric, Operator, OrderDirection, PredicateType, SemanticQuery, SemanticRequest, SemanticResult, ) from superset.semantic_layers.cache import ( _apply_post_processing, _implies, CachedEntry, can_satisfy, shape_key, value_key, ViewMeta, ) # --------------------------------------------------------------------------- # Fixtures # --------------------------------------------------------------------------- def dim(id_: str, name: str | None = None) -> Dimension: return Dimension(id=id_, name=name or id_, type=pa.utf8()) def met( id_: str, name: str | None = None, aggregation: AggregationType | None = None, ) -> Metric: return Metric( id=id_, name=name or id_, type=pa.float64(), definition="x", aggregation=aggregation, ) COL_A = dim("col.a", "a") COL_B = dim("col.b", "b") M_X = met("met.x", "x") M_Y = met("met.y", "y") VIEW = ViewMeta(uuid="view-1", changed_on_iso="2026-05-01T00:00:00", cache_timeout=None) def where(column: Dimension | Metric | None, op: Operator, value: Any) -> Filter: return Filter(type=PredicateType.WHERE, column=column, operator=op, value=value) def having(column: Metric, op: Operator, value: Any) -> Filter: return Filter(type=PredicateType.HAVING, column=column, operator=op, value=value) def adhoc(definition: str, type_: PredicateType = PredicateType.WHERE) -> Filter: return Filter(type=type_, column=None, operator=Operator.ADHOC, value=definition) def query( filters: set[Filter] | None = None, limit: int | None = None, order: Any = None, dimensions: list[Dimension] | None = None, metrics: list[Metric] | None = None, ) -> SemanticQuery: return SemanticQuery( metrics=metrics if metrics is not None else [M_X], dimensions=dimensions if dimensions is not None else [COL_A, COL_B], filters=filters, order=order, limit=limit, ) def entry_from(q: SemanticQuery, value_key_: str = "vk") -> CachedEntry: from superset.semantic_layers.cache import ( _dimension_key, _group_limit_key, _order_key, ) return CachedEntry( filters=frozenset(q.filters or set()), dimension_keys=frozenset(_dimension_key(d) for d in q.dimensions), limit=q.limit, offset=q.offset or 0, order_key=_order_key(q.order), group_limit_key=_group_limit_key(q.group_limit), value_key=value_key_, ) # --------------------------------------------------------------------------- # _implies: scalar range pairs # --------------------------------------------------------------------------- @pytest.mark.parametrize( "new_op,new_val,cached_op,cached_val,expected", [ # narrower lower bound (Operator.GREATER_THAN, 20, Operator.GREATER_THAN, 10, True), (Operator.GREATER_THAN, 10, Operator.GREATER_THAN, 20, False), (Operator.GREATER_THAN_OR_EQUAL, 11, Operator.GREATER_THAN, 10, True), (Operator.GREATER_THAN_OR_EQUAL, 10, Operator.GREATER_THAN, 10, False), (Operator.GREATER_THAN, 10, Operator.GREATER_THAN_OR_EQUAL, 10, True), (Operator.GREATER_THAN, 9, Operator.GREATER_THAN_OR_EQUAL, 10, False), # narrower upper bound (Operator.LESS_THAN, 5, Operator.LESS_THAN, 10, True), (Operator.LESS_THAN_OR_EQUAL, 9, Operator.LESS_THAN, 10, True), (Operator.LESS_THAN_OR_EQUAL, 10, Operator.LESS_THAN, 10, False), # cross-direction — never implies (Operator.LESS_THAN, 5, Operator.GREATER_THAN, 10, False), (Operator.GREATER_THAN, 5, Operator.LESS_THAN, 10, False), # equals fits in range (Operator.EQUALS, 15, Operator.GREATER_THAN, 10, True), (Operator.EQUALS, 10, Operator.GREATER_THAN, 10, False), (Operator.EQUALS, 10, Operator.GREATER_THAN_OR_EQUAL, 10, True), ], ) def test_implies_range( new_op: Operator, new_val: Any, cached_op: Operator, cached_val: Any, expected: bool, ) -> None: assert ( _implies(where(COL_A, new_op, new_val), where(COL_A, cached_op, cached_val)) is expected ) def test_implies_in_subset() -> None: cached = where(COL_A, Operator.IN, frozenset({"a", "b", "c"})) assert _implies(where(COL_A, Operator.IN, frozenset({"a", "b"})), cached) is True assert _implies(where(COL_A, Operator.IN, frozenset({"a", "d"})), cached) is False # equals to a value in the cached IN set assert _implies(where(COL_A, Operator.EQUALS, "b"), cached) is True assert _implies(where(COL_A, Operator.EQUALS, "z"), cached) is False def test_implies_in_all_in_range() -> None: cached = where(COL_A, Operator.GREATER_THAN, 10) assert _implies(where(COL_A, Operator.IN, frozenset({11, 12})), cached) is True assert _implies(where(COL_A, Operator.IN, frozenset({10, 12})), cached) is False def test_implies_equals_exact() -> None: cached = where(COL_A, Operator.EQUALS, 5) assert _implies(where(COL_A, Operator.EQUALS, 5), cached) is True assert _implies(where(COL_A, Operator.EQUALS, 6), cached) is False def test_implies_is_not_null() -> None: cached = where(COL_A, Operator.IS_NOT_NULL, None) assert _implies(where(COL_A, Operator.GREATER_THAN, 0), cached) is True assert _implies(where(COL_A, Operator.IS_NOT_NULL, None), cached) is True assert _implies(where(COL_A, Operator.IS_NULL, None), cached) is False def test_implies_like_exact_match_only() -> None: a = where(COL_A, Operator.LIKE, "foo%") b = where(COL_A, Operator.LIKE, "foo%") c = where(COL_A, Operator.LIKE, "bar%") assert _implies(a, b) is True assert _implies(c, b) is False assert _implies(where(COL_A, Operator.EQUALS, "fooz"), b) is False # --------------------------------------------------------------------------- # can_satisfy # --------------------------------------------------------------------------- def test_can_satisfy_empty_cached_returns_all_as_leftovers() -> None: cached_q = query(filters=None) new_q = query(filters={where(COL_A, Operator.GREATER_THAN, 5)}) ok, leftovers, projection = can_satisfy(entry_from(cached_q), new_q) assert ok is True assert projection is False assert leftovers == {where(COL_A, Operator.GREATER_THAN, 5)} def test_can_satisfy_narrower_filter() -> None: cached_q = query(filters={where(COL_A, Operator.GREATER_THAN, 1)}) new_q = query(filters={where(COL_A, Operator.GREATER_THAN, 2)}) ok, leftovers, _ = can_satisfy(entry_from(cached_q), new_q) assert ok is True assert leftovers == {where(COL_A, Operator.GREATER_THAN, 2)} def test_can_satisfy_broader_filter_fails() -> None: cached_q = query(filters={where(COL_A, Operator.GREATER_THAN, 2)}) new_q = query(filters={where(COL_A, Operator.GREATER_THAN, 1)}) ok, leftovers, _ = can_satisfy(entry_from(cached_q), new_q) assert ok is False assert leftovers == set() def test_can_satisfy_missing_constraint_fails() -> None: cached_q = query(filters={where(COL_A, Operator.GREATER_THAN, 1)}) new_q = query(filters=None) ok, _, _ = can_satisfy(entry_from(cached_q), new_q) assert ok is False def test_can_satisfy_new_filter_on_extra_column() -> None: cached_q = query(filters={where(COL_A, Operator.GREATER_THAN, 1)}) new_q = query( filters={ where(COL_A, Operator.GREATER_THAN, 2), where(COL_B, Operator.EQUALS, "x"), } ) ok, leftovers, _ = can_satisfy(entry_from(cached_q), new_q) assert ok is True assert leftovers == { where(COL_A, Operator.GREATER_THAN, 2), where(COL_B, Operator.EQUALS, "x"), } def test_can_satisfy_leftover_on_non_projected_column_fails() -> None: other = dim("col.other", "other") cached_q = query(filters=None) new_q = query( filters={where(other, Operator.EQUALS, "x")}, dimensions=[COL_A, COL_B], ) ok, _, _ = can_satisfy(entry_from(cached_q), new_q) assert ok is False def test_can_satisfy_having_requires_exact_set() -> None: cached_q = query(filters={having(M_X, Operator.GREATER_THAN, 100)}) same = query(filters={having(M_X, Operator.GREATER_THAN, 100)}) tighter = query(filters={having(M_X, Operator.GREATER_THAN, 200)}) ok_same, _, _ = can_satisfy(entry_from(cached_q), same) ok_tight, _, _ = can_satisfy(entry_from(cached_q), tighter) assert ok_same is True assert ok_tight is False def test_can_satisfy_adhoc_requires_exact_set() -> None: cached_q = query(filters={adhoc("col_a > 1")}) same = query(filters={adhoc("col_a > 1")}) different = query(filters={adhoc("col_a > 2")}) ok_same, _, _ = can_satisfy(entry_from(cached_q), same) ok_diff, _, _ = can_satisfy(entry_from(cached_q), different) assert ok_same is True assert ok_diff is False # --------------------------------------------------------------------------- # Limit / order / offset # --------------------------------------------------------------------------- def test_can_satisfy_unlimited_cached_satisfies_any_limit() -> None: cached_q = query(filters=None, limit=None) new_q = query(filters=None, limit=10) ok, leftovers, _ = can_satisfy(entry_from(cached_q), new_q) assert ok is True assert leftovers == set() def test_can_satisfy_smaller_limit_with_matching_order() -> None: order = [(M_X, OrderDirection.DESC)] cached_q = query(filters=None, limit=100, order=order) new_q = query(filters=None, limit=10, order=order) ok, _, _ = can_satisfy(entry_from(cached_q), new_q) assert ok is True def test_can_satisfy_smaller_limit_different_order_fails() -> None: cached_q = query(filters=None, limit=100, order=[(M_X, OrderDirection.DESC)]) new_q = query(filters=None, limit=10, order=[(M_X, OrderDirection.ASC)]) ok, _, _ = can_satisfy(entry_from(cached_q), new_q) assert ok is False def test_can_satisfy_larger_limit_fails() -> None: cached_q = query(filters=None, limit=10) new_q = query(filters=None, limit=100) ok, _, _ = can_satisfy(entry_from(cached_q), new_q) assert ok is False def test_can_satisfy_no_new_limit_when_cached_has_one_fails() -> None: cached_q = query(filters=None, limit=100) new_q = query(filters=None, limit=None) ok, _, _ = can_satisfy(entry_from(cached_q), new_q) assert ok is False def test_can_satisfy_offset_never_reused() -> None: cached_q = SemanticQuery(metrics=[M_X], dimensions=[COL_A], offset=5) new_q = SemanticQuery(metrics=[M_X], dimensions=[COL_A], offset=5) ok, _, _ = can_satisfy(entry_from(cached_q), new_q) assert ok is False # --------------------------------------------------------------------------- # Post-processing # --------------------------------------------------------------------------- def test_apply_post_processing_filters_and_limits() -> None: df = pd.DataFrame({"a": [1, 3, 5, 7, 9], "x": [10, 20, 30, 40, 50]}) cached = SemanticResult( requests=[SemanticRequest(type="SQL", definition="select ...")], results=pa.Table.from_pandas(df, preserve_index=False), ) new_q = query( filters={where(COL_A, Operator.GREATER_THAN, 2)}, limit=2, ) result = _apply_post_processing( cached, new_q, {where(COL_A, Operator.GREATER_THAN, 2)}, False ) result_df = result.results.to_pandas() assert list(result_df["a"]) == [3, 5] # the cache annotates the requests with a marker assert any(req.type == "cache" for req in result.requests) def test_apply_post_processing_no_leftovers_no_limit_returns_original() -> None: df = pd.DataFrame({"a": [1, 2]}) cached = SemanticResult( requests=[], results=pa.Table.from_pandas(df, preserve_index=False) ) new_q = query(filters=None, limit=None) out = _apply_post_processing(cached, new_q, set(), False) # same object reference is OK; we explicitly return the input assert out is cached # --------------------------------------------------------------------------- # Hash stability # --------------------------------------------------------------------------- def test_value_key_stable_across_metric_order() -> None: q1 = SemanticQuery(metrics=[M_X, M_Y], dimensions=[COL_A]) q2 = SemanticQuery(metrics=[M_Y, M_X], dimensions=[COL_A]) assert value_key(VIEW, q1) == value_key(VIEW, q2) def test_shape_key_stable_across_dimension_order() -> None: q1 = SemanticQuery(metrics=[M_X], dimensions=[COL_A, COL_B]) q2 = SemanticQuery(metrics=[M_X], dimensions=[COL_B, COL_A]) assert shape_key(VIEW, q1) == shape_key(VIEW, q2) def test_shape_key_changes_with_changed_on() -> None: q = SemanticQuery(metrics=[M_X], dimensions=[COL_A]) other = ViewMeta(uuid=VIEW.uuid, changed_on_iso="2099-01-01", cache_timeout=None) assert shape_key(VIEW, q) != shape_key(other, q) def test_value_key_changes_with_filter_value() -> None: q1 = SemanticQuery( metrics=[M_X], dimensions=[COL_A], filters={where(COL_A, Operator.GREATER_THAN, 1)}, ) q2 = SemanticQuery( metrics=[M_X], dimensions=[COL_A], filters={where(COL_A, Operator.GREATER_THAN, 2)}, ) assert value_key(VIEW, q1) != value_key(VIEW, q2) def test_value_key_with_datetime_filter() -> None: f = where(COL_A, Operator.GREATER_THAN_OR_EQUAL, datetime(2025, 1, 1)) q = SemanticQuery(metrics=[M_X], dimensions=[COL_A], filters={f}) # should not raise assert value_key(VIEW, q).startswith("sv:val:") def test_shape_key_independent_of_dimensions() -> None: # The v2 shape key buckets entries by metric set only; different dimension # sets share the same shape so the projection path can find broader entries. q1 = SemanticQuery(metrics=[M_X], dimensions=[COL_A, COL_B]) q2 = SemanticQuery(metrics=[M_X], dimensions=[COL_A]) assert shape_key(VIEW, q1) == shape_key(VIEW, q2) # Value keys still differ. assert value_key(VIEW, q1) != value_key(VIEW, q2) # --------------------------------------------------------------------------- # Projection (v2) # --------------------------------------------------------------------------- M_SUM = met("met.sum", "sum_x", aggregation=AggregationType.SUM) M_COUNT = met("met.count", "count_x", aggregation=AggregationType.COUNT) M_MIN = met("met.min", "min_x", aggregation=AggregationType.MIN) M_MAX = met("met.max", "max_x", aggregation=AggregationType.MAX) M_AVG = met("met.avg", "avg_x", aggregation=AggregationType.AVG) M_UNKNOWN = met("met.unknown", "unknown_x", aggregation=None) def _projection_query( metrics: list[Metric], new_dimensions: list[Dimension], cached_dimensions: list[Dimension], cached_filters: set[Filter] | None = None, cached_limit: int | None = None, new_filters: set[Filter] | None = None, new_limit: int | None = None, new_order: Any = None, new_group_limit: GroupLimit | None = None, ) -> tuple[CachedEntry, SemanticQuery]: cached_q = SemanticQuery( metrics=metrics, dimensions=cached_dimensions, filters=cached_filters, limit=cached_limit, ) new_q = SemanticQuery( metrics=metrics, dimensions=new_dimensions, filters=new_filters, limit=new_limit, order=new_order, group_limit=new_group_limit, ) return entry_from(cached_q), new_q @pytest.mark.parametrize( "metric,operator", [ (M_SUM, "sum"), (M_COUNT, "sum"), (M_MIN, "min"), (M_MAX, "max"), ], ) def test_can_satisfy_projection_each_additive_op(metric: Metric, operator: str) -> None: entry, new_q = _projection_query( metrics=[metric], new_dimensions=[COL_A], cached_dimensions=[COL_A, COL_B], ) ok, leftovers, projection = can_satisfy(entry, new_q) assert ok is True assert projection is True assert leftovers == set() def test_projection_rolls_up_sum() -> None: entry, new_q = _projection_query( metrics=[M_SUM], new_dimensions=[COL_A], cached_dimensions=[COL_A, COL_B], ) cached_df = pd.DataFrame( {"a": ["x", "x", "y", "y"], "b": [1, 2, 1, 2], "sum_x": [10, 20, 30, 40]} ) cached = SemanticResult( requests=[SemanticRequest(type="SQL", definition="select ...")], results=pa.Table.from_pandas(cached_df, preserve_index=False), ) out = _apply_post_processing(cached, new_q, set(), True) out_df = out.results.to_pandas().sort_values("a").reset_index(drop=True) assert list(out_df["a"]) == ["x", "y"] assert list(out_df["sum_x"]) == [30, 70] def test_projection_rolls_up_min_max_count() -> None: entry, new_q = _projection_query( metrics=[M_MIN, M_MAX, M_COUNT], new_dimensions=[COL_A], cached_dimensions=[COL_A, COL_B], ) cached_df = pd.DataFrame( { "a": ["x", "x", "y", "y"], "b": [1, 2, 1, 2], "min_x": [5, 2, 9, 8], "max_x": [50, 60, 70, 80], "count_x": [1, 1, 2, 3], } ) cached = SemanticResult( requests=[], results=pa.Table.from_pandas(cached_df, preserve_index=False), ) out = _apply_post_processing(cached, new_q, set(), True) df = out.results.to_pandas().sort_values("a").reset_index(drop=True) assert list(df["min_x"]) == [2, 8] assert list(df["max_x"]) == [60, 80] assert list(df["count_x"]) == [2, 5] def test_projection_drops_multiple_dims() -> None: col_c = dim("col.c", "c") entry, new_q = _projection_query( metrics=[M_SUM], new_dimensions=[COL_A], cached_dimensions=[COL_A, COL_B, col_c], ) cached_df = pd.DataFrame( { "a": ["x", "x", "x", "y"], "b": [1, 1, 2, 1], "c": [10, 20, 10, 10], "sum_x": [1, 2, 3, 4], } ) cached = SemanticResult( requests=[], results=pa.Table.from_pandas(cached_df, preserve_index=False) ) out = _apply_post_processing(cached, new_q, set(), True) df = out.results.to_pandas().sort_values("a").reset_index(drop=True) assert list(df["sum_x"]) == [6, 4] def test_projection_with_leftover_filter_then_rollup() -> None: entry, new_q = _projection_query( metrics=[M_SUM], new_dimensions=[COL_A], cached_dimensions=[COL_A, COL_B], new_filters={where(COL_B, Operator.GREATER_THAN, 1)}, ) cached_df = pd.DataFrame( {"a": ["x", "x", "y"], "b": [1, 2, 2], "sum_x": [10, 20, 30]} ) cached = SemanticResult( requests=[], results=pa.Table.from_pandas(cached_df, preserve_index=False) ) ok, leftovers, projection = can_satisfy(entry, new_q) assert ok is True assert projection is True out = _apply_post_processing(cached, new_q, leftovers, projection) df = out.results.to_pandas().sort_values("a").reset_index(drop=True) # b > 1 removes the (x,1) row; x sums to 20, y to 30 assert list(df["sum_x"]) == [20, 30] def test_projection_with_order_and_limit() -> None: entry, new_q = _projection_query( metrics=[M_SUM], new_dimensions=[COL_A], cached_dimensions=[COL_A, COL_B], new_order=[(M_SUM, OrderDirection.DESC)], new_limit=1, ) cached_df = pd.DataFrame( {"a": ["x", "x", "y"], "b": [1, 2, 1], "sum_x": [1, 2, 100]} ) cached = SemanticResult( requests=[], results=pa.Table.from_pandas(cached_df, preserve_index=False) ) out = _apply_post_processing(cached, new_q, set(), True) df = out.results.to_pandas() assert len(df) == 1 assert df["a"].tolist() == ["y"] assert df["sum_x"].tolist() == [100] def test_apply_post_processing_sorts_before_limit_for_non_projection() -> None: cached_df = pd.DataFrame({"a": ["x", "y", "z"], "x": [1.0, 100.0, 50.0]}) cached = SemanticResult( requests=[], results=pa.Table.from_pandas(cached_df, preserve_index=False), ) new_q = SemanticQuery( metrics=[M_X], dimensions=[COL_A], order=[(M_X, OrderDirection.DESC)], limit=2, ) out = _apply_post_processing(cached, new_q, set(), False) df = out.results.to_pandas() assert df["x"].tolist() == [100.0, 50.0] def test_projection_rejected_when_metric_aggregation_unknown() -> None: entry, new_q = _projection_query( metrics=[M_UNKNOWN], new_dimensions=[COL_A], cached_dimensions=[COL_A, COL_B], ) ok, _, _ = can_satisfy(entry, new_q) assert ok is False def test_projection_rejected_for_avg() -> None: entry, new_q = _projection_query( metrics=[M_AVG], new_dimensions=[COL_A], cached_dimensions=[COL_A, COL_B], ) ok, _, _ = can_satisfy(entry, new_q) assert ok is False def test_projection_rejected_when_cached_has_limit() -> None: entry, new_q = _projection_query( metrics=[M_SUM], new_dimensions=[COL_A], cached_dimensions=[COL_A, COL_B], cached_limit=10, ) ok, _, _ = can_satisfy(entry, new_q) assert ok is False def test_projection_rejected_when_cached_has_having() -> None: entry, new_q = _projection_query( metrics=[M_SUM], new_dimensions=[COL_A], cached_dimensions=[COL_A, COL_B], cached_filters={having(M_SUM, Operator.GREATER_THAN, 10)}, new_filters={having(M_SUM, Operator.GREATER_THAN, 10)}, ) ok, _, _ = can_satisfy(entry, new_q) assert ok is False def test_projection_rejected_when_new_query_has_group_limit() -> None: group_limit = GroupLimit( dimensions=[COL_A], top=2, metric=M_SUM, direction=OrderDirection.DESC, ) entry, new_q = _projection_query( metrics=[M_SUM], new_dimensions=[COL_A], cached_dimensions=[COL_A, COL_B], new_group_limit=group_limit, ) ok, _, _ = can_satisfy(entry, new_q) assert ok is False def test_projection_rejected_when_order_references_dropped_dim() -> None: entry, new_q = _projection_query( metrics=[M_SUM], new_dimensions=[COL_A], cached_dimensions=[COL_A, COL_B], new_order=[(COL_B, OrderDirection.ASC)], ) ok, _, _ = can_satisfy(entry, new_q) assert ok is False def test_projection_rejected_when_cached_has_filter_on_dropped_dim() -> None: # cached restricts c; rolling up to [a] would miss rows we'd need entry, new_q = _projection_query( metrics=[M_SUM], new_dimensions=[COL_A], cached_dimensions=[COL_A, COL_B], cached_filters={where(COL_B, Operator.GREATER_THAN, 5)}, ) ok, _, _ = can_satisfy(entry, new_q) assert ok is False def test_projection_rejected_when_cached_dims_subset_not_superset() -> None: # cached has just [a]; new wants [a, b] — finer-grained data unavailable entry, new_q = _projection_query( metrics=[M_SUM], new_dimensions=[COL_A, COL_B], cached_dimensions=[COL_A], ) ok, _, _ = can_satisfy(entry, new_q) assert ok is False