Files
superset2/tests/unit_tests/common/test_query_context_processor.py

1382 lines
50 KiB
Python

# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
from typing import Any
from unittest.mock import MagicMock, patch
import numpy as np
import pandas as pd
import pytest
from superset.common.chart_data import ChartDataResultFormat, ChartDataResultType
from superset.common.db_query_status import QueryStatus
from superset.common.query_context_processor import QueryContextProcessor
from superset.utils.core import GenericDataType
@pytest.fixture
def mock_query_context():
with patch(
"superset.common.query_context_processor.QueryContextProcessor"
) as mock_query_context_processor:
yield mock_query_context_processor
@pytest.fixture
def processor(mock_query_context):
from superset.models.helpers import ExploreMixin
mock_query_context.datasource.data = MagicMock()
mock_query_context.datasource.data.get.return_value = {
"col1": "Column 1",
"col2": "Column 2",
}
# Create a processor instance
processor = QueryContextProcessor(mock_query_context)
# Setup datasource methods from ExploreMixin to be real methods
# by binding them to the mock datasource
processor._qc_datasource.is_valid_date_range = (
ExploreMixin.is_valid_date_range.__get__(processor._qc_datasource)
)
processor._qc_datasource.is_valid_date = ExploreMixin.is_valid_date.__get__(
processor._qc_datasource
)
processor._qc_datasource.get_offset_custom_or_inherit = (
ExploreMixin.get_offset_custom_or_inherit.__get__(processor._qc_datasource)
)
processor._qc_datasource._get_temporal_column_for_filter = (
ExploreMixin._get_temporal_column_for_filter.__get__(processor._qc_datasource)
)
processor._qc_datasource.join_offset_dfs = ExploreMixin.join_offset_dfs.__get__(
processor._qc_datasource
)
processor._qc_datasource._determine_join_keys = (
ExploreMixin._determine_join_keys.__get__(processor._qc_datasource)
)
processor._qc_datasource._process_date_range_offset = (
ExploreMixin._process_date_range_offset.__get__(processor._qc_datasource)
)
processor._qc_datasource._perform_join = ExploreMixin._perform_join.__get__(
processor._qc_datasource
)
processor._qc_datasource._apply_cleanup_logic = (
ExploreMixin._apply_cleanup_logic.__get__(processor._qc_datasource)
)
processor._qc_datasource.add_offset_join_column = (
ExploreMixin.add_offset_join_column.__get__(processor._qc_datasource)
)
return processor
def test_get_data_table_like(processor, mock_query_context):
df = pd.DataFrame({"col1": [1, 2, 3], "col2": ["a", "b", "c"]})
coltypes = [GenericDataType.NUMERIC, GenericDataType.STRING]
mock_query_context.result_format = ChartDataResultFormat.JSON
result = processor.get_data(df, coltypes)
expected = [
{"col1": 1, "col2": "a"},
{"col1": 2, "col2": "b"},
{"col1": 3, "col2": "c"},
]
assert result == expected
@patch("superset.common.query_context_processor.csv.df_to_escaped_csv")
def test_get_data_csv(mock_df_to_escaped_csv, processor, mock_query_context):
df = pd.DataFrame({"col1": [1, 2, 3], "col2": ["a", "b", "c"]})
coltypes = [GenericDataType.NUMERIC, GenericDataType.STRING]
mock_query_context.result_format = ChartDataResultFormat.CSV
mock_df_to_escaped_csv.return_value = "col1,col2\n1,a\n2,b\n3,c\n"
result = processor.get_data(df, coltypes)
assert result == "col1,col2\n1,a\n2,b\n3,c\n"
mock_df_to_escaped_csv.assert_called_once_with(
df, index=False, encoding="utf-8-sig"
)
@patch("superset.common.query_context_processor.excel.df_to_excel")
@patch("superset.common.query_context_processor.excel.apply_column_types")
def test_get_data_xlsx(
mock_apply_column_types, mock_df_to_excel, processor, mock_query_context
):
df = pd.DataFrame({"col1": [1, 2, 3], "col2": ["a", "b", "c"]})
coltypes = [GenericDataType.NUMERIC, GenericDataType.STRING]
mock_query_context.result_format = ChartDataResultFormat.XLSX
mock_df_to_excel.return_value = b"binary data"
result = processor.get_data(df, coltypes)
assert result == b"binary data"
mock_apply_column_types.assert_called_once_with(df, coltypes)
mock_df_to_excel.assert_called_once_with(df)
def test_get_data_json(processor, mock_query_context):
df = pd.DataFrame({"col1": [1, 2, 3], "col2": ["a", "b", "c"]})
coltypes = [GenericDataType.NUMERIC, GenericDataType.STRING]
mock_query_context.result_format = ChartDataResultFormat.JSON
result = processor.get_data(df, coltypes)
expected = [
{"col1": 1, "col2": "a"},
{"col1": 2, "col2": "b"},
{"col1": 3, "col2": "c"},
]
assert result == expected
def test_get_data_invalid_dataframe(processor, mock_query_context):
df = pd.DataFrame({"col1": [1, 2, 3], "col2": ["a", "b", "c"]})
coltypes = [GenericDataType.NUMERIC, GenericDataType.STRING]
mock_query_context.result_format = ChartDataResultFormat.JSON
with patch.object(df, "to_dict", side_effect=ValueError("Invalid DataFrame")):
with pytest.raises(ValueError, match="Invalid DataFrame"):
processor.get_data(df, coltypes)
def test_get_data_non_unique_columns(processor, mock_query_context):
data = [[1, "a"], [2, "b"], [3, "c"]]
df = pd.DataFrame(data, columns=["col1", "col1"])
coltypes = [GenericDataType.NUMERIC, GenericDataType.STRING]
mock_query_context.result_format = ChartDataResultFormat.JSON
with pytest.warns(
UserWarning,
match="DataFrame columns are not unique, some columns will be omitted",
):
processor.get_data(df, coltypes)
def test_get_data_empty_dataframe_json(processor, mock_query_context):
df = pd.DataFrame(columns=["col1", "col2"])
coltypes = [GenericDataType.NUMERIC, GenericDataType.STRING]
mock_query_context.result_format = ChartDataResultFormat.JSON
result = processor.get_data(df, coltypes)
assert result == []
@patch("superset.common.query_context_processor.csv.df_to_escaped_csv")
def test_get_data_empty_dataframe_csv(
mock_df_to_escaped_csv, processor, mock_query_context
):
df = pd.DataFrame(columns=["col1", "col2"])
coltypes = [GenericDataType.NUMERIC, GenericDataType.STRING]
mock_query_context.result_format = ChartDataResultFormat.CSV
mock_df_to_escaped_csv.return_value = "col1,col2\n"
result = processor.get_data(df, coltypes)
assert result == "col1,col2\n"
mock_df_to_escaped_csv.assert_called_once_with(
df, index=False, encoding="utf-8-sig"
)
@patch("superset.common.query_context_processor.excel.df_to_excel")
@patch("superset.common.query_context_processor.excel.apply_column_types")
def test_get_data_empty_dataframe_xlsx(
mock_apply_column_types, mock_df_to_excel, processor, mock_query_context
):
df = pd.DataFrame(columns=["col1", "col2"])
coltypes = [GenericDataType.NUMERIC, GenericDataType.STRING]
mock_query_context.result_format = ChartDataResultFormat.XLSX
mock_df_to_excel.return_value = b"binary data empty"
result = processor.get_data(df, coltypes)
assert result == b"binary data empty"
mock_apply_column_types.assert_called_once_with(df, coltypes)
mock_df_to_excel.assert_called_once_with(df)
def test_get_data_nan_values_json(processor, mock_query_context):
df = pd.DataFrame({"col1": [1, np.nan, 3], "col2": ["a", "b", "c"]})
coltypes = [GenericDataType.NUMERIC, GenericDataType.STRING]
mock_query_context.result_format = ChartDataResultFormat.JSON
result = processor.get_data(df, coltypes)
assert result[0]["col1"] == 1
assert pd.isna(result[1]["col1"])
assert result[2]["col1"] == 3
def test_get_data_invalid_input(processor, mock_query_context):
df = "not a dataframe"
coltypes = [GenericDataType.NUMERIC, GenericDataType.STRING]
mock_query_context.result_format = ChartDataResultFormat.JSON
with pytest.raises(AttributeError):
processor.get_data(df, coltypes)
def test_get_data_default_format_when_result_format_is_none(
processor, mock_query_context
):
df = pd.DataFrame({"col1": [1, 2, 3], "col2": ["a", "b", "c"]})
coltypes = [GenericDataType.NUMERIC, GenericDataType.STRING]
mock_query_context.result_format = None
result = processor.get_data(df, coltypes)
expected = [
{"col1": 1, "col2": "a"},
{"col1": 2, "col2": "b"},
{"col1": 3, "col2": "c"},
]
assert result == expected
def fake_apply_column_types(df, coltypes):
if len(coltypes) != len(df.columns):
raise ValueError("Mismatch between column types and dataframe columns")
return df
@patch("superset.common.query_context_processor.excel.df_to_excel")
@patch(
"superset.common.query_context_processor.excel.apply_column_types",
side_effect=fake_apply_column_types,
)
def test_get_data_invalid_coltypes_length_xlsx(
mock_apply_column_types, mock_df_to_excel, processor, mock_query_context
):
df = pd.DataFrame({"col1": [1, 2, 3], "col2": ["a", "b", "c"]})
coltypes = [GenericDataType.NUMERIC] # Mismatched length
mock_query_context.result_format = ChartDataResultFormat.XLSX
with pytest.raises(
ValueError, match="Mismatch between column types and dataframe columns"
):
processor.get_data(df, coltypes)
def test_get_data_does_not_mutate_dataframe(processor, mock_query_context):
df = pd.DataFrame({"col1": [1, 2, 3], "col2": ["a", "b", "c"]})
original_df = df.copy(deep=True)
coltypes = [GenericDataType.NUMERIC, GenericDataType.STRING]
mock_query_context.result_format = ChartDataResultFormat.JSON
_ = processor.get_data(df, coltypes)
pd.testing.assert_frame_equal(df, original_df)
@patch(
"superset.common.query_context_processor.excel.apply_column_types",
side_effect=ValueError("Conversion error"),
)
def test_get_data_xlsx_apply_column_types_error(
mock_apply_column_types, processor, mock_query_context
):
df = pd.DataFrame({"col1": [1, 2, 3], "col2": ["a", "b", "c"]})
coltypes = [GenericDataType.NUMERIC, GenericDataType.STRING]
mock_query_context.result_format = ChartDataResultFormat.XLSX
with pytest.raises(ValueError, match="Conversion error"):
processor.get_data(df, coltypes)
def test_is_valid_date_range_format(processor):
"""Test that date range format validation works correctly."""
# Should return True for valid date range format
assert (
processor._qc_datasource.is_valid_date_range("2023-01-01 : 2023-01-31") is True
)
assert (
processor._qc_datasource.is_valid_date_range("2020-12-25 : 2020-12-31") is True
)
# Should return False for invalid format
assert processor._qc_datasource.is_valid_date_range("1 day ago") is False
assert processor._qc_datasource.is_valid_date_range("2023-01-01") is False
assert processor._qc_datasource.is_valid_date_range("invalid") is False
def test_is_valid_date_range_static_format():
"""Test that static date range format validation works correctly."""
from superset.models.helpers import ExploreMixin
# Should return True for valid date range format
assert ExploreMixin.is_valid_date_range_static("2023-01-01 : 2023-01-31") is True
assert ExploreMixin.is_valid_date_range_static("2020-12-25 : 2020-12-31") is True
# Should return False for invalid format
assert ExploreMixin.is_valid_date_range_static("1 day ago") is False
assert ExploreMixin.is_valid_date_range_static("2023-01-01") is False
assert ExploreMixin.is_valid_date_range_static("invalid") is False
def test_processing_time_offsets_date_range_logic(processor):
"""Test that date range timeshift logic works correctly with feature flag checks."""
from superset.models.helpers import ExploreMixin
# Test that the date range validation works
assert (
processor._qc_datasource.is_valid_date_range("2023-01-01 : 2023-01-31") is True
)
assert processor._qc_datasource.is_valid_date_range("1 year ago") is False
# Test that static method also works
assert ExploreMixin.is_valid_date_range_static("2023-01-01 : 2023-01-31") is True
assert ExploreMixin.is_valid_date_range_static("1 year ago") is False
def test_feature_flag_validation_logic():
"""Test that feature flag validation logic works as expected."""
from superset.extensions import feature_flag_manager
# This tests the concept - actual feature flag value depends on config
# The important thing is that the code checks for DATE_RANGE_TIMESHIFTS_ENABLED
flag_name = "DATE_RANGE_TIMESHIFTS_ENABLED"
# Test that the feature flag is being checked
# (This will vary based on actual config but tests the mechanism)
result = feature_flag_manager.is_feature_enabled(flag_name)
assert isinstance(result, bool) # Should return a boolean
def test_join_offset_dfs_date_range_basic(processor):
"""Test basic join logic for date range offsets."""
# Create simple test data
main_df = pd.DataFrame({"dim1": ["A", "B", "C"], "metric1": [10, 20, 30]})
offset_df = pd.DataFrame({"dim1": ["A", "B", "C"], "metric1": [5, 10, 15]})
# Mock query context
mock_query = MagicMock()
mock_query.granularity = "date_col"
processor._query_context.queries = [mock_query]
# Test basic join with date range offset
offset_dfs = {"2023-01-01 : 2023-01-31": offset_df}
join_keys = ["dim1"]
with patch("superset.models.helpers.feature_flag_manager") as mock_ff:
mock_ff.is_feature_enabled.return_value = True
with patch("superset.common.utils.dataframe_utils.left_join_df") as mock_join:
mock_join.return_value = pd.DataFrame(
{
"dim1": ["A", "B", "C"],
"metric1": [10, 20, 30],
"metric1 2023-01-01 : 2023-01-31": [5, 10, 15],
}
)
result_df = processor._qc_datasource.join_offset_dfs(
main_df, offset_dfs, time_grain=None, join_keys=join_keys
)
# Verify join was called
mock_join.assert_called_once()
assert len(result_df) == 3
def test_get_offset_custom_or_inherit_with_inherit(processor):
"""Test get_offset_custom_or_inherit with 'inherit' option."""
from_dttm = pd.Timestamp("2024-01-01")
to_dttm = pd.Timestamp("2024-01-10")
result = processor._qc_datasource.get_offset_custom_or_inherit(
"inherit", from_dttm, to_dttm
)
# Should return the difference in days
assert result == "9 days ago"
def test_get_offset_custom_or_inherit_with_date(processor):
"""Test get_offset_custom_or_inherit with specific date."""
from_dttm = pd.Timestamp("2024-01-10")
to_dttm = pd.Timestamp("2024-01-20")
result = processor._qc_datasource.get_offset_custom_or_inherit(
"2024-01-05", from_dttm, to_dttm
)
# Should return difference between from_dttm and the specified date
assert result == "5 days ago"
def test_get_offset_custom_or_inherit_with_invalid_date(processor):
"""Test get_offset_custom_or_inherit with invalid date format."""
from_dttm = pd.Timestamp("2024-01-10")
to_dttm = pd.Timestamp("2024-01-20")
result = processor._qc_datasource.get_offset_custom_or_inherit(
"invalid-date", from_dttm, to_dttm
)
# Should return empty string for invalid format
assert result == ""
def test_get_temporal_column_for_filter_with_granularity(processor):
"""Test _get_temporal_column_for_filter returns granularity when available."""
query_object = MagicMock()
query_object.granularity = "date_column"
result = processor._qc_datasource._get_temporal_column_for_filter(
query_object, "x_axis_col"
)
assert result == "date_column"
def test_get_temporal_column_for_filter_with_x_axis_fallback(processor):
"""Test _get_temporal_column_for_filter falls back to x_axis_label."""
query_object = MagicMock()
query_object.granularity = None
result = processor._qc_datasource._get_temporal_column_for_filter(
query_object, "x_axis_col"
)
assert result == "x_axis_col"
def test_get_temporal_column_for_filter_with_datasource_columns(processor):
"""Test _get_temporal_column_for_filter
returns None when no clear temporal column."""
query_object = MagicMock()
query_object.granularity = None
query_object.filter = []
mock_datetime_col = MagicMock()
mock_datetime_col.is_dttm = True
mock_datetime_col.column_name = "created_at"
mock_regular_col = MagicMock()
mock_regular_col.is_dttm = False
mock_regular_col.column_name = "name"
processor._qc_datasource.columns = [mock_regular_col, mock_datetime_col]
result = processor._qc_datasource._get_temporal_column_for_filter(
query_object, None
)
assert result is None
def test_get_temporal_column_for_filter_prefers_granularity(processor):
"""Test _get_temporal_column_for_filter uses granularity when available."""
query_object = MagicMock()
query_object.granularity = "timestamp_col"
query_object.filter = []
mock_datetime_col = MagicMock()
mock_datetime_col.is_dttm = True
mock_datetime_col.name = "other_col"
processor._qc_datasource.columns = [mock_datetime_col]
result = processor._qc_datasource._get_temporal_column_for_filter(
query_object, None
)
assert result == "timestamp_col"
def test_get_temporal_column_for_filter_no_columns_found(processor):
"""Test _get_temporal_column_for_filter
returns None when no temporal column found."""
query_object = MagicMock()
query_object.granularity = None
# Mock datasource with no datetime columns
mock_regular_col = MagicMock()
mock_regular_col.is_dttm = False
mock_regular_col.column_name = "name"
processor._qc_datasource.columns = [mock_regular_col]
result = processor._qc_datasource._get_temporal_column_for_filter(
query_object, None
)
assert result is None
def test_get_temporal_column_for_filter_no_datasource_columns(processor):
"""Test _get_temporal_column_for_filter handles datasource
without columns attribute."""
query_object = MagicMock()
query_object.granularity = None
# Remove columns attribute from datasource
if hasattr(processor._qc_datasource, "columns"):
delattr(processor._qc_datasource, "columns")
result = processor._qc_datasource._get_temporal_column_for_filter(
query_object, None
)
assert result is None
def test_processing_time_offsets_temporal_column_error(processor):
"""Test processing_time_offsets raises QueryObjectValidationError
when temporal column can't be determined."""
from superset.common.query_object import QueryObject
from superset.exceptions import QueryObjectValidationError
# Create a dataframe for testing
df = pd.DataFrame({"dim1": ["A", "B", "C"], "metric1": [10, 20, 30]})
# Create query object with date range offset and proper time range
query_object = QueryObject(
datasource=MagicMock(),
granularity=None, # No granularity set
columns=[],
is_timeseries=True,
time_offsets=["2023-01-01 : 2023-01-31"],
filter=[
{
"col": "some_date_col",
"op": "TEMPORAL_RANGE",
"val": "2024-01-01 : 2024-01-31",
}
],
)
# Mock get_since_until_from_query_object to return valid dates
with patch(
"superset.common.utils.time_range_utils.get_since_until_from_query_object"
) as mock_dates:
mock_dates.return_value = (
pd.Timestamp("2024-01-01"),
pd.Timestamp("2024-01-31"),
)
# Mock feature flag to be enabled
with patch("superset.models.helpers.feature_flag_manager") as mock_ff:
mock_ff.is_feature_enabled.return_value = True
# Mock _get_temporal_column_for_filter to return None
# (no temporal column found)
with patch.object(
processor._qc_datasource,
"_get_temporal_column_for_filter",
return_value=None,
):
# Mock the datasource's processing_time_offsets to raise the error
def raise_error(*args, **kwargs):
raise QueryObjectValidationError(
"Unable to identify temporal column for date "
"range time comparison."
)
with patch.object(
processor._qc_datasource,
"processing_time_offsets",
side_effect=raise_error,
):
with pytest.raises(
QueryObjectValidationError,
match="Unable to identify temporal column",
):
processor._qc_datasource.processing_time_offsets(
df, query_object, None, None, False
)
def test_processing_time_offsets_date_range_enabled(processor):
"""Test processing_time_offsets correctly handles
date range offsets when enabled."""
from superset.common.query_object import QueryObject
# Create a dataframe for testing
df = pd.DataFrame(
{
"dim1": ["A", "B", "C"],
"metric1": [10, 20, 30],
"__timestamp": pd.date_range("2023-01-01", periods=3, freq="D"),
}
)
# Create a properly mocked datasource
mock_datasource = MagicMock()
mock_datasource.id = 123
mock_datasource.uid = "abc123"
mock_datasource.cache_timeout = None
mock_datasource.changed_on = pd.Timestamp("2023-01-01")
mock_datasource.get_extra_cache_keys.return_value = {}
# Create query object with date range offset
query_object = QueryObject(
datasource=mock_datasource,
granularity="date_col",
columns=[],
is_timeseries=True,
time_offsets=["2022-01-01 : 2022-01-31"],
filter=[],
)
# Mock the query context and its methods
processor._query_context.queries = [query_object]
with patch("superset.models.helpers.feature_flag_manager") as mock_ff:
mock_ff.is_feature_enabled.return_value = True
with patch(
"superset.utils.core.get_base_axis_labels",
return_value=["__timestamp"],
):
with patch(
"superset.common.utils.time_range_utils.get_since_until_from_query_object"
) as mock_dates:
mock_dates.return_value = (
pd.Timestamp("2023-01-01"),
pd.Timestamp("2023-01-03"),
)
with patch(
"superset.common.utils.time_range_utils.get_since_until_from_time_range"
) as mock_time_range:
mock_time_range.return_value = (
pd.Timestamp("2022-01-01"),
pd.Timestamp("2022-01-31"),
)
with patch.object(
processor, "get_query_result"
) as mock_query_result:
mock_result = MagicMock()
mock_result.df = pd.DataFrame(
{
"dim1": ["A", "B"],
"metric1": [5, 10],
"__timestamp": pd.date_range(
"2022-01-01", periods=2, freq="D"
),
}
)
mock_result.query = "SELECT * FROM table"
mock_result.cache_key = "offset_cache_key"
mock_query_result.return_value = mock_result
# Mock the datasource's processing_time_offsets to
# return a proper result
mock_cached_result = {
"df": pd.DataFrame(
{
"dim1": ["A", "B", "C"],
"metric1": [10, 20, 30],
"metric1 2022-01-01 : 2022-01-31": [5, 10, 15],
"__timestamp": pd.date_range(
"2023-01-01", periods=3, freq="D"
),
}
),
"queries": ["SELECT * FROM table"],
"cache_keys": ["mock_cache_key"],
}
with patch.object(
processor._qc_datasource,
"processing_time_offsets",
return_value=mock_cached_result,
):
# Test the method (call datasource method directly)
result = processor._qc_datasource.processing_time_offsets(
df, query_object, None, None, False
)
# Verify that the method completes successfully
assert "df" in result
assert "queries" in result
assert "cache_keys" in result
# Verify the result has the expected structure
assert isinstance(result["df"], pd.DataFrame)
assert isinstance(result["queries"], list)
assert isinstance(result["cache_keys"], list)
def test_ensure_totals_available_updates_cache_values():
"""
Test that ensure_totals_available() updates the query objects AND
cache_values to keep them in sync.
The issue was that ensure_totals_available() modified QueryObject instances
(e.g., setting row_limit=None on totals queries and adding contribution_totals
to post_processing), but cache_values still contained the original queries.
This caused cache key mismatches between worker execution and cache fetch.
"""
import pandas as pd
from superset.common.query_object import QueryObject
# Create a mock datasource
mock_datasource = MagicMock()
mock_datasource.uid = "test_datasource"
mock_datasource.database.db_engine_spec.engine = "postgresql"
mock_datasource.cache_timeout = None
mock_datasource.changed_on = None
# Create QueryObjects that would trigger ensure_totals_available logic
# Query 1: Main query with contribution post-processing (needs totals)
main_query = QueryObject(
datasource=mock_datasource,
columns=["brokerage"],
metrics=["Net Amount In", "Amount Out", "Amount In"],
row_limit=50000,
orderby=[["Net Amount In", False]],
post_processing=[
{
"operation": "contribution",
"options": {
"columns": ["Amount In", "Amount Out"],
"rename_columns": ["%Amount In", "%Amount Out"],
},
}
],
)
# Query 2: Totals query (no columns, has metrics, no post-processing)
totals_query = QueryObject(
datasource=mock_datasource,
columns=[], # No columns = totals query
metrics=["Net Amount In", "Amount Out", "Amount In"],
row_limit=50000,
post_processing=[], # No post-processing
)
# Create mock query context
mock_query_context = MagicMock()
mock_query_context.force = False
mock_query_context.datasource = mock_datasource
mock_query_context.queries = [main_query, totals_query]
mock_query_context.result_type = "full"
mock_query_context.cache_values = {
"datasource": {"type": "table", "id": 1},
"queries": [
# These are the original queries as they would be stored in cache_values
{
"columns": ["brokerage"],
"metrics": ["Net Amount In", "Amount Out", "Amount In"],
"row_limit": 50000,
"orderby": [("Net Amount In", False)],
"post_processing": [
{
"operation": "contribution",
"options": {
"columns": ["Amount In", "Amount Out"],
"rename_columns": ["%Amount In", "%Amount Out"],
},
}
],
},
{
"columns": [],
"metrics": ["Net Amount In", "Amount Out", "Amount In"],
"row_limit": 50000,
"post_processing": [],
},
],
"result_type": "full",
"result_format": "json",
}
# Create processor
processor = QueryContextProcessor(mock_query_context)
processor._qc_datasource = mock_datasource
# Mock the query execution result for totals query
mock_query_result = MagicMock()
mock_df = pd.DataFrame(
{
"Net Amount In": [20228060486.838825],
"Amount Out": [-20543489614.980007],
"Amount In": [40771550101.81883],
}
)
mock_query_result.df = mock_df
with patch.object(
mock_query_context, "get_query_result", return_value=mock_query_result
):
# Call ensure_totals_available
processor.ensure_totals_available()
# Now call get_payload which should update cache_values
with patch(
"superset.common.query_context_processor.get_query_results"
) as mock_get_query_results:
# Mock the query results
mock_query_results_response = [
{
"data": [{"brokerage": "Test", "Net Amount In": 100}],
"query": "SELECT ...",
}
]
mock_get_query_results.return_value = mock_query_results_response
# Mock cache manager to avoid actual caching
with patch(
"superset.common.query_context_processor.QueryCacheManager"
) as mock_cache_manager:
mock_cache = MagicMock()
mock_cache.is_loaded = True
mock_cache.df = pd.DataFrame(
{"brokerage": ["Test"], "Net Amount In": [100]}
)
mock_cache.query = "SELECT ..."
mock_cache.error_message = None
mock_cache.status = "success"
mock_cache_manager.get.return_value = mock_cache
# This should update cache_values to match the modified queries
processor.get_payload(cache_query_context=False)
# Verify that cache_values has been updated to reflect the modifications
updated_cache_queries = mock_query_context.cache_values["queries"]
# Check that totals query has row_limit=None (modified by ensure_totals_available)
assert updated_cache_queries[1]["row_limit"] is None, (
"Expected totals query to have row_limit=None after ensure_totals_available, "
f"but got: {updated_cache_queries[1]['row_limit']}"
)
# Check that the main query has contribution_totals in post_processing
assert (
"contribution_totals"
in updated_cache_queries[0]["post_processing"][0]["options"]
), "Expected main query post_processing to have contribution_totals added"
# Verify the contribution_totals match what we mocked
expected_totals = {
"Net Amount In": 20228060486.838825,
"Amount Out": -20543489614.980007,
"Amount In": 40771550101.81883,
}
assert (
updated_cache_queries[0]["post_processing"][0]["options"]["contribution_totals"]
== expected_totals
)
def test_get_df_payload_validates_before_cache_key_generation():
"""
Test that get_df_payload calls validate() before generating cache key.
"""
from superset.common.query_object import QueryObject
# Create a mock query context
mock_query_context = MagicMock()
mock_query_context.force = False
mock_query_context.result_type = "full"
# Create a mock datasource
mock_datasource = MagicMock()
mock_datasource.id = 123
mock_datasource.uid = "test_datasource"
mock_datasource.cache_timeout = None
mock_datasource.database.db_engine_spec.engine = "postgresql"
mock_datasource.database.extra = "{}"
mock_datasource.get_extra_cache_keys.return_value = []
mock_datasource.changed_on = None
# Create processor
processor = QueryContextProcessor(mock_query_context)
processor._qc_datasource = mock_datasource
# Create a query object with unsanitized where clause
query_obj = QueryObject(
datasource=mock_datasource,
columns=["col1"],
metrics=[],
extras={"where": "(\n col1 > 0\n)"}, # Unsanitized with newlines
)
# Track the order of calls
call_order = []
original_validate = query_obj.validate
def mock_validate(*args, **kwargs):
call_order.append("validate")
# Update extras to simulate sanitization
query_obj.extras["where"] = "(col1 > 0)" # Sanitized, compact format
return original_validate(*args, **kwargs)
original_cache_key = query_obj.cache_key
def mock_cache_key(*args, **kwargs):
call_order.append("cache_key")
# Verify that extras have been sanitized at this point
assert query_obj.extras["where"] == "(col1 > 0)", (
f"Expected sanitized clause in cache_key, got: {query_obj.extras['where']}"
)
return original_cache_key(*args, **kwargs)
with patch.object(query_obj, "validate", side_effect=mock_validate):
with patch.object(query_obj, "cache_key", side_effect=mock_cache_key):
with patch(
"superset.common.query_context_processor.QueryCacheManager"
) as mock_cache_manager:
mock_cache = MagicMock()
mock_cache.is_loaded = True
mock_cache.df = pd.DataFrame({"col1": [1, 2, 3]})
mock_cache.query = "SELECT * FROM table"
mock_cache.error_message = None
mock_cache.status = "success"
mock_cache_manager.get.return_value = mock_cache
# Call get_df_payload
processor.get_df_payload(query_obj, force_cached=False)
# Verify validate was called before cache_key
assert call_order == ["validate", "cache_key"], (
f"Expected validate to be called before cache_key, "
f"but got call order: {call_order}"
)
def test_cache_values_sync_after_ensure_totals_available():
"""
Test that cache_values is synchronized with QueryObject modifications
after ensure_totals_available() runs.
This is a focused regression test for the cache key mismatch issue.
It verifies that when ensure_totals_available() modifies QueryObject
instances, those changes are reflected in cache_values before the
QueryContext cache key is generated.
"""
import pandas as pd
from superset.common.query_object import QueryObject
# Create a mock datasource
mock_datasource = MagicMock()
mock_datasource.uid = "test_datasource_456"
mock_datasource.database.db_engine_spec.engine = "pinot"
mock_datasource.cache_timeout = None
mock_datasource.changed_on = None
# Create two queries: one totals query and one main query with contribution
totals_query = QueryObject(
datasource=mock_datasource,
columns=[],
metrics=["sales"],
row_limit=1000,
post_processing=[],
)
main_query = QueryObject(
datasource=mock_datasource,
columns=["region"],
metrics=["sales"],
row_limit=1000,
post_processing=[{"operation": "contribution", "options": {}}],
)
# Create mock query context with initial cache_values
mock_query_context = MagicMock()
mock_query_context.force = False
mock_query_context.datasource = mock_datasource
mock_query_context.queries = [main_query, totals_query]
mock_query_context.result_type = "full"
mock_query_context.cache_values = {
"datasource": {"type": "table", "id": 20},
"queries": [
{
"columns": ["region"],
"metrics": ["sales"],
"row_limit": 1000,
"post_processing": [{"operation": "contribution", "options": {}}],
},
{
"columns": [],
"metrics": ["sales"],
"row_limit": 1000,
"post_processing": [],
},
],
"result_type": "full",
"result_format": "json",
}
# Create processor
processor = QueryContextProcessor(mock_query_context)
processor._qc_datasource = mock_datasource
# Mock query execution result (totals query execution)
mock_query_result = MagicMock()
mock_df = pd.DataFrame({"sales": [1000.0]})
mock_query_result.df = mock_df
# Patch methods to isolate the test
with patch.object(
mock_query_context, "get_query_result", return_value=mock_query_result
):
# Mock cache management to prevent actual caching
with patch(
"superset.common.query_context_processor.QueryCacheManager"
) as mock_cache_manager:
mock_cache = MagicMock()
mock_cache.is_loaded = True
mock_cache.df = pd.DataFrame({"region": ["North"], "sales": [100]})
mock_cache.query = "SELECT region, SUM(sales) FROM table GROUP BY region"
mock_cache.error_message = None
mock_cache.status = "success"
mock_cache_manager.get.return_value = mock_cache
# Mock the query results
with patch(
"superset.common.query_context_processor.get_query_results"
) as mock_get_query_results:
mock_query_results_response = [
{
"data": [{"region": "North", "sales": 100}],
"query": "SELECT region, SUM(sales) FROM table GROUP BY region",
}
]
mock_get_query_results.return_value = mock_query_results_response
# Call get_payload - this internally calls ensure_totals_available()
# and then should update cache_values
processor.get_payload(cache_query_context=False)
# Verify the fix: cache_values should now reflect the modifications
updated_cache_queries = mock_query_context.cache_values["queries"]
updated_totals_row_limit = updated_cache_queries[1]["row_limit"]
# Before the fix: row_limit would remain 1000 in cache_values
# After the fix: row_limit should be None (modified by
# ensure_totals_available)
assert updated_totals_row_limit is None, (
"Expected row_limit to be None after ensure_totals_available, "
f"but got: {updated_totals_row_limit}"
)
# Verify that contribution_totals was added to the main query
assert (
"contribution_totals"
in updated_cache_queries[0]["post_processing"][0]["options"]
)
# Verify that the main query row_limit is still 1000 (only totals query
# should be modified)
assert updated_cache_queries[0]["row_limit"] == 1000
def test_cache_key_excludes_contribution_totals():
"""
Test that cache_key() excludes contribution_totals from post_processing.
contribution_totals is computed at runtime by ensure_totals_available() and
varies per request. Including it in the cache key would cause mismatches
between workers that compute different totals for the same query.
"""
from superset.common.query_object import QueryObject
mock_datasource = MagicMock()
mock_datasource.uid = "test_datasource"
mock_datasource.database.extra = "{}"
mock_datasource.get_extra_cache_keys.return_value = []
# Create query with contribution post-processing that includes contribution_totals
query_with_totals = QueryObject(
datasource=mock_datasource,
columns=["region"],
metrics=["sales", "profit"],
post_processing=[
{
"operation": "contribution",
"options": {
"columns": ["sales", "profit"],
"rename_columns": ["%sales", "%profit"],
"contribution_totals": {"sales": 1000.0, "profit": 200.0},
},
}
],
)
# Create identical query without contribution_totals
query_without_totals = QueryObject(
datasource=mock_datasource,
columns=["region"],
metrics=["sales", "profit"],
post_processing=[
{
"operation": "contribution",
"options": {
"columns": ["sales", "profit"],
"rename_columns": ["%sales", "%profit"],
},
}
],
)
# Cache keys should be identical since contribution_totals is excluded
cache_key_with = query_with_totals.cache_key()
cache_key_without = query_without_totals.cache_key()
assert cache_key_with == cache_key_without, (
"Cache keys should match regardless of contribution_totals. "
f"With totals: {cache_key_with}, Without totals: {cache_key_without}"
)
def test_cache_key_preserves_other_post_processing_options():
"""
Test that cache_key() only excludes contribution_totals, not other options.
"""
from superset.common.query_object import QueryObject
mock_datasource = MagicMock()
mock_datasource.uid = "test_datasource"
mock_datasource.database.extra = "{}"
mock_datasource.get_extra_cache_keys.return_value = []
# Create query with contribution post-processing
query1 = QueryObject(
datasource=mock_datasource,
columns=["region"],
metrics=["sales"],
post_processing=[
{
"operation": "contribution",
"options": {
"columns": ["sales"],
"rename_columns": ["%sales"],
"contribution_totals": {"sales": 1000.0},
},
}
],
)
# Create query with different rename_columns
query2 = QueryObject(
datasource=mock_datasource,
columns=["region"],
metrics=["sales"],
post_processing=[
{
"operation": "contribution",
"options": {
"columns": ["sales"],
"rename_columns": ["%sales_pct"], # Different!
"contribution_totals": {"sales": 1000.0},
},
}
],
)
# Cache keys should differ because rename_columns is different
assert query1.cache_key() != query2.cache_key(), (
"Cache keys should differ when other post_processing options differ"
)
def test_cache_key_non_contribution_post_processing_unchanged():
"""
Test that non-contribution post_processing operations are unchanged in cache key.
"""
from superset.common.query_object import QueryObject
mock_datasource = MagicMock()
mock_datasource.uid = "test_datasource"
mock_datasource.database.extra = "{}"
mock_datasource.get_extra_cache_keys.return_value = []
# Create query with non-contribution post-processing
query1 = QueryObject(
datasource=mock_datasource,
columns=["region"],
metrics=["sales"],
post_processing=[
{
"operation": "pivot",
"options": {"columns": ["region"], "aggregates": {"sales": "sum"}},
}
],
)
query2 = QueryObject(
datasource=mock_datasource,
columns=["region"],
metrics=["sales"],
post_processing=[
{
"operation": "pivot",
"options": {"columns": ["region"], "aggregates": {"sales": "mean"}},
}
],
)
# Cache keys should differ because aggregates option is different
assert query1.cache_key() != query2.cache_key(), (
"Cache keys should differ for different non-contribution post_processing"
)
def test_force_cached_normalizes_totals_query_row_limit():
"""
When fetching from cache (force_cached=True), the totals query should still be
normalized so its cache key matches the cached entry, but the totals query should
not be executed.
"""
from superset.common.query_object import QueryObject
mock_datasource = MagicMock()
mock_datasource.uid = "test_datasource"
mock_datasource.column_names = ["region", "sales"]
mock_datasource.cache_timeout = None
mock_datasource.changed_on = None
mock_datasource.get_extra_cache_keys.return_value = []
mock_datasource.database.extra = "{}"
mock_datasource.database.impersonate_user = False
mock_datasource.database.db_engine_spec.get_impersonation_key.return_value = None
totals_query = QueryObject(
datasource=mock_datasource,
columns=[],
metrics=["sales"],
row_limit=1000,
)
main_query = QueryObject(
datasource=mock_datasource,
columns=["region"],
metrics=["sales"],
row_limit=1000,
post_processing=[{"operation": "contribution", "options": {}}],
)
totals_query.validate = MagicMock()
main_query.validate = MagicMock()
captured_limits: list[int | None] = []
def totals_cache_key(**kwargs: Any) -> str:
captured_limits.append(totals_query.row_limit)
return "totals-cache-key"
totals_query.cache_key = totals_cache_key
main_query.cache_key = lambda **kwargs: "main-cache-key"
mock_query_context = MagicMock()
mock_query_context.force = False
mock_query_context.datasource = mock_datasource
mock_query_context.queries = [main_query, totals_query]
mock_query_context.result_type = ChartDataResultType.FULL
mock_query_context.result_format = ChartDataResultFormat.JSON
mock_query_context.cache_values = {
"queries": [main_query.to_dict(), totals_query.to_dict()]
}
mock_query_context.get_query_result = MagicMock()
processor = QueryContextProcessor(mock_query_context)
processor._qc_datasource = mock_datasource
mock_query_context.get_df_payload = processor.get_df_payload
mock_query_context.get_data = processor.get_data
with patch(
"superset.common.query_context_processor.security_manager"
) as mock_security_manager:
mock_security_manager.get_rls_cache_key.return_value = None
with patch(
"superset.common.query_context_processor.QueryCacheManager"
) as mock_cache_manager:
def cache_get(*args: Any, **kwargs: Any) -> Any:
df = pd.DataFrame({"region": ["North"], "sales": [100]})
cache = MagicMock()
cache.is_loaded = True
cache.df = df
cache.query = "SELECT 1"
cache.error_message = None
cache.status = QueryStatus.SUCCESS
cache.applied_template_filters = []
cache.applied_filter_columns = []
cache.rejected_filter_columns = []
cache.annotation_data = {}
cache.is_cached = True
cache.sql_rowcount = len(df)
cache.cache_dttm = "2024-01-01T00:00:00"
return cache
mock_cache_manager.get.side_effect = cache_get
processor.get_payload(cache_query_context=False, force_cached=True)
assert captured_limits == [None], "Totals query should be normalized before caching"
mock_query_context.get_query_result.assert_not_called()
def test_get_df_payload_invalidates_cache_missing_applied_filter_columns():
"""
Test that get_df_payload invalidates cache when cache is loaded but missing
applied_filter_columns and query has filters.
This ensures that old cache entries without applied_filter_columns are
invalidated and fresh queries are executed to populate the field correctly.
"""
from superset.common.query_object import QueryObject
# Minimal setup
mock_query_context = MagicMock()
mock_query_context.force = False
mock_datasource = MagicMock()
mock_datasource.column_names = ["col1"]
processor = QueryContextProcessor(mock_query_context)
processor._qc_datasource = mock_datasource
# Create query object with filters (note: `filters` kwarg, not `filter`)
query_obj = QueryObject(
datasource=mock_datasource,
columns=["col1"],
filters=[{"col": "col1", "op": "IN", "val": ["value1"]}],
)
# Simple cache class that tracks is_loaded changes
class MockCache:
def __init__(self):
self.is_loaded = True
self.applied_filter_columns = [] # Empty = missing
self.df = pd.DataFrame()
self.query = ""
self.status = "success"
self.cache_dttm = "2024-01-01T00:00:00"
self.queried_dttm = "2024-01-01T00:00:00"
self.stacktrace = None
self.error_message = None
self.is_cached = True
self.sql_rowcount = 0
self.cache_value = None
self.cache_timeout = 3600
self.datasource_uid = "test_datasource"
self.applied_template_filters = []
self.rejected_filter_columns = []
self.annotation_data = {}
self.set_query_result = MagicMock()
mock_cache = MockCache()
with patch(
"superset.common.query_context_processor.QueryCacheManager"
) as mock_cache_manager:
mock_cache_manager.get.return_value = mock_cache
# Prevent validate from doing any heavy work; it shouldn't modify filters
with patch.object(query_obj, "validate", return_value=None):
with patch.object(processor, "query_cache_key", return_value="key"):
with patch.object(processor, "get_cache_timeout", return_value=3600):
# Call get_df_payload - should invalidate cache
processor.get_df_payload(query_obj, force_cached=False)
# Verify cache was invalidated
assert mock_cache.is_loaded is False, (
"Cache should be inv when no applied_filter_columns and query has filters"
)