From 836dddafc623aa4efc65e82c43686a9f37dca4dc Mon Sep 17 00:00:00 2001 From: Beto Dealmeida Date: Wed, 29 Oct 2025 15:21:31 -0400 Subject: [PATCH] Add tests --- superset/semantic_layers/mapper.py | 58 ++- .../unit_tests/semantic_layers/test_mapper.py | 391 ++++++++++++++++++ 2 files changed, 414 insertions(+), 35 deletions(-) diff --git a/superset/semantic_layers/mapper.py b/superset/semantic_layers/mapper.py index 0fbf3f6ec9a..3cf0e724d5a 100644 --- a/superset/semantic_layers/mapper.py +++ b/superset/semantic_layers/mapper.py @@ -18,7 +18,6 @@ from datetime import datetime import numpy as np -import pandas as pd from superset.common.query_object import QueryObject from superset.common.utils.time_range_utils import get_since_until_from_query_object @@ -614,12 +613,6 @@ def get_results(query_object: QueryObject) -> SemanticResult: """ Run a query based on the `QueryObject` and return the results as a SemanticResult. - This function handles the complete flow: - 1. Converts QueryObject to SemanticQuery objects (one per time offset) - 2. Executes all queries via the semantic view - 3. Joins the results into a single DataFrame - 4. Collects all requests from each query for troubleshooting - :param query_object: The QueryObject containing query specifications :return: SemanticResult with combined DataFrame and all requests """ @@ -688,18 +681,13 @@ def get_results(query_object: QueryObject) -> SemanticResult: offset_df = result.results - # Handle empty results - create a DataFrame with NaN values - # This ensures the join doesn't fail and produces NULL values for missing data + # Handle empty results - add NaN columns directly instead of merging + # This avoids dtype mismatch issues with empty DataFrames if offset_df.empty: - offset_df = pd.DataFrame( - { - **{col: [np.nan] for col in join_keys}, - **{ - TIME_COMPARISON.join([metric, time_offset]): [np.nan] - for metric in metric_names - }, - } - ) + # Add offset metric columns with NaN values directly to main_df + for metric in metric_names: + offset_col_name = TIME_COMPARISON.join([metric, time_offset]) + main_df[offset_col_name] = np.nan else: # Rename metric columns with time offset suffix # Format: "{metric_name}__{time_offset}" @@ -711,22 +699,22 @@ def get_results(query_object: QueryObject) -> SemanticResult: } ) - # Step 5: Perform left join on dimension columns - # This preserves all rows from main_df and adds offset metrics where they match - main_df = main_df.merge( - offset_df, - on=join_keys, - how="left", - suffixes=("", "__duplicate"), - ) + # Step 5: Perform left join on dimension columns + # This preserves all rows from main_df and adds offset metrics + # where they match + main_df = main_df.merge( + offset_df, + on=join_keys, + how="left", + suffixes=("", "__duplicate"), + ) - # Clean up any duplicate columns that might have been created - # (shouldn't happen with proper join keys, but defensive programming) - duplicate_cols = [col for col in main_df.columns if col.endswith("__duplicate")] - if duplicate_cols: - main_df = main_df.drop(columns=duplicate_cols) + # Clean up any duplicate columns that might have been created + # (shouldn't happen with proper join keys, but defensive programming) + duplicate_cols = [ + col for col in main_df.columns if col.endswith("__duplicate") + ] + if duplicate_cols: + main_df = main_df.drop(columns=duplicate_cols) - return SemanticResult( - requests=all_requests, - results=main_df, - ) + return SemanticResult(requests=all_requests, results=main_df) diff --git a/tests/unit_tests/semantic_layers/test_mapper.py b/tests/unit_tests/semantic_layers/test_mapper.py index f9f8b0d17a3..26bd98ae6fd 100644 --- a/tests/unit_tests/semantic_layers/test_mapper.py +++ b/tests/unit_tests/semantic_layers/test_mapper.py @@ -18,6 +18,7 @@ from datetime import datetime from unittest.mock import Mock +import pandas as pd import pytest from superset.common.query_object import QueryObject @@ -31,6 +32,7 @@ from superset.semantic_layers.mapper import ( _get_order_from_query_object, _get_time_bounds, _get_time_filter, + get_results, map_query_object, validate_query_object, ) @@ -48,6 +50,8 @@ from superset.semantic_layers.types import ( OrderDirection, PredicateType, SemanticQuery, + SemanticRequest, + SemanticResult, SemanticViewFeature, STRING, TimeGrain, @@ -1168,3 +1172,390 @@ def test_convert_query_object_filter_like(): operator=Operator.LIKE, value="%test%", ) + + +def test_get_results_without_time_offsets(mock_datasource): + """ + Test get_results without time offsets returns main query result. + """ + # Create mock dataframe for main query + main_df = pd.DataFrame( + { + "category": ["Electronics", "Books", "Clothing"], + "total_sales": [1000.0, 500.0, 750.0], + } + ) + + # Mock the semantic view's get_dataframe method + mock_result = SemanticResult( + requests=[ + SemanticRequest( + type="SQL", + definition="SELECT category, SUM(amount) FROM orders GROUP BY category", + ) + ], + results=main_df, + ) + + mock_datasource.implementation.get_dataframe = Mock(return_value=mock_result) + + # Create query object without time offsets + query_object = QueryObject( + datasource=mock_datasource, + from_dttm=datetime(2025, 10, 15), + to_dttm=datetime(2025, 10, 22), + metrics=["total_sales"], + columns=["category"], + granularity="order_date", + ) + + # Call get_results + result = get_results(query_object) + + # Verify result + assert isinstance(result, SemanticResult) + assert len(result.requests) == 1 + assert result.requests[0].type == "SQL" + + # Verify DataFrame matches main query result + pd.testing.assert_frame_equal(result.results, main_df) + + +def test_get_results_with_single_time_offset(mock_datasource): + """ + Test get_results with a single time offset joins correctly. + """ + # Create mock dataframes + main_df = pd.DataFrame( + { + "category": ["Electronics", "Books", "Clothing"], + "total_sales": [1000.0, 500.0, 750.0], + } + ) + + offset_df = pd.DataFrame( + { + "category": ["Electronics", "Books", "Clothing"], + "total_sales": [950.0, 480.0, 700.0], + } + ) + + # Mock the semantic view's get_dataframe method + # It will be called twice: once for main, once for offset + mock_main_result = SemanticResult( + requests=[ + SemanticRequest( + type="SQL", + definition=( + "SELECT category, SUM(amount) FROM orders " + "WHERE date >= '2025-10-15' GROUP BY category" + ), + ) + ], + results=main_df.copy(), + ) + + mock_offset_result = SemanticResult( + requests=[ + SemanticRequest( + type="SQL", + definition=( + "SELECT category, SUM(amount) FROM orders " + "WHERE date >= '2025-10-08' GROUP BY category" + ), + ) + ], + results=offset_df.copy(), + ) + + mock_datasource.implementation.get_dataframe = Mock( + side_effect=[mock_main_result, mock_offset_result] + ) + + # Create query object with time offset + query_object = QueryObject( + datasource=mock_datasource, + from_dttm=datetime(2025, 10, 15), + to_dttm=datetime(2025, 10, 22), + metrics=["total_sales"], + columns=["category"], + granularity="order_date", + time_offsets=["1 week ago"], + ) + + # Call get_results + result = get_results(query_object) + + # Verify result structure + assert isinstance(result, SemanticResult) + assert len(result.requests) == 2 # Main + offset query + + # Verify DataFrame has both main and offset metrics + expected_df = pd.DataFrame( + { + "category": ["Electronics", "Books", "Clothing"], + "total_sales": [1000.0, 500.0, 750.0], + "total_sales__1 week ago": [950.0, 480.0, 700.0], + } + ) + + pd.testing.assert_frame_equal(result.results, expected_df) + + +def test_get_results_with_multiple_time_offsets(mock_datasource): + """ + Test get_results with multiple time offsets joins all correctly. + """ + # Create mock dataframes + main_df = pd.DataFrame( + { + "region": ["US", "UK", "JP"], + "order_count": [100, 50, 75], + } + ) + + offset_1w_df = pd.DataFrame( + { + "region": ["US", "UK", "JP"], + "order_count": [95, 48, 70], + } + ) + + offset_1m_df = pd.DataFrame( + { + "region": ["US", "UK", "JP"], + "order_count": [80, 40, 60], + } + ) + + # Mock results + mock_main_result = SemanticResult( + requests=[SemanticRequest(type="SQL", definition="MAIN QUERY")], + results=main_df.copy(), + ) + + mock_offset_1w_result = SemanticResult( + requests=[SemanticRequest(type="SQL", definition="OFFSET 1W QUERY")], + results=offset_1w_df.copy(), + ) + + mock_offset_1m_result = SemanticResult( + requests=[SemanticRequest(type="SQL", definition="OFFSET 1M QUERY")], + results=offset_1m_df.copy(), + ) + + mock_datasource.implementation.get_dataframe = Mock( + side_effect=[mock_main_result, mock_offset_1w_result, mock_offset_1m_result] + ) + + # Create query object with multiple time offsets + query_object = QueryObject( + datasource=mock_datasource, + from_dttm=datetime(2025, 10, 15), + to_dttm=datetime(2025, 10, 22), + metrics=["order_count"], + columns=["region"], + granularity="order_date", + time_offsets=["1 week ago", "1 month ago"], + ) + + # Call get_results + result = get_results(query_object) + + # Verify result structure + assert isinstance(result, SemanticResult) + assert len(result.requests) == 3 # Main + 2 offset queries + + # Verify all requests are collected + assert result.requests[0].definition == "MAIN QUERY" + assert result.requests[1].definition == "OFFSET 1W QUERY" + assert result.requests[2].definition == "OFFSET 1M QUERY" + + # Verify DataFrame has all metrics + expected_df = pd.DataFrame( + { + "region": ["US", "UK", "JP"], + "order_count": [100, 50, 75], + "order_count__1 week ago": [95, 48, 70], + "order_count__1 month ago": [80, 40, 60], + } + ) + + pd.testing.assert_frame_equal(result.results, expected_df) + + +def test_get_results_with_empty_offset_result(mock_datasource): + """ + Test get_results handles empty offset results gracefully. + """ + # Create mock dataframes + main_df = pd.DataFrame( + { + "category": ["Electronics", "Books"], + "total_sales": [1000.0, 500.0], + } + ) + + # Empty offset result + offset_df = pd.DataFrame() + + # Mock results + mock_main_result = SemanticResult( + requests=[SemanticRequest(type="SQL", definition="MAIN QUERY")], + results=main_df.copy(), + ) + + mock_offset_result = SemanticResult( + requests=[SemanticRequest(type="SQL", definition="OFFSET QUERY")], + results=offset_df, + ) + + mock_datasource.implementation.get_dataframe = Mock( + side_effect=[mock_main_result, mock_offset_result] + ) + + # Create query object with time offset + query_object = QueryObject( + datasource=mock_datasource, + from_dttm=datetime(2025, 10, 15), + to_dttm=datetime(2025, 10, 22), + metrics=["total_sales"], + columns=["category"], + granularity="order_date", + time_offsets=["1 week ago"], + ) + + # Call get_results + result = get_results(query_object) + + # Verify result structure + assert isinstance(result, SemanticResult) + assert len(result.requests) == 2 + + # Verify DataFrame has NaN for missing offset data + assert "total_sales__1 week ago" in result.results.columns + assert result.results["total_sales__1 week ago"].isna().all() + + +def test_get_results_with_partial_offset_match(mock_datasource): + """ + Test get_results with partial matches in offset data (left join behavior). + """ + # Main query has 3 categories + main_df = pd.DataFrame( + { + "category": ["Electronics", "Books", "Clothing"], + "total_sales": [1000.0, 500.0, 750.0], + } + ) + + # Offset query only has 2 categories (Books missing) + offset_df = pd.DataFrame( + { + "category": ["Electronics", "Clothing"], + "total_sales": [950.0, 700.0], + } + ) + + # Mock results + mock_main_result = SemanticResult( + requests=[SemanticRequest(type="SQL", definition="MAIN QUERY")], + results=main_df.copy(), + ) + + mock_offset_result = SemanticResult( + requests=[SemanticRequest(type="SQL", definition="OFFSET QUERY")], + results=offset_df.copy(), + ) + + mock_datasource.implementation.get_dataframe = Mock( + side_effect=[mock_main_result, mock_offset_result] + ) + + # Create query object + query_object = QueryObject( + datasource=mock_datasource, + from_dttm=datetime(2025, 10, 15), + to_dttm=datetime(2025, 10, 22), + metrics=["total_sales"], + columns=["category"], + granularity="order_date", + time_offsets=["1 week ago"], + ) + + # Call get_results + result = get_results(query_object) + + # Verify DataFrame structure + expected_df = pd.DataFrame( + { + "category": ["Electronics", "Books", "Clothing"], + "total_sales": [1000.0, 500.0, 750.0], + "total_sales__1 week ago": [950.0, None, 700.0], + } + ) + + pd.testing.assert_frame_equal(result.results, expected_df) + + +def test_get_results_with_multiple_dimensions(mock_datasource): + """ + Test get_results with multiple dimension columns in join. + """ + # Create mock dataframes with multiple dimensions + main_df = pd.DataFrame( + { + "category": ["Electronics", "Electronics", "Books"], + "region": ["US", "UK", "US"], + "total_sales": [1000.0, 800.0, 500.0], + } + ) + + offset_df = pd.DataFrame( + { + "category": ["Electronics", "Electronics", "Books"], + "region": ["US", "UK", "US"], + "total_sales": [950.0, 780.0, 480.0], + } + ) + + # Mock results + mock_main_result = SemanticResult( + requests=[SemanticRequest(type="SQL", definition="MAIN QUERY")], + results=main_df.copy(), + ) + + mock_offset_result = SemanticResult( + requests=[SemanticRequest(type="SQL", definition="OFFSET QUERY")], + results=offset_df.copy(), + ) + + mock_datasource.implementation.get_dataframe = Mock( + side_effect=[mock_main_result, mock_offset_result] + ) + + # Create query object with multiple dimensions + query_object = QueryObject( + datasource=mock_datasource, + from_dttm=datetime(2025, 10, 15), + to_dttm=datetime(2025, 10, 22), + metrics=["total_sales"], + columns=["category", "region"], + granularity="order_date", + time_offsets=["1 week ago"], + ) + + # Call get_results + result = get_results(query_object) + + # Verify DataFrame structure - join should be on both category and region + expected_df = pd.DataFrame( + { + "category": ["Electronics", "Electronics", "Books"], + "region": ["US", "UK", "US"], + "total_sales": [1000.0, 800.0, 500.0], + "total_sales__1 week ago": [950.0, 780.0, 480.0], + } + ) + + pd.testing.assert_frame_equal(result.results, expected_df)