From 755aa2e32f380de505fd68b192a28cc1c86f182c Mon Sep 17 00:00:00 2001 From: Beto Dealmeida Date: Mon, 2 Mar 2026 18:07:16 -0500 Subject: [PATCH] Address TODOs --- superset-core/pyproject.toml | 1 + .../semantic_layers/semantic_layer.py | 16 +++++---- .../semantic_layers/semantic_view.py | 16 +++++---- .../superset_core/semantic_layers/types.py | 5 ++- superset/semantic_layers/mapper.py | 18 +++++----- .../unit_tests/semantic_layers/mapper_test.py | 33 ++++++++++--------- 6 files changed, 49 insertions(+), 40 deletions(-) diff --git a/superset-core/pyproject.toml b/superset-core/pyproject.toml index 57dfb231ade..7027df9dde1 100644 --- a/superset-core/pyproject.toml +++ b/superset-core/pyproject.toml @@ -43,6 +43,7 @@ classifiers = [ ] dependencies = [ "flask-appbuilder>=5.0.2,<6", + "pyarrow>=16.0.0", "pydantic>=2.8.0", "sqlalchemy>=1.4.0,<2.0", "sqlalchemy-utils>=0.38.0, <0.43", # expanding lowerbound to work with pydoris diff --git a/superset-core/src/superset_core/semantic_layers/semantic_layer.py b/superset-core/src/superset_core/semantic_layers/semantic_layer.py index 615014f8c1b..1fc421cf359 100644 --- a/superset-core/src/superset_core/semantic_layers/semantic_layer.py +++ b/superset-core/src/superset_core/semantic_layers/semantic_layer.py @@ -17,23 +17,23 @@ from __future__ import annotations -from typing import Any, Protocol, runtime_checkable, TypeVar +from abc import ABC, abstractmethod +from typing import Any, Generic, TypeVar from pydantic import BaseModel from superset_core.semantic_layers.semantic_view import SemanticView -ConfigT = TypeVar("ConfigT", bound=BaseModel, contravariant=True) +ConfigT = TypeVar("ConfigT", bound=BaseModel) SemanticViewT = TypeVar("SemanticViewT", bound="SemanticView") -# TODO (betodealmeida): convert to ABC -@runtime_checkable -class SemanticLayer(Protocol[ConfigT, SemanticViewT]): +class SemanticLayer(ABC, Generic[ConfigT, SemanticViewT]): """ - A protocol for semantic layers. + Abstract base class for semantic layers. """ @classmethod + @abstractmethod def from_configuration( cls, configuration: dict[str, Any], @@ -43,6 +43,7 @@ class SemanticLayer(Protocol[ConfigT, SemanticViewT]): """ @classmethod + @abstractmethod def get_configuration_schema( cls, configuration: ConfigT | None = None, @@ -67,6 +68,7 @@ class SemanticLayer(Protocol[ConfigT, SemanticViewT]): """ @classmethod + @abstractmethod def get_runtime_schema( cls, configuration: ConfigT, @@ -93,6 +95,7 @@ class SemanticLayer(Protocol[ConfigT, SemanticViewT]): configuration. """ + @abstractmethod def get_semantic_views( self, runtime_configuration: dict[str, Any], @@ -104,6 +107,7 @@ class SemanticLayer(Protocol[ConfigT, SemanticViewT]): schema, used to restrict the semantic views returned. """ + @abstractmethod def get_semantic_view( self, name: str, diff --git a/superset-core/src/superset_core/semantic_layers/semantic_view.py b/superset-core/src/superset_core/semantic_layers/semantic_view.py index a67050f5b14..7f7c97fcc12 100644 --- a/superset-core/src/superset_core/semantic_layers/semantic_view.py +++ b/superset-core/src/superset_core/semantic_layers/semantic_view.py @@ -18,7 +18,7 @@ from __future__ import annotations import enum -from typing import Protocol, runtime_checkable +from abc import ABC, abstractmethod from superset_core.semantic_layers.types import ( Dimension, @@ -41,30 +41,32 @@ class SemanticViewFeature(enum.Enum): GROUP_OTHERS = "GROUP_OTHERS" -# TODO (betodealmeida): convert to ABC -@runtime_checkable -class SemanticView(Protocol): +class SemanticView(ABC): """ - A protocol for semantic views. + Abstract base class for semantic views. """ features: frozenset[SemanticViewFeature] + @abstractmethod def uid(self) -> str: """ Returns a unique identifier for the semantic view. """ + @abstractmethod def get_dimensions(self) -> set[Dimension]: """ Get the dimensions defined in the semantic view. """ + @abstractmethod def get_metrics(self) -> set[Metric]: """ Get the metrics defined in the semantic view. """ + @abstractmethod def get_values( self, dimension: Dimension, @@ -74,6 +76,7 @@ class SemanticView(Protocol): Return distinct values for a dimension. """ + @abstractmethod def get_dataframe( self, metrics: list[Metric], @@ -86,9 +89,10 @@ class SemanticView(Protocol): group_limit: GroupLimit | None = None, ) -> SemanticResult: """ - Execute a semantic query and return the results as a DataFrame. + Execute a semantic query and return the results. """ + @abstractmethod def get_row_count( self, metrics: list[Metric], diff --git a/superset-core/src/superset_core/semantic_layers/types.py b/superset-core/src/superset_core/semantic_layers/types.py index 353fd0355bb..d62de2c6cb8 100644 --- a/superset-core/src/superset_core/semantic_layers/types.py +++ b/superset-core/src/superset_core/semantic_layers/types.py @@ -23,7 +23,7 @@ from datetime import date, datetime, time, timedelta from functools import total_ordering from typing import Type as TypeOf -from pandas import DataFrame +import pyarrow as pa __all__ = [ "BINARY", @@ -309,8 +309,7 @@ class SemanticResult: """ requests: list[SemanticRequest] - # TODO (betodealmeida): convert to PyArrow Table - results: DataFrame + results: pa.Table @dataclass(frozen=True) diff --git a/superset/semantic_layers/mapper.py b/superset/semantic_layers/mapper.py index 441e48cf941..f137b8fa572 100644 --- a/superset/semantic_layers/mapper.py +++ b/superset/semantic_layers/mapper.py @@ -29,6 +29,7 @@ from time import time from typing import Any, cast, Sequence, TypeGuard import numpy as np +import pyarrow as pa from superset_core.semantic_layers.semantic_view import SemanticViewFeature from superset_core.semantic_layers.types import ( AdhocExpression, @@ -133,20 +134,16 @@ def get_results(query_object: QueryObject) -> QueryResult: group_limit=main_query.group_limit, ) - main_df = main_result.results + main_df = main_result.results.to_pandas() # Collect all requests (SQL queries, HTTP requests, etc.) for troubleshooting all_requests = list(main_result.requests) # If no time offsets, return the main result as-is if not query_object.time_offsets or len(queries) <= 1: - semantic_result = SemanticResult( - requests=all_requests, - results=main_df, - ) duration = timedelta(seconds=time() - start_time) return map_semantic_result_to_query_result( - semantic_result, + main_result, query_object, duration, ) @@ -179,7 +176,7 @@ def get_results(query_object: QueryObject) -> QueryResult: # Add this query's requests to the collection all_requests.extend(result.requests) - offset_df = result.results + offset_df = result.results.to_pandas() # Handle empty results - add NaN columns directly instead of merging # This avoids dtype mismatch issues with empty DataFrames @@ -218,7 +215,10 @@ def get_results(query_object: QueryObject) -> QueryResult: main_df = main_df.drop(columns=duplicate_cols) # Convert final result to QueryResult - semantic_result = SemanticResult(requests=all_requests, results=main_df) + semantic_result = SemanticResult( + requests=all_requests, + results=pa.Table.from_pandas(main_df), + ) duration = timedelta(seconds=time() - start_time) return map_semantic_result_to_query_result( semantic_result, @@ -250,7 +250,7 @@ def map_semantic_result_to_query_result( return QueryResult( # Core data - df=semantic_result.results, + df=semantic_result.results.to_pandas(), query=query_str, duration=duration, # Template filters - not applicable to semantic layers diff --git a/tests/unit_tests/semantic_layers/mapper_test.py b/tests/unit_tests/semantic_layers/mapper_test.py index 06cf6871ad5..ab6586d0f20 100644 --- a/tests/unit_tests/semantic_layers/mapper_test.py +++ b/tests/unit_tests/semantic_layers/mapper_test.py @@ -19,6 +19,7 @@ from datetime import datetime from unittest.mock import MagicMock import pandas as pd +import pyarrow as pa import pytest from pytest_mock import MockerFixture from superset_core.semantic_layers.semantic_view import SemanticViewFeature @@ -1229,7 +1230,7 @@ def test_get_results_without_time_offsets( definition="SELECT category, SUM(amount) FROM orders GROUP BY category", ) ], - results=main_df, + results=pa.Table.from_pandas(main_df), ) mock_datasource.implementation.get_dataframe = mocker.Mock(return_value=mock_result) @@ -1289,7 +1290,7 @@ def test_get_results_with_single_time_offset( ), ) ], - results=main_df.copy(), + results=pa.Table.from_pandas(main_df.copy()), ) mock_offset_result = SemanticResult( @@ -1302,7 +1303,7 @@ def test_get_results_with_single_time_offset( ), ) ], - results=offset_df.copy(), + results=pa.Table.from_pandas(offset_df.copy()), ) mock_datasource.implementation.get_dataframe = mocker.Mock( @@ -1371,17 +1372,17 @@ def test_get_results_with_multiple_time_offsets( # Mock results mock_main_result = SemanticResult( requests=[SemanticRequest(type="SQL", definition="MAIN QUERY")], - results=main_df.copy(), + results=pa.Table.from_pandas(main_df.copy()), ) mock_offset_1w_result = SemanticResult( requests=[SemanticRequest(type="SQL", definition="OFFSET 1W QUERY")], - results=offset_1w_df.copy(), + results=pa.Table.from_pandas(offset_1w_df.copy()), ) mock_offset_1m_result = SemanticResult( requests=[SemanticRequest(type="SQL", definition="OFFSET 1M QUERY")], - results=offset_1m_df.copy(), + results=pa.Table.from_pandas(offset_1m_df.copy()), ) mock_datasource.implementation.get_dataframe = mocker.Mock( @@ -1442,12 +1443,12 @@ def test_get_results_with_empty_offset_result( # Mock results mock_main_result = SemanticResult( requests=[SemanticRequest(type="SQL", definition="MAIN QUERY")], - results=main_df.copy(), + results=pa.Table.from_pandas(main_df.copy()), ) mock_offset_result = SemanticResult( requests=[SemanticRequest(type="SQL", definition="OFFSET QUERY")], - results=offset_df, + results=pa.Table.from_pandas(offset_df), ) mock_datasource.implementation.get_dataframe = mocker.Mock( @@ -1504,12 +1505,12 @@ def test_get_results_with_partial_offset_match( # Mock results mock_main_result = SemanticResult( requests=[SemanticRequest(type="SQL", definition="MAIN QUERY")], - results=main_df.copy(), + results=pa.Table.from_pandas(main_df.copy()), ) mock_offset_result = SemanticResult( requests=[SemanticRequest(type="SQL", definition="OFFSET QUERY")], - results=offset_df.copy(), + results=pa.Table.from_pandas(offset_df.copy()), ) mock_datasource.implementation.get_dataframe = mocker.Mock( @@ -1569,12 +1570,12 @@ def test_get_results_with_multiple_dimensions( # Mock results mock_main_result = SemanticResult( requests=[SemanticRequest(type="SQL", definition="MAIN QUERY")], - results=main_df.copy(), + results=pa.Table.from_pandas(main_df.copy()), ) mock_offset_result = SemanticResult( requests=[SemanticRequest(type="SQL", definition="OFFSET QUERY")], - results=offset_df.copy(), + results=pa.Table.from_pandas(offset_df.copy()), ) mock_datasource.implementation.get_dataframe = mocker.Mock( @@ -1648,12 +1649,12 @@ def test_get_results_with_duplicate_columns( mock_main_result = SemanticResult( requests=[SemanticRequest(type="SQL", definition="MAIN")], - results=main_df.copy(), + results=pa.Table.from_pandas(main_df.copy()), ) mock_offset_result = SemanticResult( requests=[SemanticRequest(type="SQL", definition="OFFSET")], - results=offset_df.copy(), + results=pa.Table.from_pandas(offset_df.copy()), ) mock_datasource.implementation.get_dataframe = mocker.Mock( @@ -1692,7 +1693,7 @@ def test_get_results_empty_requests( mock_result = SemanticResult( requests=[], # Empty requests - results=main_df, + results=pa.Table.from_pandas(main_df), ) mock_datasource.implementation.get_dataframe = mocker.Mock(return_value=mock_result) @@ -2328,7 +2329,7 @@ def test_get_results_with_is_rowcount( mock_result = SemanticResult( requests=[SemanticRequest(type="SQL", definition="SELECT COUNT(*)")], - results=main_df, + results=pa.Table.from_pandas(main_df), ) mock_datasource.implementation.get_row_count = mocker.Mock(return_value=mock_result)