mirror of
https://github.com/apache/superset.git
synced 2026-05-07 17:04:58 +00:00
829 lines
28 KiB
Python
829 lines
28 KiB
Python
# Licensed to the Apache Software Foundation (ASF) under one
|
|
# or more contributor license agreements. See the NOTICE file
|
|
# distributed with this work for additional information
|
|
# regarding copyright ownership. The ASF licenses this file
|
|
# to you under the Apache License, Version 2.0 (the
|
|
# "License"); you may not use this file except in compliance
|
|
# with the License. You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing,
|
|
# software distributed under the License is distributed on an
|
|
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
# KIND, either express or implied. See the License for the
|
|
# specific language governing permissions and limitations
|
|
# under the License.
|
|
|
|
# ruff: noqa: S608
|
|
|
|
from __future__ import annotations
|
|
|
|
import itertools
|
|
import re
|
|
from collections import defaultdict
|
|
from textwrap import dedent
|
|
|
|
from pandas import DataFrame
|
|
from snowflake.connector import connect, DictCursor
|
|
from snowflake.sqlalchemy.snowdialect import SnowflakeDialect
|
|
|
|
from superset.semantic_layers.snowflake.schemas import SnowflakeConfiguration
|
|
from superset.semantic_layers.snowflake.utils import (
|
|
get_connection_parameters,
|
|
substitute_parameters,
|
|
validate_order_by,
|
|
)
|
|
from superset.semantic_layers.types import (
|
|
AdhocExpression,
|
|
AdhocFilter,
|
|
BINARY,
|
|
BOOLEAN,
|
|
DATE,
|
|
DATETIME,
|
|
DECIMAL,
|
|
Dimension,
|
|
Filter,
|
|
FilterValues,
|
|
GroupLimit,
|
|
INTEGER,
|
|
Metric,
|
|
NUMBER,
|
|
OBJECT,
|
|
Operator,
|
|
OrderTuple,
|
|
PredicateType,
|
|
SemanticRequest,
|
|
SemanticResult,
|
|
SemanticViewFeature,
|
|
SemanticViewImplementation,
|
|
STRING,
|
|
TIME,
|
|
Type,
|
|
)
|
|
|
|
REQUEST_TYPE = "snowflake"
|
|
|
|
|
|
class SnowflakeSemanticView(SemanticViewImplementation):
|
|
features = frozenset(
|
|
{
|
|
SemanticViewFeature.ADHOC_EXPRESSIONS_IN_ORDERBY,
|
|
SemanticViewFeature.GROUP_LIMIT,
|
|
SemanticViewFeature.GROUP_OTHERS,
|
|
}
|
|
)
|
|
|
|
def __init__(self, name: str, configuration: SnowflakeConfiguration):
|
|
self.configuration = configuration
|
|
self.name = name
|
|
|
|
self._quote = SnowflakeDialect().identifier_preparer.quote
|
|
|
|
self.dimensions = self.get_dimensions()
|
|
self.metrics = self.get_metrics()
|
|
|
|
def uid(self) -> str:
|
|
return ".".join(
|
|
self._quote(part)
|
|
for part in (
|
|
self.configuration.database,
|
|
self.configuration.schema_,
|
|
self.name,
|
|
)
|
|
)
|
|
|
|
def get_dimensions(self) -> set[Dimension]:
|
|
"""
|
|
Get the dimensions defined in the semantic view.
|
|
|
|
Even though Snowflake supports `SHOW SEMANTIC DIMENSIONS IN my_semantic_view`,
|
|
it doesn't return the expression of dimensions, so we use a slightly more
|
|
complicated query to get all the information we need in one go.
|
|
"""
|
|
dimensions: set[Dimension] = set()
|
|
|
|
query = dedent(
|
|
f"""
|
|
DESC SEMANTIC VIEW {self.uid()}
|
|
->> SELECT "object_name", "property", "property_value"
|
|
FROM $1
|
|
WHERE
|
|
"object_kind" = 'DIMENSION' AND
|
|
"property" IN ('COMMENT', 'DATA_TYPE', 'EXPRESSION', 'TABLE');
|
|
"""
|
|
).strip()
|
|
|
|
connection_parameters = get_connection_parameters(self.configuration)
|
|
with connect(**connection_parameters) as connection:
|
|
cursor = connection.cursor(DictCursor)
|
|
rows = cursor.execute(query).fetchall()
|
|
|
|
for name, group in itertools.groupby(rows, key=lambda x: x["object_name"]):
|
|
attributes = defaultdict(set)
|
|
for row in group:
|
|
attributes[row["property"]].add(row["property_value"])
|
|
|
|
table = next(iter(attributes["TABLE"]))
|
|
id_ = table + "." + name
|
|
type_ = self._get_type(next(iter(attributes["DATA_TYPE"])))
|
|
description = next(iter(attributes["COMMENT"]), None)
|
|
definition = next(iter(attributes["EXPRESSION"]), None)
|
|
|
|
dimensions.add(Dimension(id_, name, type_, description, definition))
|
|
|
|
return dimensions
|
|
|
|
def get_metrics(self) -> set[Metric]:
|
|
"""
|
|
Get the metrics defined in the semantic view.
|
|
"""
|
|
metrics: set[Metric] = set()
|
|
|
|
query = dedent(
|
|
f"""
|
|
DESC SEMANTIC VIEW {self.uid()}
|
|
->> SELECT "object_name", "property", "property_value"
|
|
FROM $1
|
|
WHERE
|
|
"object_kind" = 'METRIC' AND
|
|
"property" IN ('COMMENT', 'DATA_TYPE', 'EXPRESSION', 'TABLE');
|
|
"""
|
|
).strip()
|
|
|
|
connection_parameters = get_connection_parameters(self.configuration)
|
|
with connect(**connection_parameters) as connection:
|
|
cursor = connection.cursor(DictCursor)
|
|
rows = cursor.execute(query).fetchall()
|
|
|
|
for name, group in itertools.groupby(rows, key=lambda x: x["object_name"]):
|
|
attributes = defaultdict(set)
|
|
for row in group:
|
|
attributes[row["property"]].add(row["property_value"])
|
|
|
|
table = next(iter(attributes["TABLE"]))
|
|
id_ = table + "." + name
|
|
type_ = self._get_type(next(iter(attributes["DATA_TYPE"])))
|
|
description = next(iter(attributes["COMMENT"]), None)
|
|
definition = next(iter(attributes["EXPRESSION"]), None)
|
|
|
|
metrics.add(Metric(id_, name, type_, definition, description))
|
|
|
|
return metrics
|
|
|
|
def _get_type(self, snowflake_type: str | None) -> type[Type]:
|
|
"""
|
|
Return the semantic type corresponding to a Snowflake type.
|
|
"""
|
|
if snowflake_type is None:
|
|
return STRING
|
|
|
|
type_map = {
|
|
STRING: {r"VARCHAR\(\d+\)$", "STRING$", "TEXT$", r"CHAR\(\d+\)$"},
|
|
INTEGER: {r"NUMBER\(38,\s?0\)$", "INT$", "INTEGER$", "BIGINT$"},
|
|
DECIMAL: {r"NUMBER\(10,\s?2\)$"},
|
|
NUMBER: {r"NUMBER\(\d+,\s?\d+\)$", "FLOAT$", "DOUBLE$"},
|
|
BOOLEAN: {"BOOLEAN$"},
|
|
DATE: {"DATE$"},
|
|
DATETIME: {"TIMESTAMP_TZ$", "TIMESTAMP__NTZ$"},
|
|
TIME: {"TIME$"},
|
|
OBJECT: {"OBJECT$"},
|
|
BINARY: {r"BINARY\(\d+\)$", r"VARBINARY\(\d+\)$"},
|
|
}
|
|
for semantic_type, patterns in type_map.items():
|
|
if any(
|
|
re.match(pattern, snowflake_type, re.IGNORECASE) for pattern in patterns
|
|
):
|
|
return semantic_type
|
|
|
|
return STRING
|
|
|
|
def _build_predicates(
|
|
self,
|
|
filters: list[Filter | AdhocFilter],
|
|
) -> tuple[str, tuple[FilterValues, ...]]:
|
|
"""
|
|
Convert a set of filters to a single `AND`ed predicate.
|
|
|
|
Caller should check the types of filters beforehand, as this method does not
|
|
differentiate between `WHERE` and `HAVING` predicates.
|
|
"""
|
|
if not filters:
|
|
return "", ()
|
|
|
|
# convert filters predicate with associated parameters; native filters are
|
|
# already strings, so we keep them as-is
|
|
unary_operators = {Operator.IS_NULL, Operator.IS_NOT_NULL}
|
|
predicates: list[str] = []
|
|
parameters: list[FilterValues] = []
|
|
for filter_ in filters or set():
|
|
if isinstance(filter_, AdhocFilter):
|
|
predicates.append(f"({filter_.definition})")
|
|
else:
|
|
predicates.append(f"({self._build_native_filter(filter_)})")
|
|
if filter_.operator not in unary_operators:
|
|
parameters.extend(
|
|
[filter_.value]
|
|
if not isinstance(filter_.value, (set, frozenset))
|
|
else filter_.value
|
|
)
|
|
|
|
return " AND ".join(predicates), tuple(parameters)
|
|
|
|
def get_values(
|
|
self,
|
|
dimension: Dimension,
|
|
filters: set[Filter | AdhocFilter] | None = None,
|
|
) -> SemanticResult:
|
|
"""
|
|
Return distinct values for a dimension.
|
|
"""
|
|
where_clause, parameters = self._build_predicates(
|
|
sorted(
|
|
filter_
|
|
for filter_ in (filters or [])
|
|
if filter_.type == PredicateType.WHERE
|
|
)
|
|
)
|
|
query = dedent(
|
|
f"""
|
|
SELECT {self._quote(dimension.name)}
|
|
FROM SEMANTIC_VIEW(
|
|
{self.uid()}
|
|
DIMENSIONS {dimension.id}
|
|
{"WHERE " + where_clause if where_clause else ""}
|
|
)
|
|
"""
|
|
).strip()
|
|
connection_parameters = get_connection_parameters(self.configuration)
|
|
with connect(**connection_parameters) as connection:
|
|
df = connection.cursor().execute(query, parameters).fetch_pandas_all()
|
|
|
|
return SemanticResult(
|
|
requests=[
|
|
SemanticRequest(
|
|
REQUEST_TYPE,
|
|
substitute_parameters(query, parameters),
|
|
)
|
|
],
|
|
results=df,
|
|
)
|
|
|
|
def _build_native_filter(self, filter_: Filter) -> str:
|
|
"""
|
|
Convert a Filter to a AdhocFilter.
|
|
"""
|
|
column = filter_.column
|
|
operator = filter_.operator
|
|
value = filter_.value
|
|
|
|
column_name = self._quote(column.name)
|
|
|
|
# Handle IS NULL and IS NOT NULL operators (no value needed)
|
|
if operator in {Operator.IS_NULL, Operator.IS_NOT_NULL}:
|
|
return f"{column_name} {operator.value}"
|
|
|
|
# Handle IN and NOT IN operators (set values)
|
|
if operator in {Operator.IN, Operator.NOT_IN}:
|
|
parameter_count = len(value) if isinstance(value, (set, frozenset)) else 1
|
|
formatted_values = ", ".join("?" for _ in range(parameter_count))
|
|
return f"{column_name} {operator.value} ({formatted_values})"
|
|
|
|
return f"{column_name} {operator.value} ?"
|
|
|
|
def get_dataframe(
|
|
self,
|
|
metrics: list[Metric],
|
|
dimensions: list[Dimension],
|
|
filters: set[Filter | AdhocFilter] | None = None,
|
|
order: list[OrderTuple] | None = None,
|
|
limit: int | None = None,
|
|
offset: int | None = None,
|
|
*,
|
|
group_limit: GroupLimit | None = None,
|
|
) -> SemanticResult:
|
|
"""
|
|
Execute a query and return the results as a Pandas DataFrame.
|
|
"""
|
|
print("AM HERE")
|
|
if not metrics and not dimensions:
|
|
return DataFrame()
|
|
|
|
query, parameters = self._get_query(
|
|
metrics,
|
|
dimensions,
|
|
filters,
|
|
order,
|
|
limit,
|
|
offset,
|
|
group_limit,
|
|
)
|
|
connection_parameters = get_connection_parameters(self.configuration)
|
|
with connect(**connection_parameters) as connection:
|
|
df = connection.cursor().execute(query, parameters).fetch_pandas_all()
|
|
|
|
# map column names to dimension/metric names instead of IDs
|
|
mapping = {
|
|
**{dimension.id: dimension.name for dimension in dimensions},
|
|
**{metric.id: metric.name for metric in metrics},
|
|
}
|
|
print("BETO")
|
|
print(df.columns)
|
|
print(mapping)
|
|
df.rename(columns=mapping, inplace=True)
|
|
print(df.columns)
|
|
|
|
return SemanticResult(
|
|
requests=[
|
|
SemanticRequest(
|
|
REQUEST_TYPE,
|
|
substitute_parameters(query, parameters),
|
|
)
|
|
],
|
|
results=df,
|
|
)
|
|
|
|
def get_row_count(
|
|
self,
|
|
metrics: list[Metric],
|
|
dimensions: list[Dimension],
|
|
filters: set[Filter | AdhocFilter] | None = None,
|
|
order: list[OrderTuple] | None = None,
|
|
limit: int | None = None,
|
|
offset: int | None = None,
|
|
*,
|
|
group_limit: GroupLimit | None = None,
|
|
) -> SemanticResult:
|
|
"""
|
|
Execute a query and return the number of rows the result would have.
|
|
"""
|
|
if not metrics and not dimensions:
|
|
return SemanticResult(
|
|
requests=[],
|
|
results=DataFrame([[0]], columns=["COUNT"]),
|
|
)
|
|
|
|
query, parameters = self._get_query(
|
|
metrics,
|
|
dimensions,
|
|
filters,
|
|
order,
|
|
limit,
|
|
offset,
|
|
group_limit,
|
|
)
|
|
query = f"SELECT COUNT(*) FROM ({query}) AS subquery"
|
|
connection_parameters = get_connection_parameters(self.configuration)
|
|
with connect(**connection_parameters) as connection:
|
|
df = connection.cursor().execute(query, parameters).fechone()[0]
|
|
|
|
return SemanticResult(
|
|
requests=[
|
|
SemanticRequest(
|
|
REQUEST_TYPE,
|
|
substitute_parameters(query, parameters),
|
|
)
|
|
],
|
|
results=df,
|
|
)
|
|
|
|
def _get_query(
|
|
self,
|
|
metrics: list[Metric],
|
|
dimensions: list[Dimension],
|
|
filters: set[Filter | AdhocFilter] | None = None,
|
|
order: list[OrderTuple] | None = None,
|
|
limit: int | None = None,
|
|
offset: int | None = None,
|
|
group_limit: GroupLimit | None = None,
|
|
) -> tuple[str, tuple[FilterValues, ...]]:
|
|
"""
|
|
Build a query to fetch data from the semantic view.
|
|
|
|
This also returns the parameters need to run `cursor.execute()`, passed
|
|
separately to prevent SQL injection.
|
|
"""
|
|
if limit is None and offset is not None:
|
|
raise ValueError("Offset cannot be set without limit")
|
|
|
|
filters = filters or set()
|
|
where_clause, where_parameters = self._build_predicates(
|
|
# XXX sort to ensure deterministic order for parameters
|
|
[filter_ for filter_ in filters if filter_.type == PredicateType.WHERE]
|
|
)
|
|
# having clauses are not supported, since there's no GROUP BY
|
|
if any(filter_.type == PredicateType.HAVING for filter_ in filters):
|
|
raise ValueError("HAVING filters are not supported")
|
|
|
|
if group_limit:
|
|
query, cte_parameters = self._build_query_with_group_limit(
|
|
metrics,
|
|
dimensions,
|
|
where_clause,
|
|
order,
|
|
limit,
|
|
offset,
|
|
group_limit,
|
|
)
|
|
# Combine parameters: CTE params first, then main query params
|
|
all_parameters = cte_parameters + where_parameters
|
|
else:
|
|
query = self._build_simple_query(
|
|
metrics,
|
|
dimensions,
|
|
where_clause,
|
|
order,
|
|
limit,
|
|
offset,
|
|
)
|
|
all_parameters = where_parameters
|
|
|
|
return query, all_parameters
|
|
|
|
def _alias_element(self, element: Metric | Dimension) -> str:
|
|
"""
|
|
Generate an aliased column expression for a metric or dimension.
|
|
"""
|
|
return f"{element.id} AS {self._quote(element.id)}"
|
|
|
|
def _build_order_clause(
|
|
self,
|
|
order: list[OrderTuple] | None = None,
|
|
) -> str:
|
|
"""
|
|
Build the ORDER BY clause from a list of (element, direction) tuples.
|
|
|
|
Note that for adhoc expressions, Superset will still add `ASC` or `DESC` to the
|
|
end, which means adhoc expressions can contain multiple columns as long as the
|
|
last one has no direction specified.
|
|
|
|
This is fine:
|
|
|
|
gender ASC, COUNT(*)
|
|
|
|
But this is not
|
|
|
|
gender ASC, COUNT(*) DESC
|
|
|
|
The latter will produce a query that looks like this:
|
|
|
|
... ORDER BY gender ASC, COUNT(*) DESC DESC
|
|
|
|
"""
|
|
if not order:
|
|
return ""
|
|
|
|
def build_element(element: Metric | Dimension | AdhocExpression) -> str:
|
|
if isinstance(element, AdhocExpression):
|
|
validate_order_by(element.definition)
|
|
return element.definition
|
|
return self._quote(element.id)
|
|
|
|
return ", ".join(
|
|
f"{build_element(element)} {direction.value}"
|
|
for element, direction in order
|
|
)
|
|
|
|
def _build_simple_query(
|
|
self,
|
|
metrics: list[Metric],
|
|
dimensions: list[Dimension],
|
|
where_clause: str,
|
|
order: list[OrderTuple] | None,
|
|
limit: int | None,
|
|
offset: int | None,
|
|
) -> str:
|
|
"""
|
|
Build a query without group limiting.
|
|
"""
|
|
dimension_arguments = ", ".join(
|
|
self._alias_element(dimension) for dimension in dimensions
|
|
)
|
|
metric_arguments = ", ".join(self._alias_element(metric) for metric in metrics)
|
|
order_clause = self._build_order_clause(order)
|
|
|
|
return dedent(
|
|
f"""
|
|
SELECT * FROM SEMANTIC_VIEW(
|
|
{self.uid()}
|
|
{"DIMENSIONS " + dimension_arguments if dimension_arguments else ""}
|
|
{"METRICS " + metric_arguments if metric_arguments else ""}
|
|
{"WHERE " + where_clause if where_clause else ""}
|
|
)
|
|
{"ORDER BY " + order_clause if order_clause else ""}
|
|
{"LIMIT " + str(limit) if limit is not None else ""}
|
|
{"OFFSET " + str(offset) if offset is not None else ""}
|
|
"""
|
|
).strip()
|
|
|
|
def _build_top_groups_cte(
|
|
self,
|
|
group_limit: GroupLimit,
|
|
where_clause: str,
|
|
) -> tuple[str, tuple[FilterValues, ...]]:
|
|
"""
|
|
Build a CTE that finds the top N combinations of limited dimensions.
|
|
|
|
If group_limit.filters is set, it uses those filters instead of the main
|
|
query's where clause. This allows using different time bounds for finding top
|
|
groups vs showing data.
|
|
|
|
Returns:
|
|
Tuple of (CTE SQL, parameters for the CTE)
|
|
"""
|
|
limited_dimension_arguments = ", ".join(
|
|
self._alias_element(dimension) for dimension in group_limit.dimensions
|
|
)
|
|
limited_dimension_names = ", ".join(
|
|
self._quote(dimension.id) for dimension in group_limit.dimensions
|
|
)
|
|
|
|
# Use separate filters for group limit if provided (Option 2)
|
|
# Otherwise use the same filters as the main query (Option 1)
|
|
if group_limit.filters is not None:
|
|
group_where_clause, group_where_params = self._build_predicates(
|
|
sorted(
|
|
filter_
|
|
for filter_ in group_limit.filters
|
|
if filter_.type == PredicateType.WHERE
|
|
)
|
|
)
|
|
if any(
|
|
filter_.type == PredicateType.HAVING for filter_ in group_limit.filters
|
|
):
|
|
raise ValueError(
|
|
"HAVING filters are not supported in group limit filters"
|
|
)
|
|
cte_params = group_where_params
|
|
else:
|
|
group_where_clause = where_clause
|
|
cte_params = () # No additional params - using main query params
|
|
|
|
# Build METRICS clause and ORDER BY based on whether metric is provided
|
|
if group_limit.metric is not None:
|
|
metrics_clause = (
|
|
f"METRICS {group_limit.metric.id}"
|
|
f" AS {self._quote(group_limit.metric.id)}"
|
|
)
|
|
order_by_clause = (
|
|
f"{self._quote(group_limit.metric.id)} {group_limit.direction.value}"
|
|
)
|
|
else:
|
|
# No metric provided - order by first dimension
|
|
metrics_clause = ""
|
|
order_by_clause = (
|
|
f"{self._quote(group_limit.dimensions[0].id)} "
|
|
f"{group_limit.direction.value}"
|
|
)
|
|
|
|
# Build SEMANTIC_VIEW arguments
|
|
semantic_view_args = [
|
|
f"DIMENSIONS {limited_dimension_arguments}",
|
|
]
|
|
if metrics_clause:
|
|
semantic_view_args.append(metrics_clause)
|
|
if group_where_clause:
|
|
semantic_view_args.append(f"WHERE {group_where_clause}")
|
|
|
|
semantic_view_args_str = "\n ".join(semantic_view_args)
|
|
|
|
# Add trailing blank line if there's no WHERE clause
|
|
# This matches the original template behavior
|
|
if not group_where_clause:
|
|
semantic_view_args_str += "\n"
|
|
|
|
cte_sql = dedent(
|
|
f"""
|
|
WITH top_groups AS (
|
|
SELECT {limited_dimension_names}
|
|
FROM SEMANTIC_VIEW(
|
|
{self.uid()}
|
|
{semantic_view_args_str}
|
|
)
|
|
ORDER BY
|
|
{order_by_clause}
|
|
LIMIT {group_limit.top}
|
|
)
|
|
"""
|
|
).strip()
|
|
|
|
return cte_sql, cte_params
|
|
|
|
def _build_group_filter(self, group_limit: GroupLimit) -> str:
|
|
"""
|
|
Build a WHERE filter that restricts results to top N groups.
|
|
"""
|
|
if len(group_limit.dimensions) == 1:
|
|
dimension_id = self._quote(group_limit.dimensions[0].id)
|
|
return f"{dimension_id} IN (SELECT {dimension_id} FROM top_groups)"
|
|
|
|
# Multi-column IN clause
|
|
dimension_tuple = ", ".join(
|
|
self._quote(dim.id) for dim in group_limit.dimensions
|
|
)
|
|
return f"({dimension_tuple}) IN (SELECT {dimension_tuple} FROM top_groups)"
|
|
|
|
def _build_case_expression(
|
|
self,
|
|
dimension: Dimension,
|
|
group_condition: str,
|
|
) -> str:
|
|
"""
|
|
Build a CASE expression that replaces non-top values with 'Other'.
|
|
|
|
Args:
|
|
dimension: The dimension to build the CASE for
|
|
group_condition: The condition to check if value is in top groups
|
|
(e.g., "staff_id IN (SELECT staff_id FROM top_groups)")
|
|
|
|
Returns:
|
|
SQL CASE expression
|
|
"""
|
|
dimension_id = self._quote(dimension.id)
|
|
return f"""CASE
|
|
WHEN {group_condition} THEN {dimension_id}
|
|
ELSE CAST('Other' AS VARCHAR)
|
|
END"""
|
|
|
|
def _build_query_with_others(
|
|
self,
|
|
metrics: list[Metric],
|
|
dimensions: list[Dimension],
|
|
where_clause: str,
|
|
order: list[OrderTuple] | None,
|
|
limit: int | None,
|
|
offset: int | None,
|
|
group_limit: GroupLimit,
|
|
) -> tuple[str, tuple[FilterValues, ...]]:
|
|
"""
|
|
Build a query that groups non-top N values as 'Other'.
|
|
|
|
This uses a two-stage approach:
|
|
1. CTE to find top N groups
|
|
2. Subquery with CASE expressions to replace non-top values with 'Other'
|
|
3. Outer query to re-aggregate with the new grouping
|
|
|
|
Returns:
|
|
Tuple of (SQL query, CTE parameters)
|
|
"""
|
|
top_groups_cte, cte_params = self._build_top_groups_cte(
|
|
group_limit,
|
|
where_clause,
|
|
)
|
|
|
|
# Determine which dimensions are limited vs non-limited
|
|
limited_dimension_ids = {dim.id for dim in group_limit.dimensions}
|
|
non_limited_dimensions = [
|
|
dim for dim in dimensions if dim.id not in limited_dimension_ids
|
|
]
|
|
|
|
# Build the group condition for CASE expressions
|
|
if len(group_limit.dimensions) == 1:
|
|
dimension_id = self._quote(group_limit.dimensions[0].id)
|
|
group_condition = (
|
|
f"{dimension_id} IN (SELECT {dimension_id} FROM top_groups)"
|
|
)
|
|
else:
|
|
dimension_tuple = ", ".join(
|
|
self._quote(dim.id) for dim in group_limit.dimensions
|
|
)
|
|
group_condition = (
|
|
f"({dimension_tuple}) IN (SELECT {dimension_tuple} FROM top_groups)"
|
|
)
|
|
|
|
# Build CASE expressions for limited dimensions
|
|
case_expressions = []
|
|
case_expressions_for_groupby = []
|
|
for dim in group_limit.dimensions:
|
|
case_expr = self._build_case_expression(dim, group_condition)
|
|
alias = self._quote(dim.id)
|
|
case_expressions.append(f"{case_expr} AS {alias}")
|
|
# Store the full CASE expression for GROUP BY (not just alias)
|
|
case_expressions_for_groupby.append(case_expr)
|
|
|
|
# Build SELECT for non-limited dimensions (pass through)
|
|
non_limited_selects = [
|
|
f"{self._quote(dim.id)} AS {self._quote(dim.id)}"
|
|
for dim in non_limited_dimensions
|
|
]
|
|
|
|
# Build metric aggregations
|
|
metric_aggregations = [
|
|
f"SUM({self._quote(metric.id)}) AS {self._quote(metric.id)}"
|
|
for metric in metrics
|
|
]
|
|
|
|
# Build the subquery that gets raw data from SEMANTIC_VIEW
|
|
dimension_arguments = ", ".join(
|
|
self._alias_element(dimension) for dimension in dimensions
|
|
)
|
|
metric_arguments = ", ".join(self._alias_element(metric) for metric in metrics)
|
|
|
|
subquery = dedent(
|
|
f"""
|
|
raw_data AS (
|
|
SELECT * FROM SEMANTIC_VIEW(
|
|
{self.uid()}
|
|
DIMENSIONS {dimension_arguments}
|
|
METRICS {metric_arguments}
|
|
{"WHERE " + where_clause if where_clause else ""}
|
|
)
|
|
)
|
|
"""
|
|
).strip()
|
|
|
|
# Build GROUP BY clause (full CASE expressions + non-limited dimensions)
|
|
# We need to repeat the full CASE expressions, not use aliases, because
|
|
# Snowflake may interpret the alias as the original column reference
|
|
group_by_columns = case_expressions_for_groupby + [
|
|
self._quote(dim.id) for dim in non_limited_dimensions
|
|
]
|
|
group_by_clause = ", ".join(group_by_columns)
|
|
|
|
# Build final SELECT columns
|
|
select_columns = case_expressions + non_limited_selects + metric_aggregations
|
|
select_clause = ",\n ".join(select_columns)
|
|
|
|
# Build ORDER BY clause (need to reference the aliased columns)
|
|
order_clause = self._build_order_clause(order)
|
|
|
|
query = dedent(
|
|
f"""
|
|
{top_groups_cte},
|
|
{subquery}
|
|
SELECT
|
|
{select_clause}
|
|
FROM raw_data
|
|
GROUP BY {group_by_clause}
|
|
{"ORDER BY " + order_clause if order_clause else ""}
|
|
{"LIMIT " + str(limit) if limit is not None else ""}
|
|
{"OFFSET " + str(offset) if offset is not None else ""}
|
|
"""
|
|
).strip()
|
|
|
|
return query, cte_params
|
|
|
|
def _build_query_with_group_limit(
|
|
self,
|
|
metrics: list[Metric],
|
|
dimensions: list[Dimension],
|
|
where_clause: str,
|
|
order: list[OrderTuple] | None,
|
|
limit: int | None,
|
|
offset: int | None,
|
|
group_limit: GroupLimit,
|
|
) -> tuple[str, tuple[FilterValues, ...]]:
|
|
"""
|
|
Build a query with group limiting (top N groups).
|
|
|
|
If group_others is True, groups non-top values as 'Other'.
|
|
Otherwise, filters to show only top N groups.
|
|
|
|
Returns:
|
|
Tuple of (SQL query, CTE parameters)
|
|
"""
|
|
if group_limit.group_others:
|
|
return self._build_query_with_others(
|
|
metrics,
|
|
dimensions,
|
|
where_clause,
|
|
order,
|
|
limit,
|
|
offset,
|
|
group_limit,
|
|
)
|
|
|
|
# Standard group limiting: just filter to top N groups
|
|
# We can't use CTE references inside SEMANTIC_VIEW(), so we wrap it
|
|
dimension_arguments = ", ".join(
|
|
self._alias_element(dimension) for dimension in dimensions
|
|
)
|
|
metric_arguments = ", ".join(self._alias_element(metric) for metric in metrics)
|
|
order_clause = self._build_order_clause(order)
|
|
|
|
top_groups_cte, cte_params = self._build_top_groups_cte(
|
|
group_limit,
|
|
where_clause,
|
|
)
|
|
group_filter = self._build_group_filter(group_limit)
|
|
|
|
query = dedent(
|
|
f"""
|
|
{top_groups_cte}
|
|
SELECT * FROM SEMANTIC_VIEW(
|
|
{self.uid()}
|
|
{"DIMENSIONS " + dimension_arguments if dimension_arguments else ""}
|
|
{"METRICS " + metric_arguments if metric_arguments else ""}
|
|
{"WHERE " + where_clause if where_clause else ""}
|
|
) AS subquery
|
|
WHERE {group_filter}
|
|
{"ORDER BY " + order_clause if order_clause else ""}
|
|
{"LIMIT " + str(limit) if limit is not None else ""}
|
|
{"OFFSET " + str(offset) if offset is not None else ""}
|
|
"""
|
|
).strip()
|
|
|
|
return query, cte_params
|
|
|
|
__repr__ = uid
|