mirror of
https://github.com/apache/superset.git
synced 2026-04-20 00:24:38 +00:00
feat(mcp): MCP service implementation (PRs 3-9 consolidated) (#35877)
This commit is contained in:
@@ -0,0 +1,195 @@
|
||||
# Licensed to the Apache Software Foundation (ASF) under one
|
||||
# or more contributor license agreements. See the NOTICE file
|
||||
# distributed with this work for additional information
|
||||
# regarding copyright ownership. The ASF licenses this file
|
||||
# to you under the Apache License, Version 2.0 (the
|
||||
# "License"); you may not use this file except in compliance
|
||||
# with the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing,
|
||||
# software distributed under the License is distributed on an
|
||||
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
# KIND, either express or implied. See the License for the
|
||||
# specific language governing permissions and limitations
|
||||
# under the License.
|
||||
|
||||
"""
|
||||
Cardinality validation to prevent unusable visualizations from high-cardinality data.
|
||||
"""
|
||||
|
||||
import logging
|
||||
from typing import Any, Dict, List, Tuple
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class CardinalityValidator:
|
||||
"""
|
||||
Validates cardinality of dimensions to prevent charts with too many categories
|
||||
that become unreadable or cause performance issues.
|
||||
"""
|
||||
|
||||
# Thresholds for different chart types
|
||||
CARDINALITY_THRESHOLDS = {
|
||||
"bar": 50, # Bar charts become unreadable with >50 bars
|
||||
"line": 100, # Line charts can handle more points
|
||||
"scatter": 500, # Scatter plots can show many points
|
||||
"area": 30, # Area charts need fewer categories
|
||||
"table": 1000, # Tables can handle many rows with pagination
|
||||
"default": 50, # Conservative default
|
||||
}
|
||||
|
||||
# Known high-cardinality column patterns
|
||||
HIGH_CARDINALITY_PATTERNS = [
|
||||
"id",
|
||||
"uuid",
|
||||
"guid",
|
||||
"email",
|
||||
"phone",
|
||||
"address",
|
||||
"session",
|
||||
"transaction",
|
||||
"order_number",
|
||||
"invoice",
|
||||
"timestamp",
|
||||
"datetime",
|
||||
"created_at",
|
||||
"updated_at",
|
||||
]
|
||||
|
||||
@staticmethod
|
||||
def check_cardinality(
|
||||
dataset_id: int | str,
|
||||
x_column: str,
|
||||
chart_type: str = "default",
|
||||
group_by_column: str | None = None,
|
||||
) -> Tuple[bool, Dict[str, Any] | None]:
|
||||
"""
|
||||
Check cardinality of X-axis and group_by columns.
|
||||
|
||||
Returns:
|
||||
Tuple of (is_ok, warning_info)
|
||||
"""
|
||||
try:
|
||||
# Quick pattern check first (no DB query needed)
|
||||
pattern_warnings = CardinalityValidator._check_column_patterns(
|
||||
x_column, group_by_column
|
||||
)
|
||||
|
||||
if pattern_warnings:
|
||||
return False, {
|
||||
"warnings": pattern_warnings,
|
||||
"suggestions": CardinalityValidator._get_suggestions(
|
||||
x_column, chart_type, pattern_based=True
|
||||
),
|
||||
}
|
||||
|
||||
# For non-pattern columns, we could do actual cardinality check
|
||||
# but that requires DB access - for now just return OK
|
||||
# In production, you'd want to cache cardinality stats
|
||||
|
||||
return True, None
|
||||
|
||||
except Exception as e:
|
||||
logger.warning("Cardinality check failed: %s", e)
|
||||
# Don't block on validation failures
|
||||
return True, None
|
||||
|
||||
@staticmethod
|
||||
def _check_column_patterns(
|
||||
x_column: str, group_by_column: str | None = None
|
||||
) -> List[str]:
|
||||
"""Check for known high-cardinality column patterns."""
|
||||
warnings = []
|
||||
|
||||
x_lower = x_column.lower()
|
||||
|
||||
# Check X-axis column
|
||||
for pattern in CardinalityValidator.HIGH_CARDINALITY_PATTERNS:
|
||||
if pattern in x_lower:
|
||||
warnings.append(
|
||||
f"Column '{x_column}' appears to be a high-cardinality field "
|
||||
f"(contains '{pattern}'). This may create an unreadable chart "
|
||||
f"with too many categories on the X-axis."
|
||||
)
|
||||
break
|
||||
|
||||
# Check group_by column if present
|
||||
if group_by_column:
|
||||
group_lower = group_by_column.lower()
|
||||
for pattern in CardinalityValidator.HIGH_CARDINALITY_PATTERNS:
|
||||
if pattern in group_lower:
|
||||
warnings.append(
|
||||
f"Group by column '{group_by_column}' appears to be a "
|
||||
f"high-cardinality field (contains '{pattern}'). This may "
|
||||
f"create too many series to visualize effectively."
|
||||
)
|
||||
break
|
||||
|
||||
return warnings
|
||||
|
||||
@staticmethod
|
||||
def _get_suggestions(
|
||||
column: str, chart_type: str, pattern_based: bool = False
|
||||
) -> List[str]:
|
||||
"""Get suggestions for handling high cardinality."""
|
||||
suggestions = []
|
||||
|
||||
if pattern_based:
|
||||
# Suggestions when we detected high-cardinality patterns
|
||||
if any(p in column.lower() for p in ["id", "uuid", "guid"]):
|
||||
suggestions.extend(
|
||||
[
|
||||
"Consider using a different column for the X-axis",
|
||||
f"If you need to analyze by {column}, use filters to limit "
|
||||
f"the data",
|
||||
"A table chart might be more appropriate for ID-based data",
|
||||
]
|
||||
)
|
||||
elif any(p in column.lower() for p in ["email", "phone", "address"]):
|
||||
suggestions.extend(
|
||||
[
|
||||
"Consider grouping by a higher-level category (e.g., "
|
||||
"domain for emails)",
|
||||
f"Use filters to focus on specific {column} values",
|
||||
"Aggregate the data before visualization",
|
||||
]
|
||||
)
|
||||
elif any(
|
||||
p in column.lower() for p in ["timestamp", "datetime", "created_at"]
|
||||
):
|
||||
suggestions.extend(
|
||||
[
|
||||
"Consider truncating timestamps to date or hour level",
|
||||
"Use time-based grouping (daily, weekly, monthly)",
|
||||
"Apply date range filters to limit the data",
|
||||
]
|
||||
)
|
||||
else:
|
||||
# General high-cardinality suggestions
|
||||
threshold = CardinalityValidator.CARDINALITY_THRESHOLDS.get(chart_type, 50)
|
||||
suggestions.extend(
|
||||
[
|
||||
f"This chart type works best with fewer than {threshold} "
|
||||
f"categories",
|
||||
"Consider using filters to reduce the number of values",
|
||||
"Try grouping or categorizing the data at a higher level",
|
||||
"A table or pivot table might better display high-cardinality data",
|
||||
]
|
||||
)
|
||||
|
||||
return suggestions
|
||||
|
||||
@staticmethod
|
||||
def suggest_chart_type(cardinality: int) -> List[str]:
|
||||
"""Suggest appropriate chart types based on cardinality."""
|
||||
if cardinality <= 10:
|
||||
return ["bar", "pie", "donut", "area"]
|
||||
elif cardinality <= 30:
|
||||
return ["bar", "line", "area"]
|
||||
elif cardinality <= 100:
|
||||
return ["line", "scatter"]
|
||||
else:
|
||||
return ["table", "pivot_table", "heatmap"]
|
||||
Reference in New Issue
Block a user