Files
superset2/superset/mcp_service/chart/validation/runtime/cardinality_validator.py

196 lines
7.1 KiB
Python

# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
"""
Cardinality validation to prevent unusable visualizations from high-cardinality data.
"""
import logging
from typing import Any, Dict, List, Tuple
logger = logging.getLogger(__name__)
class CardinalityValidator:
"""
Validates cardinality of dimensions to prevent charts with too many categories
that become unreadable or cause performance issues.
"""
# Thresholds for different chart types
CARDINALITY_THRESHOLDS = {
"bar": 50, # Bar charts become unreadable with >50 bars
"line": 100, # Line charts can handle more points
"scatter": 500, # Scatter plots can show many points
"area": 30, # Area charts need fewer categories
"table": 1000, # Tables can handle many rows with pagination
"default": 50, # Conservative default
}
# Known high-cardinality column patterns
HIGH_CARDINALITY_PATTERNS = [
"id",
"uuid",
"guid",
"email",
"phone",
"address",
"session",
"transaction",
"order_number",
"invoice",
"timestamp",
"datetime",
"created_at",
"updated_at",
]
@staticmethod
def check_cardinality(
dataset_id: int | str,
x_column: str,
chart_type: str = "default",
group_by_column: str | None = None,
) -> Tuple[bool, Dict[str, Any] | None]:
"""
Check cardinality of X-axis and group_by columns.
Returns:
Tuple of (is_ok, warning_info)
"""
try:
# Quick pattern check first (no DB query needed)
pattern_warnings = CardinalityValidator._check_column_patterns(
x_column, group_by_column
)
if pattern_warnings:
return False, {
"warnings": pattern_warnings,
"suggestions": CardinalityValidator._get_suggestions(
x_column, chart_type, pattern_based=True
),
}
# For non-pattern columns, we could do actual cardinality check
# but that requires DB access - for now just return OK
# In production, you'd want to cache cardinality stats
return True, None
except Exception as e:
logger.warning("Cardinality check failed: %s", e)
# Don't block on validation failures
return True, None
@staticmethod
def _check_column_patterns(
x_column: str, group_by_column: str | None = None
) -> List[str]:
"""Check for known high-cardinality column patterns."""
warnings = []
x_lower = x_column.lower()
# Check X-axis column
for pattern in CardinalityValidator.HIGH_CARDINALITY_PATTERNS:
if pattern in x_lower:
warnings.append(
f"Column '{x_column}' appears to be a high-cardinality field "
f"(contains '{pattern}'). This may create an unreadable chart "
f"with too many categories on the X-axis."
)
break
# Check group_by column if present
if group_by_column:
group_lower = group_by_column.lower()
for pattern in CardinalityValidator.HIGH_CARDINALITY_PATTERNS:
if pattern in group_lower:
warnings.append(
f"Group by column '{group_by_column}' appears to be a "
f"high-cardinality field (contains '{pattern}'). This may "
f"create too many series to visualize effectively."
)
break
return warnings
@staticmethod
def _get_suggestions(
column: str, chart_type: str, pattern_based: bool = False
) -> List[str]:
"""Get suggestions for handling high cardinality."""
suggestions = []
if pattern_based:
# Suggestions when we detected high-cardinality patterns
if any(p in column.lower() for p in ["id", "uuid", "guid"]):
suggestions.extend(
[
"Consider using a different column for the X-axis",
f"If you need to analyze by {column}, use filters to limit "
f"the data",
"A table chart might be more appropriate for ID-based data",
]
)
elif any(p in column.lower() for p in ["email", "phone", "address"]):
suggestions.extend(
[
"Consider grouping by a higher-level category (e.g., "
"domain for emails)",
f"Use filters to focus on specific {column} values",
"Aggregate the data before visualization",
]
)
elif any(
p in column.lower() for p in ["timestamp", "datetime", "created_at"]
):
suggestions.extend(
[
"Consider truncating timestamps to date or hour level",
"Use time-based grouping (daily, weekly, monthly)",
"Apply date range filters to limit the data",
]
)
else:
# General high-cardinality suggestions
threshold = CardinalityValidator.CARDINALITY_THRESHOLDS.get(chart_type, 50)
suggestions.extend(
[
f"This chart type works best with fewer than {threshold} "
f"categories",
"Consider using filters to reduce the number of values",
"Try grouping or categorizing the data at a higher level",
"A table or pivot table might better display high-cardinality data",
]
)
return suggestions
@staticmethod
def suggest_chart_type(cardinality: int) -> List[str]:
"""Suggest appropriate chart types based on cardinality."""
if cardinality <= 10:
return ["bar", "pie", "donut", "area"]
elif cardinality <= 30:
return ["bar", "line", "area"]
elif cardinality <= 100:
return ["line", "scatter"]
else:
return ["table", "pivot_table", "heatmap"]