mirror of
https://github.com/apache/superset.git
synced 2026-04-11 20:37:16 +00:00
196 lines
7.1 KiB
Python
196 lines
7.1 KiB
Python
# Licensed to the Apache Software Foundation (ASF) under one
|
|
# or more contributor license agreements. See the NOTICE file
|
|
# distributed with this work for additional information
|
|
# regarding copyright ownership. The ASF licenses this file
|
|
# to you under the Apache License, Version 2.0 (the
|
|
# "License"); you may not use this file except in compliance
|
|
# with the License. You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing,
|
|
# software distributed under the License is distributed on an
|
|
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
# KIND, either express or implied. See the License for the
|
|
# specific language governing permissions and limitations
|
|
# under the License.
|
|
|
|
"""
|
|
Cardinality validation to prevent unusable visualizations from high-cardinality data.
|
|
"""
|
|
|
|
import logging
|
|
from typing import Any, Dict, List, Tuple
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class CardinalityValidator:
|
|
"""
|
|
Validates cardinality of dimensions to prevent charts with too many categories
|
|
that become unreadable or cause performance issues.
|
|
"""
|
|
|
|
# Thresholds for different chart types
|
|
CARDINALITY_THRESHOLDS = {
|
|
"bar": 50, # Bar charts become unreadable with >50 bars
|
|
"line": 100, # Line charts can handle more points
|
|
"scatter": 500, # Scatter plots can show many points
|
|
"area": 30, # Area charts need fewer categories
|
|
"table": 1000, # Tables can handle many rows with pagination
|
|
"default": 50, # Conservative default
|
|
}
|
|
|
|
# Known high-cardinality column patterns
|
|
HIGH_CARDINALITY_PATTERNS = [
|
|
"id",
|
|
"uuid",
|
|
"guid",
|
|
"email",
|
|
"phone",
|
|
"address",
|
|
"session",
|
|
"transaction",
|
|
"order_number",
|
|
"invoice",
|
|
"timestamp",
|
|
"datetime",
|
|
"created_at",
|
|
"updated_at",
|
|
]
|
|
|
|
@staticmethod
|
|
def check_cardinality(
|
|
dataset_id: int | str,
|
|
x_column: str,
|
|
chart_type: str = "default",
|
|
group_by_column: str | None = None,
|
|
) -> Tuple[bool, Dict[str, Any] | None]:
|
|
"""
|
|
Check cardinality of X-axis and group_by columns.
|
|
|
|
Returns:
|
|
Tuple of (is_ok, warning_info)
|
|
"""
|
|
try:
|
|
# Quick pattern check first (no DB query needed)
|
|
pattern_warnings = CardinalityValidator._check_column_patterns(
|
|
x_column, group_by_column
|
|
)
|
|
|
|
if pattern_warnings:
|
|
return False, {
|
|
"warnings": pattern_warnings,
|
|
"suggestions": CardinalityValidator._get_suggestions(
|
|
x_column, chart_type, pattern_based=True
|
|
),
|
|
}
|
|
|
|
# For non-pattern columns, we could do actual cardinality check
|
|
# but that requires DB access - for now just return OK
|
|
# In production, you'd want to cache cardinality stats
|
|
|
|
return True, None
|
|
|
|
except Exception as e:
|
|
logger.warning("Cardinality check failed: %s", e)
|
|
# Don't block on validation failures
|
|
return True, None
|
|
|
|
@staticmethod
|
|
def _check_column_patterns(
|
|
x_column: str, group_by_column: str | None = None
|
|
) -> List[str]:
|
|
"""Check for known high-cardinality column patterns."""
|
|
warnings = []
|
|
|
|
x_lower = x_column.lower()
|
|
|
|
# Check X-axis column
|
|
for pattern in CardinalityValidator.HIGH_CARDINALITY_PATTERNS:
|
|
if pattern in x_lower:
|
|
warnings.append(
|
|
f"Column '{x_column}' appears to be a high-cardinality field "
|
|
f"(contains '{pattern}'). This may create an unreadable chart "
|
|
f"with too many categories on the X-axis."
|
|
)
|
|
break
|
|
|
|
# Check group_by column if present
|
|
if group_by_column:
|
|
group_lower = group_by_column.lower()
|
|
for pattern in CardinalityValidator.HIGH_CARDINALITY_PATTERNS:
|
|
if pattern in group_lower:
|
|
warnings.append(
|
|
f"Group by column '{group_by_column}' appears to be a "
|
|
f"high-cardinality field (contains '{pattern}'). This may "
|
|
f"create too many series to visualize effectively."
|
|
)
|
|
break
|
|
|
|
return warnings
|
|
|
|
@staticmethod
|
|
def _get_suggestions(
|
|
column: str, chart_type: str, pattern_based: bool = False
|
|
) -> List[str]:
|
|
"""Get suggestions for handling high cardinality."""
|
|
suggestions = []
|
|
|
|
if pattern_based:
|
|
# Suggestions when we detected high-cardinality patterns
|
|
if any(p in column.lower() for p in ["id", "uuid", "guid"]):
|
|
suggestions.extend(
|
|
[
|
|
"Consider using a different column for the X-axis",
|
|
f"If you need to analyze by {column}, use filters to limit "
|
|
f"the data",
|
|
"A table chart might be more appropriate for ID-based data",
|
|
]
|
|
)
|
|
elif any(p in column.lower() for p in ["email", "phone", "address"]):
|
|
suggestions.extend(
|
|
[
|
|
"Consider grouping by a higher-level category (e.g., "
|
|
"domain for emails)",
|
|
f"Use filters to focus on specific {column} values",
|
|
"Aggregate the data before visualization",
|
|
]
|
|
)
|
|
elif any(
|
|
p in column.lower() for p in ["timestamp", "datetime", "created_at"]
|
|
):
|
|
suggestions.extend(
|
|
[
|
|
"Consider truncating timestamps to date or hour level",
|
|
"Use time-based grouping (daily, weekly, monthly)",
|
|
"Apply date range filters to limit the data",
|
|
]
|
|
)
|
|
else:
|
|
# General high-cardinality suggestions
|
|
threshold = CardinalityValidator.CARDINALITY_THRESHOLDS.get(chart_type, 50)
|
|
suggestions.extend(
|
|
[
|
|
f"This chart type works best with fewer than {threshold} "
|
|
f"categories",
|
|
"Consider using filters to reduce the number of values",
|
|
"Try grouping or categorizing the data at a higher level",
|
|
"A table or pivot table might better display high-cardinality data",
|
|
]
|
|
)
|
|
|
|
return suggestions
|
|
|
|
@staticmethod
|
|
def suggest_chart_type(cardinality: int) -> List[str]:
|
|
"""Suggest appropriate chart types based on cardinality."""
|
|
if cardinality <= 10:
|
|
return ["bar", "pie", "donut", "area"]
|
|
elif cardinality <= 30:
|
|
return ["bar", "line", "area"]
|
|
elif cardinality <= 100:
|
|
return ["line", "scatter"]
|
|
else:
|
|
return ["table", "pivot_table", "heatmap"]
|