mirror of
https://github.com/apache/superset.git
synced 2026-04-24 18:44:53 +00:00
feat(mcp): MCP service implementation (PRs 3-9 consolidated) (#35877)
This commit is contained in:
185
superset/mcp_service/chart/validation/runtime/__init__.py
Normal file
185
superset/mcp_service/chart/validation/runtime/__init__.py
Normal file
@@ -0,0 +1,185 @@
|
||||
# Licensed to the Apache Software Foundation (ASF) under one
|
||||
# or more contributor license agreements. See the NOTICE file
|
||||
# distributed with this work for additional information
|
||||
# regarding copyright ownership. The ASF licenses this file
|
||||
# to you under the Apache License, Version 2.0 (the
|
||||
# "License"); you may not use this file except in compliance
|
||||
# with the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing,
|
||||
# software distributed under the License is distributed on an
|
||||
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
# KIND, either express or implied. See the License for the
|
||||
# specific language governing permissions and limitations
|
||||
# under the License.
|
||||
|
||||
"""
|
||||
Runtime validation module for chart configurations.
|
||||
Validates performance, compatibility, and user experience issues.
|
||||
"""
|
||||
|
||||
import logging
|
||||
from typing import List, Tuple
|
||||
|
||||
from superset.mcp_service.chart.schemas import (
|
||||
ChartConfig,
|
||||
XYChartConfig,
|
||||
)
|
||||
from superset.mcp_service.common.error_schemas import ChartGenerationError
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class RuntimeValidator:
|
||||
"""Orchestrates runtime validations for chart configurations."""
|
||||
|
||||
@staticmethod
|
||||
def validate_runtime_issues(
|
||||
config: ChartConfig, dataset_id: int | str
|
||||
) -> Tuple[bool, ChartGenerationError | None]:
|
||||
"""
|
||||
Validate runtime issues that could affect chart rendering or performance.
|
||||
|
||||
Args:
|
||||
config: Chart configuration to validate
|
||||
dataset_id: Dataset identifier
|
||||
|
||||
Returns:
|
||||
Tuple of (is_valid, error)
|
||||
"""
|
||||
warnings: List[str] = []
|
||||
suggestions: List[str] = []
|
||||
|
||||
# Only check XY charts for format and cardinality issues
|
||||
if isinstance(config, XYChartConfig):
|
||||
# Format-type compatibility validation
|
||||
format_warnings = RuntimeValidator._validate_format_compatibility(config)
|
||||
if format_warnings:
|
||||
warnings.extend(format_warnings)
|
||||
|
||||
# Cardinality validation
|
||||
cardinality_warnings, cardinality_suggestions = (
|
||||
RuntimeValidator._validate_cardinality(config, dataset_id)
|
||||
)
|
||||
if cardinality_warnings:
|
||||
warnings.extend(cardinality_warnings)
|
||||
suggestions.extend(cardinality_suggestions)
|
||||
|
||||
# Chart type appropriateness validation (for all chart types)
|
||||
type_warnings, type_suggestions = RuntimeValidator._validate_chart_type(
|
||||
config, dataset_id
|
||||
)
|
||||
if type_warnings:
|
||||
warnings.extend(type_warnings)
|
||||
suggestions.extend(type_suggestions)
|
||||
|
||||
# If we have warnings, return them as a validation error
|
||||
if warnings:
|
||||
from superset.mcp_service.utils.error_builder import (
|
||||
ChartErrorBuilder,
|
||||
)
|
||||
|
||||
return False, ChartErrorBuilder.build_error(
|
||||
error_type="runtime_semantic_warning",
|
||||
template_key="performance_warning",
|
||||
template_vars={
|
||||
"reason": "; ".join(warnings[:3])
|
||||
+ ("..." if len(warnings) > 3 else "")
|
||||
},
|
||||
custom_suggestions=suggestions[:5], # Limit suggestions
|
||||
error_code="RUNTIME_SEMANTIC_WARNING",
|
||||
)
|
||||
|
||||
return True, None
|
||||
|
||||
@staticmethod
|
||||
def _validate_format_compatibility(config: XYChartConfig) -> List[str]:
|
||||
"""Validate format-type compatibility."""
|
||||
warnings: List[str] = []
|
||||
|
||||
try:
|
||||
# Import here to avoid circular imports
|
||||
from .format_validator import FormatTypeValidator
|
||||
|
||||
is_valid, format_warnings = (
|
||||
FormatTypeValidator.validate_format_compatibility(config)
|
||||
)
|
||||
if format_warnings:
|
||||
warnings.extend(format_warnings)
|
||||
except ImportError:
|
||||
logger.warning("Format validator not available")
|
||||
except Exception as e:
|
||||
logger.warning("Format validation failed: %s", e)
|
||||
|
||||
return warnings
|
||||
|
||||
@staticmethod
|
||||
def _validate_cardinality(
|
||||
config: XYChartConfig, dataset_id: int | str
|
||||
) -> Tuple[List[str], List[str]]:
|
||||
"""Validate cardinality issues."""
|
||||
warnings: List[str] = []
|
||||
suggestions: List[str] = []
|
||||
|
||||
try:
|
||||
# Import here to avoid circular imports
|
||||
from .cardinality_validator import CardinalityValidator
|
||||
|
||||
# Determine chart type for cardinality thresholds
|
||||
chart_type = config.kind if hasattr(config, "kind") else "default"
|
||||
|
||||
# Check X-axis cardinality
|
||||
is_ok, cardinality_info = CardinalityValidator.check_cardinality(
|
||||
dataset_id=dataset_id,
|
||||
x_column=config.x.name,
|
||||
chart_type=chart_type,
|
||||
group_by_column=config.group_by.name if config.group_by else None,
|
||||
)
|
||||
|
||||
if not is_ok and cardinality_info:
|
||||
warnings.extend(cardinality_info.get("warnings", []))
|
||||
suggestions.extend(cardinality_info.get("suggestions", []))
|
||||
|
||||
except ImportError:
|
||||
logger.warning("Cardinality validator not available")
|
||||
except Exception as e:
|
||||
logger.warning("Cardinality validation failed: %s", e)
|
||||
|
||||
return warnings, suggestions
|
||||
|
||||
@staticmethod
|
||||
def _validate_chart_type(
|
||||
config: ChartConfig, dataset_id: int | str
|
||||
) -> Tuple[List[str], List[str]]:
|
||||
"""Validate chart type appropriateness."""
|
||||
warnings: List[str] = []
|
||||
suggestions: List[str] = []
|
||||
|
||||
try:
|
||||
# Import here to avoid circular imports
|
||||
from .chart_type_suggester import ChartTypeSuggester
|
||||
|
||||
is_appropriate, suggestion_info = ChartTypeSuggester.analyze_and_suggest(
|
||||
config, dataset_id
|
||||
)
|
||||
|
||||
if not is_appropriate and suggestion_info:
|
||||
warnings.extend(suggestion_info.get("issues", []))
|
||||
suggestions.extend(suggestion_info.get("suggestions", []))
|
||||
|
||||
# Add recommended chart types
|
||||
recommended = suggestion_info.get("recommended_types", [])
|
||||
if recommended:
|
||||
recommendations = ", ".join(recommended)
|
||||
suggestions.append(
|
||||
f"Recommended chart types for this data: {recommendations}"
|
||||
)
|
||||
|
||||
except ImportError:
|
||||
logger.warning("Chart type suggester not available")
|
||||
except Exception as e:
|
||||
logger.warning("Chart type validation failed: %s", e)
|
||||
|
||||
return warnings, suggestions
|
||||
@@ -0,0 +1,195 @@
|
||||
# Licensed to the Apache Software Foundation (ASF) under one
|
||||
# or more contributor license agreements. See the NOTICE file
|
||||
# distributed with this work for additional information
|
||||
# regarding copyright ownership. The ASF licenses this file
|
||||
# to you under the Apache License, Version 2.0 (the
|
||||
# "License"); you may not use this file except in compliance
|
||||
# with the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing,
|
||||
# software distributed under the License is distributed on an
|
||||
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
# KIND, either express or implied. See the License for the
|
||||
# specific language governing permissions and limitations
|
||||
# under the License.
|
||||
|
||||
"""
|
||||
Cardinality validation to prevent unusable visualizations from high-cardinality data.
|
||||
"""
|
||||
|
||||
import logging
|
||||
from typing import Any, Dict, List, Tuple
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class CardinalityValidator:
|
||||
"""
|
||||
Validates cardinality of dimensions to prevent charts with too many categories
|
||||
that become unreadable or cause performance issues.
|
||||
"""
|
||||
|
||||
# Thresholds for different chart types
|
||||
CARDINALITY_THRESHOLDS = {
|
||||
"bar": 50, # Bar charts become unreadable with >50 bars
|
||||
"line": 100, # Line charts can handle more points
|
||||
"scatter": 500, # Scatter plots can show many points
|
||||
"area": 30, # Area charts need fewer categories
|
||||
"table": 1000, # Tables can handle many rows with pagination
|
||||
"default": 50, # Conservative default
|
||||
}
|
||||
|
||||
# Known high-cardinality column patterns
|
||||
HIGH_CARDINALITY_PATTERNS = [
|
||||
"id",
|
||||
"uuid",
|
||||
"guid",
|
||||
"email",
|
||||
"phone",
|
||||
"address",
|
||||
"session",
|
||||
"transaction",
|
||||
"order_number",
|
||||
"invoice",
|
||||
"timestamp",
|
||||
"datetime",
|
||||
"created_at",
|
||||
"updated_at",
|
||||
]
|
||||
|
||||
@staticmethod
|
||||
def check_cardinality(
|
||||
dataset_id: int | str,
|
||||
x_column: str,
|
||||
chart_type: str = "default",
|
||||
group_by_column: str | None = None,
|
||||
) -> Tuple[bool, Dict[str, Any] | None]:
|
||||
"""
|
||||
Check cardinality of X-axis and group_by columns.
|
||||
|
||||
Returns:
|
||||
Tuple of (is_ok, warning_info)
|
||||
"""
|
||||
try:
|
||||
# Quick pattern check first (no DB query needed)
|
||||
pattern_warnings = CardinalityValidator._check_column_patterns(
|
||||
x_column, group_by_column
|
||||
)
|
||||
|
||||
if pattern_warnings:
|
||||
return False, {
|
||||
"warnings": pattern_warnings,
|
||||
"suggestions": CardinalityValidator._get_suggestions(
|
||||
x_column, chart_type, pattern_based=True
|
||||
),
|
||||
}
|
||||
|
||||
# For non-pattern columns, we could do actual cardinality check
|
||||
# but that requires DB access - for now just return OK
|
||||
# In production, you'd want to cache cardinality stats
|
||||
|
||||
return True, None
|
||||
|
||||
except Exception as e:
|
||||
logger.warning("Cardinality check failed: %s", e)
|
||||
# Don't block on validation failures
|
||||
return True, None
|
||||
|
||||
@staticmethod
|
||||
def _check_column_patterns(
|
||||
x_column: str, group_by_column: str | None = None
|
||||
) -> List[str]:
|
||||
"""Check for known high-cardinality column patterns."""
|
||||
warnings = []
|
||||
|
||||
x_lower = x_column.lower()
|
||||
|
||||
# Check X-axis column
|
||||
for pattern in CardinalityValidator.HIGH_CARDINALITY_PATTERNS:
|
||||
if pattern in x_lower:
|
||||
warnings.append(
|
||||
f"Column '{x_column}' appears to be a high-cardinality field "
|
||||
f"(contains '{pattern}'). This may create an unreadable chart "
|
||||
f"with too many categories on the X-axis."
|
||||
)
|
||||
break
|
||||
|
||||
# Check group_by column if present
|
||||
if group_by_column:
|
||||
group_lower = group_by_column.lower()
|
||||
for pattern in CardinalityValidator.HIGH_CARDINALITY_PATTERNS:
|
||||
if pattern in group_lower:
|
||||
warnings.append(
|
||||
f"Group by column '{group_by_column}' appears to be a "
|
||||
f"high-cardinality field (contains '{pattern}'). This may "
|
||||
f"create too many series to visualize effectively."
|
||||
)
|
||||
break
|
||||
|
||||
return warnings
|
||||
|
||||
@staticmethod
|
||||
def _get_suggestions(
|
||||
column: str, chart_type: str, pattern_based: bool = False
|
||||
) -> List[str]:
|
||||
"""Get suggestions for handling high cardinality."""
|
||||
suggestions = []
|
||||
|
||||
if pattern_based:
|
||||
# Suggestions when we detected high-cardinality patterns
|
||||
if any(p in column.lower() for p in ["id", "uuid", "guid"]):
|
||||
suggestions.extend(
|
||||
[
|
||||
"Consider using a different column for the X-axis",
|
||||
f"If you need to analyze by {column}, use filters to limit "
|
||||
f"the data",
|
||||
"A table chart might be more appropriate for ID-based data",
|
||||
]
|
||||
)
|
||||
elif any(p in column.lower() for p in ["email", "phone", "address"]):
|
||||
suggestions.extend(
|
||||
[
|
||||
"Consider grouping by a higher-level category (e.g., "
|
||||
"domain for emails)",
|
||||
f"Use filters to focus on specific {column} values",
|
||||
"Aggregate the data before visualization",
|
||||
]
|
||||
)
|
||||
elif any(
|
||||
p in column.lower() for p in ["timestamp", "datetime", "created_at"]
|
||||
):
|
||||
suggestions.extend(
|
||||
[
|
||||
"Consider truncating timestamps to date or hour level",
|
||||
"Use time-based grouping (daily, weekly, monthly)",
|
||||
"Apply date range filters to limit the data",
|
||||
]
|
||||
)
|
||||
else:
|
||||
# General high-cardinality suggestions
|
||||
threshold = CardinalityValidator.CARDINALITY_THRESHOLDS.get(chart_type, 50)
|
||||
suggestions.extend(
|
||||
[
|
||||
f"This chart type works best with fewer than {threshold} "
|
||||
f"categories",
|
||||
"Consider using filters to reduce the number of values",
|
||||
"Try grouping or categorizing the data at a higher level",
|
||||
"A table or pivot table might better display high-cardinality data",
|
||||
]
|
||||
)
|
||||
|
||||
return suggestions
|
||||
|
||||
@staticmethod
|
||||
def suggest_chart_type(cardinality: int) -> List[str]:
|
||||
"""Suggest appropriate chart types based on cardinality."""
|
||||
if cardinality <= 10:
|
||||
return ["bar", "pie", "donut", "area"]
|
||||
elif cardinality <= 30:
|
||||
return ["bar", "line", "area"]
|
||||
elif cardinality <= 100:
|
||||
return ["line", "scatter"]
|
||||
else:
|
||||
return ["table", "pivot_table", "heatmap"]
|
||||
@@ -0,0 +1,437 @@
|
||||
# Licensed to the Apache Software Foundation (ASF) under one
|
||||
# or more contributor license agreements. See the NOTICE file
|
||||
# distributed with this work for additional information
|
||||
# regarding copyright ownership. The ASF licenses this file
|
||||
# to you under the Apache License, Version 2.0 (the
|
||||
# "License"); you may not use this file except in compliance
|
||||
# with the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing,
|
||||
# software distributed under the License is distributed on an
|
||||
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
# KIND, either express or implied. See the License for the
|
||||
# specific language governing permissions and limitations
|
||||
# under the License.
|
||||
|
||||
"""
|
||||
Chart type suggestions based on data characteristics and user intent.
|
||||
"""
|
||||
|
||||
import logging
|
||||
from typing import Any, Dict, List, Tuple
|
||||
|
||||
from superset.mcp_service.chart.schemas import (
|
||||
ChartConfig,
|
||||
ColumnRef,
|
||||
TableChartConfig,
|
||||
XYChartConfig,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class ChartTypeSuggester:
|
||||
"""
|
||||
Suggests appropriate chart types based on data characteristics
|
||||
and identifies potential mismatches between chart type and data.
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
def analyze_and_suggest(
|
||||
config: ChartConfig,
|
||||
dataset_id: int | str, # noqa: ARG002
|
||||
) -> Tuple[bool, Dict[str, Any] | None]:
|
||||
"""
|
||||
Analyze chart configuration and suggest better chart types if needed.
|
||||
|
||||
Returns:
|
||||
Tuple of (is_appropriate, suggestion_info)
|
||||
"""
|
||||
try:
|
||||
if isinstance(config, XYChartConfig):
|
||||
return ChartTypeSuggester._analyze_xy_chart(config)
|
||||
elif isinstance(config, TableChartConfig):
|
||||
return ChartTypeSuggester._analyze_table_chart(config)
|
||||
else:
|
||||
return True, None
|
||||
except Exception as e:
|
||||
logger.warning("Chart type analysis failed: %s", e)
|
||||
return True, None # Don't block on suggestion failures
|
||||
|
||||
@staticmethod
|
||||
def _analyze_xy_chart(
|
||||
config: XYChartConfig,
|
||||
) -> Tuple[bool, Dict[str, Any] | None]:
|
||||
"""Analyze XY chart appropriateness."""
|
||||
issues = []
|
||||
suggestions = []
|
||||
|
||||
x_analysis = ChartTypeSuggester._analyze_x_axis(config.x.name)
|
||||
y_analysis = ChartTypeSuggester._analyze_y_axis(config.y)
|
||||
|
||||
# Check chart type specific issues
|
||||
chart_issues, chart_suggestions = ChartTypeSuggester._check_chart_type_issues(
|
||||
config, x_analysis, y_analysis
|
||||
)
|
||||
issues.extend(chart_issues)
|
||||
suggestions.extend(chart_suggestions)
|
||||
|
||||
# Add general suggestions
|
||||
general_suggestions = ChartTypeSuggester._get_general_suggestions(
|
||||
x_analysis, y_analysis
|
||||
)
|
||||
suggestions.extend(general_suggestions)
|
||||
|
||||
if issues:
|
||||
return False, {
|
||||
"issues": issues,
|
||||
"suggestions": suggestions,
|
||||
"recommended_types": ChartTypeSuggester._get_recommended_types(
|
||||
x_analysis["is_temporal"],
|
||||
x_analysis["is_categorical"],
|
||||
y_analysis["has_count"],
|
||||
y_analysis["num_metrics"],
|
||||
),
|
||||
}
|
||||
|
||||
return True, None
|
||||
|
||||
@staticmethod
|
||||
def _analyze_x_axis(x_name: str) -> Dict[str, Any]:
|
||||
"""Analyze X-axis characteristics."""
|
||||
x_name_lower = x_name.lower()
|
||||
return {
|
||||
"is_temporal": any(
|
||||
t in x_name_lower
|
||||
for t in [
|
||||
"date",
|
||||
"time",
|
||||
"year",
|
||||
"month",
|
||||
"day",
|
||||
"hour",
|
||||
"created",
|
||||
"updated",
|
||||
]
|
||||
),
|
||||
"is_categorical": any(
|
||||
c in x_name_lower
|
||||
for c in [
|
||||
"category",
|
||||
"type",
|
||||
"status",
|
||||
"department",
|
||||
"region",
|
||||
"country",
|
||||
"state",
|
||||
]
|
||||
),
|
||||
"is_id": any(i in x_name_lower for i in ["id", "uuid", "guid", "key"]),
|
||||
"name": x_name,
|
||||
}
|
||||
|
||||
@staticmethod
|
||||
def _analyze_y_axis(y_columns: List[ColumnRef]) -> Dict[str, Any]:
|
||||
"""Analyze Y-axis characteristics."""
|
||||
return {
|
||||
"has_count": any(
|
||||
col.aggregate in ["COUNT", "COUNT_DISTINCT"] for col in y_columns
|
||||
),
|
||||
"num_metrics": len(y_columns),
|
||||
}
|
||||
|
||||
@staticmethod
|
||||
def _check_chart_type_issues(
|
||||
config: XYChartConfig, x_analysis: Dict[str, Any], y_analysis: Dict[str, Any]
|
||||
) -> Tuple[List[str], List[str]]:
|
||||
"""Check for chart type specific issues."""
|
||||
issues = []
|
||||
suggestions = []
|
||||
|
||||
# Extract analysis values
|
||||
x_is_temporal = x_analysis["is_temporal"]
|
||||
x_is_categorical = x_analysis["is_categorical"]
|
||||
x_is_id = x_analysis["is_id"]
|
||||
num_metrics = y_analysis["num_metrics"]
|
||||
|
||||
# Check chart type specific issues by delegating to helper methods
|
||||
if config.kind == "line":
|
||||
line_issues, line_suggestions = ChartTypeSuggester._check_line_chart_issues(
|
||||
config, x_is_temporal, x_is_categorical, x_is_id
|
||||
)
|
||||
issues.extend(line_issues)
|
||||
suggestions.extend(line_suggestions)
|
||||
elif config.kind == "scatter":
|
||||
(
|
||||
scatter_issues,
|
||||
scatter_suggestions,
|
||||
) = ChartTypeSuggester._check_scatter_chart_issues(
|
||||
config, x_is_categorical, num_metrics
|
||||
)
|
||||
issues.extend(scatter_issues)
|
||||
suggestions.extend(scatter_suggestions)
|
||||
elif config.kind == "area":
|
||||
area_issues, area_suggestions = ChartTypeSuggester._check_area_chart_issues(
|
||||
config, x_is_temporal
|
||||
)
|
||||
issues.extend(area_issues)
|
||||
suggestions.extend(area_suggestions)
|
||||
elif config.kind == "bar":
|
||||
bar_issues, bar_suggestions = ChartTypeSuggester._check_bar_chart_issues(
|
||||
config, x_is_id
|
||||
)
|
||||
issues.extend(bar_issues)
|
||||
suggestions.extend(bar_suggestions)
|
||||
|
||||
return issues, suggestions
|
||||
|
||||
@staticmethod
|
||||
def _check_line_chart_issues(
|
||||
config: XYChartConfig,
|
||||
x_is_temporal: bool,
|
||||
x_is_categorical: bool,
|
||||
x_is_id: bool,
|
||||
) -> Tuple[List[str], List[str]]:
|
||||
"""Check line chart specific issues."""
|
||||
issues = []
|
||||
suggestions = []
|
||||
|
||||
if not x_is_temporal and x_is_categorical:
|
||||
issues.append(
|
||||
f"Line chart with categorical X-axis '{config.x.name}' may not "
|
||||
f"show meaningful trends"
|
||||
)
|
||||
suggestions.extend(
|
||||
[
|
||||
"Consider using a bar chart for categorical comparisons",
|
||||
"Line charts work best with temporal or continuous data",
|
||||
]
|
||||
)
|
||||
elif x_is_id:
|
||||
issues.append(
|
||||
f"Line chart with ID field '{config.x.name}' on X-axis will not "
|
||||
f"show meaningful patterns"
|
||||
)
|
||||
suggestions.extend(
|
||||
[
|
||||
"Use a table to display individual records",
|
||||
"Or aggregate the data by a meaningful dimension",
|
||||
]
|
||||
)
|
||||
|
||||
return issues, suggestions
|
||||
|
||||
@staticmethod
|
||||
def _check_scatter_chart_issues(
|
||||
config: XYChartConfig, x_is_categorical: bool, num_metrics: int
|
||||
) -> Tuple[List[str], List[str]]:
|
||||
"""Check scatter chart specific issues."""
|
||||
issues = []
|
||||
suggestions = []
|
||||
|
||||
if x_is_categorical:
|
||||
issues.append(
|
||||
f"Scatter plot with categorical X-axis '{config.x.name}' may not "
|
||||
f"effectively show correlations"
|
||||
)
|
||||
suggestions.extend(
|
||||
[
|
||||
"Scatter plots work best with two continuous variables",
|
||||
"Consider a bar chart for categorical vs numeric data",
|
||||
]
|
||||
)
|
||||
if num_metrics > 1:
|
||||
issues.append("Scatter plots with multiple Y metrics can be confusing")
|
||||
suggestions.extend(
|
||||
[
|
||||
"Consider using only one Y metric for clarity",
|
||||
"Or use a line/bar chart to compare multiple metrics",
|
||||
]
|
||||
)
|
||||
|
||||
return issues, suggestions
|
||||
|
||||
@staticmethod
|
||||
def _check_area_chart_issues(
|
||||
config: XYChartConfig, x_is_temporal: bool
|
||||
) -> Tuple[List[str], List[str]]:
|
||||
"""Check area chart specific issues."""
|
||||
issues = []
|
||||
suggestions = []
|
||||
|
||||
if not x_is_temporal:
|
||||
issues.append(
|
||||
f"Area chart with non-temporal X-axis '{config.x.name}' may be "
|
||||
f"misleading"
|
||||
)
|
||||
suggestions.extend(
|
||||
[
|
||||
"Area charts imply cumulative or part-to-whole relationships over "
|
||||
"time",
|
||||
"Consider a stacked bar chart for categorical data",
|
||||
]
|
||||
)
|
||||
|
||||
# Check for potential negative values
|
||||
for col in config.y:
|
||||
if any(term in col.name.lower() for term in ["loss", "debt", "negative"]):
|
||||
issues.append(
|
||||
f"Area chart with potentially negative values in '{col.name}' "
|
||||
f"can create visual confusion"
|
||||
)
|
||||
suggestions.extend(
|
||||
[
|
||||
"Use a line chart for data that can go negative",
|
||||
"Or ensure all values are positive before using area chart",
|
||||
]
|
||||
)
|
||||
|
||||
return issues, suggestions
|
||||
|
||||
@staticmethod
|
||||
def _check_bar_chart_issues(
|
||||
config: XYChartConfig, x_is_id: bool
|
||||
) -> Tuple[List[str], List[str]]:
|
||||
"""Check bar chart specific issues."""
|
||||
issues = []
|
||||
suggestions = []
|
||||
|
||||
if x_is_id:
|
||||
issues.append(
|
||||
f"Bar chart with ID field '{config.x.name}' may create too many bars"
|
||||
)
|
||||
suggestions.extend(
|
||||
[
|
||||
"Consider aggregating by a higher-level category",
|
||||
"Or use filters to limit the number of bars displayed",
|
||||
]
|
||||
)
|
||||
|
||||
return issues, suggestions
|
||||
|
||||
@staticmethod
|
||||
def _get_general_suggestions(
|
||||
x_analysis: Dict[str, Any], y_analysis: Dict[str, Any]
|
||||
) -> List[str]:
|
||||
"""Get general suggestions based on data patterns."""
|
||||
suggestions = []
|
||||
x_is_temporal = x_analysis["is_temporal"]
|
||||
x_is_categorical = x_analysis["is_categorical"]
|
||||
has_count = y_analysis["has_count"]
|
||||
num_metrics = y_analysis["num_metrics"]
|
||||
|
||||
if has_count and x_is_categorical:
|
||||
suggestions.append(
|
||||
"This looks like frequency analysis - bar charts work well for counts "
|
||||
"by category"
|
||||
)
|
||||
elif x_is_temporal and num_metrics == 1:
|
||||
suggestions.append(
|
||||
"Single metric over time - line charts are ideal for showing trends"
|
||||
)
|
||||
elif x_is_temporal and num_metrics > 3:
|
||||
suggestions.append(
|
||||
"Many metrics over time - consider focusing on 2-3 key metrics for "
|
||||
"clarity"
|
||||
)
|
||||
|
||||
return suggestions
|
||||
|
||||
@staticmethod
|
||||
def _analyze_table_chart(
|
||||
config: TableChartConfig,
|
||||
) -> Tuple[bool, Dict[str, Any] | None]:
|
||||
"""Analyze table chart appropriateness."""
|
||||
issues = []
|
||||
suggestions = []
|
||||
|
||||
# Count different column types
|
||||
raw_columns = sum(1 for col in config.columns if not col.aggregate)
|
||||
metric_columns = sum(1 for col in config.columns if col.aggregate)
|
||||
total_columns = len(config.columns)
|
||||
|
||||
# Check if data might be better visualized
|
||||
if metric_columns > 0 and raw_columns <= 2:
|
||||
# Mostly metrics with few dimensions - could be visualized
|
||||
issues.append(
|
||||
"Table with mostly aggregated metrics could be visualized as a chart"
|
||||
)
|
||||
suggestions.append("Consider a bar chart to compare metric values visually")
|
||||
suggestions.append("Or use a line chart if there's a time dimension")
|
||||
|
||||
# Check for ID-heavy tables
|
||||
id_columns = sum(
|
||||
1
|
||||
for col in config.columns
|
||||
if any(i in col.name.lower() for i in ["id", "uuid", "guid", "key"])
|
||||
)
|
||||
if id_columns > total_columns / 2:
|
||||
suggestions.append(
|
||||
"Table appears to be ID-heavy - ensure this is for detailed record "
|
||||
"inspection"
|
||||
)
|
||||
suggestions.append(
|
||||
"For analysis, consider aggregating by meaningful dimensions instead"
|
||||
)
|
||||
|
||||
# Very wide tables
|
||||
if total_columns > 10:
|
||||
issues.append(
|
||||
f"Table with {total_columns} columns may be difficult to read"
|
||||
)
|
||||
suggestions.append("Consider showing only the most important columns")
|
||||
suggestions.append("Or break into multiple focused views")
|
||||
|
||||
if issues:
|
||||
return False, {
|
||||
"issues": issues,
|
||||
"suggestions": suggestions,
|
||||
"recommended_types": ["table", "pivot_table"]
|
||||
if metric_columns > 0
|
||||
else ["table"],
|
||||
}
|
||||
|
||||
return True, None
|
||||
|
||||
@staticmethod
|
||||
def _get_recommended_types(
|
||||
x_is_temporal: bool, x_is_categorical: bool, has_count: bool, num_metrics: int
|
||||
) -> List[str]:
|
||||
"""Get recommended chart types based on data characteristics."""
|
||||
recommendations = []
|
||||
|
||||
if x_is_temporal:
|
||||
recommendations.extend(["line", "area", "bar"])
|
||||
if num_metrics == 1:
|
||||
recommendations.append("scatter") # For trend analysis
|
||||
elif x_is_categorical:
|
||||
recommendations.extend(["bar", "table"])
|
||||
if has_count and num_metrics == 1:
|
||||
recommendations.append("pie") # For proportion analysis
|
||||
else:
|
||||
# Continuous or unclear X-axis
|
||||
recommendations.extend(["scatter", "line", "table"])
|
||||
|
||||
# Always include table as fallback
|
||||
if "table" not in recommendations:
|
||||
recommendations.append("table")
|
||||
|
||||
return recommendations
|
||||
|
||||
@staticmethod
|
||||
def get_chart_type_description(chart_type: str) -> str:
|
||||
"""Get a description of when to use each chart type."""
|
||||
descriptions = {
|
||||
"line": "Best for showing trends over time or continuous data",
|
||||
"bar": "Ideal for comparing values across categories",
|
||||
"area": "Shows cumulative totals and part-to-whole relationships over time",
|
||||
"scatter": "Reveals correlations between two continuous variables",
|
||||
"table": "Displays detailed data or many dimensions at once",
|
||||
"pie": "Shows proportions of a whole (use sparingly, max 5-7 slices)",
|
||||
"pivot_table": "Summarizes data across multiple dimensions",
|
||||
}
|
||||
return descriptions.get(
|
||||
chart_type, f"Visualizes data using {chart_type} format"
|
||||
)
|
||||
@@ -0,0 +1,225 @@
|
||||
# Licensed to the Apache Software Foundation (ASF) under one
|
||||
# or more contributor license agreements. See the NOTICE file
|
||||
# distributed with this work for additional information
|
||||
# regarding copyright ownership. The ASF licenses this file
|
||||
# to you under the Apache License, Version 2.0 (the
|
||||
# "License"); you may not use this file except in compliance
|
||||
# with the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing,
|
||||
# software distributed under the License is distributed on an
|
||||
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
# KIND, either express or implied. See the License for the
|
||||
# specific language governing permissions and limitations
|
||||
# under the License.
|
||||
|
||||
"""
|
||||
Format-type compatibility validation to prevent misleading data presentation.
|
||||
"""
|
||||
|
||||
import logging
|
||||
import re
|
||||
from typing import List, Tuple
|
||||
|
||||
from superset.mcp_service.chart.schemas import ColumnRef, XYChartConfig
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class FormatTypeValidator:
|
||||
"""
|
||||
Validates that format strings are appropriate for the data type and aggregation.
|
||||
Prevents issues like currency formatting on COUNT data or percentage on absolute
|
||||
values.
|
||||
"""
|
||||
|
||||
# Format patterns and their appropriate uses
|
||||
CURRENCY_PATTERNS = [
|
||||
r"\$", # Dollar sign
|
||||
r"€", # Euro
|
||||
r"£", # Pound
|
||||
r"¥", # Yen
|
||||
r"[,.]2f", # Two decimal places (common for currency)
|
||||
r"\$[,.]", # Dollar with thousands separator
|
||||
]
|
||||
|
||||
PERCENTAGE_PATTERNS = [
|
||||
r"%", # Percentage sign
|
||||
r"\.0%", # Percentage with no decimals
|
||||
r"\.1%", # Percentage with 1 decimal
|
||||
r"\.2%", # Percentage with 2 decimals
|
||||
]
|
||||
|
||||
INTEGER_PATTERNS = [
|
||||
r"\.0f", # No decimals
|
||||
r",d", # Integer with thousands separator
|
||||
r"[,.]0f", # Integer format variations
|
||||
]
|
||||
|
||||
@staticmethod
|
||||
def validate_format_compatibility(
|
||||
config: XYChartConfig,
|
||||
) -> Tuple[bool, List[str] | None]:
|
||||
"""
|
||||
Validate that axis formats are appropriate for the data types.
|
||||
|
||||
Returns:
|
||||
Tuple of (is_valid, warnings_list)
|
||||
"""
|
||||
warnings = []
|
||||
|
||||
# Validate Y-axis format against metrics
|
||||
if config.y_axis and config.y_axis.format:
|
||||
y_warnings = FormatTypeValidator._validate_y_axis_format(
|
||||
config.y_axis.format, config.y
|
||||
)
|
||||
warnings.extend(y_warnings)
|
||||
|
||||
# Validate X-axis format (usually temporal or categorical)
|
||||
if config.x_axis and config.x_axis.format:
|
||||
x_warnings = FormatTypeValidator._validate_x_axis_format(
|
||||
config.x_axis.format, config.x
|
||||
)
|
||||
warnings.extend(x_warnings)
|
||||
|
||||
return len(warnings) == 0, warnings if warnings else None
|
||||
|
||||
@staticmethod
|
||||
def _validate_y_axis_format(
|
||||
format_string: str, y_columns: List[ColumnRef]
|
||||
) -> List[str]:
|
||||
"""Validate Y-axis format against the metrics."""
|
||||
warnings = []
|
||||
|
||||
warnings.extend(
|
||||
FormatTypeValidator._check_currency_format_issues(format_string, y_columns)
|
||||
)
|
||||
warnings.extend(
|
||||
FormatTypeValidator._check_percentage_format_issues(
|
||||
format_string, y_columns
|
||||
)
|
||||
)
|
||||
warnings.extend(
|
||||
FormatTypeValidator._check_decimal_format_issues(format_string, y_columns)
|
||||
)
|
||||
|
||||
return warnings
|
||||
|
||||
@staticmethod
|
||||
def _check_currency_format_issues(
|
||||
format_string: str, y_columns: List[ColumnRef]
|
||||
) -> List[str]:
|
||||
"""Check for currency format issues."""
|
||||
warnings = []
|
||||
if FormatTypeValidator._is_currency_format(format_string):
|
||||
for col in y_columns:
|
||||
if col.aggregate in ["COUNT", "COUNT_DISTINCT"]:
|
||||
warnings.append(
|
||||
f"Currency format '{format_string}' applied to {col.aggregate} "
|
||||
f"of '{col.name}'. COUNT operations return whole numbers, not "
|
||||
f"currency values. Consider using integer format like ',"
|
||||
f"d' instead."
|
||||
)
|
||||
return warnings
|
||||
|
||||
@staticmethod
|
||||
def _check_percentage_format_issues(
|
||||
format_string: str, y_columns: List[ColumnRef]
|
||||
) -> List[str]:
|
||||
"""Check for percentage format issues."""
|
||||
warnings = []
|
||||
if FormatTypeValidator._is_percentage_format(format_string):
|
||||
for col in y_columns:
|
||||
if col.aggregate in ["SUM", "COUNT", "COUNT_DISTINCT"]:
|
||||
label = col.label or f"{col.aggregate}({col.name})"
|
||||
warnings.append(
|
||||
f"Percentage format '{format_string}' applied to "
|
||||
f"{col.aggregate} of '{col.name}'. This will multiply values "
|
||||
f"by 100 and add %. "
|
||||
f"If '{label}' contains absolute values (not ratios 0-1), "
|
||||
f"consider using a numeric format instead."
|
||||
)
|
||||
return warnings
|
||||
|
||||
@staticmethod
|
||||
def _check_decimal_format_issues(
|
||||
format_string: str, y_columns: List[ColumnRef]
|
||||
) -> List[str]:
|
||||
"""Check for decimal format issues."""
|
||||
warnings = []
|
||||
if "." in format_string and any(char.isdigit() for char in format_string):
|
||||
decimal_places = FormatTypeValidator._get_decimal_places(format_string)
|
||||
if decimal_places and decimal_places > 0:
|
||||
for col in y_columns:
|
||||
if col.aggregate in ["COUNT", "COUNT_DISTINCT"]:
|
||||
warnings.append(
|
||||
f"Decimal format '{format_string}' applied to "
|
||||
f"{col.aggregate} of '{col.name}'. COUNT operations "
|
||||
f"always return "
|
||||
f"integers. Consider using integer format like ',"
|
||||
f"d' or '.0f' instead."
|
||||
)
|
||||
return warnings
|
||||
|
||||
@staticmethod
|
||||
def _validate_x_axis_format(format_string: str, x_column: ColumnRef) -> List[str]:
|
||||
"""Validate X-axis format appropriateness."""
|
||||
warnings = []
|
||||
|
||||
# Currency format on X-axis is almost always wrong
|
||||
if FormatTypeValidator._is_currency_format(format_string):
|
||||
warnings.append(
|
||||
f"Currency format '{format_string}' applied to X-axis '"
|
||||
f"{x_column.name}'. "
|
||||
f"X-axis typically shows categories, time, or dimensions, "
|
||||
f"not currency. "
|
||||
f"Consider removing the format or using a date/category format."
|
||||
)
|
||||
|
||||
# Percentage format on X-axis is unusual
|
||||
elif FormatTypeValidator._is_percentage_format(format_string):
|
||||
warnings.append(
|
||||
f"Percentage format '{format_string}' applied to X-axis '"
|
||||
f"{x_column.name}'. "
|
||||
f"This is unusual for axis labels. Consider if this is intentional."
|
||||
)
|
||||
|
||||
return warnings
|
||||
|
||||
@staticmethod
|
||||
def _is_currency_format(format_string: str) -> bool:
|
||||
"""Check if format string represents currency."""
|
||||
return any(
|
||||
re.search(pattern, format_string, re.IGNORECASE)
|
||||
for pattern in FormatTypeValidator.CURRENCY_PATTERNS
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def _is_percentage_format(format_string: str) -> bool:
|
||||
"""Check if format string represents percentage."""
|
||||
return any(
|
||||
re.search(pattern, format_string)
|
||||
for pattern in FormatTypeValidator.PERCENTAGE_PATTERNS
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def _get_decimal_places(format_string: str) -> int | None:
|
||||
"""Extract number of decimal places from format string."""
|
||||
if match := re.search(r"\.(\d+)f", format_string):
|
||||
return int(match.group(1))
|
||||
return None
|
||||
|
||||
@staticmethod
|
||||
def suggest_format(column: ColumnRef) -> str:
|
||||
"""Suggest appropriate format based on column and aggregation."""
|
||||
if column.aggregate in ["COUNT", "COUNT_DISTINCT"]:
|
||||
return ",d" # Integer with thousands separator
|
||||
elif column.aggregate in ["AVG", "STDDEV", "VAR"]:
|
||||
return ",.2f" # Two decimals for statistical measures
|
||||
elif column.aggregate in ["SUM", "MIN", "MAX"]:
|
||||
# Could be currency or regular number, default to flexible
|
||||
return ",.2f" # Two decimals with thousands separator
|
||||
else:
|
||||
return "" # Let Superset decide
|
||||
Reference in New Issue
Block a user