feat(mcp): MCP service implementation (PRs 3-9 consolidated) (#35877)

This commit is contained in:
Amin Ghadersohi
2025-11-01 02:33:21 +11:00
committed by GitHub
parent 30d584afd1
commit fee4e7d8e2
106 changed files with 21826 additions and 223 deletions

View File

@@ -0,0 +1,185 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
"""
Runtime validation module for chart configurations.
Validates performance, compatibility, and user experience issues.
"""
import logging
from typing import List, Tuple
from superset.mcp_service.chart.schemas import (
ChartConfig,
XYChartConfig,
)
from superset.mcp_service.common.error_schemas import ChartGenerationError
logger = logging.getLogger(__name__)
class RuntimeValidator:
"""Orchestrates runtime validations for chart configurations."""
@staticmethod
def validate_runtime_issues(
config: ChartConfig, dataset_id: int | str
) -> Tuple[bool, ChartGenerationError | None]:
"""
Validate runtime issues that could affect chart rendering or performance.
Args:
config: Chart configuration to validate
dataset_id: Dataset identifier
Returns:
Tuple of (is_valid, error)
"""
warnings: List[str] = []
suggestions: List[str] = []
# Only check XY charts for format and cardinality issues
if isinstance(config, XYChartConfig):
# Format-type compatibility validation
format_warnings = RuntimeValidator._validate_format_compatibility(config)
if format_warnings:
warnings.extend(format_warnings)
# Cardinality validation
cardinality_warnings, cardinality_suggestions = (
RuntimeValidator._validate_cardinality(config, dataset_id)
)
if cardinality_warnings:
warnings.extend(cardinality_warnings)
suggestions.extend(cardinality_suggestions)
# Chart type appropriateness validation (for all chart types)
type_warnings, type_suggestions = RuntimeValidator._validate_chart_type(
config, dataset_id
)
if type_warnings:
warnings.extend(type_warnings)
suggestions.extend(type_suggestions)
# If we have warnings, return them as a validation error
if warnings:
from superset.mcp_service.utils.error_builder import (
ChartErrorBuilder,
)
return False, ChartErrorBuilder.build_error(
error_type="runtime_semantic_warning",
template_key="performance_warning",
template_vars={
"reason": "; ".join(warnings[:3])
+ ("..." if len(warnings) > 3 else "")
},
custom_suggestions=suggestions[:5], # Limit suggestions
error_code="RUNTIME_SEMANTIC_WARNING",
)
return True, None
@staticmethod
def _validate_format_compatibility(config: XYChartConfig) -> List[str]:
"""Validate format-type compatibility."""
warnings: List[str] = []
try:
# Import here to avoid circular imports
from .format_validator import FormatTypeValidator
is_valid, format_warnings = (
FormatTypeValidator.validate_format_compatibility(config)
)
if format_warnings:
warnings.extend(format_warnings)
except ImportError:
logger.warning("Format validator not available")
except Exception as e:
logger.warning("Format validation failed: %s", e)
return warnings
@staticmethod
def _validate_cardinality(
config: XYChartConfig, dataset_id: int | str
) -> Tuple[List[str], List[str]]:
"""Validate cardinality issues."""
warnings: List[str] = []
suggestions: List[str] = []
try:
# Import here to avoid circular imports
from .cardinality_validator import CardinalityValidator
# Determine chart type for cardinality thresholds
chart_type = config.kind if hasattr(config, "kind") else "default"
# Check X-axis cardinality
is_ok, cardinality_info = CardinalityValidator.check_cardinality(
dataset_id=dataset_id,
x_column=config.x.name,
chart_type=chart_type,
group_by_column=config.group_by.name if config.group_by else None,
)
if not is_ok and cardinality_info:
warnings.extend(cardinality_info.get("warnings", []))
suggestions.extend(cardinality_info.get("suggestions", []))
except ImportError:
logger.warning("Cardinality validator not available")
except Exception as e:
logger.warning("Cardinality validation failed: %s", e)
return warnings, suggestions
@staticmethod
def _validate_chart_type(
config: ChartConfig, dataset_id: int | str
) -> Tuple[List[str], List[str]]:
"""Validate chart type appropriateness."""
warnings: List[str] = []
suggestions: List[str] = []
try:
# Import here to avoid circular imports
from .chart_type_suggester import ChartTypeSuggester
is_appropriate, suggestion_info = ChartTypeSuggester.analyze_and_suggest(
config, dataset_id
)
if not is_appropriate and suggestion_info:
warnings.extend(suggestion_info.get("issues", []))
suggestions.extend(suggestion_info.get("suggestions", []))
# Add recommended chart types
recommended = suggestion_info.get("recommended_types", [])
if recommended:
recommendations = ", ".join(recommended)
suggestions.append(
f"Recommended chart types for this data: {recommendations}"
)
except ImportError:
logger.warning("Chart type suggester not available")
except Exception as e:
logger.warning("Chart type validation failed: %s", e)
return warnings, suggestions

View File

@@ -0,0 +1,195 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
"""
Cardinality validation to prevent unusable visualizations from high-cardinality data.
"""
import logging
from typing import Any, Dict, List, Tuple
logger = logging.getLogger(__name__)
class CardinalityValidator:
"""
Validates cardinality of dimensions to prevent charts with too many categories
that become unreadable or cause performance issues.
"""
# Thresholds for different chart types
CARDINALITY_THRESHOLDS = {
"bar": 50, # Bar charts become unreadable with >50 bars
"line": 100, # Line charts can handle more points
"scatter": 500, # Scatter plots can show many points
"area": 30, # Area charts need fewer categories
"table": 1000, # Tables can handle many rows with pagination
"default": 50, # Conservative default
}
# Known high-cardinality column patterns
HIGH_CARDINALITY_PATTERNS = [
"id",
"uuid",
"guid",
"email",
"phone",
"address",
"session",
"transaction",
"order_number",
"invoice",
"timestamp",
"datetime",
"created_at",
"updated_at",
]
@staticmethod
def check_cardinality(
dataset_id: int | str,
x_column: str,
chart_type: str = "default",
group_by_column: str | None = None,
) -> Tuple[bool, Dict[str, Any] | None]:
"""
Check cardinality of X-axis and group_by columns.
Returns:
Tuple of (is_ok, warning_info)
"""
try:
# Quick pattern check first (no DB query needed)
pattern_warnings = CardinalityValidator._check_column_patterns(
x_column, group_by_column
)
if pattern_warnings:
return False, {
"warnings": pattern_warnings,
"suggestions": CardinalityValidator._get_suggestions(
x_column, chart_type, pattern_based=True
),
}
# For non-pattern columns, we could do actual cardinality check
# but that requires DB access - for now just return OK
# In production, you'd want to cache cardinality stats
return True, None
except Exception as e:
logger.warning("Cardinality check failed: %s", e)
# Don't block on validation failures
return True, None
@staticmethod
def _check_column_patterns(
x_column: str, group_by_column: str | None = None
) -> List[str]:
"""Check for known high-cardinality column patterns."""
warnings = []
x_lower = x_column.lower()
# Check X-axis column
for pattern in CardinalityValidator.HIGH_CARDINALITY_PATTERNS:
if pattern in x_lower:
warnings.append(
f"Column '{x_column}' appears to be a high-cardinality field "
f"(contains '{pattern}'). This may create an unreadable chart "
f"with too many categories on the X-axis."
)
break
# Check group_by column if present
if group_by_column:
group_lower = group_by_column.lower()
for pattern in CardinalityValidator.HIGH_CARDINALITY_PATTERNS:
if pattern in group_lower:
warnings.append(
f"Group by column '{group_by_column}' appears to be a "
f"high-cardinality field (contains '{pattern}'). This may "
f"create too many series to visualize effectively."
)
break
return warnings
@staticmethod
def _get_suggestions(
column: str, chart_type: str, pattern_based: bool = False
) -> List[str]:
"""Get suggestions for handling high cardinality."""
suggestions = []
if pattern_based:
# Suggestions when we detected high-cardinality patterns
if any(p in column.lower() for p in ["id", "uuid", "guid"]):
suggestions.extend(
[
"Consider using a different column for the X-axis",
f"If you need to analyze by {column}, use filters to limit "
f"the data",
"A table chart might be more appropriate for ID-based data",
]
)
elif any(p in column.lower() for p in ["email", "phone", "address"]):
suggestions.extend(
[
"Consider grouping by a higher-level category (e.g., "
"domain for emails)",
f"Use filters to focus on specific {column} values",
"Aggregate the data before visualization",
]
)
elif any(
p in column.lower() for p in ["timestamp", "datetime", "created_at"]
):
suggestions.extend(
[
"Consider truncating timestamps to date or hour level",
"Use time-based grouping (daily, weekly, monthly)",
"Apply date range filters to limit the data",
]
)
else:
# General high-cardinality suggestions
threshold = CardinalityValidator.CARDINALITY_THRESHOLDS.get(chart_type, 50)
suggestions.extend(
[
f"This chart type works best with fewer than {threshold} "
f"categories",
"Consider using filters to reduce the number of values",
"Try grouping or categorizing the data at a higher level",
"A table or pivot table might better display high-cardinality data",
]
)
return suggestions
@staticmethod
def suggest_chart_type(cardinality: int) -> List[str]:
"""Suggest appropriate chart types based on cardinality."""
if cardinality <= 10:
return ["bar", "pie", "donut", "area"]
elif cardinality <= 30:
return ["bar", "line", "area"]
elif cardinality <= 100:
return ["line", "scatter"]
else:
return ["table", "pivot_table", "heatmap"]

View File

@@ -0,0 +1,437 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
"""
Chart type suggestions based on data characteristics and user intent.
"""
import logging
from typing import Any, Dict, List, Tuple
from superset.mcp_service.chart.schemas import (
ChartConfig,
ColumnRef,
TableChartConfig,
XYChartConfig,
)
logger = logging.getLogger(__name__)
class ChartTypeSuggester:
"""
Suggests appropriate chart types based on data characteristics
and identifies potential mismatches between chart type and data.
"""
@staticmethod
def analyze_and_suggest(
config: ChartConfig,
dataset_id: int | str, # noqa: ARG002
) -> Tuple[bool, Dict[str, Any] | None]:
"""
Analyze chart configuration and suggest better chart types if needed.
Returns:
Tuple of (is_appropriate, suggestion_info)
"""
try:
if isinstance(config, XYChartConfig):
return ChartTypeSuggester._analyze_xy_chart(config)
elif isinstance(config, TableChartConfig):
return ChartTypeSuggester._analyze_table_chart(config)
else:
return True, None
except Exception as e:
logger.warning("Chart type analysis failed: %s", e)
return True, None # Don't block on suggestion failures
@staticmethod
def _analyze_xy_chart(
config: XYChartConfig,
) -> Tuple[bool, Dict[str, Any] | None]:
"""Analyze XY chart appropriateness."""
issues = []
suggestions = []
x_analysis = ChartTypeSuggester._analyze_x_axis(config.x.name)
y_analysis = ChartTypeSuggester._analyze_y_axis(config.y)
# Check chart type specific issues
chart_issues, chart_suggestions = ChartTypeSuggester._check_chart_type_issues(
config, x_analysis, y_analysis
)
issues.extend(chart_issues)
suggestions.extend(chart_suggestions)
# Add general suggestions
general_suggestions = ChartTypeSuggester._get_general_suggestions(
x_analysis, y_analysis
)
suggestions.extend(general_suggestions)
if issues:
return False, {
"issues": issues,
"suggestions": suggestions,
"recommended_types": ChartTypeSuggester._get_recommended_types(
x_analysis["is_temporal"],
x_analysis["is_categorical"],
y_analysis["has_count"],
y_analysis["num_metrics"],
),
}
return True, None
@staticmethod
def _analyze_x_axis(x_name: str) -> Dict[str, Any]:
"""Analyze X-axis characteristics."""
x_name_lower = x_name.lower()
return {
"is_temporal": any(
t in x_name_lower
for t in [
"date",
"time",
"year",
"month",
"day",
"hour",
"created",
"updated",
]
),
"is_categorical": any(
c in x_name_lower
for c in [
"category",
"type",
"status",
"department",
"region",
"country",
"state",
]
),
"is_id": any(i in x_name_lower for i in ["id", "uuid", "guid", "key"]),
"name": x_name,
}
@staticmethod
def _analyze_y_axis(y_columns: List[ColumnRef]) -> Dict[str, Any]:
"""Analyze Y-axis characteristics."""
return {
"has_count": any(
col.aggregate in ["COUNT", "COUNT_DISTINCT"] for col in y_columns
),
"num_metrics": len(y_columns),
}
@staticmethod
def _check_chart_type_issues(
config: XYChartConfig, x_analysis: Dict[str, Any], y_analysis: Dict[str, Any]
) -> Tuple[List[str], List[str]]:
"""Check for chart type specific issues."""
issues = []
suggestions = []
# Extract analysis values
x_is_temporal = x_analysis["is_temporal"]
x_is_categorical = x_analysis["is_categorical"]
x_is_id = x_analysis["is_id"]
num_metrics = y_analysis["num_metrics"]
# Check chart type specific issues by delegating to helper methods
if config.kind == "line":
line_issues, line_suggestions = ChartTypeSuggester._check_line_chart_issues(
config, x_is_temporal, x_is_categorical, x_is_id
)
issues.extend(line_issues)
suggestions.extend(line_suggestions)
elif config.kind == "scatter":
(
scatter_issues,
scatter_suggestions,
) = ChartTypeSuggester._check_scatter_chart_issues(
config, x_is_categorical, num_metrics
)
issues.extend(scatter_issues)
suggestions.extend(scatter_suggestions)
elif config.kind == "area":
area_issues, area_suggestions = ChartTypeSuggester._check_area_chart_issues(
config, x_is_temporal
)
issues.extend(area_issues)
suggestions.extend(area_suggestions)
elif config.kind == "bar":
bar_issues, bar_suggestions = ChartTypeSuggester._check_bar_chart_issues(
config, x_is_id
)
issues.extend(bar_issues)
suggestions.extend(bar_suggestions)
return issues, suggestions
@staticmethod
def _check_line_chart_issues(
config: XYChartConfig,
x_is_temporal: bool,
x_is_categorical: bool,
x_is_id: bool,
) -> Tuple[List[str], List[str]]:
"""Check line chart specific issues."""
issues = []
suggestions = []
if not x_is_temporal and x_is_categorical:
issues.append(
f"Line chart with categorical X-axis '{config.x.name}' may not "
f"show meaningful trends"
)
suggestions.extend(
[
"Consider using a bar chart for categorical comparisons",
"Line charts work best with temporal or continuous data",
]
)
elif x_is_id:
issues.append(
f"Line chart with ID field '{config.x.name}' on X-axis will not "
f"show meaningful patterns"
)
suggestions.extend(
[
"Use a table to display individual records",
"Or aggregate the data by a meaningful dimension",
]
)
return issues, suggestions
@staticmethod
def _check_scatter_chart_issues(
config: XYChartConfig, x_is_categorical: bool, num_metrics: int
) -> Tuple[List[str], List[str]]:
"""Check scatter chart specific issues."""
issues = []
suggestions = []
if x_is_categorical:
issues.append(
f"Scatter plot with categorical X-axis '{config.x.name}' may not "
f"effectively show correlations"
)
suggestions.extend(
[
"Scatter plots work best with two continuous variables",
"Consider a bar chart for categorical vs numeric data",
]
)
if num_metrics > 1:
issues.append("Scatter plots with multiple Y metrics can be confusing")
suggestions.extend(
[
"Consider using only one Y metric for clarity",
"Or use a line/bar chart to compare multiple metrics",
]
)
return issues, suggestions
@staticmethod
def _check_area_chart_issues(
config: XYChartConfig, x_is_temporal: bool
) -> Tuple[List[str], List[str]]:
"""Check area chart specific issues."""
issues = []
suggestions = []
if not x_is_temporal:
issues.append(
f"Area chart with non-temporal X-axis '{config.x.name}' may be "
f"misleading"
)
suggestions.extend(
[
"Area charts imply cumulative or part-to-whole relationships over "
"time",
"Consider a stacked bar chart for categorical data",
]
)
# Check for potential negative values
for col in config.y:
if any(term in col.name.lower() for term in ["loss", "debt", "negative"]):
issues.append(
f"Area chart with potentially negative values in '{col.name}' "
f"can create visual confusion"
)
suggestions.extend(
[
"Use a line chart for data that can go negative",
"Or ensure all values are positive before using area chart",
]
)
return issues, suggestions
@staticmethod
def _check_bar_chart_issues(
config: XYChartConfig, x_is_id: bool
) -> Tuple[List[str], List[str]]:
"""Check bar chart specific issues."""
issues = []
suggestions = []
if x_is_id:
issues.append(
f"Bar chart with ID field '{config.x.name}' may create too many bars"
)
suggestions.extend(
[
"Consider aggregating by a higher-level category",
"Or use filters to limit the number of bars displayed",
]
)
return issues, suggestions
@staticmethod
def _get_general_suggestions(
x_analysis: Dict[str, Any], y_analysis: Dict[str, Any]
) -> List[str]:
"""Get general suggestions based on data patterns."""
suggestions = []
x_is_temporal = x_analysis["is_temporal"]
x_is_categorical = x_analysis["is_categorical"]
has_count = y_analysis["has_count"]
num_metrics = y_analysis["num_metrics"]
if has_count and x_is_categorical:
suggestions.append(
"This looks like frequency analysis - bar charts work well for counts "
"by category"
)
elif x_is_temporal and num_metrics == 1:
suggestions.append(
"Single metric over time - line charts are ideal for showing trends"
)
elif x_is_temporal and num_metrics > 3:
suggestions.append(
"Many metrics over time - consider focusing on 2-3 key metrics for "
"clarity"
)
return suggestions
@staticmethod
def _analyze_table_chart(
config: TableChartConfig,
) -> Tuple[bool, Dict[str, Any] | None]:
"""Analyze table chart appropriateness."""
issues = []
suggestions = []
# Count different column types
raw_columns = sum(1 for col in config.columns if not col.aggregate)
metric_columns = sum(1 for col in config.columns if col.aggregate)
total_columns = len(config.columns)
# Check if data might be better visualized
if metric_columns > 0 and raw_columns <= 2:
# Mostly metrics with few dimensions - could be visualized
issues.append(
"Table with mostly aggregated metrics could be visualized as a chart"
)
suggestions.append("Consider a bar chart to compare metric values visually")
suggestions.append("Or use a line chart if there's a time dimension")
# Check for ID-heavy tables
id_columns = sum(
1
for col in config.columns
if any(i in col.name.lower() for i in ["id", "uuid", "guid", "key"])
)
if id_columns > total_columns / 2:
suggestions.append(
"Table appears to be ID-heavy - ensure this is for detailed record "
"inspection"
)
suggestions.append(
"For analysis, consider aggregating by meaningful dimensions instead"
)
# Very wide tables
if total_columns > 10:
issues.append(
f"Table with {total_columns} columns may be difficult to read"
)
suggestions.append("Consider showing only the most important columns")
suggestions.append("Or break into multiple focused views")
if issues:
return False, {
"issues": issues,
"suggestions": suggestions,
"recommended_types": ["table", "pivot_table"]
if metric_columns > 0
else ["table"],
}
return True, None
@staticmethod
def _get_recommended_types(
x_is_temporal: bool, x_is_categorical: bool, has_count: bool, num_metrics: int
) -> List[str]:
"""Get recommended chart types based on data characteristics."""
recommendations = []
if x_is_temporal:
recommendations.extend(["line", "area", "bar"])
if num_metrics == 1:
recommendations.append("scatter") # For trend analysis
elif x_is_categorical:
recommendations.extend(["bar", "table"])
if has_count and num_metrics == 1:
recommendations.append("pie") # For proportion analysis
else:
# Continuous or unclear X-axis
recommendations.extend(["scatter", "line", "table"])
# Always include table as fallback
if "table" not in recommendations:
recommendations.append("table")
return recommendations
@staticmethod
def get_chart_type_description(chart_type: str) -> str:
"""Get a description of when to use each chart type."""
descriptions = {
"line": "Best for showing trends over time or continuous data",
"bar": "Ideal for comparing values across categories",
"area": "Shows cumulative totals and part-to-whole relationships over time",
"scatter": "Reveals correlations between two continuous variables",
"table": "Displays detailed data or many dimensions at once",
"pie": "Shows proportions of a whole (use sparingly, max 5-7 slices)",
"pivot_table": "Summarizes data across multiple dimensions",
}
return descriptions.get(
chart_type, f"Visualizes data using {chart_type} format"
)

View File

@@ -0,0 +1,225 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
"""
Format-type compatibility validation to prevent misleading data presentation.
"""
import logging
import re
from typing import List, Tuple
from superset.mcp_service.chart.schemas import ColumnRef, XYChartConfig
logger = logging.getLogger(__name__)
class FormatTypeValidator:
"""
Validates that format strings are appropriate for the data type and aggregation.
Prevents issues like currency formatting on COUNT data or percentage on absolute
values.
"""
# Format patterns and their appropriate uses
CURRENCY_PATTERNS = [
r"\$", # Dollar sign
r"", # Euro
r"£", # Pound
r"¥", # Yen
r"[,.]2f", # Two decimal places (common for currency)
r"\$[,.]", # Dollar with thousands separator
]
PERCENTAGE_PATTERNS = [
r"%", # Percentage sign
r"\.0%", # Percentage with no decimals
r"\.1%", # Percentage with 1 decimal
r"\.2%", # Percentage with 2 decimals
]
INTEGER_PATTERNS = [
r"\.0f", # No decimals
r",d", # Integer with thousands separator
r"[,.]0f", # Integer format variations
]
@staticmethod
def validate_format_compatibility(
config: XYChartConfig,
) -> Tuple[bool, List[str] | None]:
"""
Validate that axis formats are appropriate for the data types.
Returns:
Tuple of (is_valid, warnings_list)
"""
warnings = []
# Validate Y-axis format against metrics
if config.y_axis and config.y_axis.format:
y_warnings = FormatTypeValidator._validate_y_axis_format(
config.y_axis.format, config.y
)
warnings.extend(y_warnings)
# Validate X-axis format (usually temporal or categorical)
if config.x_axis and config.x_axis.format:
x_warnings = FormatTypeValidator._validate_x_axis_format(
config.x_axis.format, config.x
)
warnings.extend(x_warnings)
return len(warnings) == 0, warnings if warnings else None
@staticmethod
def _validate_y_axis_format(
format_string: str, y_columns: List[ColumnRef]
) -> List[str]:
"""Validate Y-axis format against the metrics."""
warnings = []
warnings.extend(
FormatTypeValidator._check_currency_format_issues(format_string, y_columns)
)
warnings.extend(
FormatTypeValidator._check_percentage_format_issues(
format_string, y_columns
)
)
warnings.extend(
FormatTypeValidator._check_decimal_format_issues(format_string, y_columns)
)
return warnings
@staticmethod
def _check_currency_format_issues(
format_string: str, y_columns: List[ColumnRef]
) -> List[str]:
"""Check for currency format issues."""
warnings = []
if FormatTypeValidator._is_currency_format(format_string):
for col in y_columns:
if col.aggregate in ["COUNT", "COUNT_DISTINCT"]:
warnings.append(
f"Currency format '{format_string}' applied to {col.aggregate} "
f"of '{col.name}'. COUNT operations return whole numbers, not "
f"currency values. Consider using integer format like ',"
f"d' instead."
)
return warnings
@staticmethod
def _check_percentage_format_issues(
format_string: str, y_columns: List[ColumnRef]
) -> List[str]:
"""Check for percentage format issues."""
warnings = []
if FormatTypeValidator._is_percentage_format(format_string):
for col in y_columns:
if col.aggregate in ["SUM", "COUNT", "COUNT_DISTINCT"]:
label = col.label or f"{col.aggregate}({col.name})"
warnings.append(
f"Percentage format '{format_string}' applied to "
f"{col.aggregate} of '{col.name}'. This will multiply values "
f"by 100 and add %. "
f"If '{label}' contains absolute values (not ratios 0-1), "
f"consider using a numeric format instead."
)
return warnings
@staticmethod
def _check_decimal_format_issues(
format_string: str, y_columns: List[ColumnRef]
) -> List[str]:
"""Check for decimal format issues."""
warnings = []
if "." in format_string and any(char.isdigit() for char in format_string):
decimal_places = FormatTypeValidator._get_decimal_places(format_string)
if decimal_places and decimal_places > 0:
for col in y_columns:
if col.aggregate in ["COUNT", "COUNT_DISTINCT"]:
warnings.append(
f"Decimal format '{format_string}' applied to "
f"{col.aggregate} of '{col.name}'. COUNT operations "
f"always return "
f"integers. Consider using integer format like ',"
f"d' or '.0f' instead."
)
return warnings
@staticmethod
def _validate_x_axis_format(format_string: str, x_column: ColumnRef) -> List[str]:
"""Validate X-axis format appropriateness."""
warnings = []
# Currency format on X-axis is almost always wrong
if FormatTypeValidator._is_currency_format(format_string):
warnings.append(
f"Currency format '{format_string}' applied to X-axis '"
f"{x_column.name}'. "
f"X-axis typically shows categories, time, or dimensions, "
f"not currency. "
f"Consider removing the format or using a date/category format."
)
# Percentage format on X-axis is unusual
elif FormatTypeValidator._is_percentage_format(format_string):
warnings.append(
f"Percentage format '{format_string}' applied to X-axis '"
f"{x_column.name}'. "
f"This is unusual for axis labels. Consider if this is intentional."
)
return warnings
@staticmethod
def _is_currency_format(format_string: str) -> bool:
"""Check if format string represents currency."""
return any(
re.search(pattern, format_string, re.IGNORECASE)
for pattern in FormatTypeValidator.CURRENCY_PATTERNS
)
@staticmethod
def _is_percentage_format(format_string: str) -> bool:
"""Check if format string represents percentage."""
return any(
re.search(pattern, format_string)
for pattern in FormatTypeValidator.PERCENTAGE_PATTERNS
)
@staticmethod
def _get_decimal_places(format_string: str) -> int | None:
"""Extract number of decimal places from format string."""
if match := re.search(r"\.(\d+)f", format_string):
return int(match.group(1))
return None
@staticmethod
def suggest_format(column: ColumnRef) -> str:
"""Suggest appropriate format based on column and aggregation."""
if column.aggregate in ["COUNT", "COUNT_DISTINCT"]:
return ",d" # Integer with thousands separator
elif column.aggregate in ["AVG", "STDDEV", "VAR"]:
return ",.2f" # Two decimals for statistical measures
elif column.aggregate in ["SUM", "MIN", "MAX"]:
# Could be currency or regular number, default to flexible
return ",.2f" # Two decimals with thousands separator
else:
return "" # Let Superset decide