superset2/superset/mcp_service/chart/validation/runtime/cardinality_validator.py

# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements.  See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership.  The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License.  You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.

"""
Cardinality validation to prevent unusable visualizations from high-cardinality data.
"""

import logging
from typing import Any, Dict, List, Tuple

logger = logging.getLogger(__name__)


class CardinalityValidator:
    """
    Validates cardinality of dimensions to prevent charts with too many categories
    that become unreadable or cause performance issues.
    """

    # Thresholds for different chart types
    CARDINALITY_THRESHOLDS = {
        "bar": 50,  # Bar charts become unreadable with >50 bars
        "line": 100,  # Line charts can handle more points
        "scatter": 500,  # Scatter plots can show many points
        "area": 30,  # Area charts need fewer categories
        "table": 1000,  # Tables can handle many rows with pagination
        "default": 50,  # Conservative default
    }

    # Known high-cardinality column patterns
    HIGH_CARDINALITY_PATTERNS = [
        "id",
        "uuid",
        "guid",
        "email",
        "phone",
        "address",
        "session",
        "transaction",
        "order_number",
        "invoice",
        "timestamp",
        "datetime",
        "created_at",
        "updated_at",
    ]

    @staticmethod
    def check_cardinality(
        dataset_id: int | str,
        x_column: str,
        chart_type: str = "default",
        group_by_column: str | None = None,
    ) -> Tuple[bool, Dict[str, Any] | None]:
        """
        Check cardinality of X-axis and group_by columns.

        Returns:
            Tuple of (is_ok, warning_info)
        """
        try:
            # Quick pattern check first (no DB query needed)
            pattern_warnings = CardinalityValidator._check_column_patterns(
                x_column, group_by_column
            )

            if pattern_warnings:
                return False, {
                    "warnings": pattern_warnings,
                    "suggestions": CardinalityValidator._get_suggestions(
                        x_column, chart_type, pattern_based=True
                    ),
                }

            # For non-pattern columns, we could do actual cardinality check
            # but that requires DB access - for now just return OK
            # In production, you'd want to cache cardinality stats

            return True, None

        except Exception as e:
            logger.warning("Cardinality check failed: %s", e)
            # Don't block on validation failures
            return True, None

    @staticmethod
    def _check_column_patterns(
        x_column: str, group_by_column: str | None = None
    ) -> List[str]:
        """Check for known high-cardinality column patterns."""
        warnings = []

        x_lower = x_column.lower()

        # Check X-axis column
        for pattern in CardinalityValidator.HIGH_CARDINALITY_PATTERNS:
            if pattern in x_lower:
                warnings.append(
                    f"Column '{x_column}' appears to be a high-cardinality field "
                    f"(contains '{pattern}'). This may create an unreadable chart "
                    f"with too many categories on the X-axis."
                )
                break

        # Check group_by column if present
        if group_by_column:
            group_lower = group_by_column.lower()
            for pattern in CardinalityValidator.HIGH_CARDINALITY_PATTERNS:
                if pattern in group_lower:
                    warnings.append(
                        f"Group by column '{group_by_column}' appears to be a "
                        f"high-cardinality field (contains '{pattern}'). This may "
                        f"create too many series to visualize effectively."
                    )
                    break

        return warnings

    @staticmethod
    def _get_suggestions(
        column: str, chart_type: str, pattern_based: bool = False
    ) -> List[str]:
        """Get suggestions for handling high cardinality."""
        suggestions = []

        if pattern_based:
            # Suggestions when we detected high-cardinality patterns
            if any(p in column.lower() for p in ["id", "uuid", "guid"]):
                suggestions.extend(
                    [
                        "Consider using a different column for the X-axis",
                        f"If you need to analyze by {column}, use filters to limit "
                        f"the data",
                        "A table chart might be more appropriate for ID-based data",
                    ]
                )
            elif any(p in column.lower() for p in ["email", "phone", "address"]):
                suggestions.extend(
                    [
                        "Consider grouping by a higher-level category (e.g., "
                        "domain for emails)",
                        f"Use filters to focus on specific {column} values",
                        "Aggregate the data before visualization",
                    ]
                )
            elif any(
                p in column.lower() for p in ["timestamp", "datetime", "created_at"]
            ):
                suggestions.extend(
                    [
                        "Consider truncating timestamps to date or hour level",
                        "Use time-based grouping (daily, weekly, monthly)",
                        "Apply date range filters to limit the data",
                    ]
                )
        else:
            # General high-cardinality suggestions
            threshold = CardinalityValidator.CARDINALITY_THRESHOLDS.get(chart_type, 50)
            suggestions.extend(
                [
                    f"This chart type works best with fewer than {threshold} "
                    f"categories",
                    "Consider using filters to reduce the number of values",
                    "Try grouping or categorizing the data at a higher level",
                    "A table or pivot table might better display high-cardinality data",
                ]
            )

        return suggestions

    @staticmethod
    def suggest_chart_type(cardinality: int) -> List[str]:
        """Suggest appropriate chart types based on cardinality."""
        if cardinality <= 10:
            return ["bar", "pie", "donut", "area"]
        elif cardinality <= 30:
            return ["bar", "line", "area"]
        elif cardinality <= 100:
            return ["line", "scatter"]
        else:
            return ["table", "pivot_table", "heatmap"]