superset2/superset/commands/dashboard/export_example.py

# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements.  See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership.  The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License.  You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.
"""Command to export a dashboard as an example bundle.

This creates an example-ready structure that can be committed to
superset/examples/ and loaded via the example loading system.
"""

from __future__ import annotations

import logging
from collections.abc import Iterator
from io import BytesIO
from typing import Any, Callable, TYPE_CHECKING

import yaml

from superset.commands.base import BaseCommand
from superset.commands.dashboard.exceptions import DashboardNotFoundError
from superset.daos.dashboard import DashboardDAO

if TYPE_CHECKING:
    from superset.connectors.sqla.models import SqlaTable
    from superset.models.dashboard import Dashboard
    from superset.models.slice import Slice

from superset.sql.parse import SQLStatement, Table

logger = logging.getLogger(__name__)

# Canonical UUID for the examples database
EXAMPLES_DATABASE_UUID = "a2dc77af-e654-49bb-b321-40f6b559a1ee"

# ASF license header for generated YAML files
YAML_LICENSE_HEADER = """\
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements.  See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership.  The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License.  You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.
"""


def sanitize_filename(name: str) -> str:
    """Convert a name to a safe filename."""
    safe = "".join(c if c.isalnum() or c in "._-" else "_" for c in name)
    while "__" in safe:
        safe = safe.replace("__", "_")
    return safe.strip("_")


def get_referenced_tables(sql: str, engine: str = "base") -> set[Table]:
    """Extract table references from SQL using Superset's SQL parser.

    Args:
        sql: The SQL query to parse
        engine: The database engine/dialect (e.g., "postgresql", "mysql")

    Returns:
        Set of Table objects referenced in the SQL
    """
    try:
        statement = SQLStatement(sql, engine=engine)
        return statement.tables
    except Exception as e:
        logger.warning("Could not parse SQL to extract tables: %s", e)
        return set()


def is_virtual_dataset(dataset: SqlaTable) -> bool:
    """Check if a dataset is virtual (SQL-based) vs physical (table-based)."""
    return bool(dataset.sql)


def can_preserve_virtual_dataset(
    dataset: SqlaTable,
    physical_tables: set[str],
    engine: str = "base",
) -> bool:
    """Check if a virtual dataset can be preserved (all dependencies are in export).

    A virtual dataset can be preserved if all tables it references are
    physical tables that will be exported as Parquet files.

    Args:
        dataset: The virtual dataset to check
        physical_tables: Set of physical table names being exported
        engine: The database engine/dialect for SQL parsing

    Returns:
        True if the virtual dataset can be preserved with its SQL intact
    """
    if not dataset.sql:
        return False  # Not a virtual dataset

    referenced = get_referenced_tables(dataset.sql, engine)
    if not referenced:
        # Couldn't parse SQL or no tables found - safer to materialize
        logger.info(
            "Could not determine dependencies for %s, will materialize",
            dataset.table_name,
        )
        return False

    # Check if all referenced tables are in our physical tables set
    for table in referenced:
        # Match by table name (ignore schema since we normalize to default schema)
        if table.table not in physical_tables:
            logger.info(
                "Virtual dataset %s references external table %s, will materialize",
                dataset.table_name,
                table.table,
            )
            return False

    logger.info(
        "Virtual dataset %s can be preserved (references: %s)",
        dataset.table_name,
        ", ".join(t.table for t in referenced),
    )
    return True


def export_dataset_yaml(
    dataset: SqlaTable,
    data_file: str | None = None,
    preserve_virtual: bool = False,
) -> dict[str, Any]:
    """Export a dataset to YAML format.

    Args:
        dataset: The dataset to export
        data_file: Optional explicit parquet filename (for physical datasets)
        preserve_virtual: If True and dataset is virtual, preserve the SQL query
                         instead of converting to physical with data_file
    """
    # Determine if this is a preserved virtual dataset
    is_preserved_virtual = preserve_virtual and dataset.sql

    dataset_config: dict[str, Any] = {
        "table_name": dataset.table_name,
        # Virtual datasets don't have data files - they query other tables
        "data_file": None if is_preserved_virtual else data_file,
        "main_dttm_col": dataset.main_dttm_col,
        "description": dataset.description,
        "default_endpoint": dataset.default_endpoint,
        "offset": dataset.offset,
        "cache_timeout": dataset.cache_timeout,
        "catalog": dataset.catalog,
        "schema": None,  # Don't export - use target database's default schema
        # Preserve SQL for virtual datasets, None for physical (data is in parquet)
        "sql": dataset.sql if is_preserved_virtual else None,
        # Track source database engine for SQL transpilation during import
        "source_db_engine": (
            dataset.database.db_engine_spec.engine if is_preserved_virtual else None
        ),
        "params": None,  # Don't export - contains stale import metadata
        "template_params": dataset.template_params,
        "filter_select_enabled": dataset.filter_select_enabled,
        "fetch_values_predicate": dataset.fetch_values_predicate,
        "extra": dataset.extra,
        "normalize_columns": dataset.normalize_columns,
        "always_filter_main_dttm": dataset.always_filter_main_dttm,
        "folders": None,
        "uuid": str(dataset.uuid),
        "metrics": [],
        "columns": [],
        "version": "1.0.0",
        "database_uuid": EXAMPLES_DATABASE_UUID,
    }

    for metric in dataset.metrics:
        dataset_config["metrics"].append(
            {
                "metric_name": metric.metric_name,
                "verbose_name": metric.verbose_name,
                "metric_type": metric.metric_type,
                "expression": metric.expression,
                "description": metric.description,
                "d3format": metric.d3format,
                "currency": metric.currency,
                "extra": metric.extra,
                "warning_text": metric.warning_text,
            }
        )

    for column in dataset.columns:
        dataset_config["columns"].append(
            {
                "column_name": column.column_name,
                "verbose_name": column.verbose_name,
                "is_dttm": column.is_dttm,
                "is_active": column.is_active,
                "type": column.type,
                "advanced_data_type": column.advanced_data_type,
                "groupby": column.groupby,
                "filterable": column.filterable,
                "expression": column.expression,
                "description": column.description,
                "python_date_format": column.python_date_format,
                "extra": column.extra,
            }
        )

    return dataset_config


def export_dataset_data(
    dataset: SqlaTable,
    sample_rows: int | None = None,
) -> bytes | None:
    """Export dataset data to Parquet format. Returns bytes or None on failure."""
    import pandas as pd  # pylint: disable=import-outside-toplevel

    from superset import db  # pylint: disable=import-outside-toplevel

    # Ensure dataset is attached to session and relationships are loaded
    if dataset not in db.session:
        dataset = db.session.merge(dataset)

    # Force load the database and columns relationships by accessing them
    _ = dataset.database
    _ = dataset.columns

    if not dataset.database:
        logger.warning("Dataset %s has no database", dataset.table_name)
        return None

    try:
        logger.info("Exporting data for %s to Parquet...", dataset.table_name)

        # Check if this is a virtual dataset (SQL-based)
        if dataset.sql:
            sql = dataset.sql
        else:
            # For physical tables, build SELECT query from columns
            columns = [col.column_name for col in dataset.columns if not col.expression]

            if not columns:
                logger.warning("No columns to export for %s", dataset.table_name)
                return None

            # Build simple SELECT query (quote identifiers to handle spaces/keywords)
            column_list = ", ".join(f'"{c}"' for c in columns)
            quoted_table = f'"{dataset.table_name}"'
            if dataset.schema:
                table_ref = f'"{dataset.schema}".{quoted_table}'
            else:
                table_ref = quoted_table
            sql = f"SELECT {column_list} FROM {table_ref}"  # noqa: S608

        with dataset.database.get_sqla_engine() as engine:
            df = pd.read_sql(sql, engine)

        if sample_rows and len(df) > sample_rows:
            df = df.head(sample_rows)
            logger.info("Sampled to %d rows", sample_rows)

        # Write to bytes buffer
        buf = BytesIO()
        df.to_parquet(buf, index=False)
        buf.seek(0)
        logger.info("Exported %d rows for %s", len(df), dataset.table_name)
        return buf.getvalue()

    except Exception as e:
        logger.exception("Could not export data for %s: %s", dataset.table_name, e)
        return None


def export_chart(chart: Slice, dataset_uuid: str) -> dict[str, Any]:
    """Export a chart to YAML format."""
    params = chart.params_dict if hasattr(chart, "params_dict") else {}

    return {
        "slice_name": chart.slice_name,
        "description": chart.description,
        "certified_by": chart.certified_by,
        "certification_details": chart.certification_details,
        "viz_type": chart.viz_type,
        "params": params,
        "query_context": None,  # Don't include - contains stale IDs
        "cache_timeout": chart.cache_timeout,
        "uuid": str(chart.uuid),
        "version": "1.0.0",
        "dataset_uuid": dataset_uuid,
    }


def remap_native_filters(
    filters: list[dict[str, Any]],
    chart_id_to_uuid: dict[int, str],
    dataset_id_to_uuid: dict[int, str],
) -> list[dict[str, Any]]:
    """Remap IDs to UUIDs in native filter configuration."""
    remapped = []
    for f in filters:
        new_filter = f.copy()

        # Remap chartsInScope from IDs to UUIDs
        if "chartsInScope" in new_filter:
            new_filter["chartsInScope"] = [
                chart_id_to_uuid.get(cid, cid) for cid in new_filter["chartsInScope"]
            ]

        # Remap targets to use datasetUuid
        if "targets" in new_filter:
            new_targets = []
            for target in new_filter["targets"]:
                new_target = target.copy()
                if "datasetId" in new_target:
                    dataset_id = new_target.pop("datasetId")
                    if dataset_id in dataset_id_to_uuid:
                        new_target["datasetUuid"] = dataset_id_to_uuid[dataset_id]
                new_targets.append(new_target)
            new_filter["targets"] = new_targets

        remapped.append(new_filter)
    return remapped


def remap_chart_configuration(
    chart_config: dict[str, Any],
    chart_id_to_uuid: dict[int, str],
) -> dict[str, Any]:
    """Remap chart IDs to UUIDs in chart_configuration (cross-filters)."""
    remapped: dict[str, Any] = {}
    for chart_id_str, config in chart_config.items():
        chart_id = int(chart_id_str)
        if chart_id not in chart_id_to_uuid:
            continue

        new_config = config.copy()
        chart_uuid = chart_id_to_uuid[chart_id]

        # Update the id field
        new_config["id"] = chart_uuid

        # Remap chartsInScope
        cross_filters = new_config.get("crossFilters", {})
        if "chartsInScope" in cross_filters:
            new_config["crossFilters"] = new_config["crossFilters"].copy()
            new_config["crossFilters"]["chartsInScope"] = [
                chart_id_to_uuid.get(cid, cid)
                for cid in new_config["crossFilters"]["chartsInScope"]
            ]

        remapped[chart_uuid] = new_config

    return remapped


def remap_global_chart_configuration(
    global_config: dict[str, Any],
    chart_id_to_uuid: dict[int, str],
) -> dict[str, Any]:
    """Remap chart IDs in global_chart_configuration."""
    new_config = global_config.copy()
    if "chartsInScope" in new_config:
        new_config["chartsInScope"] = [
            chart_id_to_uuid.get(cid, cid) for cid in new_config["chartsInScope"]
        ]
    return new_config


def export_dashboard_yaml(
    dashboard: Dashboard,
    chart_id_to_uuid: dict[int, str],
    dataset_id_to_uuid: dict[int, str],
) -> dict[str, Any]:
    """Export dashboard to YAML format with proper ID remapping."""
    from superset.utils import (
        json as superset_json,  # pylint: disable=import-outside-toplevel
    )

    position = dashboard.position or {}

    # Update position to use UUIDs
    updated_position = {}
    for key, value in position.items():
        if isinstance(value, dict):
            updated_value = value.copy()
            if "meta" in updated_value and "chartId" in updated_value.get("meta", {}):
                chart_id = updated_value["meta"]["chartId"]
                if chart_id in chart_id_to_uuid:
                    updated_value["meta"]["uuid"] = chart_id_to_uuid[chart_id]
            updated_position[key] = updated_value
        else:
            updated_position[key] = value

    # Parse json_metadata
    json_metadata = {}
    if dashboard.json_metadata:
        try:
            json_metadata = superset_json.loads(dashboard.json_metadata)
        except Exception:
            logger.debug("Could not parse json_metadata")

    # Remap native filters
    native_filters = json_metadata.get("native_filter_configuration", [])
    remapped_filters = remap_native_filters(
        native_filters, chart_id_to_uuid, dataset_id_to_uuid
    )

    # Remap chart_configuration (cross-filters)
    chart_configuration = json_metadata.get("chart_configuration", {})
    remapped_chart_config = remap_chart_configuration(
        chart_configuration, chart_id_to_uuid
    )

    # Remap global_chart_configuration
    global_chart_config = json_metadata.get("global_chart_configuration", {})
    remapped_global_config = remap_global_chart_configuration(
        global_chart_config, chart_id_to_uuid
    )

    # Build metadata section
    metadata: dict[str, Any] = {
        "timed_refresh_immune_slices": json_metadata.get(
            "timed_refresh_immune_slices", []
        ),
        "expanded_slices": json_metadata.get("expanded_slices", {}),
        "refresh_frequency": json_metadata.get("refresh_frequency", 0),
        "default_filters": json_metadata.get("default_filters", "{}"),
        "color_scheme": json_metadata.get("color_scheme", ""),
        "label_colors": json_metadata.get("label_colors", {}),
        "native_filter_configuration": remapped_filters,
        "shared_label_colors": json_metadata.get("shared_label_colors", []),
        "map_label_colors": json_metadata.get("map_label_colors", {}),
        "color_scheme_domain": json_metadata.get("color_scheme_domain", []),
        "cross_filters_enabled": json_metadata.get("cross_filters_enabled", False),
        "chart_configuration": remapped_chart_config,
        "global_chart_configuration": remapped_global_config,
    }

    return {
        "dashboard_title": dashboard.dashboard_title,
        "description": dashboard.description,
        "css": dashboard.css,
        "slug": dashboard.slug,
        "certified_by": dashboard.certified_by,
        "certification_details": dashboard.certification_details,
        "published": dashboard.published,
        "uuid": str(dashboard.uuid),
        "position": updated_position,
        "metadata": metadata,
        "version": "1.0.0",
    }


def _make_yaml_generator(config: dict[str, Any]) -> Callable[[], bytes]:
    """Create a generator function for YAML content with ASF license header."""
    yaml_content = yaml.safe_dump(config, default_flow_style=False, allow_unicode=True)
    return lambda: (YAML_LICENSE_HEADER + yaml_content).encode("utf-8")


def _make_bytes_generator(data: bytes) -> Callable[[], bytes]:
    """Create a generator function for raw bytes content."""
    return lambda: data


class ExportExampleCommand(BaseCommand):
    """Export dashboard as an example bundle with Parquet data and YAML configs.

    Output structure for single dataset:
        data.parquet      - Raw data
        dataset.yaml      - Dataset metadata
        dashboard.yaml    - Dashboard definition
        charts/*.yaml     - Chart definitions

    Output structure for multiple datasets:
        data/*.parquet    - Raw data files
        datasets/*.yaml   - Dataset metadata files
        dashboard.yaml    - Dashboard definition
        charts/*.yaml     - Chart definitions
    """

    def __init__(
        self,
        dashboard_id: int,
        export_data: bool = True,
        sample_rows: int | None = None,
    ):
        self._dashboard_id = dashboard_id
        self._export_data = export_data
        self._sample_rows = sample_rows
        self._dashboard: Dashboard | None = None

    def validate(self) -> None:
        self._dashboard = DashboardDAO.find_by_id(self._dashboard_id)
        if not self._dashboard:
            raise DashboardNotFoundError()

    def run(self) -> Iterator[tuple[str, Callable[[], bytes]]]:  # noqa: C901
        """Yield (filename, content_generator) tuples for ZIP packaging.

        Content generators return bytes (either YAML encoded or raw Parquet).
        """
        self.validate()
        assert self._dashboard is not None

        # Collect all charts and their datasets
        charts = self._dashboard.slices
        datasets: dict[int, SqlaTable] = {}
        chart_id_to_uuid: dict[int, str] = {}
        chart_to_dataset_uuid: dict[int, str] = {}

        for chart in charts:
            chart_id_to_uuid[chart.id] = str(chart.uuid)
            if chart.datasource:
                datasets[chart.datasource.id] = chart.datasource
                chart_to_dataset_uuid[chart.id] = str(chart.datasource.uuid)

        # Build dataset ID to UUID mapping
        dataset_id_to_uuid: dict[int, str] = {
            ds_id: str(ds.uuid) for ds_id, ds in datasets.items()
        }

        logger.info("Found %d charts and %d datasets", len(charts), len(datasets))

        # Classify datasets: physical vs virtual
        # Physical datasets need Parquet export; virtual datasets with all
        # dependencies in the export can preserve their SQL
        physical_datasets: dict[int, SqlaTable] = {}
        virtual_datasets: dict[int, SqlaTable] = {}

        for ds_id, dataset in datasets.items():
            if is_virtual_dataset(dataset):
                virtual_datasets[ds_id] = dataset
            else:
                physical_datasets[ds_id] = dataset

        # Get the set of physical table names for dependency checking
        physical_table_names = {ds.table_name for ds in physical_datasets.values()}

        # Determine which virtual datasets can be preserved vs need materialization
        # A virtual dataset can be preserved if all its referenced tables are
        # physical datasets in this export
        preserved_virtual: dict[int, SqlaTable] = {}
        materialized_virtual: dict[int, SqlaTable] = {}

        # Get database engine for SQL parsing (use first dataset's database)
        db_engine = "base"
        if datasets:
            first_dataset = next(iter(datasets.values()))
            if first_dataset.database:
                db_engine = first_dataset.database.backend or "base"

        for ds_id, dataset in virtual_datasets.items():
            if can_preserve_virtual_dataset(dataset, physical_table_names, db_engine):
                preserved_virtual[ds_id] = dataset
            else:
                materialized_virtual[ds_id] = dataset

        # Log classification summary
        logger.info(
            "Dataset classification: %d physical, %d virtual preserved, "
            "%d virtual materialized",
            len(physical_datasets),
            len(preserved_virtual),
            len(materialized_virtual),
        )

        # Datasets that need Parquet export = physical + materialized virtual
        datasets_needing_data = {**physical_datasets, **materialized_virtual}

        # Build unique filenames for datasets (handle table_name collisions)
        dataset_filenames: dict[int, str] = {}
        seen_table_names: dict[str, int] = {}  # table_name -> first dataset_id

        for ds_id, dataset in datasets.items():
            table_name = dataset.table_name
            if table_name in seen_table_names:
                # Collision! Use UUID suffix for uniqueness
                uuid_suffix = str(dataset.uuid)[:8]
                filename = f"{table_name}-{uuid_suffix}"
                logger.info(
                    "Table name collision for '%s', using '%s'", table_name, filename
                )
            else:
                filename = table_name
                seen_table_names[table_name] = ds_id
            dataset_filenames[ds_id] = filename

        # Export datasets
        multi_dataset = len(datasets) > 1

        if multi_dataset:
            # Multiple datasets: use datasets/ and data/ folders
            for ds_id, dataset in datasets.items():
                filename = dataset_filenames[ds_id]
                needs_data = ds_id in datasets_needing_data
                is_preserved = ds_id in preserved_virtual
                data_file = f"{filename}.parquet" if needs_data else None

                # Export YAML
                dataset_config = export_dataset_yaml(
                    dataset,
                    data_file=data_file,
                    preserve_virtual=is_preserved,
                )
                yield (
                    f"datasets/{filename}.yaml",
                    _make_yaml_generator(dataset_config),
                )

                # Export data only for datasets that need it
                if self._export_data and needs_data:
                    data = export_dataset_data(dataset, self._sample_rows)
                    if data:
                        yield (
                            f"data/{data_file}",
                            _make_bytes_generator(data),
                        )

        elif len(datasets) == 1:
            # Single dataset: use dataset.yaml and data.parquet at root
            ds_id = next(iter(datasets.keys()))
            dataset = datasets[ds_id]
            needs_data = ds_id in datasets_needing_data
            is_preserved = ds_id in preserved_virtual
            data_file = "data.parquet" if needs_data else None

            dataset_config = export_dataset_yaml(
                dataset,
                data_file=data_file,
                preserve_virtual=is_preserved,
            )
            yield ("dataset.yaml", _make_yaml_generator(dataset_config))

            if self._export_data and needs_data:
                data = export_dataset_data(dataset, self._sample_rows)
                if data:
                    yield ("data.parquet", _make_bytes_generator(data))

        # Export charts
        for chart in charts:
            dataset_uuid = chart_to_dataset_uuid.get(chart.id, "")
            chart_config = export_chart(chart, dataset_uuid)
            filename = sanitize_filename(chart.slice_name) + ".yaml"
            yield (f"charts/{filename}", _make_yaml_generator(chart_config))

        # Export dashboard
        dashboard_config = export_dashboard_yaml(
            self._dashboard, chart_id_to_uuid, dataset_id_to_uuid
        )
        yield ("dashboard.yaml", _make_yaml_generator(dashboard_config))