feat(examples): Modernize example data loading with Parquet and YAML configs (#36538)

Co-authored-by: Claude <noreply@anthropic.com>
2026-04-19 08:04:53 +00:00 · 2026-01-21 12:42:15 -08:00
parent ec36791551
commit dee063a4c5
271 changed files with 23340 additions and 12971 deletions
--- a/superset/commands/dashboard/export_example.py
+++ b/superset/commands/dashboard/export_example.py
@@ -0,0 +1,668 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Command to export a dashboard as an example bundle.
+
+This creates an example-ready structure that can be committed to
+superset/examples/ and loaded via the example loading system.
+"""
+
+from __future__ import annotations
+
+import logging
+from collections.abc import Iterator
+from io import BytesIO
+from typing import Any, Callable, TYPE_CHECKING
+
+import yaml
+
+from superset.commands.base import BaseCommand
+from superset.commands.dashboard.exceptions import DashboardNotFoundError
+from superset.daos.dashboard import DashboardDAO
+
+if TYPE_CHECKING:
+    from superset.connectors.sqla.models import SqlaTable
+    from superset.models.dashboard import Dashboard
+    from superset.models.slice import Slice
+
+from superset.sql.parse import SQLStatement, Table
+
+logger = logging.getLogger(__name__)
+
+# Canonical UUID for the examples database
+EXAMPLES_DATABASE_UUID = "a2dc77af-e654-49bb-b321-40f6b559a1ee"
+
+# ASF license header for generated YAML files
+YAML_LICENSE_HEADER = """\
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+
+
+def sanitize_filename(name: str) -> str:
+    """Convert a name to a safe filename."""
+    safe = "".join(c if c.isalnum() or c in "._-" else "_" for c in name)
+    while "__" in safe:
+        safe = safe.replace("__", "_")
+    return safe.strip("_")
+
+
+def get_referenced_tables(sql: str, engine: str = "base") -> set[Table]:
+    """Extract table references from SQL using Superset's SQL parser.
+
+    Args:
+        sql: The SQL query to parse
+        engine: The database engine/dialect (e.g., "postgresql", "mysql")
+
+    Returns:
+        Set of Table objects referenced in the SQL
+    """
+    try:
+        statement = SQLStatement(sql, engine=engine)
+        return statement.tables
+    except Exception as e:
+        logger.warning("Could not parse SQL to extract tables: %s", e)
+        return set()
+
+
+def is_virtual_dataset(dataset: SqlaTable) -> bool:
+    """Check if a dataset is virtual (SQL-based) vs physical (table-based)."""
+    return bool(dataset.sql)
+
+
+def can_preserve_virtual_dataset(
+    dataset: SqlaTable,
+    physical_tables: set[str],
+    engine: str = "base",
+) -> bool:
+    """Check if a virtual dataset can be preserved (all dependencies are in export).
+
+    A virtual dataset can be preserved if all tables it references are
+    physical tables that will be exported as Parquet files.
+
+    Args:
+        dataset: The virtual dataset to check
+        physical_tables: Set of physical table names being exported
+        engine: The database engine/dialect for SQL parsing
+
+    Returns:
+        True if the virtual dataset can be preserved with its SQL intact
+    """
+    if not dataset.sql:
+        return False  # Not a virtual dataset
+
+    referenced = get_referenced_tables(dataset.sql, engine)
+    if not referenced:
+        # Couldn't parse SQL or no tables found - safer to materialize
+        logger.info(
+            "Could not determine dependencies for %s, will materialize",
+            dataset.table_name,
+        )
+        return False
+
+    # Check if all referenced tables are in our physical tables set
+    for table in referenced:
+        # Match by table name (ignore schema since we normalize to default schema)
+        if table.table not in physical_tables:
+            logger.info(
+                "Virtual dataset %s references external table %s, will materialize",
+                dataset.table_name,
+                table.table,
+            )
+            return False
+
+    logger.info(
+        "Virtual dataset %s can be preserved (references: %s)",
+        dataset.table_name,
+        ", ".join(t.table for t in referenced),
+    )
+    return True
+
+
+def export_dataset_yaml(
+    dataset: SqlaTable,
+    data_file: str | None = None,
+    preserve_virtual: bool = False,
+) -> dict[str, Any]:
+    """Export a dataset to YAML format.
+
+    Args:
+        dataset: The dataset to export
+        data_file: Optional explicit parquet filename (for physical datasets)
+        preserve_virtual: If True and dataset is virtual, preserve the SQL query
+                         instead of converting to physical with data_file
+    """
+    # Determine if this is a preserved virtual dataset
+    is_preserved_virtual = preserve_virtual and dataset.sql
+
+    dataset_config: dict[str, Any] = {
+        "table_name": dataset.table_name,
+        # Virtual datasets don't have data files - they query other tables
+        "data_file": None if is_preserved_virtual else data_file,
+        "main_dttm_col": dataset.main_dttm_col,
+        "description": dataset.description,
+        "default_endpoint": dataset.default_endpoint,
+        "offset": dataset.offset,
+        "cache_timeout": dataset.cache_timeout,
+        "catalog": dataset.catalog,
+        "schema": None,  # Don't export - use target database's default schema
+        # Preserve SQL for virtual datasets, None for physical (data is in parquet)
+        "sql": dataset.sql if is_preserved_virtual else None,
+        "params": None,  # Don't export - contains stale import metadata
+        "template_params": dataset.template_params,
+        "filter_select_enabled": dataset.filter_select_enabled,
+        "fetch_values_predicate": dataset.fetch_values_predicate,
+        "extra": dataset.extra,
+        "normalize_columns": dataset.normalize_columns,
+        "always_filter_main_dttm": dataset.always_filter_main_dttm,
+        "folders": None,
+        "uuid": str(dataset.uuid),
+        "metrics": [],
+        "columns": [],
+        "version": "1.0.0",
+        "database_uuid": EXAMPLES_DATABASE_UUID,
+    }
+
+    for metric in dataset.metrics:
+        dataset_config["metrics"].append(
+            {
+                "metric_name": metric.metric_name,
+                "verbose_name": metric.verbose_name,
+                "metric_type": metric.metric_type,
+                "expression": metric.expression,
+                "description": metric.description,
+                "d3format": metric.d3format,
+                "currency": metric.currency,
+                "extra": metric.extra,
+                "warning_text": metric.warning_text,
+            }
+        )
+
+    for column in dataset.columns:
+        dataset_config["columns"].append(
+            {
+                "column_name": column.column_name,
+                "verbose_name": column.verbose_name,
+                "is_dttm": column.is_dttm,
+                "is_active": column.is_active,
+                "type": column.type,
+                "advanced_data_type": column.advanced_data_type,
+                "groupby": column.groupby,
+                "filterable": column.filterable,
+                "expression": column.expression,
+                "description": column.description,
+                "python_date_format": column.python_date_format,
+                "extra": column.extra,
+            }
+        )
+
+    return dataset_config
+
+
+def export_dataset_data(
+    dataset: SqlaTable,
+    sample_rows: int | None = None,
+) -> bytes | None:
+    """Export dataset data to Parquet format. Returns bytes or None on failure."""
+    import pandas as pd  # pylint: disable=import-outside-toplevel
+
+    from superset import db  # pylint: disable=import-outside-toplevel
+
+    # Ensure dataset is attached to session and relationships are loaded
+    if dataset not in db.session:
+        dataset = db.session.merge(dataset)
+
+    # Force load the database and columns relationships by accessing them
+    _ = dataset.database
+    _ = dataset.columns
+
+    if not dataset.database:
+        logger.warning("Dataset %s has no database", dataset.table_name)
+        return None
+
+    try:
+        logger.info("Exporting data for %s to Parquet...", dataset.table_name)
+
+        # Check if this is a virtual dataset (SQL-based)
+        if dataset.sql:
+            sql = dataset.sql
+        else:
+            # For physical tables, build SELECT query from columns
+            columns = [col.column_name for col in dataset.columns if not col.expression]
+
+            if not columns:
+                logger.warning("No columns to export for %s", dataset.table_name)
+                return None
+
+            # Build simple SELECT query (quote identifiers to handle spaces/keywords)
+            column_list = ", ".join(f'"{c}"' for c in columns)
+            quoted_table = f'"{dataset.table_name}"'
+            if dataset.schema:
+                table_ref = f'"{dataset.schema}".{quoted_table}'
+            else:
+                table_ref = quoted_table
+            sql = f"SELECT {column_list} FROM {table_ref}"  # noqa: S608
+
+        with dataset.database.get_sqla_engine() as engine:
+            df = pd.read_sql(sql, engine)
+
+        if sample_rows and len(df) > sample_rows:
+            df = df.head(sample_rows)
+            logger.info("Sampled to %d rows", sample_rows)
+
+        # Write to bytes buffer
+        buf = BytesIO()
+        df.to_parquet(buf, index=False)
+        buf.seek(0)
+        logger.info("Exported %d rows for %s", len(df), dataset.table_name)
+        return buf.getvalue()
+
+    except Exception as e:
+        logger.exception("Could not export data for %s: %s", dataset.table_name, e)
+        return None
+
+
+def export_chart(chart: Slice, dataset_uuid: str) -> dict[str, Any]:
+    """Export a chart to YAML format."""
+    params = chart.params_dict if hasattr(chart, "params_dict") else {}
+
+    return {
+        "slice_name": chart.slice_name,
+        "description": chart.description,
+        "certified_by": chart.certified_by,
+        "certification_details": chart.certification_details,
+        "viz_type": chart.viz_type,
+        "params": params,
+        "query_context": None,  # Don't include - contains stale IDs
+        "cache_timeout": chart.cache_timeout,
+        "uuid": str(chart.uuid),
+        "version": "1.0.0",
+        "dataset_uuid": dataset_uuid,
+    }
+
+
+def remap_native_filters(
+    filters: list[dict[str, Any]],
+    chart_id_to_uuid: dict[int, str],
+    dataset_id_to_uuid: dict[int, str],
+) -> list[dict[str, Any]]:
+    """Remap IDs to UUIDs in native filter configuration."""
+    remapped = []
+    for f in filters:
+        new_filter = f.copy()
+
+        # Remap chartsInScope from IDs to UUIDs
+        if "chartsInScope" in new_filter:
+            new_filter["chartsInScope"] = [
+                chart_id_to_uuid.get(cid, cid) for cid in new_filter["chartsInScope"]
+            ]
+
+        # Remap targets to use datasetUuid
+        if "targets" in new_filter:
+            new_targets = []
+            for target in new_filter["targets"]:
+                new_target = target.copy()
+                if "datasetId" in new_target:
+                    dataset_id = new_target.pop("datasetId")
+                    if dataset_id in dataset_id_to_uuid:
+                        new_target["datasetUuid"] = dataset_id_to_uuid[dataset_id]
+                new_targets.append(new_target)
+            new_filter["targets"] = new_targets
+
+        remapped.append(new_filter)
+    return remapped
+
+
+def remap_chart_configuration(
+    chart_config: dict[str, Any],
+    chart_id_to_uuid: dict[int, str],
+) -> dict[str, Any]:
+    """Remap chart IDs to UUIDs in chart_configuration (cross-filters)."""
+    remapped: dict[str, Any] = {}
+    for chart_id_str, config in chart_config.items():
+        chart_id = int(chart_id_str)
+        if chart_id not in chart_id_to_uuid:
+            continue
+
+        new_config = config.copy()
+        chart_uuid = chart_id_to_uuid[chart_id]
+
+        # Update the id field
+        new_config["id"] = chart_uuid
+
+        # Remap chartsInScope
+        cross_filters = new_config.get("crossFilters", {})
+        if "chartsInScope" in cross_filters:
+            new_config["crossFilters"] = new_config["crossFilters"].copy()
+            new_config["crossFilters"]["chartsInScope"] = [
+                chart_id_to_uuid.get(cid, cid)
+                for cid in new_config["crossFilters"]["chartsInScope"]
+            ]
+
+        remapped[chart_uuid] = new_config
+
+    return remapped
+
+
+def remap_global_chart_configuration(
+    global_config: dict[str, Any],
+    chart_id_to_uuid: dict[int, str],
+) -> dict[str, Any]:
+    """Remap chart IDs in global_chart_configuration."""
+    new_config = global_config.copy()
+    if "chartsInScope" in new_config:
+        new_config["chartsInScope"] = [
+            chart_id_to_uuid.get(cid, cid) for cid in new_config["chartsInScope"]
+        ]
+    return new_config
+
+
+def export_dashboard_yaml(
+    dashboard: Dashboard,
+    chart_id_to_uuid: dict[int, str],
+    dataset_id_to_uuid: dict[int, str],
+) -> dict[str, Any]:
+    """Export dashboard to YAML format with proper ID remapping."""
+    from superset.utils import (
+        json as superset_json,  # pylint: disable=import-outside-toplevel
+    )
+
+    position = dashboard.position or {}
+
+    # Update position to use UUIDs
+    updated_position = {}
+    for key, value in position.items():
+        if isinstance(value, dict):
+            updated_value = value.copy()
+            if "meta" in updated_value and "chartId" in updated_value.get("meta", {}):
+                chart_id = updated_value["meta"]["chartId"]
+                if chart_id in chart_id_to_uuid:
+                    updated_value["meta"]["uuid"] = chart_id_to_uuid[chart_id]
+            updated_position[key] = updated_value
+        else:
+            updated_position[key] = value
+
+    # Parse json_metadata
+    json_metadata = {}
+    if dashboard.json_metadata:
+        try:
+            json_metadata = superset_json.loads(dashboard.json_metadata)
+        except Exception:
+            logger.debug("Could not parse json_metadata")
+
+    # Remap native filters
+    native_filters = json_metadata.get("native_filter_configuration", [])
+    remapped_filters = remap_native_filters(
+        native_filters, chart_id_to_uuid, dataset_id_to_uuid
+    )
+
+    # Remap chart_configuration (cross-filters)
+    chart_configuration = json_metadata.get("chart_configuration", {})
+    remapped_chart_config = remap_chart_configuration(
+        chart_configuration, chart_id_to_uuid
+    )
+
+    # Remap global_chart_configuration
+    global_chart_config = json_metadata.get("global_chart_configuration", {})
+    remapped_global_config = remap_global_chart_configuration(
+        global_chart_config, chart_id_to_uuid
+    )
+
+    # Build metadata section
+    metadata: dict[str, Any] = {
+        "timed_refresh_immune_slices": json_metadata.get(
+            "timed_refresh_immune_slices", []
+        ),
+        "expanded_slices": json_metadata.get("expanded_slices", {}),
+        "refresh_frequency": json_metadata.get("refresh_frequency", 0),
+        "default_filters": json_metadata.get("default_filters", "{}"),
+        "color_scheme": json_metadata.get("color_scheme", ""),
+        "label_colors": json_metadata.get("label_colors", {}),
+        "native_filter_configuration": remapped_filters,
+        "shared_label_colors": json_metadata.get("shared_label_colors", []),
+        "map_label_colors": json_metadata.get("map_label_colors", {}),
+        "color_scheme_domain": json_metadata.get("color_scheme_domain", []),
+        "cross_filters_enabled": json_metadata.get("cross_filters_enabled", False),
+        "chart_configuration": remapped_chart_config,
+        "global_chart_configuration": remapped_global_config,
+    }
+
+    return {
+        "dashboard_title": dashboard.dashboard_title,
+        "description": dashboard.description,
+        "css": dashboard.css,
+        "slug": dashboard.slug,
+        "certified_by": dashboard.certified_by,
+        "certification_details": dashboard.certification_details,
+        "published": dashboard.published,
+        "uuid": str(dashboard.uuid),
+        "position": updated_position,
+        "metadata": metadata,
+        "version": "1.0.0",
+    }
+
+
+def _make_yaml_generator(config: dict[str, Any]) -> Callable[[], bytes]:
+    """Create a generator function for YAML content with ASF license header."""
+    yaml_content = yaml.safe_dump(config, default_flow_style=False, allow_unicode=True)
+    return lambda: (YAML_LICENSE_HEADER + yaml_content).encode("utf-8")
+
+
+def _make_bytes_generator(data: bytes) -> Callable[[], bytes]:
+    """Create a generator function for raw bytes content."""
+    return lambda: data
+
+
+class ExportExampleCommand(BaseCommand):
+    """Export dashboard as an example bundle with Parquet data and YAML configs.
+
+    Output structure for single dataset:
+        data.parquet      - Raw data
+        dataset.yaml      - Dataset metadata
+        dashboard.yaml    - Dashboard definition
+        charts/*.yaml     - Chart definitions
+
+    Output structure for multiple datasets:
+        data/*.parquet    - Raw data files
+        datasets/*.yaml   - Dataset metadata files
+        dashboard.yaml    - Dashboard definition
+        charts/*.yaml     - Chart definitions
+    """
+
+    def __init__(
+        self,
+        dashboard_id: int,
+        export_data: bool = True,
+        sample_rows: int | None = None,
+    ):
+        self._dashboard_id = dashboard_id
+        self._export_data = export_data
+        self._sample_rows = sample_rows
+        self._dashboard: Dashboard | None = None
+
+    def validate(self) -> None:
+        self._dashboard = DashboardDAO.find_by_id(self._dashboard_id)
+        if not self._dashboard:
+            raise DashboardNotFoundError()
+
+    def run(self) -> Iterator[tuple[str, Callable[[], bytes]]]:  # noqa: C901
+        """Yield (filename, content_generator) tuples for ZIP packaging.
+
+        Content generators return bytes (either YAML encoded or raw Parquet).
+        """
+        self.validate()
+        assert self._dashboard is not None
+
+        # Collect all charts and their datasets
+        charts = self._dashboard.slices
+        datasets: dict[int, SqlaTable] = {}
+        chart_id_to_uuid: dict[int, str] = {}
+        chart_to_dataset_uuid: dict[int, str] = {}
+
+        for chart in charts:
+            chart_id_to_uuid[chart.id] = str(chart.uuid)
+            if chart.datasource:
+                datasets[chart.datasource.id] = chart.datasource
+                chart_to_dataset_uuid[chart.id] = str(chart.datasource.uuid)
+
+        # Build dataset ID to UUID mapping
+        dataset_id_to_uuid: dict[int, str] = {
+            ds_id: str(ds.uuid) for ds_id, ds in datasets.items()
+        }
+
+        logger.info("Found %d charts and %d datasets", len(charts), len(datasets))
+
+        # Classify datasets: physical vs virtual
+        # Physical datasets need Parquet export; virtual datasets with all
+        # dependencies in the export can preserve their SQL
+        physical_datasets: dict[int, SqlaTable] = {}
+        virtual_datasets: dict[int, SqlaTable] = {}
+
+        for ds_id, dataset in datasets.items():
+            if is_virtual_dataset(dataset):
+                virtual_datasets[ds_id] = dataset
+            else:
+                physical_datasets[ds_id] = dataset
+
+        # Get the set of physical table names for dependency checking
+        physical_table_names = {ds.table_name for ds in physical_datasets.values()}
+
+        # Determine which virtual datasets can be preserved vs need materialization
+        # A virtual dataset can be preserved if all its referenced tables are
+        # physical datasets in this export
+        preserved_virtual: dict[int, SqlaTable] = {}
+        materialized_virtual: dict[int, SqlaTable] = {}
+
+        # Get database engine for SQL parsing (use first dataset's database)
+        db_engine = "base"
+        if datasets:
+            first_dataset = next(iter(datasets.values()))
+            if first_dataset.database:
+                db_engine = first_dataset.database.backend or "base"
+
+        for ds_id, dataset in virtual_datasets.items():
+            if can_preserve_virtual_dataset(dataset, physical_table_names, db_engine):
+                preserved_virtual[ds_id] = dataset
+            else:
+                materialized_virtual[ds_id] = dataset
+
+        # Log classification summary
+        logger.info(
+            "Dataset classification: %d physical, %d virtual preserved, "
+            "%d virtual materialized",
+            len(physical_datasets),
+            len(preserved_virtual),
+            len(materialized_virtual),
+        )
+
+        # Datasets that need Parquet export = physical + materialized virtual
+        datasets_needing_data = {**physical_datasets, **materialized_virtual}
+
+        # Build unique filenames for datasets (handle table_name collisions)
+        dataset_filenames: dict[int, str] = {}
+        seen_table_names: dict[str, int] = {}  # table_name -> first dataset_id
+
+        for ds_id, dataset in datasets.items():
+            table_name = dataset.table_name
+            if table_name in seen_table_names:
+                # Collision! Use UUID suffix for uniqueness
+                uuid_suffix = str(dataset.uuid)[:8]
+                filename = f"{table_name}-{uuid_suffix}"
+                logger.info(
+                    "Table name collision for '%s', using '%s'", table_name, filename
+                )
+            else:
+                filename = table_name
+                seen_table_names[table_name] = ds_id
+            dataset_filenames[ds_id] = filename
+
+        # Export datasets
+        multi_dataset = len(datasets) > 1
+
+        if multi_dataset:
+            # Multiple datasets: use datasets/ and data/ folders
+            for ds_id, dataset in datasets.items():
+                filename = dataset_filenames[ds_id]
+                needs_data = ds_id in datasets_needing_data
+                is_preserved = ds_id in preserved_virtual
+                data_file = f"{filename}.parquet" if needs_data else None
+
+                # Export YAML
+                dataset_config = export_dataset_yaml(
+                    dataset,
+                    data_file=data_file,
+                    preserve_virtual=is_preserved,
+                )
+                yield (
+                    f"datasets/{filename}.yaml",
+                    _make_yaml_generator(dataset_config),
+                )
+
+                # Export data only for datasets that need it
+                if self._export_data and needs_data:
+                    data = export_dataset_data(dataset, self._sample_rows)
+                    if data:
+                        yield (
+                            f"data/{data_file}",
+                            _make_bytes_generator(data),
+                        )
+
+        elif len(datasets) == 1:
+            # Single dataset: use dataset.yaml and data.parquet at root
+            ds_id = next(iter(datasets.keys()))
+            dataset = datasets[ds_id]
+            needs_data = ds_id in datasets_needing_data
+            is_preserved = ds_id in preserved_virtual
+            data_file = "data.parquet" if needs_data else None
+
+            dataset_config = export_dataset_yaml(
+                dataset,
+                data_file=data_file,
+                preserve_virtual=is_preserved,
+            )
+            yield ("dataset.yaml", _make_yaml_generator(dataset_config))
+
+            if self._export_data and needs_data:
+                data = export_dataset_data(dataset, self._sample_rows)
+                if data:
+                    yield ("data.parquet", _make_bytes_generator(data))
+
+        # Export charts
+        for chart in charts:
+            dataset_uuid = chart_to_dataset_uuid.get(chart.id, "")
+            chart_config = export_chart(chart, dataset_uuid)
+            filename = sanitize_filename(chart.slice_name) + ".yaml"
+            yield (f"charts/{filename}", _make_yaml_generator(chart_config))
+
+        # Export dashboard
+        dashboard_config = export_dashboard_yaml(
+            self._dashboard, chart_id_to_uuid, dataset_id_to_uuid
+        )
+        yield ("dashboard.yaml", _make_yaml_generator(dashboard_config))