Files
superset2/superset/commands/dashboard/export_example.py
Evan Rusackas 87bbd54d0a feat(examples): Transpile virtual dataset SQL on import (#37311)
Co-authored-by: Claude Opus 4.5 <noreply@anthropic.com>
Co-authored-by: Beto Dealmeida <roberto@dealmeida.net>
Co-authored-by: bito-code-review[bot] <188872107+bito-code-review[bot]@users.noreply.github.com>
2026-01-22 09:50:05 -08:00

673 lines
25 KiB
Python

# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
"""Command to export a dashboard as an example bundle.
This creates an example-ready structure that can be committed to
superset/examples/ and loaded via the example loading system.
"""
from __future__ import annotations
import logging
from collections.abc import Iterator
from io import BytesIO
from typing import Any, Callable, TYPE_CHECKING
import yaml
from superset.commands.base import BaseCommand
from superset.commands.dashboard.exceptions import DashboardNotFoundError
from superset.daos.dashboard import DashboardDAO
if TYPE_CHECKING:
from superset.connectors.sqla.models import SqlaTable
from superset.models.dashboard import Dashboard
from superset.models.slice import Slice
from superset.sql.parse import SQLStatement, Table
logger = logging.getLogger(__name__)
# Canonical UUID for the examples database
EXAMPLES_DATABASE_UUID = "a2dc77af-e654-49bb-b321-40f6b559a1ee"
# ASF license header for generated YAML files
YAML_LICENSE_HEADER = """\
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
"""
def sanitize_filename(name: str) -> str:
"""Convert a name to a safe filename."""
safe = "".join(c if c.isalnum() or c in "._-" else "_" for c in name)
while "__" in safe:
safe = safe.replace("__", "_")
return safe.strip("_")
def get_referenced_tables(sql: str, engine: str = "base") -> set[Table]:
"""Extract table references from SQL using Superset's SQL parser.
Args:
sql: The SQL query to parse
engine: The database engine/dialect (e.g., "postgresql", "mysql")
Returns:
Set of Table objects referenced in the SQL
"""
try:
statement = SQLStatement(sql, engine=engine)
return statement.tables
except Exception as e:
logger.warning("Could not parse SQL to extract tables: %s", e)
return set()
def is_virtual_dataset(dataset: SqlaTable) -> bool:
"""Check if a dataset is virtual (SQL-based) vs physical (table-based)."""
return bool(dataset.sql)
def can_preserve_virtual_dataset(
dataset: SqlaTable,
physical_tables: set[str],
engine: str = "base",
) -> bool:
"""Check if a virtual dataset can be preserved (all dependencies are in export).
A virtual dataset can be preserved if all tables it references are
physical tables that will be exported as Parquet files.
Args:
dataset: The virtual dataset to check
physical_tables: Set of physical table names being exported
engine: The database engine/dialect for SQL parsing
Returns:
True if the virtual dataset can be preserved with its SQL intact
"""
if not dataset.sql:
return False # Not a virtual dataset
referenced = get_referenced_tables(dataset.sql, engine)
if not referenced:
# Couldn't parse SQL or no tables found - safer to materialize
logger.info(
"Could not determine dependencies for %s, will materialize",
dataset.table_name,
)
return False
# Check if all referenced tables are in our physical tables set
for table in referenced:
# Match by table name (ignore schema since we normalize to default schema)
if table.table not in physical_tables:
logger.info(
"Virtual dataset %s references external table %s, will materialize",
dataset.table_name,
table.table,
)
return False
logger.info(
"Virtual dataset %s can be preserved (references: %s)",
dataset.table_name,
", ".join(t.table for t in referenced),
)
return True
def export_dataset_yaml(
dataset: SqlaTable,
data_file: str | None = None,
preserve_virtual: bool = False,
) -> dict[str, Any]:
"""Export a dataset to YAML format.
Args:
dataset: The dataset to export
data_file: Optional explicit parquet filename (for physical datasets)
preserve_virtual: If True and dataset is virtual, preserve the SQL query
instead of converting to physical with data_file
"""
# Determine if this is a preserved virtual dataset
is_preserved_virtual = preserve_virtual and dataset.sql
dataset_config: dict[str, Any] = {
"table_name": dataset.table_name,
# Virtual datasets don't have data files - they query other tables
"data_file": None if is_preserved_virtual else data_file,
"main_dttm_col": dataset.main_dttm_col,
"description": dataset.description,
"default_endpoint": dataset.default_endpoint,
"offset": dataset.offset,
"cache_timeout": dataset.cache_timeout,
"catalog": dataset.catalog,
"schema": None, # Don't export - use target database's default schema
# Preserve SQL for virtual datasets, None for physical (data is in parquet)
"sql": dataset.sql if is_preserved_virtual else None,
# Track source database engine for SQL transpilation during import
"source_db_engine": (
dataset.database.db_engine_spec.engine if is_preserved_virtual else None
),
"params": None, # Don't export - contains stale import metadata
"template_params": dataset.template_params,
"filter_select_enabled": dataset.filter_select_enabled,
"fetch_values_predicate": dataset.fetch_values_predicate,
"extra": dataset.extra,
"normalize_columns": dataset.normalize_columns,
"always_filter_main_dttm": dataset.always_filter_main_dttm,
"folders": None,
"uuid": str(dataset.uuid),
"metrics": [],
"columns": [],
"version": "1.0.0",
"database_uuid": EXAMPLES_DATABASE_UUID,
}
for metric in dataset.metrics:
dataset_config["metrics"].append(
{
"metric_name": metric.metric_name,
"verbose_name": metric.verbose_name,
"metric_type": metric.metric_type,
"expression": metric.expression,
"description": metric.description,
"d3format": metric.d3format,
"currency": metric.currency,
"extra": metric.extra,
"warning_text": metric.warning_text,
}
)
for column in dataset.columns:
dataset_config["columns"].append(
{
"column_name": column.column_name,
"verbose_name": column.verbose_name,
"is_dttm": column.is_dttm,
"is_active": column.is_active,
"type": column.type,
"advanced_data_type": column.advanced_data_type,
"groupby": column.groupby,
"filterable": column.filterable,
"expression": column.expression,
"description": column.description,
"python_date_format": column.python_date_format,
"extra": column.extra,
}
)
return dataset_config
def export_dataset_data(
dataset: SqlaTable,
sample_rows: int | None = None,
) -> bytes | None:
"""Export dataset data to Parquet format. Returns bytes or None on failure."""
import pandas as pd # pylint: disable=import-outside-toplevel
from superset import db # pylint: disable=import-outside-toplevel
# Ensure dataset is attached to session and relationships are loaded
if dataset not in db.session:
dataset = db.session.merge(dataset)
# Force load the database and columns relationships by accessing them
_ = dataset.database
_ = dataset.columns
if not dataset.database:
logger.warning("Dataset %s has no database", dataset.table_name)
return None
try:
logger.info("Exporting data for %s to Parquet...", dataset.table_name)
# Check if this is a virtual dataset (SQL-based)
if dataset.sql:
sql = dataset.sql
else:
# For physical tables, build SELECT query from columns
columns = [col.column_name for col in dataset.columns if not col.expression]
if not columns:
logger.warning("No columns to export for %s", dataset.table_name)
return None
# Build simple SELECT query (quote identifiers to handle spaces/keywords)
column_list = ", ".join(f'"{c}"' for c in columns)
quoted_table = f'"{dataset.table_name}"'
if dataset.schema:
table_ref = f'"{dataset.schema}".{quoted_table}'
else:
table_ref = quoted_table
sql = f"SELECT {column_list} FROM {table_ref}" # noqa: S608
with dataset.database.get_sqla_engine() as engine:
df = pd.read_sql(sql, engine)
if sample_rows and len(df) > sample_rows:
df = df.head(sample_rows)
logger.info("Sampled to %d rows", sample_rows)
# Write to bytes buffer
buf = BytesIO()
df.to_parquet(buf, index=False)
buf.seek(0)
logger.info("Exported %d rows for %s", len(df), dataset.table_name)
return buf.getvalue()
except Exception as e:
logger.exception("Could not export data for %s: %s", dataset.table_name, e)
return None
def export_chart(chart: Slice, dataset_uuid: str) -> dict[str, Any]:
"""Export a chart to YAML format."""
params = chart.params_dict if hasattr(chart, "params_dict") else {}
return {
"slice_name": chart.slice_name,
"description": chart.description,
"certified_by": chart.certified_by,
"certification_details": chart.certification_details,
"viz_type": chart.viz_type,
"params": params,
"query_context": None, # Don't include - contains stale IDs
"cache_timeout": chart.cache_timeout,
"uuid": str(chart.uuid),
"version": "1.0.0",
"dataset_uuid": dataset_uuid,
}
def remap_native_filters(
filters: list[dict[str, Any]],
chart_id_to_uuid: dict[int, str],
dataset_id_to_uuid: dict[int, str],
) -> list[dict[str, Any]]:
"""Remap IDs to UUIDs in native filter configuration."""
remapped = []
for f in filters:
new_filter = f.copy()
# Remap chartsInScope from IDs to UUIDs
if "chartsInScope" in new_filter:
new_filter["chartsInScope"] = [
chart_id_to_uuid.get(cid, cid) for cid in new_filter["chartsInScope"]
]
# Remap targets to use datasetUuid
if "targets" in new_filter:
new_targets = []
for target in new_filter["targets"]:
new_target = target.copy()
if "datasetId" in new_target:
dataset_id = new_target.pop("datasetId")
if dataset_id in dataset_id_to_uuid:
new_target["datasetUuid"] = dataset_id_to_uuid[dataset_id]
new_targets.append(new_target)
new_filter["targets"] = new_targets
remapped.append(new_filter)
return remapped
def remap_chart_configuration(
chart_config: dict[str, Any],
chart_id_to_uuid: dict[int, str],
) -> dict[str, Any]:
"""Remap chart IDs to UUIDs in chart_configuration (cross-filters)."""
remapped: dict[str, Any] = {}
for chart_id_str, config in chart_config.items():
chart_id = int(chart_id_str)
if chart_id not in chart_id_to_uuid:
continue
new_config = config.copy()
chart_uuid = chart_id_to_uuid[chart_id]
# Update the id field
new_config["id"] = chart_uuid
# Remap chartsInScope
cross_filters = new_config.get("crossFilters", {})
if "chartsInScope" in cross_filters:
new_config["crossFilters"] = new_config["crossFilters"].copy()
new_config["crossFilters"]["chartsInScope"] = [
chart_id_to_uuid.get(cid, cid)
for cid in new_config["crossFilters"]["chartsInScope"]
]
remapped[chart_uuid] = new_config
return remapped
def remap_global_chart_configuration(
global_config: dict[str, Any],
chart_id_to_uuid: dict[int, str],
) -> dict[str, Any]:
"""Remap chart IDs in global_chart_configuration."""
new_config = global_config.copy()
if "chartsInScope" in new_config:
new_config["chartsInScope"] = [
chart_id_to_uuid.get(cid, cid) for cid in new_config["chartsInScope"]
]
return new_config
def export_dashboard_yaml(
dashboard: Dashboard,
chart_id_to_uuid: dict[int, str],
dataset_id_to_uuid: dict[int, str],
) -> dict[str, Any]:
"""Export dashboard to YAML format with proper ID remapping."""
from superset.utils import (
json as superset_json, # pylint: disable=import-outside-toplevel
)
position = dashboard.position or {}
# Update position to use UUIDs
updated_position = {}
for key, value in position.items():
if isinstance(value, dict):
updated_value = value.copy()
if "meta" in updated_value and "chartId" in updated_value.get("meta", {}):
chart_id = updated_value["meta"]["chartId"]
if chart_id in chart_id_to_uuid:
updated_value["meta"]["uuid"] = chart_id_to_uuid[chart_id]
updated_position[key] = updated_value
else:
updated_position[key] = value
# Parse json_metadata
json_metadata = {}
if dashboard.json_metadata:
try:
json_metadata = superset_json.loads(dashboard.json_metadata)
except Exception:
logger.debug("Could not parse json_metadata")
# Remap native filters
native_filters = json_metadata.get("native_filter_configuration", [])
remapped_filters = remap_native_filters(
native_filters, chart_id_to_uuid, dataset_id_to_uuid
)
# Remap chart_configuration (cross-filters)
chart_configuration = json_metadata.get("chart_configuration", {})
remapped_chart_config = remap_chart_configuration(
chart_configuration, chart_id_to_uuid
)
# Remap global_chart_configuration
global_chart_config = json_metadata.get("global_chart_configuration", {})
remapped_global_config = remap_global_chart_configuration(
global_chart_config, chart_id_to_uuid
)
# Build metadata section
metadata: dict[str, Any] = {
"timed_refresh_immune_slices": json_metadata.get(
"timed_refresh_immune_slices", []
),
"expanded_slices": json_metadata.get("expanded_slices", {}),
"refresh_frequency": json_metadata.get("refresh_frequency", 0),
"default_filters": json_metadata.get("default_filters", "{}"),
"color_scheme": json_metadata.get("color_scheme", ""),
"label_colors": json_metadata.get("label_colors", {}),
"native_filter_configuration": remapped_filters,
"shared_label_colors": json_metadata.get("shared_label_colors", []),
"map_label_colors": json_metadata.get("map_label_colors", {}),
"color_scheme_domain": json_metadata.get("color_scheme_domain", []),
"cross_filters_enabled": json_metadata.get("cross_filters_enabled", False),
"chart_configuration": remapped_chart_config,
"global_chart_configuration": remapped_global_config,
}
return {
"dashboard_title": dashboard.dashboard_title,
"description": dashboard.description,
"css": dashboard.css,
"slug": dashboard.slug,
"certified_by": dashboard.certified_by,
"certification_details": dashboard.certification_details,
"published": dashboard.published,
"uuid": str(dashboard.uuid),
"position": updated_position,
"metadata": metadata,
"version": "1.0.0",
}
def _make_yaml_generator(config: dict[str, Any]) -> Callable[[], bytes]:
"""Create a generator function for YAML content with ASF license header."""
yaml_content = yaml.safe_dump(config, default_flow_style=False, allow_unicode=True)
return lambda: (YAML_LICENSE_HEADER + yaml_content).encode("utf-8")
def _make_bytes_generator(data: bytes) -> Callable[[], bytes]:
"""Create a generator function for raw bytes content."""
return lambda: data
class ExportExampleCommand(BaseCommand):
"""Export dashboard as an example bundle with Parquet data and YAML configs.
Output structure for single dataset:
data.parquet - Raw data
dataset.yaml - Dataset metadata
dashboard.yaml - Dashboard definition
charts/*.yaml - Chart definitions
Output structure for multiple datasets:
data/*.parquet - Raw data files
datasets/*.yaml - Dataset metadata files
dashboard.yaml - Dashboard definition
charts/*.yaml - Chart definitions
"""
def __init__(
self,
dashboard_id: int,
export_data: bool = True,
sample_rows: int | None = None,
):
self._dashboard_id = dashboard_id
self._export_data = export_data
self._sample_rows = sample_rows
self._dashboard: Dashboard | None = None
def validate(self) -> None:
self._dashboard = DashboardDAO.find_by_id(self._dashboard_id)
if not self._dashboard:
raise DashboardNotFoundError()
def run(self) -> Iterator[tuple[str, Callable[[], bytes]]]: # noqa: C901
"""Yield (filename, content_generator) tuples for ZIP packaging.
Content generators return bytes (either YAML encoded or raw Parquet).
"""
self.validate()
assert self._dashboard is not None
# Collect all charts and their datasets
charts = self._dashboard.slices
datasets: dict[int, SqlaTable] = {}
chart_id_to_uuid: dict[int, str] = {}
chart_to_dataset_uuid: dict[int, str] = {}
for chart in charts:
chart_id_to_uuid[chart.id] = str(chart.uuid)
if chart.datasource:
datasets[chart.datasource.id] = chart.datasource
chart_to_dataset_uuid[chart.id] = str(chart.datasource.uuid)
# Build dataset ID to UUID mapping
dataset_id_to_uuid: dict[int, str] = {
ds_id: str(ds.uuid) for ds_id, ds in datasets.items()
}
logger.info("Found %d charts and %d datasets", len(charts), len(datasets))
# Classify datasets: physical vs virtual
# Physical datasets need Parquet export; virtual datasets with all
# dependencies in the export can preserve their SQL
physical_datasets: dict[int, SqlaTable] = {}
virtual_datasets: dict[int, SqlaTable] = {}
for ds_id, dataset in datasets.items():
if is_virtual_dataset(dataset):
virtual_datasets[ds_id] = dataset
else:
physical_datasets[ds_id] = dataset
# Get the set of physical table names for dependency checking
physical_table_names = {ds.table_name for ds in physical_datasets.values()}
# Determine which virtual datasets can be preserved vs need materialization
# A virtual dataset can be preserved if all its referenced tables are
# physical datasets in this export
preserved_virtual: dict[int, SqlaTable] = {}
materialized_virtual: dict[int, SqlaTable] = {}
# Get database engine for SQL parsing (use first dataset's database)
db_engine = "base"
if datasets:
first_dataset = next(iter(datasets.values()))
if first_dataset.database:
db_engine = first_dataset.database.backend or "base"
for ds_id, dataset in virtual_datasets.items():
if can_preserve_virtual_dataset(dataset, physical_table_names, db_engine):
preserved_virtual[ds_id] = dataset
else:
materialized_virtual[ds_id] = dataset
# Log classification summary
logger.info(
"Dataset classification: %d physical, %d virtual preserved, "
"%d virtual materialized",
len(physical_datasets),
len(preserved_virtual),
len(materialized_virtual),
)
# Datasets that need Parquet export = physical + materialized virtual
datasets_needing_data = {**physical_datasets, **materialized_virtual}
# Build unique filenames for datasets (handle table_name collisions)
dataset_filenames: dict[int, str] = {}
seen_table_names: dict[str, int] = {} # table_name -> first dataset_id
for ds_id, dataset in datasets.items():
table_name = dataset.table_name
if table_name in seen_table_names:
# Collision! Use UUID suffix for uniqueness
uuid_suffix = str(dataset.uuid)[:8]
filename = f"{table_name}-{uuid_suffix}"
logger.info(
"Table name collision for '%s', using '%s'", table_name, filename
)
else:
filename = table_name
seen_table_names[table_name] = ds_id
dataset_filenames[ds_id] = filename
# Export datasets
multi_dataset = len(datasets) > 1
if multi_dataset:
# Multiple datasets: use datasets/ and data/ folders
for ds_id, dataset in datasets.items():
filename = dataset_filenames[ds_id]
needs_data = ds_id in datasets_needing_data
is_preserved = ds_id in preserved_virtual
data_file = f"{filename}.parquet" if needs_data else None
# Export YAML
dataset_config = export_dataset_yaml(
dataset,
data_file=data_file,
preserve_virtual=is_preserved,
)
yield (
f"datasets/{filename}.yaml",
_make_yaml_generator(dataset_config),
)
# Export data only for datasets that need it
if self._export_data and needs_data:
data = export_dataset_data(dataset, self._sample_rows)
if data:
yield (
f"data/{data_file}",
_make_bytes_generator(data),
)
elif len(datasets) == 1:
# Single dataset: use dataset.yaml and data.parquet at root
ds_id = next(iter(datasets.keys()))
dataset = datasets[ds_id]
needs_data = ds_id in datasets_needing_data
is_preserved = ds_id in preserved_virtual
data_file = "data.parquet" if needs_data else None
dataset_config = export_dataset_yaml(
dataset,
data_file=data_file,
preserve_virtual=is_preserved,
)
yield ("dataset.yaml", _make_yaml_generator(dataset_config))
if self._export_data and needs_data:
data = export_dataset_data(dataset, self._sample_rows)
if data:
yield ("data.parquet", _make_bytes_generator(data))
# Export charts
for chart in charts:
dataset_uuid = chart_to_dataset_uuid.get(chart.id, "")
chart_config = export_chart(chart, dataset_uuid)
filename = sanitize_filename(chart.slice_name) + ".yaml"
yield (f"charts/{filename}", _make_yaml_generator(chart_config))
# Export dashboard
dashboard_config = export_dashboard_yaml(
self._dashboard, chart_id_to_uuid, dataset_id_to_uuid
)
yield ("dashboard.yaml", _make_yaml_generator(dashboard_config))