feat(examples): Modernize example data loading with Parquet and YAML configs (#36538)

Co-authored-by: Claude <noreply@anthropic.com>
2026-04-20 00:24:38 +00:00 · 2026-01-21 12:42:15 -08:00
parent ec36791551
commit dee063a4c5
271 changed files with 23340 additions and 12971 deletions
--- a/superset/cli/examples.py
+++ b/superset/cli/examples.py
@@ -15,6 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 import logging
+from typing import Any, Callable

 import click
 from flask.cli import with_appcontext
@@ -25,6 +26,44 @@ from superset.utils.decorators import transaction
 logger = logging.getLogger(__name__)


+def _should_skip_loader(
+    loader_name: str, load_big_data: bool, only_metadata: bool
+) -> bool:
+    """Check if a loader should be skipped."""
+    # Skip special loaders that aren't datasets
+    if loader_name in ["load_css_templates", "load_examples_from_configs"]:
+        return True
+
+    # Skip big data if not requested or when only metadata is requested
+    if loader_name == "load_big_data" and (not load_big_data or only_metadata):
+        return True
+
+    return False
+
+
+def _load_dataset(
+    loader: Callable[..., Any], loader_name: str, only_metadata: bool, force: bool
+) -> None:
+    """Load a single dataset with error handling."""
+    import inspect
+
+    dataset_name = loader_name[5:].replace("_", " ").title()
+    logger.info("Loading [%s]", dataset_name)
+
+    # Call loader with appropriate parameters
+    sig = inspect.signature(loader)
+    params = {}
+    if "only_metadata" in sig.parameters:
+        params["only_metadata"] = only_metadata
+    if "force" in sig.parameters:
+        params["force"] = force
+
+    try:
+        loader(**params)
+    except Exception as e:
+        logger.warning("Failed to load %s: %s", dataset_name, e)
+
+
 def load_examples_run(
    load_test_data: bool = False,
    load_big_data: bool = False,
@@ -40,54 +79,21 @@ def load_examples_run(
    # pylint: disable=import-outside-toplevel
    import superset.examples.data_loading as examples

+    # Always load CSS templates
    examples.load_css_templates()

-    if load_test_data:
-        logger.info("Loading energy related dataset")
-        examples.load_energy(only_metadata, force)
+    # Auto-discover and load all datasets
+    for loader_name in dir(examples):
+        if not loader_name.startswith("load_"):
+            continue

-    logger.info("Loading [World Bank's Health Nutrition and Population Stats]")
-    examples.load_world_bank_health_n_pop(only_metadata, force)
+        if _should_skip_loader(loader_name, load_big_data, only_metadata):
+            continue

-    logger.info("Loading [Birth names]")
-    examples.load_birth_names(only_metadata, force)
+        loader = getattr(examples, loader_name)
+        _load_dataset(loader, loader_name, only_metadata, force)

-    logger.info("Loading [International Sales]")
-    examples.load_international_sales(only_metadata, force)
-
-    if load_test_data:
-        logger.info("Loading [Tabbed dashboard]")
-        examples.load_tabbed_dashboard(only_metadata)
-
-        logger.info("Loading [Supported Charts Dashboard]")
-        examples.load_supported_charts_dashboard()
-    else:
-        logger.info("Loading [Random long/lat data]")
-        examples.load_long_lat_data(only_metadata, force)
-
-        logger.info("Loading [Country Map data]")
-        examples.load_country_map_data(only_metadata, force)
-
-        logger.info("Loading [San Francisco population polygons]")
-        examples.load_sf_population_polygons(only_metadata, force)
-
-        logger.info("Loading [Flights data]")
-        examples.load_flights(only_metadata, force)
-
-        logger.info("Loading [BART lines]")
-        examples.load_bart_lines(only_metadata, force)
-
-        logger.info("Loading [Misc Charts] dashboard")
-        examples.load_misc_dashboard()
-
-        logger.info("Loading DECK.gl demo")
-        examples.load_deck_dash()
-
-    if load_big_data:
-        logger.info("Loading big synthetic data for tests")
-        examples.load_big_data()
-
-    # load examples that are stored as YAML config files
+    # Load examples that are stored as YAML config files
    examples.load_examples_from_configs(force, load_test_data)


--- a/superset/cli/export_example.py
+++ b/superset/cli/export_example.py
@@ -0,0 +1,234 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""CLI command to export a dashboard as an example.
+
+This creates an example-ready folder structure that can be committed
+to superset/examples/ and loaded via the example loading system.
+
+Usage:
+    superset export-example --dashboard-id 123 --name my_example
+    superset export-example --dashboard-slug my-dashboard --name my_example
+"""
+
+from __future__ import annotations
+
+import logging
+from pathlib import Path
+from typing import Optional
+
+import click
+from flask.cli import with_appcontext
+
+logger = logging.getLogger(__name__)
+
+APACHE_LICENSE_HEADER = """# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+
+
+def write_file_with_header(path: Path, content: bytes) -> None:
+    """Write file, adding Apache license header for YAML files."""
+    path.parent.mkdir(parents=True, exist_ok=True)
+
+    if path.suffix == ".yaml":
+        # Add license header to YAML files
+        with open(path, "wb") as f:
+            f.write(APACHE_LICENSE_HEADER.encode("utf-8"))
+            f.write(content)
+    else:
+        # Binary files (like Parquet) written as-is
+        with open(path, "wb") as f:
+            f.write(content)
+
+    logger.info("Wrote %s", path)
+
+
+@click.command()
+@with_appcontext
+@click.option("--dashboard-id", "-d", type=int, help="Dashboard ID to export")
+@click.option("--dashboard-slug", "-s", type=str, help="Dashboard slug to export")
+@click.option("--name", "-n", required=True, help="Name for the example folder")
+@click.option(
+    "--output-dir",
+    "-o",
+    default="superset/examples",
+    help="Output directory (default: superset/examples)",
+)
+@click.option(
+    "--export-data/--no-export-data",
+    default=True,
+    help="Export data to Parquet (default: True)",
+)
+@click.option(
+    "--sample-rows", type=int, default=None, help="Limit data export to this many rows"
+)
+@click.option("--force", "-f", is_flag=True, help="Overwrite existing example folder")
+def export_example(  # noqa: C901
+    dashboard_id: Optional[int],
+    dashboard_slug: Optional[str],
+    name: str,
+    output_dir: str,
+    export_data: bool,
+    sample_rows: Optional[int],
+    force: bool,
+) -> None:
+    """Export a dashboard as an example.
+
+    Creates a folder structure in superset/examples/ that can be loaded
+    by the example loading system:
+
+    \b
+    Single dataset:
+        <name>/
+        ├── data.parquet      # Raw data
+        ├── dataset.yaml      # Dataset metadata
+        ├── dashboard.yaml    # Dashboard definition
+        └── charts/
+            └── *.yaml        # Chart definitions
+
+    \b
+    Multiple datasets:
+        <name>/
+        ├── data/
+        │   ├── table1.parquet
+        │   └── table2.parquet
+        ├── datasets/
+        │   ├── table1.yaml
+        │   └── table2.yaml
+        ├── dashboard.yaml
+        └── charts/
+            └── *.yaml
+
+    Examples:
+
+    \b
+        # Export by dashboard ID
+        superset export-example -d 1 -n my_example
+
+    \b
+        # Export by slug, limit data to 1000 rows
+        superset export-example -s my-dashboard -n my_example --sample-rows 1000
+
+    \b
+        # Export metadata only (no data)
+        superset export-example -d 1 -n my_example --no-export-data
+    """
+    # Import at runtime to avoid app initialization issues during CLI loading
+    # pylint: disable=import-outside-toplevel
+    from flask import g
+
+    from superset import db, security_manager
+    from superset.commands.dashboard.exceptions import DashboardNotFoundError
+    from superset.commands.dashboard.export_example import ExportExampleCommand
+    from superset.models.dashboard import Dashboard
+    from superset.utils import json as superset_json
+
+    g.user = security_manager.find_user(username="admin")
+
+    # Find the dashboard
+    if dashboard_id:
+        dashboard = db.session.query(Dashboard).get(dashboard_id)
+    elif dashboard_slug:
+        dashboard = db.session.query(Dashboard).filter_by(slug=dashboard_slug).first()
+    else:
+        raise click.UsageError("Must specify --dashboard-id or --dashboard-slug")
+
+    if not dashboard:
+        raise click.ClickException(
+            f"Dashboard not found: {dashboard_id or dashboard_slug}"
+        )
+
+    logger.info("Exporting dashboard: %s", dashboard.dashboard_title)
+
+    # Create output directory
+    example_dir = Path(output_dir) / name
+    if example_dir.exists() and not force:
+        raise click.ClickException(
+            f"Directory already exists: {example_dir}. Use --force to overwrite."
+        )
+
+    example_dir.mkdir(parents=True, exist_ok=True)
+
+    # Run the export command
+    command = ExportExampleCommand(
+        dashboard_id=dashboard.id,
+        export_data=export_data,
+        sample_rows=sample_rows,
+    )
+
+    try:
+        file_count = {"charts": 0, "datasets": 0, "data": 0}
+
+        for filename, content_fn in command.run():
+            file_path = example_dir / filename
+            content = content_fn()
+            write_file_with_header(file_path, content)
+
+            # Track file counts for summary
+            if filename.startswith("charts/"):
+                file_count["charts"] += 1
+            elif filename.startswith("datasets/") or filename == "dataset.yaml":
+                file_count["datasets"] += 1
+            elif filename.startswith("data/") or filename == "data.parquet":
+                file_count["data"] += 1
+
+    except DashboardNotFoundError as err:
+        raise click.ClickException(
+            f"Dashboard not found: {dashboard_id or dashboard_slug}"
+        ) from err
+
+    # Summary
+    click.echo(f"\n✅ Exported to: {example_dir}")
+    click.echo("   - dashboard.yaml")
+
+    if file_count["datasets"] > 1:
+        click.echo(f"   - datasets/ ({file_count['datasets']} datasets)")
+        if export_data and file_count["data"]:
+            click.echo(f"   - data/ ({file_count['data']} parquet files)")
+    else:
+        click.echo("   - dataset.yaml")
+        if export_data and file_count["data"]:
+            click.echo("   - data.parquet")
+
+    click.echo(f"   - charts/ ({file_count['charts']} charts)")
+
+    # Native filters summary
+    if dashboard.json_metadata:
+        try:
+            meta = superset_json.loads(dashboard.json_metadata)
+            filters = meta.get("native_filter_configuration", [])
+            if filters:
+                click.echo(f"   - {len(filters)} native filters exported")
+        except Exception:
+            logger.debug("Could not parse json_metadata for filter count")
+
+    click.echo("\nTo load this example, ensure the folder is in superset/examples/")
+    click.echo("and it will be picked up by load_examples_from_configs().")
--- a/superset/cli/test_loaders.py
+++ b/superset/cli/test_loaders.py
@@ -0,0 +1,90 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Test data loaders for stress testing and development.
+
+This module contains specialized data loaders that generate synthetic data
+for testing Superset's capabilities with edge cases:
+- Wide tables (many columns)
+- Many tables (testing catalog performance)
+- Long table names (UI edge cases)
+
+These loaders are invoked via CLI flags and are not part of the standard
+example datasets.
+"""
+
+import logging
+import random
+import string
+
+import sqlalchemy.sql.sqltypes
+
+from superset.utils.mock_data import add_data, ColumnInfo
+
+logger = logging.getLogger(__name__)
+
+COLUMN_TYPES = [
+    sqlalchemy.sql.sqltypes.INTEGER(),
+    sqlalchemy.sql.sqltypes.VARCHAR(length=255),
+    sqlalchemy.sql.sqltypes.TEXT(),
+    sqlalchemy.sql.sqltypes.BOOLEAN(),
+    sqlalchemy.sql.sqltypes.FLOAT(),
+    sqlalchemy.sql.sqltypes.DATE(),
+    sqlalchemy.sql.sqltypes.TIME(),
+    sqlalchemy.sql.sqltypes.TIMESTAMP(),
+]
+
+
+def load_big_data() -> None:
+    logger.debug("Creating table `wide_table` with 100 columns")
+    columns: list[ColumnInfo] = []
+    for i in range(100):
+        column: ColumnInfo = {
+            "name": f"col{i}",
+            "type": COLUMN_TYPES[i % len(COLUMN_TYPES)],
+            "nullable": False,
+            "default": None,
+            "autoincrement": "auto",
+            "primary_key": 1 if i == 0 else 0,
+        }
+        columns.append(column)
+    add_data(columns=columns, num_rows=1000, table_name="wide_table")
+
+    logger.debug("Creating 1000 small tables")
+    columns = [
+        {
+            "name": "id",
+            "type": sqlalchemy.sql.sqltypes.INTEGER(),
+            "nullable": False,
+            "default": None,
+            "autoincrement": "auto",
+            "primary_key": 1,
+        },
+        {
+            "name": "value",
+            "type": sqlalchemy.sql.sqltypes.VARCHAR(length=255),
+            "nullable": False,
+            "default": None,
+            "autoincrement": "auto",
+            "primary_key": 0,
+        },
+    ]
+    for i in range(1000):
+        add_data(columns=columns, num_rows=10, table_name=f"small_table_{i}")
+
+    logger.debug("Creating table with long name")
+    name = "".join(random.choices(string.ascii_letters + string.digits, k=60))  # noqa: S311
+    add_data(columns=columns, num_rows=10, table_name=name)