feat(examples): Modernize example data loading with Parquet and YAML configs (#36538)

Co-authored-by: Claude <noreply@anthropic.com>
2026-04-23 18:14:56 +00:00 · 2026-01-21 12:42:15 -08:00
parent ec36791551
commit dee063a4c5
271 changed files with 23340 additions and 12971 deletions
--- a/superset/examples/data_loading.py
+++ b/superset/examples/data_loading.py
@@ -14,44 +14,173 @@
 #  KIND, either express or implied.  See the License for the
 #  specific language governing permissions and limitations
 #  under the License.
-from .bart_lines import load_bart_lines
-from .big_data import load_big_data
-from .birth_names import load_birth_names
-from .country_map import load_country_map_data
-from .css_templates import load_css_templates
-from .deck import load_deck_dash
-from .energy import load_energy
-from .flights import load_flights
-from .international_sales import load_international_sales
-from .long_lat import load_long_lat_data
-from .misc_dashboard import load_misc_dashboard
-from .multiformat_time_series import load_multiformat_time_series
-from .paris import load_paris_iris_geojson
-from .random_time_series import load_random_time_series_data
-from .sf_population_polygons import load_sf_population_polygons
-from .supported_charts_dashboard import load_supported_charts_dashboard
-from .tabbed_dashboard import load_tabbed_dashboard
-from .utils import load_examples_from_configs
-from .world_bank import load_world_bank_health_n_pop
+"""Auto-discover and load example datasets from Parquet files."""

+import logging
+from pathlib import Path
+from typing import Any, Callable, Dict, Optional
+
+import yaml
+
+# Import loaders that have custom logic (dashboards, CSS, etc.)
+from superset.cli.test_loaders import load_big_data
+
+from .css_templates import load_css_templates
+
+# Import generic loader for Parquet datasets
+from .generic_loader import create_generic_loader
+from .utils import load_examples_from_configs
+
+logger = logging.getLogger(__name__)
+
+
+def get_dataset_config_from_yaml(example_dir: Path) -> Dict[str, Optional[str]]:
+    """Read table_name, schema, and data_file from dataset.yaml if it exists."""
+    result: Dict[str, Optional[str]] = {
+        "table_name": None,
+        "schema": None,
+        "data_file": None,
+    }
+    dataset_yaml = example_dir / "dataset.yaml"
+    if dataset_yaml.exists():
+        try:
+            with open(dataset_yaml) as f:
+                config = yaml.safe_load(f)
+                result["table_name"] = config.get("table_name")
+                result["data_file"] = config.get("data_file")
+                schema = config.get("schema")
+                # Treat SQLite's 'main' schema as null (use target database default)
+                result["schema"] = None if schema == "main" else schema
+        except Exception:
+            logger.debug("Could not read dataset.yaml from %s", example_dir)
+    return result
+
+
+def get_examples_directory() -> Path:
+    """Get the path to the examples directory."""
+    from .helpers import get_examples_folder
+
+    return Path(get_examples_folder())
+
+
+def _get_multi_dataset_config(
+    example_dir: Path, dataset_name: str, data_file: Path
+) -> Dict[str, Any]:
+    """Read config for a multi-dataset example from datasets/{name}.yaml."""
+    datasets_yaml = example_dir / "datasets" / f"{dataset_name}.yaml"
+    result: Dict[str, Any] = {
+        "table_name": dataset_name,
+        "schema": None,
+        "data_file": data_file,
+    }
+
+    if not datasets_yaml.exists():
+        return result
+
+    try:
+        with open(datasets_yaml) as f:
+            yaml_config = yaml.safe_load(f)
+            result["table_name"] = yaml_config.get("table_name") or dataset_name
+            raw_schema = yaml_config.get("schema")
+            result["schema"] = None if raw_schema == "main" else raw_schema
+
+            # Use explicit data_file from YAML if specified
+            explicit_data_file = yaml_config.get("data_file")
+            if explicit_data_file:
+                candidate = example_dir / "data" / explicit_data_file
+                if candidate.exists():
+                    result["data_file"] = candidate
+                else:
+                    logger.warning(
+                        "data_file '%s' specified in YAML does not exist",
+                        explicit_data_file,
+                    )
+    except Exception:
+        logger.debug("Could not read datasets yaml from %s", datasets_yaml)
+
+    return result
+
+
+def discover_datasets() -> Dict[str, Callable[..., None]]:
+    """Auto-discover all example datasets and create loaders for them.
+
+    Examples are organized as:
+        superset/examples/{example_name}/data.parquet           # Single dataset
+        superset/examples/{example_name}/data/{name}.parquet    # Multiple datasets
+
+    Table names and data file references are read from dataset.yaml/datasets/*.yaml
+    if present, otherwise derived from the folder/file name.
+    """
+    loaders: Dict[str, Callable[..., None]] = {}
+    examples_dir = get_examples_directory()
+
+    if not examples_dir.exists():
+        return loaders
+
+    # Discover single data.parquet files (simple examples)
+    for data_file in sorted(examples_dir.glob("*/data.parquet")):
+        example_dir = data_file.parent
+        dataset_name = example_dir.name
+
+        if dataset_name.startswith("_"):
+            continue
+
+        config = get_dataset_config_from_yaml(example_dir)
+        table_name = config["table_name"] or dataset_name
+        explicit_data_file = config.get("data_file")
+        if explicit_data_file:
+            resolved_file = example_dir / explicit_data_file
+        else:
+            resolved_file = data_file
+        if explicit_data_file and not resolved_file.exists():
+            logger.warning("data_file '%s' does not exist", explicit_data_file)
+            resolved_file = data_file
+
+        loader_name = f"load_{dataset_name}"
+        loaders[loader_name] = create_generic_loader(
+            dataset_name,
+            table_name=table_name,
+            schema=config["schema"],
+            data_file=resolved_file,
+        )
+
+    # Discover multiple parquet files in data/ folders (complex examples)
+    for data_file in sorted(examples_dir.glob("*/data/*.parquet")):
+        dataset_name = data_file.stem
+        example_dir = data_file.parent.parent
+
+        if example_dir.name.startswith("_"):
+            continue
+
+        config = _get_multi_dataset_config(example_dir, dataset_name, data_file)
+        loader_name = f"load_{dataset_name}"
+        if loader_name not in loaders:
+            loaders[loader_name] = create_generic_loader(
+                dataset_name,
+                table_name=config["table_name"],
+                schema=config["schema"],
+                data_file=config["data_file"],
+            )
+
+    return loaders
+
+
+# Auto-discover and create all dataset loaders
+try:
+    _auto_loaders = discover_datasets()
+except RuntimeError:
+    # Outside Flask app context (e.g., tests, tooling)
+    _auto_loaders = {}
+
+# Add auto-discovered loaders to module namespace
+globals().update(_auto_loaders)
+
+# Build __all__ list dynamically
 __all__ = [
-    "load_bart_lines",
+    # Custom loaders (always included)
    "load_big_data",
-    "load_birth_names",
-    "load_country_map_data",
    "load_css_templates",
-    "load_international_sales",
-    "load_deck_dash",
-    "load_energy",
-    "load_flights",
-    "load_long_lat_data",
-    "load_misc_dashboard",
-    "load_multiformat_time_series",
-    "load_paris_iris_geojson",
-    "load_random_time_series_data",
-    "load_sf_population_polygons",
-    "load_supported_charts_dashboard",
-    "load_tabbed_dashboard",
    "load_examples_from_configs",
-    "load_world_bank_health_n_pop",
+    # Auto-discovered loaders
+    *sorted(_auto_loaders.keys()),
 ]