superset2/superset/examples/data_loading.py

#  Licensed to the Apache Software Foundation (ASF) under one
#  or more contributor license agreements.  See the NOTICE file
#  distributed with this work for additional information
#  regarding copyright ownership.  The ASF licenses this file
#  to you under the Apache License, Version 2.0 (the
#  "License"); you may not use this file except in compliance
#  with the License.  You may obtain a copy of the License at
#
#  http://www.apache.org/licenses/LICENSE-2.0
#
#  Unless required by applicable law or agreed to in writing,
#  software distributed under the License is distributed on an
#  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
#  KIND, either express or implied.  See the License for the
#  specific language governing permissions and limitations
#  under the License.
"""Auto-discover and load example datasets from Parquet files."""

import logging
from pathlib import Path
from typing import Any, Callable, Dict, Optional

import yaml

# Import loaders that have custom logic (dashboards, CSS, etc.)
from superset.cli.test_loaders import load_big_data

from .css_templates import load_css_templates

# Import generic loader for Parquet datasets
from .generic_loader import create_generic_loader
from .utils import load_examples_from_configs

logger = logging.getLogger(__name__)


def get_dataset_config_from_yaml(example_dir: Path) -> Dict[str, Optional[str]]:
    """Read table_name, schema, and data_file from dataset.yaml if it exists."""
    result: Dict[str, Optional[str]] = {
        "table_name": None,
        "schema": None,
        "data_file": None,
        "uuid": None,
    }
    dataset_yaml = example_dir / "dataset.yaml"
    if dataset_yaml.exists():
        try:
            with open(dataset_yaml) as f:
                config = yaml.safe_load(f)
                result["table_name"] = config.get("table_name")
                result["data_file"] = config.get("data_file")
                result["uuid"] = config.get("uuid")
                schema = config.get("schema")
                # Treat SQLite's 'main' schema as null (use target database default)
                result["schema"] = None if schema == "main" else schema
        except Exception:
            logger.debug("Could not read dataset.yaml from %s", example_dir)
    return result


def get_examples_directory() -> Path:
    """Get the path to the examples directory."""
    from .helpers import get_examples_folder  # noqa: PLC0415

    return Path(get_examples_folder())


def _get_multi_dataset_config(
    example_dir: Path, dataset_name: str, data_file: Path
) -> Dict[str, Any]:
    """Read config for a multi-dataset example from datasets/{name}.yaml."""
    datasets_yaml = example_dir / "datasets" / f"{dataset_name}.yaml"
    result: Dict[str, Any] = {
        "table_name": dataset_name,
        "schema": None,
        "data_file": data_file,
    }

    if not datasets_yaml.exists():
        return result

    try:
        with open(datasets_yaml) as f:
            yaml_config = yaml.safe_load(f)
            result["table_name"] = yaml_config.get("table_name") or dataset_name
            result["uuid"] = yaml_config.get("uuid")
            raw_schema = yaml_config.get("schema")
            result["schema"] = None if raw_schema == "main" else raw_schema

            # Use explicit data_file from YAML if specified
            explicit_data_file = yaml_config.get("data_file")
            if explicit_data_file:
                candidate = example_dir / "data" / explicit_data_file
                if candidate.exists():
                    result["data_file"] = candidate
                else:
                    logger.warning(
                        "data_file '%s' specified in YAML does not exist",
                        explicit_data_file,
                    )
    except Exception:
        logger.debug("Could not read datasets yaml from %s", datasets_yaml)

    return result


def discover_datasets() -> Dict[str, Callable[..., None]]:
    """Auto-discover all example datasets and create loaders for them.

    Examples are organized as:
        superset/examples/{example_name}/data.parquet           # Single dataset
        superset/examples/{example_name}/data/{name}.parquet    # Multiple datasets

    Table names and data file references are read from dataset.yaml/datasets/*.yaml
    if present, otherwise derived from the folder/file name.
    """
    loaders: Dict[str, Callable[..., None]] = {}
    examples_dir = get_examples_directory()

    if not examples_dir.exists():
        return loaders

    # Discover single data.parquet files (simple examples)
    for data_file in sorted(examples_dir.glob("*/data.parquet")):
        example_dir = data_file.parent
        dataset_name = example_dir.name

        if dataset_name.startswith("_"):
            continue

        config = get_dataset_config_from_yaml(example_dir)
        table_name = config["table_name"] or dataset_name
        explicit_data_file = config.get("data_file")
        if explicit_data_file:
            resolved_file = example_dir / explicit_data_file
        else:
            resolved_file = data_file
        if explicit_data_file and not resolved_file.exists():
            logger.warning("data_file '%s' does not exist", explicit_data_file)
            resolved_file = data_file

        loader_name = f"load_{dataset_name}"
        loaders[loader_name] = create_generic_loader(
            dataset_name,
            table_name=table_name,
            schema=config["schema"],
            data_file=resolved_file,
            uuid=config.get("uuid"),
        )

    # Discover multiple parquet files in data/ folders (complex examples)
    for data_file in sorted(examples_dir.glob("*/data/*.parquet")):
        dataset_name = data_file.stem
        example_dir = data_file.parent.parent

        if example_dir.name.startswith("_"):
            continue

        config = _get_multi_dataset_config(example_dir, dataset_name, data_file)
        loader_name = f"load_{dataset_name}"
        if loader_name not in loaders:
            loaders[loader_name] = create_generic_loader(
                dataset_name,
                table_name=config["table_name"],
                schema=config["schema"],
                data_file=config["data_file"],
                uuid=config.get("uuid"),
            )

    return loaders


# Auto-discover and create all dataset loaders
try:
    _auto_loaders = discover_datasets()
except RuntimeError:
    # Outside Flask app context (e.g., tests, tooling)
    _auto_loaders = {}

# Add auto-discovered loaders to module namespace
globals().update(_auto_loaders)

# Build __all__ list dynamically
__all__ = [
    # Custom loaders (always included)
    "load_big_data",
    "load_css_templates",
    "load_examples_from_configs",
    # Auto-discovered loaders
    *sorted(_auto_loaders.keys()),
]