Files
superset2/superset/examples/data_loading.py
2026-02-02 09:24:16 -08:00

192 lines
6.7 KiB
Python

# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
"""Auto-discover and load example datasets from Parquet files."""
import logging
from pathlib import Path
from typing import Any, Callable, Dict, Optional
import yaml
# Import loaders that have custom logic (dashboards, CSS, etc.)
from superset.cli.test_loaders import load_big_data
from .css_templates import load_css_templates
# Import generic loader for Parquet datasets
from .generic_loader import create_generic_loader
from .utils import load_examples_from_configs
logger = logging.getLogger(__name__)
def get_dataset_config_from_yaml(example_dir: Path) -> Dict[str, Optional[str]]:
"""Read table_name, schema, and data_file from dataset.yaml if it exists."""
result: Dict[str, Optional[str]] = {
"table_name": None,
"schema": None,
"data_file": None,
"uuid": None,
}
dataset_yaml = example_dir / "dataset.yaml"
if dataset_yaml.exists():
try:
with open(dataset_yaml) as f:
config = yaml.safe_load(f)
result["table_name"] = config.get("table_name")
result["data_file"] = config.get("data_file")
result["uuid"] = config.get("uuid")
schema = config.get("schema")
# Treat SQLite's 'main' schema as null (use target database default)
result["schema"] = None if schema == "main" else schema
except Exception:
logger.debug("Could not read dataset.yaml from %s", example_dir)
return result
def get_examples_directory() -> Path:
"""Get the path to the examples directory."""
from .helpers import get_examples_folder
return Path(get_examples_folder())
def _get_multi_dataset_config(
example_dir: Path, dataset_name: str, data_file: Path
) -> Dict[str, Any]:
"""Read config for a multi-dataset example from datasets/{name}.yaml."""
datasets_yaml = example_dir / "datasets" / f"{dataset_name}.yaml"
result: Dict[str, Any] = {
"table_name": dataset_name,
"schema": None,
"data_file": data_file,
}
if not datasets_yaml.exists():
return result
try:
with open(datasets_yaml) as f:
yaml_config = yaml.safe_load(f)
result["table_name"] = yaml_config.get("table_name") or dataset_name
result["uuid"] = yaml_config.get("uuid")
raw_schema = yaml_config.get("schema")
result["schema"] = None if raw_schema == "main" else raw_schema
# Use explicit data_file from YAML if specified
explicit_data_file = yaml_config.get("data_file")
if explicit_data_file:
candidate = example_dir / "data" / explicit_data_file
if candidate.exists():
result["data_file"] = candidate
else:
logger.warning(
"data_file '%s' specified in YAML does not exist",
explicit_data_file,
)
except Exception:
logger.debug("Could not read datasets yaml from %s", datasets_yaml)
return result
def discover_datasets() -> Dict[str, Callable[..., None]]:
"""Auto-discover all example datasets and create loaders for them.
Examples are organized as:
superset/examples/{example_name}/data.parquet # Single dataset
superset/examples/{example_name}/data/{name}.parquet # Multiple datasets
Table names and data file references are read from dataset.yaml/datasets/*.yaml
if present, otherwise derived from the folder/file name.
"""
loaders: Dict[str, Callable[..., None]] = {}
examples_dir = get_examples_directory()
if not examples_dir.exists():
return loaders
# Discover single data.parquet files (simple examples)
for data_file in sorted(examples_dir.glob("*/data.parquet")):
example_dir = data_file.parent
dataset_name = example_dir.name
if dataset_name.startswith("_"):
continue
config = get_dataset_config_from_yaml(example_dir)
table_name = config["table_name"] or dataset_name
explicit_data_file = config.get("data_file")
if explicit_data_file:
resolved_file = example_dir / explicit_data_file
else:
resolved_file = data_file
if explicit_data_file and not resolved_file.exists():
logger.warning("data_file '%s' does not exist", explicit_data_file)
resolved_file = data_file
loader_name = f"load_{dataset_name}"
loaders[loader_name] = create_generic_loader(
dataset_name,
table_name=table_name,
schema=config["schema"],
data_file=resolved_file,
uuid=config.get("uuid"),
)
# Discover multiple parquet files in data/ folders (complex examples)
for data_file in sorted(examples_dir.glob("*/data/*.parquet")):
dataset_name = data_file.stem
example_dir = data_file.parent.parent
if example_dir.name.startswith("_"):
continue
config = _get_multi_dataset_config(example_dir, dataset_name, data_file)
loader_name = f"load_{dataset_name}"
if loader_name not in loaders:
loaders[loader_name] = create_generic_loader(
dataset_name,
table_name=config["table_name"],
schema=config["schema"],
data_file=config["data_file"],
uuid=config.get("uuid"),
)
return loaders
# Auto-discover and create all dataset loaders
try:
_auto_loaders = discover_datasets()
except RuntimeError:
# Outside Flask app context (e.g., tests, tooling)
_auto_loaders = {}
# Add auto-discovered loaders to module namespace
globals().update(_auto_loaders)
# Build __all__ list dynamically
__all__ = [
# Custom loaders (always included)
"load_big_data",
"load_css_templates",
"load_examples_from_configs",
# Auto-discovered loaders
*sorted(_auto_loaders.keys()),
]