mirror of
https://github.com/apache/superset.git
synced 2026-04-07 18:35:15 +00:00
192 lines
6.7 KiB
Python
192 lines
6.7 KiB
Python
# Licensed to the Apache Software Foundation (ASF) under one
|
|
# or more contributor license agreements. See the NOTICE file
|
|
# distributed with this work for additional information
|
|
# regarding copyright ownership. The ASF licenses this file
|
|
# to you under the Apache License, Version 2.0 (the
|
|
# "License"); you may not use this file except in compliance
|
|
# with the License. You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing,
|
|
# software distributed under the License is distributed on an
|
|
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
# KIND, either express or implied. See the License for the
|
|
# specific language governing permissions and limitations
|
|
# under the License.
|
|
"""Auto-discover and load example datasets from Parquet files."""
|
|
|
|
import logging
|
|
from pathlib import Path
|
|
from typing import Any, Callable, Dict, Optional
|
|
|
|
import yaml
|
|
|
|
# Import loaders that have custom logic (dashboards, CSS, etc.)
|
|
from superset.cli.test_loaders import load_big_data
|
|
|
|
from .css_templates import load_css_templates
|
|
|
|
# Import generic loader for Parquet datasets
|
|
from .generic_loader import create_generic_loader
|
|
from .utils import load_examples_from_configs
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
def get_dataset_config_from_yaml(example_dir: Path) -> Dict[str, Optional[str]]:
|
|
"""Read table_name, schema, and data_file from dataset.yaml if it exists."""
|
|
result: Dict[str, Optional[str]] = {
|
|
"table_name": None,
|
|
"schema": None,
|
|
"data_file": None,
|
|
"uuid": None,
|
|
}
|
|
dataset_yaml = example_dir / "dataset.yaml"
|
|
if dataset_yaml.exists():
|
|
try:
|
|
with open(dataset_yaml) as f:
|
|
config = yaml.safe_load(f)
|
|
result["table_name"] = config.get("table_name")
|
|
result["data_file"] = config.get("data_file")
|
|
result["uuid"] = config.get("uuid")
|
|
schema = config.get("schema")
|
|
# Treat SQLite's 'main' schema as null (use target database default)
|
|
result["schema"] = None if schema == "main" else schema
|
|
except Exception:
|
|
logger.debug("Could not read dataset.yaml from %s", example_dir)
|
|
return result
|
|
|
|
|
|
def get_examples_directory() -> Path:
|
|
"""Get the path to the examples directory."""
|
|
from .helpers import get_examples_folder
|
|
|
|
return Path(get_examples_folder())
|
|
|
|
|
|
def _get_multi_dataset_config(
|
|
example_dir: Path, dataset_name: str, data_file: Path
|
|
) -> Dict[str, Any]:
|
|
"""Read config for a multi-dataset example from datasets/{name}.yaml."""
|
|
datasets_yaml = example_dir / "datasets" / f"{dataset_name}.yaml"
|
|
result: Dict[str, Any] = {
|
|
"table_name": dataset_name,
|
|
"schema": None,
|
|
"data_file": data_file,
|
|
}
|
|
|
|
if not datasets_yaml.exists():
|
|
return result
|
|
|
|
try:
|
|
with open(datasets_yaml) as f:
|
|
yaml_config = yaml.safe_load(f)
|
|
result["table_name"] = yaml_config.get("table_name") or dataset_name
|
|
result["uuid"] = yaml_config.get("uuid")
|
|
raw_schema = yaml_config.get("schema")
|
|
result["schema"] = None if raw_schema == "main" else raw_schema
|
|
|
|
# Use explicit data_file from YAML if specified
|
|
explicit_data_file = yaml_config.get("data_file")
|
|
if explicit_data_file:
|
|
candidate = example_dir / "data" / explicit_data_file
|
|
if candidate.exists():
|
|
result["data_file"] = candidate
|
|
else:
|
|
logger.warning(
|
|
"data_file '%s' specified in YAML does not exist",
|
|
explicit_data_file,
|
|
)
|
|
except Exception:
|
|
logger.debug("Could not read datasets yaml from %s", datasets_yaml)
|
|
|
|
return result
|
|
|
|
|
|
def discover_datasets() -> Dict[str, Callable[..., None]]:
|
|
"""Auto-discover all example datasets and create loaders for them.
|
|
|
|
Examples are organized as:
|
|
superset/examples/{example_name}/data.parquet # Single dataset
|
|
superset/examples/{example_name}/data/{name}.parquet # Multiple datasets
|
|
|
|
Table names and data file references are read from dataset.yaml/datasets/*.yaml
|
|
if present, otherwise derived from the folder/file name.
|
|
"""
|
|
loaders: Dict[str, Callable[..., None]] = {}
|
|
examples_dir = get_examples_directory()
|
|
|
|
if not examples_dir.exists():
|
|
return loaders
|
|
|
|
# Discover single data.parquet files (simple examples)
|
|
for data_file in sorted(examples_dir.glob("*/data.parquet")):
|
|
example_dir = data_file.parent
|
|
dataset_name = example_dir.name
|
|
|
|
if dataset_name.startswith("_"):
|
|
continue
|
|
|
|
config = get_dataset_config_from_yaml(example_dir)
|
|
table_name = config["table_name"] or dataset_name
|
|
explicit_data_file = config.get("data_file")
|
|
if explicit_data_file:
|
|
resolved_file = example_dir / explicit_data_file
|
|
else:
|
|
resolved_file = data_file
|
|
if explicit_data_file and not resolved_file.exists():
|
|
logger.warning("data_file '%s' does not exist", explicit_data_file)
|
|
resolved_file = data_file
|
|
|
|
loader_name = f"load_{dataset_name}"
|
|
loaders[loader_name] = create_generic_loader(
|
|
dataset_name,
|
|
table_name=table_name,
|
|
schema=config["schema"],
|
|
data_file=resolved_file,
|
|
uuid=config.get("uuid"),
|
|
)
|
|
|
|
# Discover multiple parquet files in data/ folders (complex examples)
|
|
for data_file in sorted(examples_dir.glob("*/data/*.parquet")):
|
|
dataset_name = data_file.stem
|
|
example_dir = data_file.parent.parent
|
|
|
|
if example_dir.name.startswith("_"):
|
|
continue
|
|
|
|
config = _get_multi_dataset_config(example_dir, dataset_name, data_file)
|
|
loader_name = f"load_{dataset_name}"
|
|
if loader_name not in loaders:
|
|
loaders[loader_name] = create_generic_loader(
|
|
dataset_name,
|
|
table_name=config["table_name"],
|
|
schema=config["schema"],
|
|
data_file=config["data_file"],
|
|
uuid=config.get("uuid"),
|
|
)
|
|
|
|
return loaders
|
|
|
|
|
|
# Auto-discover and create all dataset loaders
|
|
try:
|
|
_auto_loaders = discover_datasets()
|
|
except RuntimeError:
|
|
# Outside Flask app context (e.g., tests, tooling)
|
|
_auto_loaders = {}
|
|
|
|
# Add auto-discovered loaders to module namespace
|
|
globals().update(_auto_loaders)
|
|
|
|
# Build __all__ list dynamically
|
|
__all__ = [
|
|
# Custom loaders (always included)
|
|
"load_big_data",
|
|
"load_css_templates",
|
|
"load_examples_from_configs",
|
|
# Auto-discovered loaders
|
|
*sorted(_auto_loaders.keys()),
|
|
]
|