mirror of
https://github.com/apache/superset.git
synced 2026-04-23 18:14:56 +00:00
feat(examples): Modernize example data loading with Parquet and YAML configs (#36538)
Co-authored-by: Claude <noreply@anthropic.com>
This commit is contained in:
@@ -14,44 +14,173 @@
|
||||
# KIND, either express or implied. See the License for the
|
||||
# specific language governing permissions and limitations
|
||||
# under the License.
|
||||
from .bart_lines import load_bart_lines
|
||||
from .big_data import load_big_data
|
||||
from .birth_names import load_birth_names
|
||||
from .country_map import load_country_map_data
|
||||
from .css_templates import load_css_templates
|
||||
from .deck import load_deck_dash
|
||||
from .energy import load_energy
|
||||
from .flights import load_flights
|
||||
from .international_sales import load_international_sales
|
||||
from .long_lat import load_long_lat_data
|
||||
from .misc_dashboard import load_misc_dashboard
|
||||
from .multiformat_time_series import load_multiformat_time_series
|
||||
from .paris import load_paris_iris_geojson
|
||||
from .random_time_series import load_random_time_series_data
|
||||
from .sf_population_polygons import load_sf_population_polygons
|
||||
from .supported_charts_dashboard import load_supported_charts_dashboard
|
||||
from .tabbed_dashboard import load_tabbed_dashboard
|
||||
from .utils import load_examples_from_configs
|
||||
from .world_bank import load_world_bank_health_n_pop
|
||||
"""Auto-discover and load example datasets from Parquet files."""
|
||||
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from typing import Any, Callable, Dict, Optional
|
||||
|
||||
import yaml
|
||||
|
||||
# Import loaders that have custom logic (dashboards, CSS, etc.)
|
||||
from superset.cli.test_loaders import load_big_data
|
||||
|
||||
from .css_templates import load_css_templates
|
||||
|
||||
# Import generic loader for Parquet datasets
|
||||
from .generic_loader import create_generic_loader
|
||||
from .utils import load_examples_from_configs
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def get_dataset_config_from_yaml(example_dir: Path) -> Dict[str, Optional[str]]:
|
||||
"""Read table_name, schema, and data_file from dataset.yaml if it exists."""
|
||||
result: Dict[str, Optional[str]] = {
|
||||
"table_name": None,
|
||||
"schema": None,
|
||||
"data_file": None,
|
||||
}
|
||||
dataset_yaml = example_dir / "dataset.yaml"
|
||||
if dataset_yaml.exists():
|
||||
try:
|
||||
with open(dataset_yaml) as f:
|
||||
config = yaml.safe_load(f)
|
||||
result["table_name"] = config.get("table_name")
|
||||
result["data_file"] = config.get("data_file")
|
||||
schema = config.get("schema")
|
||||
# Treat SQLite's 'main' schema as null (use target database default)
|
||||
result["schema"] = None if schema == "main" else schema
|
||||
except Exception:
|
||||
logger.debug("Could not read dataset.yaml from %s", example_dir)
|
||||
return result
|
||||
|
||||
|
||||
def get_examples_directory() -> Path:
|
||||
"""Get the path to the examples directory."""
|
||||
from .helpers import get_examples_folder
|
||||
|
||||
return Path(get_examples_folder())
|
||||
|
||||
|
||||
def _get_multi_dataset_config(
|
||||
example_dir: Path, dataset_name: str, data_file: Path
|
||||
) -> Dict[str, Any]:
|
||||
"""Read config for a multi-dataset example from datasets/{name}.yaml."""
|
||||
datasets_yaml = example_dir / "datasets" / f"{dataset_name}.yaml"
|
||||
result: Dict[str, Any] = {
|
||||
"table_name": dataset_name,
|
||||
"schema": None,
|
||||
"data_file": data_file,
|
||||
}
|
||||
|
||||
if not datasets_yaml.exists():
|
||||
return result
|
||||
|
||||
try:
|
||||
with open(datasets_yaml) as f:
|
||||
yaml_config = yaml.safe_load(f)
|
||||
result["table_name"] = yaml_config.get("table_name") or dataset_name
|
||||
raw_schema = yaml_config.get("schema")
|
||||
result["schema"] = None if raw_schema == "main" else raw_schema
|
||||
|
||||
# Use explicit data_file from YAML if specified
|
||||
explicit_data_file = yaml_config.get("data_file")
|
||||
if explicit_data_file:
|
||||
candidate = example_dir / "data" / explicit_data_file
|
||||
if candidate.exists():
|
||||
result["data_file"] = candidate
|
||||
else:
|
||||
logger.warning(
|
||||
"data_file '%s' specified in YAML does not exist",
|
||||
explicit_data_file,
|
||||
)
|
||||
except Exception:
|
||||
logger.debug("Could not read datasets yaml from %s", datasets_yaml)
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def discover_datasets() -> Dict[str, Callable[..., None]]:
|
||||
"""Auto-discover all example datasets and create loaders for them.
|
||||
|
||||
Examples are organized as:
|
||||
superset/examples/{example_name}/data.parquet # Single dataset
|
||||
superset/examples/{example_name}/data/{name}.parquet # Multiple datasets
|
||||
|
||||
Table names and data file references are read from dataset.yaml/datasets/*.yaml
|
||||
if present, otherwise derived from the folder/file name.
|
||||
"""
|
||||
loaders: Dict[str, Callable[..., None]] = {}
|
||||
examples_dir = get_examples_directory()
|
||||
|
||||
if not examples_dir.exists():
|
||||
return loaders
|
||||
|
||||
# Discover single data.parquet files (simple examples)
|
||||
for data_file in sorted(examples_dir.glob("*/data.parquet")):
|
||||
example_dir = data_file.parent
|
||||
dataset_name = example_dir.name
|
||||
|
||||
if dataset_name.startswith("_"):
|
||||
continue
|
||||
|
||||
config = get_dataset_config_from_yaml(example_dir)
|
||||
table_name = config["table_name"] or dataset_name
|
||||
explicit_data_file = config.get("data_file")
|
||||
if explicit_data_file:
|
||||
resolved_file = example_dir / explicit_data_file
|
||||
else:
|
||||
resolved_file = data_file
|
||||
if explicit_data_file and not resolved_file.exists():
|
||||
logger.warning("data_file '%s' does not exist", explicit_data_file)
|
||||
resolved_file = data_file
|
||||
|
||||
loader_name = f"load_{dataset_name}"
|
||||
loaders[loader_name] = create_generic_loader(
|
||||
dataset_name,
|
||||
table_name=table_name,
|
||||
schema=config["schema"],
|
||||
data_file=resolved_file,
|
||||
)
|
||||
|
||||
# Discover multiple parquet files in data/ folders (complex examples)
|
||||
for data_file in sorted(examples_dir.glob("*/data/*.parquet")):
|
||||
dataset_name = data_file.stem
|
||||
example_dir = data_file.parent.parent
|
||||
|
||||
if example_dir.name.startswith("_"):
|
||||
continue
|
||||
|
||||
config = _get_multi_dataset_config(example_dir, dataset_name, data_file)
|
||||
loader_name = f"load_{dataset_name}"
|
||||
if loader_name not in loaders:
|
||||
loaders[loader_name] = create_generic_loader(
|
||||
dataset_name,
|
||||
table_name=config["table_name"],
|
||||
schema=config["schema"],
|
||||
data_file=config["data_file"],
|
||||
)
|
||||
|
||||
return loaders
|
||||
|
||||
|
||||
# Auto-discover and create all dataset loaders
|
||||
try:
|
||||
_auto_loaders = discover_datasets()
|
||||
except RuntimeError:
|
||||
# Outside Flask app context (e.g., tests, tooling)
|
||||
_auto_loaders = {}
|
||||
|
||||
# Add auto-discovered loaders to module namespace
|
||||
globals().update(_auto_loaders)
|
||||
|
||||
# Build __all__ list dynamically
|
||||
__all__ = [
|
||||
"load_bart_lines",
|
||||
# Custom loaders (always included)
|
||||
"load_big_data",
|
||||
"load_birth_names",
|
||||
"load_country_map_data",
|
||||
"load_css_templates",
|
||||
"load_international_sales",
|
||||
"load_deck_dash",
|
||||
"load_energy",
|
||||
"load_flights",
|
||||
"load_long_lat_data",
|
||||
"load_misc_dashboard",
|
||||
"load_multiformat_time_series",
|
||||
"load_paris_iris_geojson",
|
||||
"load_random_time_series_data",
|
||||
"load_sf_population_polygons",
|
||||
"load_supported_charts_dashboard",
|
||||
"load_tabbed_dashboard",
|
||||
"load_examples_from_configs",
|
||||
"load_world_bank_health_n_pop",
|
||||
# Auto-discovered loaders
|
||||
*sorted(_auto_loaders.keys()),
|
||||
]
|
||||
|
||||
Reference in New Issue
Block a user