feat(examples): Modernize example data loading with Parquet and YAML configs (#36538)

Co-authored-by: Claude <noreply@anthropic.com>
This commit is contained in:
Evan Rusackas
2026-01-21 12:42:15 -08:00
committed by GitHub
parent ec36791551
commit dee063a4c5
271 changed files with 23340 additions and 12971 deletions

View File

@@ -31,11 +31,167 @@ _logger = logging.getLogger(__name__)
YAML_EXTENSIONS = {".yaml", ".yml"}
def _normalize_dataset_schema(content: str) -> str:
"""Normalize schema in dataset YAML content.
Converts SQLite's 'main' schema to null for portability across databases.
"""
# Replace 'schema: main' with 'schema: null' to use target database default
return content.replace("schema: main", "schema: null")
def _read_file_if_exists(base: Any, path: Any) -> str | None:
"""Read file content if it exists, return None otherwise."""
file_path = base / str(path)
if file_path.is_file():
return file_path.read_text("utf-8")
return None
def _load_shared_configs(examples_root: Any) -> dict[str, str]:
"""Load shared database and metadata configs from _shared directory."""
from flask import current_app
contents: dict[str, str] = {}
base = files("superset")
shared_dir = examples_root / "_shared"
if not (base / str(shared_dir)).is_dir():
return contents
# Database config -> databases/examples.yaml
if db_content := _read_file_if_exists(base, shared_dir / "database.yaml"):
# Replace placeholder with configured examples URI
examples_uri = current_app.config.get("SQLALCHEMY_EXAMPLES_URI", "")
db_content = db_content.replace("__SQLALCHEMY_EXAMPLES_URI__", examples_uri)
contents["databases/examples.yaml"] = db_content
# Metadata -> metadata.yaml
if meta_content := _read_file_if_exists(base, shared_dir / "metadata.yaml"):
contents["metadata.yaml"] = meta_content
return contents
def _should_skip_directory(item: Any) -> bool:
"""Check if directory should be skipped during traversal."""
name = str(item)
if name.startswith("_") or name.startswith("."):
return True
return name in ("configs", "data", "__pycache__")
def _load_datasets_from_folder(
base: Any,
datasets_dir: Any,
test_re: re.Pattern[str],
load_test_data: bool,
) -> dict[str, str]:
"""Load dataset configs from a datasets/ folder."""
contents: dict[str, str] = {}
if not (base / str(datasets_dir)).is_dir():
return contents
for dataset_item in (base / str(datasets_dir)).iterdir():
dataset_filename = dataset_item.name # Get just the filename, not full path
if Path(dataset_filename).suffix.lower() not in YAML_EXTENSIONS:
continue
if not load_test_data and test_re.search(dataset_filename):
continue
dataset_file = datasets_dir / dataset_filename
content = _read_file_if_exists(base, dataset_file)
if content:
dataset_name = Path(dataset_filename).stem
contents[f"datasets/examples/{dataset_name}.yaml"] = (
_normalize_dataset_schema(content)
)
return contents
def _load_charts_from_folder(
base: Any,
charts_dir: Any,
example_name: str,
test_re: re.Pattern[str],
load_test_data: bool,
) -> dict[str, str]:
"""Load chart configs from a charts/ folder."""
contents: dict[str, str] = {}
if not (base / str(charts_dir)).is_dir():
return contents
for chart_item in (base / str(charts_dir)).iterdir():
chart_name = chart_item.name # Get just the filename, not full path
if Path(chart_name).suffix.lower() not in YAML_EXTENSIONS:
continue
if not load_test_data and test_re.search(chart_name):
continue
chart_file = charts_dir / chart_name
content = _read_file_if_exists(base, chart_file)
if content:
contents[f"charts/{example_name}/{chart_name}"] = content
return contents
def _load_example_contents(
example_dir: Any, example_name: str, test_re: re.Pattern[str], load_test_data: bool
) -> dict[str, str]:
"""Load all configs (dataset, dashboard, charts) from a single example directory."""
contents: dict[str, str] = {}
base = files("superset")
# Single dataset.yaml at root (backward compatible)
dataset_content = _read_file_if_exists(base, example_dir / "dataset.yaml")
if dataset_content and (load_test_data or not test_re.search("dataset.yaml")):
contents[f"datasets/examples/{example_name}.yaml"] = _normalize_dataset_schema(
dataset_content
)
# Multiple datasets in datasets/ folder
contents.update(
_load_datasets_from_folder(
base, example_dir / "datasets", test_re, load_test_data
)
)
# Dashboard config
dashboard_content = _read_file_if_exists(base, example_dir / "dashboard.yaml")
if dashboard_content and (load_test_data or not test_re.search("dashboard.yaml")):
contents[f"dashboards/{example_name}.yaml"] = dashboard_content
# Chart configs
contents.update(
_load_charts_from_folder(
base, example_dir / "charts", example_name, test_re, load_test_data
)
)
return contents
def load_examples_from_configs(
force_data: bool = False, load_test_data: bool = False
) -> None:
"""
Load all the examples inside superset/examples/configs/.
Load all the examples from the new directory structure.
Examples are organized as:
superset/examples/{example_name}/
data.parquet # Raw data (optional)
dataset.yaml # Single dataset metadata (simple examples)
datasets/ # Multiple datasets (complex examples)
dataset1.yaml
dataset2.yaml
dashboard.yaml # Dashboard config (optional)
charts/ # Chart configs (optional)
chart1.yaml
chart2.yaml
superset/examples/_shared/
database.yaml # Database connection
metadata.yaml # Import metadata
For simple examples with one dataset, use dataset.yaml at root.
For complex examples with multiple datasets, use datasets/ folder.
"""
contents = load_contents(load_test_data)
command = ImportExamplesCommand(contents, overwrite=True, force_data=force_data)
@@ -43,31 +199,45 @@ def load_examples_from_configs(
def load_contents(load_test_data: bool = False) -> dict[str, Any]:
"""Traverse configs directory and load contents"""
root = files("superset") / "examples/configs"
resource_names = (files("superset") / str(root)).iterdir()
queue = [root / str(resource_name) for resource_name in resource_names]
"""Traverse example directories and load YAML configs.
contents: dict[Path, str] = {}
while queue:
path_name = queue.pop()
test_re = re.compile(r"\.test\.|metadata\.yaml$")
Builds import structure expected by ImportExamplesCommand:
databases/examples.yaml
datasets/examples/{name}.yaml
charts/{dashboard}/{chart}.yaml
dashboards/{name}.yaml
metadata.yaml
if (files("superset") / str(path_name)).is_dir():
queue.extend(
path_name / str(child_name)
for child_name in (files("superset") / str(path_name)).iterdir()
)
elif Path(str(path_name)).suffix.lower() in YAML_EXTENSIONS:
if load_test_data and test_re.search(str(path_name)) is None:
continue
contents[Path(str(path_name))] = (
files("superset") / str(path_name)
).read_text("utf-8")
Args:
load_test_data: If True, includes test data files (*.test.yaml).
If False, excludes test data files.
"""
examples_root = files("superset") / "examples"
test_re = re.compile(r"\.test\.")
base = files("superset")
return {
str(path.relative_to(str(root))): content for path, content in contents.items()
}
# Load shared configs (_shared directory)
contents: dict[str, str] = _load_shared_configs(examples_root)
# Traverse example directories
for item in (base / str(examples_root)).iterdir():
item_name = item.name # Get just the directory name, not full path
example_dir = examples_root / item_name
# Skip non-directories and special dirs
if not (base / str(example_dir)).is_dir():
continue
if _should_skip_directory(item_name):
continue
example_name = item_name
example_contents = _load_example_contents(
example_dir, example_name, test_re, load_test_data
)
contents.update(example_contents)
return contents
def load_configs_from_directory(