mirror of
https://github.com/apache/superset.git
synced 2026-04-19 08:04:53 +00:00
feat(examples): Modernize example data loading with Parquet and YAML configs (#36538)
Co-authored-by: Claude <noreply@anthropic.com>
This commit is contained in:
@@ -31,11 +31,167 @@ _logger = logging.getLogger(__name__)
|
||||
YAML_EXTENSIONS = {".yaml", ".yml"}
|
||||
|
||||
|
||||
def _normalize_dataset_schema(content: str) -> str:
|
||||
"""Normalize schema in dataset YAML content.
|
||||
|
||||
Converts SQLite's 'main' schema to null for portability across databases.
|
||||
"""
|
||||
# Replace 'schema: main' with 'schema: null' to use target database default
|
||||
return content.replace("schema: main", "schema: null")
|
||||
|
||||
|
||||
def _read_file_if_exists(base: Any, path: Any) -> str | None:
|
||||
"""Read file content if it exists, return None otherwise."""
|
||||
file_path = base / str(path)
|
||||
if file_path.is_file():
|
||||
return file_path.read_text("utf-8")
|
||||
return None
|
||||
|
||||
|
||||
def _load_shared_configs(examples_root: Any) -> dict[str, str]:
|
||||
"""Load shared database and metadata configs from _shared directory."""
|
||||
from flask import current_app
|
||||
|
||||
contents: dict[str, str] = {}
|
||||
base = files("superset")
|
||||
shared_dir = examples_root / "_shared"
|
||||
|
||||
if not (base / str(shared_dir)).is_dir():
|
||||
return contents
|
||||
|
||||
# Database config -> databases/examples.yaml
|
||||
if db_content := _read_file_if_exists(base, shared_dir / "database.yaml"):
|
||||
# Replace placeholder with configured examples URI
|
||||
examples_uri = current_app.config.get("SQLALCHEMY_EXAMPLES_URI", "")
|
||||
db_content = db_content.replace("__SQLALCHEMY_EXAMPLES_URI__", examples_uri)
|
||||
contents["databases/examples.yaml"] = db_content
|
||||
|
||||
# Metadata -> metadata.yaml
|
||||
if meta_content := _read_file_if_exists(base, shared_dir / "metadata.yaml"):
|
||||
contents["metadata.yaml"] = meta_content
|
||||
|
||||
return contents
|
||||
|
||||
|
||||
def _should_skip_directory(item: Any) -> bool:
|
||||
"""Check if directory should be skipped during traversal."""
|
||||
name = str(item)
|
||||
if name.startswith("_") or name.startswith("."):
|
||||
return True
|
||||
return name in ("configs", "data", "__pycache__")
|
||||
|
||||
|
||||
def _load_datasets_from_folder(
|
||||
base: Any,
|
||||
datasets_dir: Any,
|
||||
test_re: re.Pattern[str],
|
||||
load_test_data: bool,
|
||||
) -> dict[str, str]:
|
||||
"""Load dataset configs from a datasets/ folder."""
|
||||
contents: dict[str, str] = {}
|
||||
if not (base / str(datasets_dir)).is_dir():
|
||||
return contents
|
||||
|
||||
for dataset_item in (base / str(datasets_dir)).iterdir():
|
||||
dataset_filename = dataset_item.name # Get just the filename, not full path
|
||||
if Path(dataset_filename).suffix.lower() not in YAML_EXTENSIONS:
|
||||
continue
|
||||
if not load_test_data and test_re.search(dataset_filename):
|
||||
continue
|
||||
dataset_file = datasets_dir / dataset_filename
|
||||
content = _read_file_if_exists(base, dataset_file)
|
||||
if content:
|
||||
dataset_name = Path(dataset_filename).stem
|
||||
contents[f"datasets/examples/{dataset_name}.yaml"] = (
|
||||
_normalize_dataset_schema(content)
|
||||
)
|
||||
return contents
|
||||
|
||||
|
||||
def _load_charts_from_folder(
|
||||
base: Any,
|
||||
charts_dir: Any,
|
||||
example_name: str,
|
||||
test_re: re.Pattern[str],
|
||||
load_test_data: bool,
|
||||
) -> dict[str, str]:
|
||||
"""Load chart configs from a charts/ folder."""
|
||||
contents: dict[str, str] = {}
|
||||
if not (base / str(charts_dir)).is_dir():
|
||||
return contents
|
||||
|
||||
for chart_item in (base / str(charts_dir)).iterdir():
|
||||
chart_name = chart_item.name # Get just the filename, not full path
|
||||
if Path(chart_name).suffix.lower() not in YAML_EXTENSIONS:
|
||||
continue
|
||||
if not load_test_data and test_re.search(chart_name):
|
||||
continue
|
||||
chart_file = charts_dir / chart_name
|
||||
content = _read_file_if_exists(base, chart_file)
|
||||
if content:
|
||||
contents[f"charts/{example_name}/{chart_name}"] = content
|
||||
return contents
|
||||
|
||||
|
||||
def _load_example_contents(
|
||||
example_dir: Any, example_name: str, test_re: re.Pattern[str], load_test_data: bool
|
||||
) -> dict[str, str]:
|
||||
"""Load all configs (dataset, dashboard, charts) from a single example directory."""
|
||||
contents: dict[str, str] = {}
|
||||
base = files("superset")
|
||||
|
||||
# Single dataset.yaml at root (backward compatible)
|
||||
dataset_content = _read_file_if_exists(base, example_dir / "dataset.yaml")
|
||||
if dataset_content and (load_test_data or not test_re.search("dataset.yaml")):
|
||||
contents[f"datasets/examples/{example_name}.yaml"] = _normalize_dataset_schema(
|
||||
dataset_content
|
||||
)
|
||||
|
||||
# Multiple datasets in datasets/ folder
|
||||
contents.update(
|
||||
_load_datasets_from_folder(
|
||||
base, example_dir / "datasets", test_re, load_test_data
|
||||
)
|
||||
)
|
||||
|
||||
# Dashboard config
|
||||
dashboard_content = _read_file_if_exists(base, example_dir / "dashboard.yaml")
|
||||
if dashboard_content and (load_test_data or not test_re.search("dashboard.yaml")):
|
||||
contents[f"dashboards/{example_name}.yaml"] = dashboard_content
|
||||
|
||||
# Chart configs
|
||||
contents.update(
|
||||
_load_charts_from_folder(
|
||||
base, example_dir / "charts", example_name, test_re, load_test_data
|
||||
)
|
||||
)
|
||||
|
||||
return contents
|
||||
|
||||
|
||||
def load_examples_from_configs(
|
||||
force_data: bool = False, load_test_data: bool = False
|
||||
) -> None:
|
||||
"""
|
||||
Load all the examples inside superset/examples/configs/.
|
||||
Load all the examples from the new directory structure.
|
||||
|
||||
Examples are organized as:
|
||||
superset/examples/{example_name}/
|
||||
data.parquet # Raw data (optional)
|
||||
dataset.yaml # Single dataset metadata (simple examples)
|
||||
datasets/ # Multiple datasets (complex examples)
|
||||
dataset1.yaml
|
||||
dataset2.yaml
|
||||
dashboard.yaml # Dashboard config (optional)
|
||||
charts/ # Chart configs (optional)
|
||||
chart1.yaml
|
||||
chart2.yaml
|
||||
superset/examples/_shared/
|
||||
database.yaml # Database connection
|
||||
metadata.yaml # Import metadata
|
||||
|
||||
For simple examples with one dataset, use dataset.yaml at root.
|
||||
For complex examples with multiple datasets, use datasets/ folder.
|
||||
"""
|
||||
contents = load_contents(load_test_data)
|
||||
command = ImportExamplesCommand(contents, overwrite=True, force_data=force_data)
|
||||
@@ -43,31 +199,45 @@ def load_examples_from_configs(
|
||||
|
||||
|
||||
def load_contents(load_test_data: bool = False) -> dict[str, Any]:
|
||||
"""Traverse configs directory and load contents"""
|
||||
root = files("superset") / "examples/configs"
|
||||
resource_names = (files("superset") / str(root)).iterdir()
|
||||
queue = [root / str(resource_name) for resource_name in resource_names]
|
||||
"""Traverse example directories and load YAML configs.
|
||||
|
||||
contents: dict[Path, str] = {}
|
||||
while queue:
|
||||
path_name = queue.pop()
|
||||
test_re = re.compile(r"\.test\.|metadata\.yaml$")
|
||||
Builds import structure expected by ImportExamplesCommand:
|
||||
databases/examples.yaml
|
||||
datasets/examples/{name}.yaml
|
||||
charts/{dashboard}/{chart}.yaml
|
||||
dashboards/{name}.yaml
|
||||
metadata.yaml
|
||||
|
||||
if (files("superset") / str(path_name)).is_dir():
|
||||
queue.extend(
|
||||
path_name / str(child_name)
|
||||
for child_name in (files("superset") / str(path_name)).iterdir()
|
||||
)
|
||||
elif Path(str(path_name)).suffix.lower() in YAML_EXTENSIONS:
|
||||
if load_test_data and test_re.search(str(path_name)) is None:
|
||||
continue
|
||||
contents[Path(str(path_name))] = (
|
||||
files("superset") / str(path_name)
|
||||
).read_text("utf-8")
|
||||
Args:
|
||||
load_test_data: If True, includes test data files (*.test.yaml).
|
||||
If False, excludes test data files.
|
||||
"""
|
||||
examples_root = files("superset") / "examples"
|
||||
test_re = re.compile(r"\.test\.")
|
||||
base = files("superset")
|
||||
|
||||
return {
|
||||
str(path.relative_to(str(root))): content for path, content in contents.items()
|
||||
}
|
||||
# Load shared configs (_shared directory)
|
||||
contents: dict[str, str] = _load_shared_configs(examples_root)
|
||||
|
||||
# Traverse example directories
|
||||
for item in (base / str(examples_root)).iterdir():
|
||||
item_name = item.name # Get just the directory name, not full path
|
||||
example_dir = examples_root / item_name
|
||||
|
||||
# Skip non-directories and special dirs
|
||||
if not (base / str(example_dir)).is_dir():
|
||||
continue
|
||||
if _should_skip_directory(item_name):
|
||||
continue
|
||||
|
||||
example_name = item_name
|
||||
|
||||
example_contents = _load_example_contents(
|
||||
example_dir, example_name, test_re, load_test_data
|
||||
)
|
||||
contents.update(example_contents)
|
||||
|
||||
return contents
|
||||
|
||||
|
||||
def load_configs_from_directory(
|
||||
|
||||
Reference in New Issue
Block a user