Files
superset2/superset/examples/data_loading.py
Claude Code dfd3f7b316 ci(lint): enforce no function-body imports (PLC0415) with targeted ignores
Follow-up to #40231 (merged), where a reviewer flagged a function-body
`from datetime import datetime, timedelta` instead of a top-of-file
import. Adds a `ruff-import-placement` pre-commit hook running
`ruff check --select PLC0415 --preview --no-fix`.

Per @rusackas's pushback on the first cut of this PR — which spammed
2,657 `# noqa: PLC0415` annotations across ~410 files without fixing
anything — this revision is a much smaller surface area:

1. **Per-file-ignores** for whole directories where function-body
   imports are a deliberate pattern, not an oversight:
   - `superset/cli/**` and `scripts/**`: subcommand-deferred imports
     keep heavy modules out of the CLI startup path.
   - `superset/tasks/**`: Celery task bodies defer imports of the
     modules they orchestrate.
   - `superset/migrations/versions/**`: Alembic migrations interact
     with model state at runtime, not at module load.
   - `superset/mcp_service/**`: MCP tools lazy-load resources on
     invocation so the server can register many tools without paying
     their import cost at startup.
   - `superset/db_engine_specs/**`: engine specs defer driver imports
     so optional DB drivers don't have to be installed.
   - `superset/initialization/__init__.py`, `superset/extensions/__init__.py`,
     `superset/app.py`: the app-factory and extension wiring are
     intentionally full of circular-import workarounds.
   - `tests/**`: test files routinely defer imports for fixture
     isolation; the rule still applies to production code.

2. **Per-line `# noqa: PLC0415`** on the 259 remaining genuine
   circular-import sites (security/manager.py, sql/execution/executor.py,
   semantic_layers/labels.py, tags/core.py, core_api_injection.py, etc.).
   These are foundational modules where moving the imports up would
   actually break things.

Net result: ~410 files / 2,657 grandfathered → ~73 files / 259 actual
noqa annotations. The rule still catches every new function-body
import outside the explicitly-allowed directories.

Also: silences a pre-existing C901 on `mcp_service/sql_lab/tool/execute_sql.py`
that fires under newer local ruff but not CI's pinned ruff 0.9.7 — blocks
the local pre-commit run otherwise.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-20 13:55:14 -07:00

192 lines
6.7 KiB
Python

# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
"""Auto-discover and load example datasets from Parquet files."""
import logging
from pathlib import Path
from typing import Any, Callable, Dict, Optional
import yaml
# Import loaders that have custom logic (dashboards, CSS, etc.)
from superset.cli.test_loaders import load_big_data
from .css_templates import load_css_templates
# Import generic loader for Parquet datasets
from .generic_loader import create_generic_loader
from .utils import load_examples_from_configs
logger = logging.getLogger(__name__)
def get_dataset_config_from_yaml(example_dir: Path) -> Dict[str, Optional[str]]:
"""Read table_name, schema, and data_file from dataset.yaml if it exists."""
result: Dict[str, Optional[str]] = {
"table_name": None,
"schema": None,
"data_file": None,
"uuid": None,
}
dataset_yaml = example_dir / "dataset.yaml"
if dataset_yaml.exists():
try:
with open(dataset_yaml) as f:
config = yaml.safe_load(f)
result["table_name"] = config.get("table_name")
result["data_file"] = config.get("data_file")
result["uuid"] = config.get("uuid")
schema = config.get("schema")
# Treat SQLite's 'main' schema as null (use target database default)
result["schema"] = None if schema == "main" else schema
except Exception:
logger.debug("Could not read dataset.yaml from %s", example_dir)
return result
def get_examples_directory() -> Path:
"""Get the path to the examples directory."""
from .helpers import get_examples_folder # noqa: PLC0415
return Path(get_examples_folder())
def _get_multi_dataset_config(
example_dir: Path, dataset_name: str, data_file: Path
) -> Dict[str, Any]:
"""Read config for a multi-dataset example from datasets/{name}.yaml."""
datasets_yaml = example_dir / "datasets" / f"{dataset_name}.yaml"
result: Dict[str, Any] = {
"table_name": dataset_name,
"schema": None,
"data_file": data_file,
}
if not datasets_yaml.exists():
return result
try:
with open(datasets_yaml) as f:
yaml_config = yaml.safe_load(f)
result["table_name"] = yaml_config.get("table_name") or dataset_name
result["uuid"] = yaml_config.get("uuid")
raw_schema = yaml_config.get("schema")
result["schema"] = None if raw_schema == "main" else raw_schema
# Use explicit data_file from YAML if specified
explicit_data_file = yaml_config.get("data_file")
if explicit_data_file:
candidate = example_dir / "data" / explicit_data_file
if candidate.exists():
result["data_file"] = candidate
else:
logger.warning(
"data_file '%s' specified in YAML does not exist",
explicit_data_file,
)
except Exception:
logger.debug("Could not read datasets yaml from %s", datasets_yaml)
return result
def discover_datasets() -> Dict[str, Callable[..., None]]:
"""Auto-discover all example datasets and create loaders for them.
Examples are organized as:
superset/examples/{example_name}/data.parquet # Single dataset
superset/examples/{example_name}/data/{name}.parquet # Multiple datasets
Table names and data file references are read from dataset.yaml/datasets/*.yaml
if present, otherwise derived from the folder/file name.
"""
loaders: Dict[str, Callable[..., None]] = {}
examples_dir = get_examples_directory()
if not examples_dir.exists():
return loaders
# Discover single data.parquet files (simple examples)
for data_file in sorted(examples_dir.glob("*/data.parquet")):
example_dir = data_file.parent
dataset_name = example_dir.name
if dataset_name.startswith("_"):
continue
config = get_dataset_config_from_yaml(example_dir)
table_name = config["table_name"] or dataset_name
explicit_data_file = config.get("data_file")
if explicit_data_file:
resolved_file = example_dir / explicit_data_file
else:
resolved_file = data_file
if explicit_data_file and not resolved_file.exists():
logger.warning("data_file '%s' does not exist", explicit_data_file)
resolved_file = data_file
loader_name = f"load_{dataset_name}"
loaders[loader_name] = create_generic_loader(
dataset_name,
table_name=table_name,
schema=config["schema"],
data_file=resolved_file,
uuid=config.get("uuid"),
)
# Discover multiple parquet files in data/ folders (complex examples)
for data_file in sorted(examples_dir.glob("*/data/*.parquet")):
dataset_name = data_file.stem
example_dir = data_file.parent.parent
if example_dir.name.startswith("_"):
continue
config = _get_multi_dataset_config(example_dir, dataset_name, data_file)
loader_name = f"load_{dataset_name}"
if loader_name not in loaders:
loaders[loader_name] = create_generic_loader(
dataset_name,
table_name=config["table_name"],
schema=config["schema"],
data_file=config["data_file"],
uuid=config.get("uuid"),
)
return loaders
# Auto-discover and create all dataset loaders
try:
_auto_loaders = discover_datasets()
except RuntimeError:
# Outside Flask app context (e.g., tests, tooling)
_auto_loaders = {}
# Add auto-discovered loaders to module namespace
globals().update(_auto_loaders)
# Build __all__ list dynamically
__all__ = [
# Custom loaders (always included)
"load_big_data",
"load_css_templates",
"load_examples_from_configs",
# Auto-discovered loaders
*sorted(_auto_loaders.keys()),
]