mirror of
https://github.com/apache/superset.git
synced 2026-05-28 11:15:24 +00:00
Follow-up to #40231 (merged), where a reviewer flagged a function-body `from datetime import datetime, timedelta` instead of a top-of-file import. Adds a `ruff-import-placement` pre-commit hook running `ruff check --select PLC0415 --preview --no-fix`. Per @rusackas's pushback on the first cut of this PR — which spammed 2,657 `# noqa: PLC0415` annotations across ~410 files without fixing anything — this revision is a much smaller surface area: 1. **Per-file-ignores** for whole directories where function-body imports are a deliberate pattern, not an oversight: - `superset/cli/**` and `scripts/**`: subcommand-deferred imports keep heavy modules out of the CLI startup path. - `superset/tasks/**`: Celery task bodies defer imports of the modules they orchestrate. - `superset/migrations/versions/**`: Alembic migrations interact with model state at runtime, not at module load. - `superset/mcp_service/**`: MCP tools lazy-load resources on invocation so the server can register many tools without paying their import cost at startup. - `superset/db_engine_specs/**`: engine specs defer driver imports so optional DB drivers don't have to be installed. - `superset/initialization/__init__.py`, `superset/extensions/__init__.py`, `superset/app.py`: the app-factory and extension wiring are intentionally full of circular-import workarounds. - `tests/**`: test files routinely defer imports for fixture isolation; the rule still applies to production code. 2. **Per-line `# noqa: PLC0415`** on the 259 remaining genuine circular-import sites (security/manager.py, sql/execution/executor.py, semantic_layers/labels.py, tags/core.py, core_api_injection.py, etc.). These are foundational modules where moving the imports up would actually break things. Net result: ~410 files / 2,657 grandfathered → ~73 files / 259 actual noqa annotations. The rule still catches every new function-body import outside the explicitly-allowed directories. Also: silences a pre-existing C901 on `mcp_service/sql_lab/tool/execute_sql.py` that fires under newer local ruff but not CI's pinned ruff 0.9.7 — blocks the local pre-commit run otherwise. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
192 lines
6.7 KiB
Python
192 lines
6.7 KiB
Python
# Licensed to the Apache Software Foundation (ASF) under one
|
|
# or more contributor license agreements. See the NOTICE file
|
|
# distributed with this work for additional information
|
|
# regarding copyright ownership. The ASF licenses this file
|
|
# to you under the Apache License, Version 2.0 (the
|
|
# "License"); you may not use this file except in compliance
|
|
# with the License. You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing,
|
|
# software distributed under the License is distributed on an
|
|
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
# KIND, either express or implied. See the License for the
|
|
# specific language governing permissions and limitations
|
|
# under the License.
|
|
"""Auto-discover and load example datasets from Parquet files."""
|
|
|
|
import logging
|
|
from pathlib import Path
|
|
from typing import Any, Callable, Dict, Optional
|
|
|
|
import yaml
|
|
|
|
# Import loaders that have custom logic (dashboards, CSS, etc.)
|
|
from superset.cli.test_loaders import load_big_data
|
|
|
|
from .css_templates import load_css_templates
|
|
|
|
# Import generic loader for Parquet datasets
|
|
from .generic_loader import create_generic_loader
|
|
from .utils import load_examples_from_configs
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
def get_dataset_config_from_yaml(example_dir: Path) -> Dict[str, Optional[str]]:
|
|
"""Read table_name, schema, and data_file from dataset.yaml if it exists."""
|
|
result: Dict[str, Optional[str]] = {
|
|
"table_name": None,
|
|
"schema": None,
|
|
"data_file": None,
|
|
"uuid": None,
|
|
}
|
|
dataset_yaml = example_dir / "dataset.yaml"
|
|
if dataset_yaml.exists():
|
|
try:
|
|
with open(dataset_yaml) as f:
|
|
config = yaml.safe_load(f)
|
|
result["table_name"] = config.get("table_name")
|
|
result["data_file"] = config.get("data_file")
|
|
result["uuid"] = config.get("uuid")
|
|
schema = config.get("schema")
|
|
# Treat SQLite's 'main' schema as null (use target database default)
|
|
result["schema"] = None if schema == "main" else schema
|
|
except Exception:
|
|
logger.debug("Could not read dataset.yaml from %s", example_dir)
|
|
return result
|
|
|
|
|
|
def get_examples_directory() -> Path:
|
|
"""Get the path to the examples directory."""
|
|
from .helpers import get_examples_folder # noqa: PLC0415
|
|
|
|
return Path(get_examples_folder())
|
|
|
|
|
|
def _get_multi_dataset_config(
|
|
example_dir: Path, dataset_name: str, data_file: Path
|
|
) -> Dict[str, Any]:
|
|
"""Read config for a multi-dataset example from datasets/{name}.yaml."""
|
|
datasets_yaml = example_dir / "datasets" / f"{dataset_name}.yaml"
|
|
result: Dict[str, Any] = {
|
|
"table_name": dataset_name,
|
|
"schema": None,
|
|
"data_file": data_file,
|
|
}
|
|
|
|
if not datasets_yaml.exists():
|
|
return result
|
|
|
|
try:
|
|
with open(datasets_yaml) as f:
|
|
yaml_config = yaml.safe_load(f)
|
|
result["table_name"] = yaml_config.get("table_name") or dataset_name
|
|
result["uuid"] = yaml_config.get("uuid")
|
|
raw_schema = yaml_config.get("schema")
|
|
result["schema"] = None if raw_schema == "main" else raw_schema
|
|
|
|
# Use explicit data_file from YAML if specified
|
|
explicit_data_file = yaml_config.get("data_file")
|
|
if explicit_data_file:
|
|
candidate = example_dir / "data" / explicit_data_file
|
|
if candidate.exists():
|
|
result["data_file"] = candidate
|
|
else:
|
|
logger.warning(
|
|
"data_file '%s' specified in YAML does not exist",
|
|
explicit_data_file,
|
|
)
|
|
except Exception:
|
|
logger.debug("Could not read datasets yaml from %s", datasets_yaml)
|
|
|
|
return result
|
|
|
|
|
|
def discover_datasets() -> Dict[str, Callable[..., None]]:
|
|
"""Auto-discover all example datasets and create loaders for them.
|
|
|
|
Examples are organized as:
|
|
superset/examples/{example_name}/data.parquet # Single dataset
|
|
superset/examples/{example_name}/data/{name}.parquet # Multiple datasets
|
|
|
|
Table names and data file references are read from dataset.yaml/datasets/*.yaml
|
|
if present, otherwise derived from the folder/file name.
|
|
"""
|
|
loaders: Dict[str, Callable[..., None]] = {}
|
|
examples_dir = get_examples_directory()
|
|
|
|
if not examples_dir.exists():
|
|
return loaders
|
|
|
|
# Discover single data.parquet files (simple examples)
|
|
for data_file in sorted(examples_dir.glob("*/data.parquet")):
|
|
example_dir = data_file.parent
|
|
dataset_name = example_dir.name
|
|
|
|
if dataset_name.startswith("_"):
|
|
continue
|
|
|
|
config = get_dataset_config_from_yaml(example_dir)
|
|
table_name = config["table_name"] or dataset_name
|
|
explicit_data_file = config.get("data_file")
|
|
if explicit_data_file:
|
|
resolved_file = example_dir / explicit_data_file
|
|
else:
|
|
resolved_file = data_file
|
|
if explicit_data_file and not resolved_file.exists():
|
|
logger.warning("data_file '%s' does not exist", explicit_data_file)
|
|
resolved_file = data_file
|
|
|
|
loader_name = f"load_{dataset_name}"
|
|
loaders[loader_name] = create_generic_loader(
|
|
dataset_name,
|
|
table_name=table_name,
|
|
schema=config["schema"],
|
|
data_file=resolved_file,
|
|
uuid=config.get("uuid"),
|
|
)
|
|
|
|
# Discover multiple parquet files in data/ folders (complex examples)
|
|
for data_file in sorted(examples_dir.glob("*/data/*.parquet")):
|
|
dataset_name = data_file.stem
|
|
example_dir = data_file.parent.parent
|
|
|
|
if example_dir.name.startswith("_"):
|
|
continue
|
|
|
|
config = _get_multi_dataset_config(example_dir, dataset_name, data_file)
|
|
loader_name = f"load_{dataset_name}"
|
|
if loader_name not in loaders:
|
|
loaders[loader_name] = create_generic_loader(
|
|
dataset_name,
|
|
table_name=config["table_name"],
|
|
schema=config["schema"],
|
|
data_file=config["data_file"],
|
|
uuid=config.get("uuid"),
|
|
)
|
|
|
|
return loaders
|
|
|
|
|
|
# Auto-discover and create all dataset loaders
|
|
try:
|
|
_auto_loaders = discover_datasets()
|
|
except RuntimeError:
|
|
# Outside Flask app context (e.g., tests, tooling)
|
|
_auto_loaders = {}
|
|
|
|
# Add auto-discovered loaders to module namespace
|
|
globals().update(_auto_loaders)
|
|
|
|
# Build __all__ list dynamically
|
|
__all__ = [
|
|
# Custom loaders (always included)
|
|
"load_big_data",
|
|
"load_css_templates",
|
|
"load_examples_from_configs",
|
|
# Auto-discovered loaders
|
|
*sorted(_auto_loaders.keys()),
|
|
]
|