Files
superset2/superset/db_engine_specs/lint_metadata.py
2026-01-21 10:54:01 -08:00

706 lines
25 KiB
Python

# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
"""
Metadata completeness linter for DB engine specs.
This script validates that all DB engine specs have complete metadata
as defined by the DBEngineSpecMetadata TypedDict in base.py.
Usage:
python superset/db_engine_specs/lint_metadata.py [--json] [--strict]
Options:
--json Output results as JSON
--strict Return non-zero exit code if any required fields are missing
--help Show this help message
The script categorizes metadata fields into:
- REQUIRED: Must be present for proper documentation
- RECOMMENDED: Should be present for good documentation
- OPTIONAL: Nice to have but not critical
Example output:
=== Metadata Completeness Report ===
PostgreSQL (postgres.py)
✓ description, category, pypi_packages, connection_string
⚠ Missing recommended: logo, homepage_url
MySQL (mysql.py)
✓ All required and recommended fields present
"""
from __future__ import annotations
import argparse
import json # noqa: TID251 - standalone script, don't depend on superset.utils
import sys
from dataclasses import dataclass
from typing import Any
# Schema definition - fields grouped by importance
REQUIRED_FIELDS = {
"description": "Brief description of the database",
"categories": "List of DatabaseCategory constants for grouping",
"pypi_packages": "Python packages needed for connection",
"connection_string": "SQLAlchemy URI template",
}
RECOMMENDED_FIELDS = {
"logo": "Logo filename (in docs/static/img/databases/)",
"homepage_url": "Official database homepage",
"default_port": "Default port number",
}
OPTIONAL_FIELDS = {
"docs_url": "Documentation URL",
"sqlalchemy_docs_url": "SQLAlchemy dialect documentation",
"notes": "Additional configuration notes",
"warnings": "Important warnings for users",
"limitations": "Known limitations (no JOINs, row limits, etc.)",
"install_instructions": "How to install the driver",
"version_requirements": "Version compatibility info",
"connection_examples": "Example connection strings",
"authentication_methods": "Supported auth methods",
"drivers": "Available driver options",
"compatible_databases": "Related/compatible databases",
"engine_parameters": "Advanced JSON config options",
"host_examples": "Platform-specific host examples",
"ssl_configuration": "SSL setup documentation",
"advanced_features": "Advanced feature documentation",
"parameters": "Connection parameter descriptions",
"tutorials": "Tutorial links",
}
# Cache for PyPI package validation
_pypi_cache: dict[str, bool] = {}
def check_pypi_package(package_name: str, timeout: float = 5.0) -> bool:
"""Check if a package exists on PyPI."""
import urllib.error
import urllib.request
# Strip version specifiers and extras
base_name = package_name.split("[")[0].split(">")[0].split("<")[0].split("=")[0]
base_name = base_name.strip()
if base_name in _pypi_cache:
return _pypi_cache[base_name]
url = f"https://pypi.org/pypi/{base_name}/json"
try:
req = urllib.request.Request( # noqa: S310
url, headers={"User-Agent": "superset-lint/1.0"}
)
with urllib.request.urlopen(req, timeout=timeout) as response: # noqa: S310
exists = response.status == 200
except (urllib.error.HTTPError, urllib.error.URLError, TimeoutError):
exists = False
_pypi_cache[base_name] = exists
return exists
def validate_pypi_packages(
packages: list[str], timeout: float = 5.0
) -> tuple[list[str], list[str]]:
"""Validate a list of PyPI packages. Returns (valid, invalid) lists."""
valid = []
invalid = []
for pkg in packages:
if check_pypi_package(pkg, timeout):
valid.append(pkg)
else:
invalid.append(pkg)
return valid, invalid
@dataclass
class MetadataReport:
"""Report for a single engine spec's metadata."""
engine_name: str
module: str
has_metadata: bool
present_fields: set[str]
missing_required: set[str]
missing_recommended: set[str]
missing_optional: set[str]
completeness_score: float # 0-100
invalid_packages: list[str] | None = None # PyPI packages that don't exist
limitations: list[str] | None = None # Known limitations
def to_dict(self) -> dict[str, Any]:
result = {
"engine_name": self.engine_name,
"module": self.module,
"has_metadata": self.has_metadata,
"present_fields": sorted(self.present_fields),
"missing_required": sorted(self.missing_required),
"missing_recommended": sorted(self.missing_recommended),
"missing_optional": sorted(self.missing_optional),
"completeness_score": self.completeness_score,
}
if self.invalid_packages is not None:
result["invalid_packages"] = self.invalid_packages
if self.limitations is not None:
result["limitations"] = self.limitations
return result
def analyze_spec(spec_data: dict[str, Any], check_pypi: bool = False) -> MetadataReport:
"""Analyze a single engine spec for metadata completeness."""
metadata = spec_data.get("metadata", {})
engine_name = spec_data.get("engine_name", spec_data.get("class_name", "Unknown"))
module = spec_data.get("module", "unknown")
# Handle unparseable metadata
if metadata.get("_unparseable"):
metadata = {}
present = set(metadata.keys()) if metadata else set()
missing_required = set(REQUIRED_FIELDS.keys()) - present
missing_recommended = set(RECOMMENDED_FIELDS.keys()) - present
missing_optional = set(OPTIONAL_FIELDS.keys()) - present
# Calculate completeness score
# Required fields: 60%, Recommended: 30%, Optional: 10%
total_required = len(REQUIRED_FIELDS)
total_recommended = len(RECOMMENDED_FIELDS)
total_optional = len(OPTIONAL_FIELDS)
required_present = total_required - len(missing_required)
recommended_present = total_recommended - len(missing_recommended)
optional_present = total_optional - len(missing_optional)
score = (
(required_present / total_required) * 60
+ (recommended_present / total_recommended) * 30
+ (optional_present / total_optional) * 10
)
# Validate PyPI packages if requested
invalid_packages = None
if check_pypi and metadata.get("pypi_packages"):
packages = metadata.get("pypi_packages", [])
if packages:
_, invalid_packages = validate_pypi_packages(packages)
# Extract limitations
limitations = metadata.get("limitations") if metadata else None
return MetadataReport(
engine_name=engine_name,
module=module,
has_metadata=bool(metadata),
present_fields=present,
missing_required=missing_required,
missing_recommended=missing_recommended,
missing_optional=missing_optional,
completeness_score=round(score, 1),
invalid_packages=invalid_packages,
limitations=limitations,
)
def get_all_engine_specs_ast() -> list[dict[str, Any]]: # noqa: C901
"""
Discover all DB engine specs using AST parsing.
This avoids needing to initialize the Flask app.
Returns a list of dicts with engine_name, module, and metadata.
"""
import ast
import os
specs = []
db_engine_specs_dir = os.path.dirname(__file__)
for filename in os.listdir(db_engine_specs_dir):
if not filename.endswith(".py"):
continue
if filename in ("__init__.py", "base.py", "lint_metadata.py", "lib.py"):
continue
filepath = os.path.join(db_engine_specs_dir, filename)
try:
with open(filepath) as f:
content = f.read()
tree = ast.parse(content)
# Find all class definitions that inherit from *EngineSpec
for node in ast.walk(tree):
if not isinstance(node, ast.ClassDef):
continue
# Check if it looks like an engine spec class
is_engine_spec = any(
"EngineSpec" in ast.unparse(base)
if hasattr(ast, "unparse")
else True
for base in node.bases
)
if not is_engine_spec:
continue
# Skip mixins
if "Mixin" in node.name:
continue
# Check for engine attribute with non-empty value to distinguish
# true base classes from product classes like OceanBaseEngineSpec
has_non_empty_engine = False
for item in node.body:
if isinstance(item, ast.Assign):
for target in item.targets:
if isinstance(target, ast.Name) and target.id == "engine":
# Check if engine value is non-empty string
if isinstance(item.value, ast.Constant):
has_non_empty_engine = bool(item.value.value)
break
# Skip true base classes (no engine or empty engine attribute)
if node.name.endswith("BaseEngineSpec") and not has_non_empty_engine:
continue
# Extract engine_name and metadata
engine_name = node.name
metadata = {}
for item in node.body:
if isinstance(item, ast.Assign):
for target in item.targets:
if isinstance(target, ast.Name):
if target.id == "engine_name":
if isinstance(item.value, ast.Constant):
engine_name = item.value.value
elif target.id == "metadata":
try:
metadata = _eval_ast_dict(item.value)
except Exception:
# Mark as unparseable
metadata = {"_unparseable": True}
specs.append(
{
"class_name": node.name,
"engine_name": engine_name,
"module": filename[:-3], # Remove .py
"metadata": metadata,
}
)
except Exception as e:
print(f"Warning: Could not parse {filename}: {e}", file=sys.stderr)
return sorted(specs, key=lambda s: s["engine_name"])
def _eval_ast_dict(node: Any) -> dict[str, Any]:
"""Safely evaluate an AST node as a dict literal."""
import ast
if isinstance(node, ast.Dict):
result = {}
for k, v in zip(node.keys, node.values, strict=False):
if k is None:
continue
key = _eval_ast_value(k)
value = _eval_ast_value(v)
if key is not None:
result[key] = value
return result
return {}
def _eval_ast_value(node: Any) -> Any: # noqa: C901
"""Safely evaluate an AST node as a value."""
import ast
if isinstance(node, ast.Constant):
return node.value
elif isinstance(node, ast.Str): # Python 3.7 compat
return node.s
elif isinstance(node, ast.Num): # Python 3.7 compat
return node.n
elif isinstance(node, ast.List):
return [_eval_ast_value(e) for e in node.elts]
elif isinstance(node, ast.Dict):
return _eval_ast_dict(node)
elif isinstance(node, ast.Name):
# Handle DatabaseCategory.* constants
return node.id
elif isinstance(node, ast.Attribute):
# Handle DatabaseCategory.TRADITIONAL_RDBMS etc
if hasattr(ast, "unparse"):
return ast.unparse(node)
return f"{_eval_ast_value(node.value)}.{node.attr}"
elif isinstance(node, ast.BinOp) and isinstance(node.op, ast.Add):
# String concatenation
left = _eval_ast_value(node.left)
right = _eval_ast_value(node.right)
if isinstance(left, str) and isinstance(right, str):
return left + right
return None
elif isinstance(node, ast.JoinedStr):
# f-strings - just return placeholder
return "<f-string>"
elif isinstance(node, ast.Tuple):
return tuple(_eval_ast_value(e) for e in node.elts)
return None
def get_all_engine_specs() -> list[type]:
"""Discover all DB engine specs using Flask app (if available)."""
# Import here to avoid issues when running standalone
from superset.db_engine_specs import load_engine_specs
specs = []
for spec in load_engine_specs():
# Skip base classes and internal specs
if spec.__name__ in ("BaseEngineSpec", "BasicParametersMixin"):
continue
specs.append(spec)
return sorted(specs, key=lambda s: getattr(s, "engine_name", s.__name__))
def print_report(reports: list[MetadataReport], verbose: bool = False) -> None: # noqa: C901
"""Print a human-readable report."""
print("\n" + "=" * 60)
print("METADATA COMPLETENESS REPORT")
print("=" * 60 + "\n")
# Summary statistics
total = len(reports)
with_metadata = sum(1 for r in reports if r.has_metadata)
fully_complete = sum(1 for r in reports if not r.missing_required)
avg_score = sum(r.completeness_score for r in reports) / total if total else 0
print(f"Total engine specs: {total}")
print(f"With metadata: {with_metadata} ({with_metadata * 100 // total}%)")
print(
f"All required fields: {fully_complete} ({fully_complete * 100 // total}%)"
)
print(f"Average completeness: {avg_score:.1f}%")
print()
# Group by completeness
complete = [
r for r in reports if not r.missing_required and not r.missing_recommended
]
needs_work = [r for r in reports if r.missing_required or r.missing_recommended]
no_metadata = [r for r in reports if not r.has_metadata]
if complete:
print(f"\n✅ COMPLETE ({len(complete)} specs - all required & recommended):")
print("-" * 50)
for r in complete:
print(f" {r.engine_name:30} {r.completeness_score:5.1f}%")
if needs_work:
print(f"\n⚠️ NEEDS WORK ({len(needs_work)} specs):")
print("-" * 50)
for r in sorted(needs_work, key=lambda x: -x.completeness_score):
status = []
if r.missing_required:
status.append(
f"missing required: {', '.join(sorted(r.missing_required))}"
)
if r.missing_recommended:
status.append(
f"missing recommended: {', '.join(sorted(r.missing_recommended))}"
)
print(f" {r.engine_name:30} {r.completeness_score:5.1f}%")
for s in status:
print(f" └─ {s}")
if no_metadata:
print(f"\n❌ NO METADATA ({len(no_metadata)} specs):")
print("-" * 50)
for r in no_metadata:
print(f" {r.engine_name} ({r.module}.py)")
# Show invalid PyPI packages
invalid_pypi = [r for r in reports if r.invalid_packages]
if invalid_pypi:
print(f"\n📦 INVALID PyPI PACKAGES ({len(invalid_pypi)} specs):")
print("-" * 50)
for r in invalid_pypi:
packages = r.invalid_packages or []
print(f" {r.engine_name}: {', '.join(packages)}")
# Field coverage summary
print("\n" + "=" * 60)
print("FIELD COVERAGE SUMMARY")
print("=" * 60)
all_fields = {**REQUIRED_FIELDS, **RECOMMENDED_FIELDS, **OPTIONAL_FIELDS}
field_counts: dict[str, int] = {f: 0 for f in all_fields}
for r in reports:
for field in r.present_fields:
if field in field_counts:
field_counts[field] += 1
print("\nRequired fields:")
for field, _desc in REQUIRED_FIELDS.items():
count = field_counts[field]
pct = count * 100 // total
bar = "" * (pct // 5) + "" * (20 - pct // 5)
print(f" {field:25} {bar} {count:3}/{total} ({pct}%)")
print("\nRecommended fields:")
for field, _desc in RECOMMENDED_FIELDS.items():
count = field_counts[field]
pct = count * 100 // total
bar = "" * (pct // 5) + "" * (20 - pct // 5)
print(f" {field:25} {bar} {count:3}/{total} ({pct}%)")
if verbose:
print("\nOptional fields:")
for field, _desc in OPTIONAL_FIELDS.items():
count = field_counts[field]
pct = count * 100 // total
bar = "" * (pct // 5) + "" * (20 - pct // 5)
print(f" {field:25} {bar} {count:3}/{total} ({pct}%)")
def generate_markdown_report(reports: list[MetadataReport]) -> str:
"""Generate a markdown report suitable for checking into the repo."""
lines = [
"<!--",
"Licensed to the Apache Software Foundation (ASF) under one",
"or more contributor license agreements. See the NOTICE file",
"distributed with this work for additional information",
"regarding copyright ownership. The ASF licenses this file",
"to you under the Apache License, Version 2.0 (the",
'"License"); you may not use this file except in compliance',
"with the License. You may obtain a copy of the License at",
"",
" http://www.apache.org/licenses/LICENSE-2.0",
"",
"Unless required by applicable law or agreed to in writing,",
"software distributed under the License is distributed on an",
'"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY',
"KIND, either express or implied. See the License for the",
"specific language governing permissions and limitations",
"under the License.",
"-->",
"",
"# Database Metadata Completeness Report",
"",
"This report is auto-generated by "
"`python superset/db_engine_specs/lint_metadata.py --markdown`.",
"It tracks which database engine specs have complete metadata.",
"",
"## Summary",
"",
]
total = len(reports)
with_metadata = sum(1 for r in reports if r.has_metadata)
all_required = sum(1 for r in reports if not r.missing_required)
avg_score = sum(r.completeness_score for r in reports) / total if total else 0
pct_meta = with_metadata * 100 // total
pct_req = all_required * 100 // total
lines.extend(
[
f"- **Total engine specs:** {total}",
f"- **With metadata:** {with_metadata} ({pct_meta}%)",
f"- **All required fields:** {all_required} ({pct_req}%)",
f"- **Average completeness:** {avg_score:.1f}%",
"",
"## Required Fields",
"",
"These fields should be in every engine spec's `metadata` attribute:",
"",
]
)
for field, desc in REQUIRED_FIELDS.items():
lines.append(f"- `{field}` - {desc}")
lines.extend(
[
"",
"## Specs Needing Work",
"",
"| Engine | Module | Score | Missing Required | Missing Recommended |",
"|--------|--------|-------|------------------|---------------------|",
]
)
# Sort by score ascending (worst first)
needs_work = [r for r in reports if r.missing_required or r.missing_recommended]
for r in sorted(needs_work, key=lambda x: x.completeness_score):
missing_req = ", ".join(sorted(r.missing_required)) or ""
missing_rec = ", ".join(sorted(r.missing_recommended)) or ""
score = f"{r.completeness_score:.0f}%"
row = f"| {r.engine_name} | {r.module}.py | {score} | {missing_req} | {missing_rec} |" # noqa: E501
lines.append(row)
lines.extend(
[
"",
"## Complete Specs",
"",
"These specs have all required and recommended fields:",
"",
]
)
complete = [
r for r in reports if not r.missing_required and not r.missing_recommended
]
for r in sorted(complete, key=lambda x: x.engine_name):
lines.append(f"- {r.engine_name} ({r.completeness_score:.0f}%)")
lines.extend(
[
"",
"## How to Fix",
"",
"Add a `metadata` attribute to your engine spec class:",
"",
"```python",
"from superset.db_engine_specs.base import (", # noqa: E501
" BaseEngineSpec, DatabaseCategory",
")",
"",
"class MyEngineSpec(BaseEngineSpec):",
' engine_name = "My Database"',
"",
" metadata = {",
' "description": "Brief description of the database.",',
' "categories": [DatabaseCategory.TRADITIONAL_RDBMS],',
' "pypi_packages": ["my-driver"],',
' "connection_string": "mydb://{username}:{password}@{host}:{port}/{database}",',
' "logo": "mydb.svg",',
' "homepage_url": "https://mydb.example.com/",',
' "default_port": 5432,',
" }",
"```",
"",
"See `superset/db_engine_specs/README.md` for full documentation.",
"",
]
)
return "\n".join(lines)
def main() -> int:
parser = argparse.ArgumentParser(
description="Lint DB engine spec metadata for completeness"
)
parser.add_argument("--json", action="store_true", help="Output as JSON")
parser.add_argument(
"--markdown", action="store_true", help="Output as Markdown report"
)
parser.add_argument(
"--strict",
action="store_true",
help="Exit with error if required fields missing",
)
parser.add_argument(
"--check-pypi",
action="store_true",
help="Validate that pypi_packages exist on PyPI (slower)",
)
parser.add_argument(
"--verbose", "-v", action="store_true", help="Show optional field coverage"
)
parser.add_argument("--output", "-o", type=str, help="Write output to file")
args = parser.parse_args()
# Use AST parsing to avoid Flask app dependency
specs = get_all_engine_specs_ast()
if not specs:
print("Error: No engine specs found.", file=sys.stderr)
return 1
if args.check_pypi:
print("Validating PyPI packages (this may take a moment)...", file=sys.stderr)
reports = [analyze_spec(spec, check_pypi=args.check_pypi) for spec in specs]
# Generate output
output_text = ""
if args.json:
output_data = {
"summary": {
"total": len(reports),
"with_metadata": sum(1 for r in reports if r.has_metadata),
"all_required": sum(1 for r in reports if not r.missing_required),
"average_score": round(
sum(r.completeness_score for r in reports) / len(reports), 1
),
},
"schema": {
"required": REQUIRED_FIELDS,
"recommended": RECOMMENDED_FIELDS,
"optional": OPTIONAL_FIELDS,
},
"reports": [r.to_dict() for r in reports],
}
output_text = json.dumps(output_data, indent=2)
elif args.markdown:
output_text = generate_markdown_report(reports)
else:
print_report(reports, verbose=args.verbose)
# Write to file or stdout
if output_text:
if args.output:
with open(args.output, "w") as f:
f.write(output_text)
print(f"Report written to {args.output}", file=sys.stderr)
else:
print(output_text)
# In strict mode, fail if specs WITH metadata are missing required fields.
# Specs without metadata are intentionally internal/legacy and are allowed.
if args.strict:
# Only count specs that HAVE metadata but are incomplete
missing_count = sum(1 for r in reports if r.has_metadata and r.missing_required)
invalid_pypi_count = sum(1 for r in reports if r.invalid_packages)
if missing_count > 0:
print(
f"\n❌ STRICT MODE: {missing_count} specs missing required fields",
file=sys.stderr,
)
return 1
if args.check_pypi and invalid_pypi_count > 0:
msg = f"\n❌ STRICT MODE: {invalid_pypi_count} specs have invalid packages"
print(msg, file=sys.stderr)
return 1
return 0
if __name__ == "__main__":
sys.exit(main())