# Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. """ Metadata completeness linter for DB engine specs. This script validates that all DB engine specs have complete metadata as defined by the DBEngineSpecMetadata TypedDict in base.py. Usage: python superset/db_engine_specs/lint_metadata.py [--json] [--strict] Options: --json Output results as JSON --strict Return non-zero exit code if any required fields are missing --help Show this help message The script categorizes metadata fields into: - REQUIRED: Must be present for proper documentation - RECOMMENDED: Should be present for good documentation - OPTIONAL: Nice to have but not critical Example output: === Metadata Completeness Report === PostgreSQL (postgres.py) ✓ description, category, pypi_packages, connection_string ⚠ Missing recommended: logo, homepage_url MySQL (mysql.py) ✓ All required and recommended fields present """ from __future__ import annotations import argparse import json # noqa: TID251 - standalone script, don't depend on superset.utils import sys from dataclasses import dataclass from typing import Any # Schema definition - fields grouped by importance REQUIRED_FIELDS = { "description": "Brief description of the database", "categories": "List of DatabaseCategory constants for grouping", "pypi_packages": "Python packages needed for connection", "connection_string": "SQLAlchemy URI template", } RECOMMENDED_FIELDS = { "logo": "Logo filename (in docs/static/img/databases/)", "homepage_url": "Official database homepage", "default_port": "Default port number", } OPTIONAL_FIELDS = { "docs_url": "Documentation URL", "sqlalchemy_docs_url": "SQLAlchemy dialect documentation", "notes": "Additional configuration notes", "warnings": "Important warnings for users", "limitations": "Known limitations (no JOINs, row limits, etc.)", "install_instructions": "How to install the driver", "version_requirements": "Version compatibility info", "connection_examples": "Example connection strings", "authentication_methods": "Supported auth methods", "drivers": "Available driver options", "compatible_databases": "Related/compatible databases", "engine_parameters": "Advanced JSON config options", "host_examples": "Platform-specific host examples", "ssl_configuration": "SSL setup documentation", "advanced_features": "Advanced feature documentation", "parameters": "Connection parameter descriptions", "tutorials": "Tutorial links", } # Cache for PyPI package validation _pypi_cache: dict[str, bool] = {} def check_pypi_package(package_name: str, timeout: float = 5.0) -> bool: """Check if a package exists on PyPI.""" import urllib.error import urllib.request # Strip version specifiers and extras base_name = package_name.split("[")[0].split(">")[0].split("<")[0].split("=")[0] base_name = base_name.strip() if base_name in _pypi_cache: return _pypi_cache[base_name] url = f"https://pypi.org/pypi/{base_name}/json" try: req = urllib.request.Request( # noqa: S310 url, headers={"User-Agent": "superset-lint/1.0"} ) with urllib.request.urlopen(req, timeout=timeout) as response: # noqa: S310 exists = response.status == 200 except (urllib.error.HTTPError, urllib.error.URLError, TimeoutError): exists = False _pypi_cache[base_name] = exists return exists def validate_pypi_packages( packages: list[str], timeout: float = 5.0 ) -> tuple[list[str], list[str]]: """Validate a list of PyPI packages. Returns (valid, invalid) lists.""" valid = [] invalid = [] for pkg in packages: if check_pypi_package(pkg, timeout): valid.append(pkg) else: invalid.append(pkg) return valid, invalid @dataclass class MetadataReport: """Report for a single engine spec's metadata.""" engine_name: str module: str has_metadata: bool present_fields: set[str] missing_required: set[str] missing_recommended: set[str] missing_optional: set[str] completeness_score: float # 0-100 invalid_packages: list[str] | None = None # PyPI packages that don't exist limitations: list[str] | None = None # Known limitations def to_dict(self) -> dict[str, Any]: result = { "engine_name": self.engine_name, "module": self.module, "has_metadata": self.has_metadata, "present_fields": sorted(self.present_fields), "missing_required": sorted(self.missing_required), "missing_recommended": sorted(self.missing_recommended), "missing_optional": sorted(self.missing_optional), "completeness_score": self.completeness_score, } if self.invalid_packages is not None: result["invalid_packages"] = self.invalid_packages if self.limitations is not None: result["limitations"] = self.limitations return result def analyze_spec(spec_data: dict[str, Any], check_pypi: bool = False) -> MetadataReport: """Analyze a single engine spec for metadata completeness.""" metadata = spec_data.get("metadata", {}) engine_name = spec_data.get("engine_name", spec_data.get("class_name", "Unknown")) module = spec_data.get("module", "unknown") # Handle unparseable metadata if metadata.get("_unparseable"): metadata = {} present = set(metadata.keys()) if metadata else set() missing_required = set(REQUIRED_FIELDS.keys()) - present missing_recommended = set(RECOMMENDED_FIELDS.keys()) - present missing_optional = set(OPTIONAL_FIELDS.keys()) - present # Calculate completeness score # Required fields: 60%, Recommended: 30%, Optional: 10% total_required = len(REQUIRED_FIELDS) total_recommended = len(RECOMMENDED_FIELDS) total_optional = len(OPTIONAL_FIELDS) required_present = total_required - len(missing_required) recommended_present = total_recommended - len(missing_recommended) optional_present = total_optional - len(missing_optional) score = ( (required_present / total_required) * 60 + (recommended_present / total_recommended) * 30 + (optional_present / total_optional) * 10 ) # Validate PyPI packages if requested invalid_packages = None if check_pypi and metadata.get("pypi_packages"): packages = metadata.get("pypi_packages", []) if packages: _, invalid_packages = validate_pypi_packages(packages) # Extract limitations limitations = metadata.get("limitations") if metadata else None return MetadataReport( engine_name=engine_name, module=module, has_metadata=bool(metadata), present_fields=present, missing_required=missing_required, missing_recommended=missing_recommended, missing_optional=missing_optional, completeness_score=round(score, 1), invalid_packages=invalid_packages, limitations=limitations, ) def get_all_engine_specs_ast() -> list[dict[str, Any]]: # noqa: C901 """ Discover all DB engine specs using AST parsing. This avoids needing to initialize the Flask app. Returns a list of dicts with engine_name, module, and metadata. """ import ast import os specs = [] db_engine_specs_dir = os.path.dirname(__file__) for filename in os.listdir(db_engine_specs_dir): if not filename.endswith(".py"): continue if filename in ("__init__.py", "base.py", "lint_metadata.py", "lib.py"): continue filepath = os.path.join(db_engine_specs_dir, filename) try: with open(filepath) as f: content = f.read() tree = ast.parse(content) # Find all class definitions that inherit from *EngineSpec for node in ast.walk(tree): if not isinstance(node, ast.ClassDef): continue # Check if it looks like an engine spec class is_engine_spec = any( "EngineSpec" in ast.unparse(base) if hasattr(ast, "unparse") else True for base in node.bases ) if not is_engine_spec: continue # Skip mixins if "Mixin" in node.name: continue # Check for engine attribute with non-empty value to distinguish # true base classes from product classes like OceanBaseEngineSpec has_non_empty_engine = False for item in node.body: if isinstance(item, ast.Assign): for target in item.targets: if isinstance(target, ast.Name) and target.id == "engine": # Check if engine value is non-empty string if isinstance(item.value, ast.Constant): has_non_empty_engine = bool(item.value.value) break # Skip true base classes (no engine or empty engine attribute) if node.name.endswith("BaseEngineSpec") and not has_non_empty_engine: continue # Extract engine_name and metadata engine_name = node.name metadata = {} for item in node.body: if isinstance(item, ast.Assign): for target in item.targets: if isinstance(target, ast.Name): if target.id == "engine_name": if isinstance(item.value, ast.Constant): engine_name = item.value.value elif target.id == "metadata": try: metadata = _eval_ast_dict(item.value) except Exception: # Mark as unparseable metadata = {"_unparseable": True} specs.append( { "class_name": node.name, "engine_name": engine_name, "module": filename[:-3], # Remove .py "metadata": metadata, } ) except Exception as e: print(f"Warning: Could not parse {filename}: {e}", file=sys.stderr) return sorted(specs, key=lambda s: s["engine_name"]) def _eval_ast_dict(node: Any) -> dict[str, Any]: """Safely evaluate an AST node as a dict literal.""" import ast if isinstance(node, ast.Dict): result = {} for k, v in zip(node.keys, node.values, strict=False): if k is None: continue key = _eval_ast_value(k) value = _eval_ast_value(v) if key is not None: result[key] = value return result return {} def _eval_ast_value(node: Any) -> Any: # noqa: C901 """Safely evaluate an AST node as a value.""" import ast if isinstance(node, ast.Constant): return node.value elif isinstance(node, ast.Str): # Python 3.7 compat return node.s elif isinstance(node, ast.Num): # Python 3.7 compat return node.n elif isinstance(node, ast.List): return [_eval_ast_value(e) for e in node.elts] elif isinstance(node, ast.Dict): return _eval_ast_dict(node) elif isinstance(node, ast.Name): # Handle DatabaseCategory.* constants return node.id elif isinstance(node, ast.Attribute): # Handle DatabaseCategory.TRADITIONAL_RDBMS etc if hasattr(ast, "unparse"): return ast.unparse(node) return f"{_eval_ast_value(node.value)}.{node.attr}" elif isinstance(node, ast.BinOp) and isinstance(node.op, ast.Add): # String concatenation left = _eval_ast_value(node.left) right = _eval_ast_value(node.right) if isinstance(left, str) and isinstance(right, str): return left + right return None elif isinstance(node, ast.JoinedStr): # f-strings - just return placeholder return "" elif isinstance(node, ast.Tuple): return tuple(_eval_ast_value(e) for e in node.elts) return None def get_all_engine_specs() -> list[type]: """Discover all DB engine specs using Flask app (if available).""" # Import here to avoid issues when running standalone from superset.db_engine_specs import load_engine_specs specs = [] for spec in load_engine_specs(): # Skip base classes and internal specs if spec.__name__ in ("BaseEngineSpec", "BasicParametersMixin"): continue specs.append(spec) return sorted(specs, key=lambda s: getattr(s, "engine_name", s.__name__)) def print_report(reports: list[MetadataReport], verbose: bool = False) -> None: # noqa: C901 """Print a human-readable report.""" print("\n" + "=" * 60) print("METADATA COMPLETENESS REPORT") print("=" * 60 + "\n") # Summary statistics total = len(reports) with_metadata = sum(1 for r in reports if r.has_metadata) fully_complete = sum(1 for r in reports if not r.missing_required) avg_score = sum(r.completeness_score for r in reports) / total if total else 0 print(f"Total engine specs: {total}") print(f"With metadata: {with_metadata} ({with_metadata * 100 // total}%)") print( f"All required fields: {fully_complete} ({fully_complete * 100 // total}%)" ) print(f"Average completeness: {avg_score:.1f}%") print() # Group by completeness complete = [ r for r in reports if not r.missing_required and not r.missing_recommended ] needs_work = [r for r in reports if r.missing_required or r.missing_recommended] no_metadata = [r for r in reports if not r.has_metadata] if complete: print(f"\n✅ COMPLETE ({len(complete)} specs - all required & recommended):") print("-" * 50) for r in complete: print(f" {r.engine_name:30} {r.completeness_score:5.1f}%") if needs_work: print(f"\n⚠️ NEEDS WORK ({len(needs_work)} specs):") print("-" * 50) for r in sorted(needs_work, key=lambda x: -x.completeness_score): status = [] if r.missing_required: status.append( f"missing required: {', '.join(sorted(r.missing_required))}" ) if r.missing_recommended: status.append( f"missing recommended: {', '.join(sorted(r.missing_recommended))}" ) print(f" {r.engine_name:30} {r.completeness_score:5.1f}%") for s in status: print(f" └─ {s}") if no_metadata: print(f"\n❌ NO METADATA ({len(no_metadata)} specs):") print("-" * 50) for r in no_metadata: print(f" {r.engine_name} ({r.module}.py)") # Show invalid PyPI packages invalid_pypi = [r for r in reports if r.invalid_packages] if invalid_pypi: print(f"\n📦 INVALID PyPI PACKAGES ({len(invalid_pypi)} specs):") print("-" * 50) for r in invalid_pypi: packages = r.invalid_packages or [] print(f" {r.engine_name}: {', '.join(packages)}") # Field coverage summary print("\n" + "=" * 60) print("FIELD COVERAGE SUMMARY") print("=" * 60) all_fields = {**REQUIRED_FIELDS, **RECOMMENDED_FIELDS, **OPTIONAL_FIELDS} field_counts: dict[str, int] = {f: 0 for f in all_fields} for r in reports: for field in r.present_fields: if field in field_counts: field_counts[field] += 1 print("\nRequired fields:") for field, _desc in REQUIRED_FIELDS.items(): count = field_counts[field] pct = count * 100 // total bar = "█" * (pct // 5) + "░" * (20 - pct // 5) print(f" {field:25} {bar} {count:3}/{total} ({pct}%)") print("\nRecommended fields:") for field, _desc in RECOMMENDED_FIELDS.items(): count = field_counts[field] pct = count * 100 // total bar = "█" * (pct // 5) + "░" * (20 - pct // 5) print(f" {field:25} {bar} {count:3}/{total} ({pct}%)") if verbose: print("\nOptional fields:") for field, _desc in OPTIONAL_FIELDS.items(): count = field_counts[field] pct = count * 100 // total bar = "█" * (pct // 5) + "░" * (20 - pct // 5) print(f" {field:25} {bar} {count:3}/{total} ({pct}%)") def generate_markdown_report(reports: list[MetadataReport]) -> str: """Generate a markdown report suitable for checking into the repo.""" lines = [ "", "", "# Database Metadata Completeness Report", "", "This report is auto-generated by " "`python superset/db_engine_specs/lint_metadata.py --markdown`.", "It tracks which database engine specs have complete metadata.", "", "## Summary", "", ] total = len(reports) with_metadata = sum(1 for r in reports if r.has_metadata) all_required = sum(1 for r in reports if not r.missing_required) avg_score = sum(r.completeness_score for r in reports) / total if total else 0 pct_meta = with_metadata * 100 // total pct_req = all_required * 100 // total lines.extend( [ f"- **Total engine specs:** {total}", f"- **With metadata:** {with_metadata} ({pct_meta}%)", f"- **All required fields:** {all_required} ({pct_req}%)", f"- **Average completeness:** {avg_score:.1f}%", "", "## Required Fields", "", "These fields should be in every engine spec's `metadata` attribute:", "", ] ) for field, desc in REQUIRED_FIELDS.items(): lines.append(f"- `{field}` - {desc}") lines.extend( [ "", "## Specs Needing Work", "", "| Engine | Module | Score | Missing Required | Missing Recommended |", "|--------|--------|-------|------------------|---------------------|", ] ) # Sort by score ascending (worst first) needs_work = [r for r in reports if r.missing_required or r.missing_recommended] for r in sorted(needs_work, key=lambda x: x.completeness_score): missing_req = ", ".join(sorted(r.missing_required)) or "✓" missing_rec = ", ".join(sorted(r.missing_recommended)) or "✓" score = f"{r.completeness_score:.0f}%" row = f"| {r.engine_name} | {r.module}.py | {score} | {missing_req} | {missing_rec} |" # noqa: E501 lines.append(row) lines.extend( [ "", "## Complete Specs", "", "These specs have all required and recommended fields:", "", ] ) complete = [ r for r in reports if not r.missing_required and not r.missing_recommended ] for r in sorted(complete, key=lambda x: x.engine_name): lines.append(f"- {r.engine_name} ({r.completeness_score:.0f}%)") lines.extend( [ "", "## How to Fix", "", "Add a `metadata` attribute to your engine spec class:", "", "```python", "from superset.db_engine_specs.base import (", # noqa: E501 " BaseEngineSpec, DatabaseCategory", ")", "", "class MyEngineSpec(BaseEngineSpec):", ' engine_name = "My Database"', "", " metadata = {", ' "description": "Brief description of the database.",', ' "categories": [DatabaseCategory.TRADITIONAL_RDBMS],', ' "pypi_packages": ["my-driver"],', ' "connection_string": "mydb://{username}:{password}@{host}:{port}/{database}",', ' "logo": "mydb.svg",', ' "homepage_url": "https://mydb.example.com/",', ' "default_port": 5432,', " }", "```", "", "See `superset/db_engine_specs/README.md` for full documentation.", "", ] ) return "\n".join(lines) def main() -> int: parser = argparse.ArgumentParser( description="Lint DB engine spec metadata for completeness" ) parser.add_argument("--json", action="store_true", help="Output as JSON") parser.add_argument( "--markdown", action="store_true", help="Output as Markdown report" ) parser.add_argument( "--strict", action="store_true", help="Exit with error if required fields missing", ) parser.add_argument( "--check-pypi", action="store_true", help="Validate that pypi_packages exist on PyPI (slower)", ) parser.add_argument( "--verbose", "-v", action="store_true", help="Show optional field coverage" ) parser.add_argument("--output", "-o", type=str, help="Write output to file") args = parser.parse_args() # Use AST parsing to avoid Flask app dependency specs = get_all_engine_specs_ast() if not specs: print("Error: No engine specs found.", file=sys.stderr) return 1 if args.check_pypi: print("Validating PyPI packages (this may take a moment)...", file=sys.stderr) reports = [analyze_spec(spec, check_pypi=args.check_pypi) for spec in specs] # Generate output output_text = "" if args.json: output_data = { "summary": { "total": len(reports), "with_metadata": sum(1 for r in reports if r.has_metadata), "all_required": sum(1 for r in reports if not r.missing_required), "average_score": round( sum(r.completeness_score for r in reports) / len(reports), 1 ), }, "schema": { "required": REQUIRED_FIELDS, "recommended": RECOMMENDED_FIELDS, "optional": OPTIONAL_FIELDS, }, "reports": [r.to_dict() for r in reports], } output_text = json.dumps(output_data, indent=2) elif args.markdown: output_text = generate_markdown_report(reports) else: print_report(reports, verbose=args.verbose) # Write to file or stdout if output_text: if args.output: with open(args.output, "w") as f: f.write(output_text) print(f"Report written to {args.output}", file=sys.stderr) else: print(output_text) # In strict mode, fail if specs WITH metadata are missing required fields. # Specs without metadata are intentionally internal/legacy and are allowed. if args.strict: # Only count specs that HAVE metadata but are incomplete missing_count = sum(1 for r in reports if r.has_metadata and r.missing_required) invalid_pypi_count = sum(1 for r in reports if r.invalid_packages) if missing_count > 0: print( f"\n❌ STRICT MODE: {missing_count} specs missing required fields", file=sys.stderr, ) return 1 if args.check_pypi and invalid_pypi_count > 0: msg = f"\n❌ STRICT MODE: {invalid_pypi_count} specs have invalid packages" print(msg, file=sys.stderr) return 1 return 0 if __name__ == "__main__": sys.exit(main())