superset2/scripts/extract_config_schema.py

#!/usr/bin/env python3
"""
Extract configuration schema from config_defaults.py.

This script parses the existing config_defaults.py file and extracts:
- All configuration keys and their default values
- Comments above each key as descriptions
- Types inferred from the default values

The output is a comprehensive JSON schema that can be used for:
- Documentation generation
- Configuration validation
- IDE autocomplete
"""

import ast
import json
import sys
from pathlib import Path
from typing import Any, Dict, List


def infer_type(value: Any) -> str:
    """Infer the configuration type from the default value."""
    if value is None:
        return "null"
    elif isinstance(value, bool):
        return "boolean"
    elif isinstance(value, int):
        return "integer"
    elif isinstance(value, float):
        return "number"
    elif isinstance(value, str):
        return "string"
    elif isinstance(value, (list, tuple)):
        return "array"
    elif isinstance(value, dict):
        return "object"
    else:
        return "unknown"


def extract_comments_before_line(lines: List[str], line_num: int) -> List[str]:
    """Extract comments immediately before a configuration line."""
    comments: List[str] = []
    current_line = line_num - 2  # line_num is 1-based, so -2 to get previous line

    # Look backwards for comments, but only go back a few lines to avoid
    # picking up unrelated comments
    max_lookback = min(5, current_line + 1)

    for i in range(max_lookback):
        if current_line - i < 0:
            break

        line = lines[current_line - i].strip()
        if line.startswith("#"):
            # Remove the '#' and clean up the comment
            comment = line[1:].strip()
            if comment:  # Only add non-empty comments
                comments.insert(0, comment)
        elif line == "":
            # Empty line - continue looking
            continue
        else:
            # Non-comment, non-empty line - stop looking
            break

    return comments


def safe_eval(node: ast.AST) -> Any:
    """Safely evaluate an AST node to get its value."""
    try:
        # Handle basic constant values
        if isinstance(node, ast.Constant):
            return node.value
        elif isinstance(node, ast.Num):  # Python < 3.8
            return node.n
        elif isinstance(node, ast.Str):  # Python < 3.8
            return node.s
        elif isinstance(node, ast.List):
            return [safe_eval(item) for item in node.elts]
        elif isinstance(node, ast.Dict):
            return {
                safe_eval(k): safe_eval(v)
                for k, v in zip(node.keys, node.values, strict=False)
                if k is not None
            }
        elif isinstance(node, ast.Name):
            # Handle common constants
            if node.id in ("True", "False", "None"):
                return {"True": True, "False": False, "None": None}[node.id]
            else:
                return f"<{node.id}>"  # Placeholder for variables
        else:
            # For everything else, just return a descriptive placeholder
            return f"<{type(node).__name__}>"
    except Exception:
        return "<unknown>"


def extract_config_schema(config_file: Path) -> Dict[str, Any]:
    """Extract configuration schema from config_defaults.py."""
    with open(config_file, "r") as f:
        content = f.read()
        lines = content.splitlines()

    # Parse the Python file
    tree = ast.parse(content)

    schema = {}

    for node in ast.walk(tree):
        if isinstance(node, ast.Assign):
            # Check if this is a simple assignment to a variable
            if len(node.targets) == 1 and isinstance(node.targets[0], ast.Name):
                var_name = node.targets[0].id

                # Only include uppercase variables (configuration convention)
                if var_name.isupper():
                    # Get the default value
                    default_value = safe_eval(node.value)

                    # Get comments before this line
                    comments = extract_comments_before_line(lines, node.lineno)
                    description = " ".join(comments) if comments else ""

                    # Infer type from default value
                    config_type = infer_type(default_value)

                    # Determine category based on variable name patterns
                    category = categorize_config(var_name)

                    schema[var_name] = {
                        "type": config_type,
                        "default": default_value,
                        "description": description,
                        "category": category,
                    }

    return schema


def categorize_config(var_name: str) -> str:
    """Categorize configuration variables based on their names."""
    name_lower = var_name.lower()

    if any(term in name_lower for term in ["limit", "timeout", "cache", "pool"]):
        return "performance"
    elif any(term in name_lower for term in ["feature", "flag", "enable", "disable"]):
        return "features"
    elif any(term in name_lower for term in ["theme", "color", "style", "ui"]):
        return "ui"
    elif any(term in name_lower for term in ["db", "database", "sql", "query"]):
        return "database"
    elif any(term in name_lower for term in ["auth", "security", "login", "oauth"]):
        return "security"
    elif any(term in name_lower for term in ["log", "debug", "stats"]):
        return "logging"
    elif any(term in name_lower for term in ["mail", "smtp", "email"]):
        return "email"
    elif any(term in name_lower for term in ["celery", "async", "worker"]):
        return "async"
    else:
        return "general"


def main() -> None:
    """Extract configuration schema and save to JSON."""
    superset_root = Path(__file__).parent.parent
    config_file = superset_root / "superset" / "config_defaults.py"

    if not config_file.exists():
        print(f"Error: {config_file} not found")
        sys.exit(1)

    print("Extracting configuration schema...")
    schema = extract_config_schema(config_file)

    # Create output structure
    output = {
        "metadata": {
            "generated_from": str(config_file),
            "total_configs": len(schema),
            "description": (
                "Superset configuration schema extracted from config_defaults.py"
            ),
        },
        "configs": schema,
        "by_category": {},
    }

    # Group by category
    for key, config in schema.items():
        category = config["category"]
        if category not in output["by_category"]:
            output["by_category"][category] = {}
        output["by_category"][category][key] = config

    # Save to JSON
    output_file = superset_root / "superset" / "config_schema.json"
    with open(output_file, "w") as f:
        json.dump(output, f, indent=2, default=str)

    print("✅ Schema extracted successfully!")
    print(f"📊 Total configurations: {len(schema)}")
    print(f"📂 Categories: {list(output['by_category'].keys())}")
    print(f"💾 Saved to: {output_file}")

    # Show some stats
    print("\n📈 Category breakdown:")
    for category, configs in output["by_category"].items():
        print(f"   {category}: {len(configs)} configs")


if __name__ == "__main__":
    main()