feat(docs): auto-generate database documentation from lib.py (#36805)

Co-authored-by: Claude Opus 4.5 <noreply@anthropic.com>
2026-04-19 16:14:52 +00:00 · 2026-01-21 10:54:01 -08:00
parent 2c1a33fd32
commit b460ca94c6
133 changed files with 11531 additions and 2123 deletions
--- a/superset/db_engine_specs/lib.py
+++ b/superset/db_engine_specs/lib.py
@@ -17,8 +17,11 @@

 from __future__ import annotations

+import os
 from typing import Any

+import yaml
+
 from superset.constants import TimeGrain
 from superset.db_engine_specs import load_engine_specs
 from superset.db_engine_specs.base import BaseEngineSpec
@@ -662,11 +665,169 @@ def generate_table() -> list[list[Any]]:
    return rows


+def infer_category(name: str) -> str:
+    """
+    Infer database category from name for unmigrated specs.
+
+    This is used as a fallback when a spec doesn't have category in metadata.
+    Once all specs are migrated to use metadata.category, this can be removed.
+    """
+    from superset.db_engine_specs.base import DatabaseCategory
+
+    name_lower = name.lower()
+
+    if "aws" in name_lower or "amazon" in name_lower:
+        return DatabaseCategory.CLOUD_AWS
+    if "google" in name_lower or "bigquery" in name_lower:
+        return DatabaseCategory.CLOUD_GCP
+    if "azure" in name_lower or "microsoft" in name_lower:
+        return DatabaseCategory.CLOUD_AZURE
+    if "snowflake" in name_lower or "databricks" in name_lower:
+        return DatabaseCategory.CLOUD_DATA_WAREHOUSES
+    if (
+        "apache" in name_lower
+        or "druid" in name_lower
+        or "hive" in name_lower
+        or "spark" in name_lower
+    ):
+        return DatabaseCategory.APACHE_PROJECTS
+    if (
+        "postgres" in name_lower
+        or "mysql" in name_lower
+        or "sqlite" in name_lower
+        or "mariadb" in name_lower
+    ):
+        return DatabaseCategory.TRADITIONAL_RDBMS
+    if (
+        "clickhouse" in name_lower
+        or "vertica" in name_lower
+        or "starrocks" in name_lower
+    ):
+        return DatabaseCategory.ANALYTICAL_DATABASES
+    if "elastic" in name_lower or "solr" in name_lower or "couchbase" in name_lower:
+        return DatabaseCategory.SEARCH_NOSQL
+    if "trino" in name_lower or "presto" in name_lower:
+        return DatabaseCategory.QUERY_ENGINES
+
+    return DatabaseCategory.OTHER
+
+
+def get_documentation_metadata(spec: type[BaseEngineSpec], name: str) -> dict[str, Any]:
+    """
+    Get documentation metadata for a database engine spec.
+
+    Documentation metadata should be defined in the spec's `metadata` attribute.
+    If not present, returns minimal fallback with just connection string.
+    """
+    # Check if the spec has metadata attribute with content
+    if spec_metadata := getattr(spec, "metadata", {}):
+        result = dict(spec_metadata)
+        # Ensure category is present
+        if "category" not in result:
+            result["category"] = infer_category(name)
+        return result
+
+    # Minimal fallback for specs without metadata
+    return {
+        "pypi_packages": [],
+        "connection_string": getattr(spec, "sqlalchemy_uri_placeholder", ""),
+        "category": infer_category(name),
+    }
+
+
+def generate_yaml_docs(output_dir: str | None = None) -> dict[str, dict[str, Any]]:
+    """
+    Generate YAML documentation files for all database engine specs.
+
+    Args:
+        output_dir: Directory to write YAML files. If None, returns dict only.
+
+    Returns:
+        Dictionary mapping database names to their full documentation data.
+    """
+    all_docs: dict[str, dict[str, Any]] = {}
+
+    for spec in sorted(load_engine_specs(), key=get_name):
+        # Skip non-superset modules (3rd party)
+        if not spec.__module__.startswith("superset"):
+            continue
+
+        name = get_name(spec)
+        doc_data = diagnose(spec)
+
+        # Get documentation metadata (prefers spec.metadata over DATABASE_DOCS)
+        doc_data["documentation"] = get_documentation_metadata(spec, name)
+
+        # Add engine spec metadata
+        doc_data["engine"] = spec.engine
+        doc_data["engine_name"] = name
+        doc_data["engine_aliases"] = list(getattr(spec, "engine_aliases", set()))
+        doc_data["default_driver"] = getattr(spec, "default_driver", None)
+        doc_data["supports_file_upload"] = spec.supports_file_upload
+        doc_data["supports_dynamic_schema"] = spec.supports_dynamic_schema
+        doc_data["supports_catalog"] = spec.supports_catalog
+
+        all_docs[name] = doc_data
+
+    if output_dir:
+        os.makedirs(output_dir, exist_ok=True)
+
+        # Write individual YAML files for each database
+        for name, data in all_docs.items():
+            # Create a safe filename
+            safe_name = name.lower().replace(" ", "-").replace(".", "")
+            filepath = os.path.join(output_dir, f"{safe_name}.yaml")
+            with open(filepath, "w") as f:
+                yaml.dump(
+                    {name: data},
+                    f,
+                    default_flow_style=False,
+                    sort_keys=False,
+                    allow_unicode=True,
+                )
+
+        # Also write a combined index file
+        index_filepath = os.path.join(output_dir, "_index.yaml")
+        with open(index_filepath, "w") as f:
+            yaml.dump(
+                all_docs,
+                f,
+                default_flow_style=False,
+                sort_keys=False,
+                allow_unicode=True,
+            )
+
+        print(f"Generated {len(all_docs)} YAML files in {output_dir}")
+
+    return all_docs
+
+
 if __name__ == "__main__":
+    import argparse
+
    from superset.app import create_app

+    parser = argparse.ArgumentParser(description="Generate database documentation")
+    parser.add_argument(
+        "--format",
+        choices=["markdown", "yaml"],
+        default="markdown",
+        help="Output format (default: markdown)",
+    )
+    parser.add_argument(
+        "--output-dir",
+        type=str,
+        default=None,
+        help="Directory for YAML output files",
+    )
+    args = parser.parse_args()
+
    app = create_app()
    with app.app_context():
-        output = generate_feature_tables()
-
-    print(output)
+        if args.format == "yaml":
+            output_dir = args.output_dir or "docs/static/databases"
+            docs = generate_yaml_docs(output_dir)
+            print(f"\nGenerated documentation for {len(docs)} databases")
+        else:
+            output = generate_feature_tables()
+            print(output)