feat(docs): auto-generate database documentation from lib.py (#36805)

Co-authored-by: Claude Opus 4.5 <noreply@anthropic.com>
2026-07-19 21:25:38 +00:00 · 2026-01-21 10:54:01 -08:00
parent 2c1a33fd32
commit b460ca94c6
133 changed files with 11531 additions and 2123 deletions
--- a/docs/scripts/generate-database-docs.mjs
+++ b/docs/scripts/generate-database-docs.mjs
@@ -0,0 +1,867 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/**
+ * This script generates database documentation data from engine spec metadata.
+ * It outputs a JSON file that can be imported by React components for rendering.
+ *
+ * Usage: node scripts/generate-database-docs.mjs
+ *
+ * The script can run in two modes:
+ * 1. With Flask app (full diagnostics) - requires superset to be installed
+ * 2. Fallback mode (documentation only) - parses engine spec `metadata` attributes via AST
+ */
+
+import { spawnSync } from 'child_process';
+import fs from 'fs';
+import path from 'path';
+import { fileURLToPath } from 'url';
+
+const __filename = fileURLToPath(import.meta.url);
+const __dirname = path.dirname(__filename);
+const ROOT_DIR = path.resolve(__dirname, '../..');
+const DOCS_DIR = path.resolve(__dirname, '..');
+const DATA_OUTPUT_DIR = path.join(DOCS_DIR, 'src/data');
+const DATA_OUTPUT_FILE = path.join(DATA_OUTPUT_DIR, 'databases.json');
+const MDX_OUTPUT_DIR = path.join(DOCS_DIR, 'docs/databases');
+const MDX_SUPPORTED_DIR = path.join(MDX_OUTPUT_DIR, 'supported');
+
+/**
+ * Try to run the full lib.py script with Flask context
+ */
+function tryRunFullScript() {
+  try {
+    console.log('Attempting to run lib.py with Flask context...');
+    const pythonCode = `
+import sys
+import json
+sys.path.insert(0, '.')
+from superset.app import create_app
+from superset.db_engine_specs.lib import generate_yaml_docs
+app = create_app()
+with app.app_context():
+    docs = generate_yaml_docs()
+    print(json.dumps(docs, default=str))
+`;
+    const result = spawnSync('python', ['-c', pythonCode], {
+      cwd: ROOT_DIR,
+      encoding: 'utf-8',
+      timeout: 60000,
+      maxBuffer: 10 * 1024 * 1024,
+      env: { ...process.env, SUPERSET_SECRET_KEY: 'docs-build-key' },
+    });
+
+    if (result.error) {
+      throw result.error;
+    }
+    if (result.status !== 0) {
+      throw new Error(result.stderr || 'Python script failed');
+    }
+    return JSON.parse(result.stdout);
+  } catch (error) {
+    console.log('Full script execution failed, using fallback mode...');
+    console.log('  Reason:', error.message?.split('\n')[0] || 'Unknown error');
+    return null;
+  }
+}
+
+/**
+ * Extract metadata from individual engine spec files using AST parsing
+ * This is the preferred approach - reads directly from spec.metadata attributes
+ * Supports metadata inheritance - child classes inherit and merge with parent metadata
+ */
+function extractEngineSpecMetadata() {
+  console.log('Extracting metadata from engine spec files...');
+  console.log(`  ROOT_DIR: ${ROOT_DIR}`);
+
+  try {
+    const pythonCode = `
+import sys
+import json
+import ast
+import os
+
+def eval_node(node):
+    """Safely evaluate an AST node as a Python literal."""
+    if node is None:
+        return None
+    if isinstance(node, ast.Constant):
+        return node.value
+    elif isinstance(node, ast.List):
+        return [eval_node(e) for e in node.elts]
+    elif isinstance(node, ast.Dict):
+        result = {}
+        for k, v in zip(node.keys, node.values):
+            if k is not None:
+                key = eval_node(k)
+                if key is not None:
+                    result[key] = eval_node(v)
+        return result
+    elif isinstance(node, ast.Name):
+        # Handle True, False, None constants
+        if node.id == 'True':
+            return True
+        elif node.id == 'False':
+            return False
+        elif node.id == 'None':
+            return None
+        return node.id
+    elif isinstance(node, ast.Attribute):
+        # Handle DatabaseCategory.SOMETHING - return just the attribute name
+        return node.attr
+    elif isinstance(node, ast.BinOp) and isinstance(node.op, ast.Add):
+        left, right = eval_node(node.left), eval_node(node.right)
+        if isinstance(left, str) and isinstance(right, str):
+            return left + right
+        return None
+    elif isinstance(node, ast.Tuple):
+        return tuple(eval_node(e) for e in node.elts)
+    elif isinstance(node, ast.JoinedStr):
+        # f-strings - just return a placeholder
+        return "<f-string>"
+    return None
+
+def deep_merge(base, override):
+    """Deep merge two dictionaries. Override values take precedence."""
+    if base is None:
+        return override
+    if override is None:
+        return base
+    if not isinstance(base, dict) or not isinstance(override, dict):
+        return override
+
+    # Fields that should NOT be inherited from parent classes
+    # - compatible_databases: Each class defines its own compatible DBs
+    # - categories: Each class defines its own categories (not extended from parent)
+    NON_INHERITABLE_FIELDS = {'compatible_databases', 'categories'}
+
+    result = base.copy()
+    # Remove non-inheritable fields from base (they should only come from the class that defines them)
+    for field in NON_INHERITABLE_FIELDS:
+        result.pop(field, None)
+
+    for key, value in override.items():
+        if key in result and isinstance(result[key], dict) and isinstance(value, dict):
+            result[key] = deep_merge(result[key], value)
+        elif key in result and isinstance(result[key], list) and isinstance(value, list):
+            # Extend lists from parent (e.g., drivers)
+            result[key] = result[key] + value
+        else:
+            result[key] = value
+    return result
+
+databases = {}
+specs_dir = 'superset/db_engine_specs'
+errors = []
+debug_info = {
+    "cwd": os.getcwd(),
+    "specs_dir_exists": os.path.isdir(specs_dir),
+    "files_checked": 0,
+    "classes_found": 0,
+    "classes_with_metadata": 0,
+    "inherited_metadata": 0,
+}
+
+if not os.path.isdir(specs_dir):
+    print(json.dumps({"error": f"Directory not found: {specs_dir}", "cwd": os.getcwd()}))
+    sys.exit(1)
+
+# First pass: collect all class info (name, bases, metadata)
+class_info = {}  # class_name -> {bases: [], metadata: {}, engine_name: str, filename: str}
+
+for filename in sorted(os.listdir(specs_dir)):
+    if not filename.endswith('.py') or filename in ('__init__.py', 'lib.py', 'lint_metadata.py'):
+        continue
+
+    debug_info["files_checked"] += 1
+    filepath = os.path.join(specs_dir, filename)
+    try:
+        with open(filepath) as f:
+            source = f.read()
+        tree = ast.parse(source)
+
+        for node in ast.walk(tree):
+            if not isinstance(node, ast.ClassDef):
+                continue
+
+            # Get base class names
+            base_names = []
+            for b in node.bases:
+                if isinstance(b, ast.Name):
+                    base_names.append(b.id)
+                elif isinstance(b, ast.Attribute):
+                    base_names.append(b.attr)
+
+            is_engine_spec = any('EngineSpec' in name or 'Mixin' in name for name in base_names)
+            if not is_engine_spec:
+                continue
+
+            # Extract class attributes
+            engine_name = None
+            metadata = None
+
+            for item in node.body:
+                if isinstance(item, ast.Assign):
+                    for target in item.targets:
+                        if isinstance(target, ast.Name):
+                            if target.id == 'engine_name':
+                                val = eval_node(item.value)
+                                if isinstance(val, str):
+                                    engine_name = val
+                            elif target.id == 'metadata':
+                                metadata = eval_node(item.value)
+
+            # Check for engine attribute with non-empty value to distinguish
+            # true base classes from product classes like OceanBaseEngineSpec
+            has_non_empty_engine = False
+            for item in node.body:
+                if isinstance(item, ast.Assign):
+                    for target in item.targets:
+                        if isinstance(target, ast.Name) and target.id == 'engine':
+                            # Check if engine value is non-empty string
+                            if isinstance(item.value, ast.Constant):
+                                has_non_empty_engine = bool(item.value.value)
+                            break
+
+            # True base classes: end with BaseEngineSpec AND don't define engine
+            # or have empty engine (like PostgresBaseEngineSpec with engine = "")
+            is_true_base = (
+                node.name.endswith('BaseEngineSpec') and not has_non_empty_engine
+            ) or 'Mixin' in node.name
+
+            # Store class info for inheritance resolution
+            class_info[node.name] = {
+                'bases': base_names,
+                'metadata': metadata,
+                'engine_name': engine_name,
+                'filename': filename,
+                'is_base_or_mixin': is_true_base,
+            }
+    except Exception as e:
+        errors.append(f"{filename}: {str(e)}")
+
+# Second pass: resolve inheritance and build final metadata
+def get_inherited_metadata(class_name, visited=None):
+    """Recursively get metadata from parent classes."""
+    if visited is None:
+        visited = set()
+    if class_name in visited:
+        return {}  # Prevent circular inheritance
+    visited.add(class_name)
+
+    info = class_info.get(class_name)
+    if not info:
+        return {}
+
+    # Start with parent metadata
+    inherited = {}
+    for base_name in info['bases']:
+        parent_metadata = get_inherited_metadata(base_name, visited.copy())
+        if parent_metadata:
+            inherited = deep_merge(inherited, parent_metadata)
+
+    # Merge with own metadata (own takes precedence)
+    if info['metadata']:
+        inherited = deep_merge(inherited, info['metadata'])
+
+    return inherited
+
+for class_name, info in class_info.items():
+    # Skip base classes and mixins
+    if info['is_base_or_mixin']:
+        continue
+
+    debug_info["classes_found"] += 1
+
+    # Get final metadata with inheritance
+    final_metadata = get_inherited_metadata(class_name)
+
+    # Remove compatible_databases if not defined by this class (it's not inheritable)
+    own_metadata = info['metadata'] or {}
+    if 'compatible_databases' not in own_metadata and 'compatible_databases' in final_metadata:
+        del final_metadata['compatible_databases']
+
+    # Track if we inherited anything
+    if final_metadata and final_metadata != own_metadata:
+        debug_info["inherited_metadata"] += 1
+
+    # Use class name as fallback for engine_name
+    display_name = info['engine_name'] or class_name.replace('EngineSpec', '').replace('_', ' ')
+
+    if final_metadata and isinstance(final_metadata, dict) and display_name:
+        debug_info["classes_with_metadata"] += 1
+        databases[display_name] = {
+            'engine': display_name.lower().replace(' ', '_'),
+            'engine_name': display_name,
+            'module': info['filename'][:-3],  # Remove .py extension
+            'documentation': final_metadata,
+            'time_grains': {},
+            'score': 0,
+            'max_score': 0,
+            'joins': True,
+            'subqueries': True,
+            'supports_dynamic_schema': False,
+            'supports_catalog': False,
+            'supports_dynamic_catalog': False,
+            'ssh_tunneling': False,
+            'query_cancelation': False,
+            'supports_file_upload': False,
+            'user_impersonation': False,
+            'query_cost_estimation': False,
+            'sql_validation': False,
+        }
+
+if errors and not databases:
+    print(json.dumps({"error": "Parse errors", "details": errors, "debug": debug_info}), file=sys.stderr)
+
+# Print debug info to stderr for troubleshooting
+print(json.dumps(debug_info), file=sys.stderr)
+
+print(json.dumps(databases, default=str))
+`;
+    const result = spawnSync('python3', ['-c', pythonCode], {
+      cwd: ROOT_DIR,
+      encoding: 'utf-8',
+      timeout: 30000,
+      maxBuffer: 10 * 1024 * 1024,
+    });
+
+    if (result.error) {
+      throw result.error;
+    }
+    // Log debug info from stderr
+    if (result.stderr) {
+      console.log('Python debug info:', result.stderr.trim());
+    }
+    if (result.status !== 0) {
+      throw new Error(result.stderr || 'Python script failed');
+    }
+    const databases = JSON.parse(result.stdout);
+    if (Object.keys(databases).length === 0) {
+      throw new Error('No metadata found in engine specs');
+    }
+
+    console.log(`Extracted metadata from ${Object.keys(databases).length} engine specs`);
+    return databases;
+  } catch (err) {
+    console.log('Engine spec metadata extraction failed:', err.message);
+    return null;
+  }
+}
+
+/**
+ * Build statistics from the database data
+ */
+function buildStatistics(databases) {
+  const stats = {
+    totalDatabases: Object.keys(databases).length,
+    withDocumentation: 0,
+    withConnectionString: 0,
+    withDrivers: 0,
+    withAuthMethods: 0,
+    supportsJoins: 0,
+    supportsSubqueries: 0,
+    supportsDynamicSchema: 0,
+    supportsCatalog: 0,
+    averageScore: 0,
+    maxScore: 0,
+    byCategory: {},
+  };
+
+  let totalScore = 0;
+
+  for (const [name, db] of Object.entries(databases)) {
+    const docs = db.documentation || {};
+
+    if (Object.keys(docs).length > 0) stats.withDocumentation++;
+    if (docs.connection_string || docs.drivers?.length > 0)
+      stats.withConnectionString++;
+    if (docs.drivers?.length > 0) stats.withDrivers++;
+    if (docs.authentication_methods?.length > 0) stats.withAuthMethods++;
+    if (db.joins) stats.supportsJoins++;
+    if (db.subqueries) stats.supportsSubqueries++;
+    if (db.supports_dynamic_schema) stats.supportsDynamicSchema++;
+    if (db.supports_catalog) stats.supportsCatalog++;
+
+    totalScore += db.score || 0;
+    if (db.max_score > stats.maxScore) stats.maxScore = db.max_score;
+
+    // Use categories from documentation metadata (computed by Python)
+    // Each database can belong to multiple categories
+    const categories = docs.categories || ['OTHER'];
+    for (const cat of categories) {
+      // Map category constant names to display names
+      const categoryDisplayNames = {
+        'CLOUD_AWS': 'Cloud - AWS',
+        'CLOUD_GCP': 'Cloud - Google',
+        'CLOUD_AZURE': 'Cloud - Azure',
+        'CLOUD_DATA_WAREHOUSES': 'Cloud Data Warehouses',
+        'APACHE_PROJECTS': 'Apache Projects',
+        'TRADITIONAL_RDBMS': 'Traditional RDBMS',
+        'ANALYTICAL_DATABASES': 'Analytical Databases',
+        'SEARCH_NOSQL': 'Search & NoSQL',
+        'QUERY_ENGINES': 'Query Engines',
+        'TIME_SERIES': 'Time Series Databases',
+        'OTHER': 'Other Databases',
+        'OPEN_SOURCE': 'Open Source',
+        'HOSTED_OPEN_SOURCE': 'Hosted Open Source',
+        'PROPRIETARY': 'Proprietary',
+      };
+      const displayName = categoryDisplayNames[cat] || cat;
+      if (!stats.byCategory[displayName]) {
+        stats.byCategory[displayName] = [];
+      }
+      stats.byCategory[displayName].push(name);
+    }
+  }
+
+  stats.averageScore = Math.round(totalScore / stats.totalDatabases);
+
+  return stats;
+}
+
+/**
+ * Convert database name to a URL-friendly slug
+ */
+function toSlug(name) {
+  return name
+    .toLowerCase()
+    .replace(/[^a-z0-9]+/g, '-')
+    .replace(/^-|-$/g, '');
+}
+
+/**
+ * Generate MDX content for a single database page
+ */
+function generateDatabaseMDX(name, db) {
+  const description = db.documentation?.description || `Documentation for ${name} database connection.`;
+  const shortDesc = description
+    .slice(0, 160)
+    .replace(/\\/g, '\\\\')
+    .replace(/"/g, '\\"');
+
+  return `---
+title: ${name}
+sidebar_label: ${name}
+description: "${shortDesc}"
+hide_title: true
+---
+
+{/*
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements.  See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership.  The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing,
+software distributed under the License is distributed on an
+"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, either express or implied.  See the License for the
+specific language governing permissions and limitations
+under the License.
+*/}
+
+import { DatabasePage } from '@site/src/components/databases';
+import databaseData from '@site/src/data/databases.json';
+
+<DatabasePage name="${name}" database={databaseData.databases["${name}"]} />
+`;
+}
+
+/**
+ * Generate the index MDX for the databases overview
+ */
+function generateIndexMDX(statistics, usedFlaskContext = true) {
+  const fallbackNotice = usedFlaskContext ? '' : `
+:::info Developer Note
+This documentation was built without Flask context, so feature diagnostics (scores, time grain support, etc.)
+may not reflect actual database capabilities. For full diagnostics, build docs locally with:
+
+\`\`\`bash
+cd docs && npm run gen-db-docs
+\`\`\`
+
+This requires a working Superset development environment.
+:::
+
+`;
+
+  return `---
+title: Connecting to Databases
+sidebar_label: Overview
+sidebar_position: 1
+---
+
+{/*
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements.  See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership.  The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing,
+software distributed under the License is distributed on an
+"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, either express or implied.  See the License for the
+specific language governing permissions and limitations
+under the License.
+*/}
+
+import { DatabaseIndex } from '@site/src/components/databases';
+import databaseData from '@site/src/data/databases.json';
+
+# Connecting to Databases
+
+Superset does not ship bundled with connectivity to databases. The main step in connecting
+Superset to a database is to **install the proper database driver(s)** in your environment.
+
+:::note
+You'll need to install the required packages for the database you want to use as your metadata database
+as well as the packages needed to connect to the databases you want to access through Superset.
+For information about setting up Superset's metadata database, please refer to
+installation documentations ([Docker Compose](/docs/installation/docker-compose), [Kubernetes](/docs/installation/kubernetes))
+:::
+
+## Supported Databases
+
+Superset supports **${statistics.totalDatabases} databases** with varying levels of feature support.
+Click on any database name to see detailed documentation including connection strings,
+authentication methods, and configuration options.
+
+<DatabaseIndex data={databaseData} />
+
+## Installing Database Drivers
+
+Superset requires a Python [DB-API database driver](https://peps.python.org/pep-0249/)
+and a [SQLAlchemy dialect](https://docs.sqlalchemy.org/en/20/dialects/) to be installed for
+each database engine you want to connect to.
+
+### Installing Drivers in Docker
+
+For Docker deployments, create a \`requirements-local.txt\` file in the \`docker\` directory:
+
+\`\`\`bash
+# Create the requirements file
+touch ./docker/requirements-local.txt
+
+# Add your driver (e.g., for PostgreSQL)
+echo "psycopg2-binary" >> ./docker/requirements-local.txt
+\`\`\`
+
+Then restart your containers. The drivers will be installed automatically.
+
+### Installing Drivers with pip
+
+For non-Docker installations:
+
+\`\`\`bash
+pip install <driver-package>
+\`\`\`
+
+See individual database pages for the specific driver packages needed.
+
+## Connecting Through the UI
+
+1. Go to **Settings → Data: Database Connections**
+2. Click **+ DATABASE**
+3. Select your database type or enter a SQLAlchemy URI
+4. Click **Test Connection** to verify
+5. Click **Connect** to save
+
+## Contributing
+
+To add or update database documentation, add a \`metadata\` attribute to your engine spec class in
+\`superset/db_engine_specs/\`. Documentation is auto-generated from these metadata attributes.
+
+See [METADATA_STATUS.md](https://github.com/apache/superset/blob/master/superset/db_engine_specs/METADATA_STATUS.md)
+for the current status of database documentation and the [README](https://github.com/apache/superset/blob/master/superset/db_engine_specs/README.md) for the metadata schema.
+${fallbackNotice}`;
+}
+
+const README_PATH = path.join(ROOT_DIR, 'README.md');
+const README_START_MARKER = '<!-- SUPPORTED_DATABASES_START -->';
+const README_END_MARKER = '<!-- SUPPORTED_DATABASES_END -->';
+
+/**
+ * Generate the database logos HTML for README.md
+ * Only includes databases that have logos defined
+ */
+function generateReadmeLogos(databases) {
+  // Get databases with logos, sorted alphabetically
+  const dbsWithLogos = Object.entries(databases)
+    .filter(([, db]) => db.documentation?.logo)
+    .sort(([a], [b]) => a.localeCompare(b));
+
+  if (dbsWithLogos.length === 0) {
+    return '';
+  }
+
+  // Generate HTML img tags
+  const logoTags = dbsWithLogos.map(([name, db]) => {
+    const logo = db.documentation.logo;
+    const alt = name.toLowerCase().replace(/\s+/g, '-');
+    // Use docs site URL for logos
+    return `  <img src="https://superset.apache.org/img/databases/${logo}" alt="${alt}" border="0" width="80" height="40" class="database-logo" />`;
+  });
+
+  return `<p align="center">
+${logoTags.join('\n')}
+</p>`;
+}
+
+/**
+ * Update the README.md with generated database logos
+ */
+function updateReadme(databases) {
+  if (!fs.existsSync(README_PATH)) {
+    console.log('README.md not found, skipping update');
+    return false;
+  }
+
+  const content = fs.readFileSync(README_PATH, 'utf-8');
+
+  // Check if markers exist
+  if (!content.includes(README_START_MARKER) || !content.includes(README_END_MARKER)) {
+    console.log('README.md missing database markers, skipping update');
+    console.log(`  Add ${README_START_MARKER} and ${README_END_MARKER} to enable auto-generation`);
+    return false;
+  }
+
+  // Generate new logos section
+  const logosHtml = generateReadmeLogos(databases);
+
+  // Replace content between markers
+  const pattern = new RegExp(
+    `${README_START_MARKER}[\\s\\S]*?${README_END_MARKER}`,
+    'g'
+  );
+  const newContent = content.replace(
+    pattern,
+    `${README_START_MARKER}\n${logosHtml}\n${README_END_MARKER}`
+  );
+
+  if (newContent !== content) {
+    fs.writeFileSync(README_PATH, newContent);
+    console.log('Updated README.md database logos');
+    return true;
+  }
+
+  console.log('README.md database logos unchanged');
+  return false;
+}
+
+/**
+ * Load existing database data if available
+ */
+function loadExistingData() {
+  if (!fs.existsSync(DATA_OUTPUT_FILE)) {
+    return null;
+  }
+
+  try {
+    const content = fs.readFileSync(DATA_OUTPUT_FILE, 'utf-8');
+    return JSON.parse(content);
+  } catch (error) {
+    console.log('Could not load existing data:', error.message);
+    return null;
+  }
+}
+
+/**
+ * Merge new documentation with existing diagnostics
+ * Preserves score, time_grains, and feature flags from existing data
+ */
+function mergeWithExistingDiagnostics(newDatabases, existingData) {
+  if (!existingData?.databases) return newDatabases;
+
+  const diagnosticFields = [
+    'score', 'max_score', 'time_grains', 'joins', 'subqueries',
+    'supports_dynamic_schema', 'supports_catalog', 'supports_dynamic_catalog',
+    'ssh_tunneling', 'query_cancelation', 'supports_file_upload',
+    'user_impersonation', 'query_cost_estimation', 'sql_validation'
+  ];
+
+  for (const [name, db] of Object.entries(newDatabases)) {
+    const existingDb = existingData.databases[name];
+    if (existingDb && existingDb.score > 0) {
+      // Preserve diagnostics from existing data
+      for (const field of diagnosticFields) {
+        if (existingDb[field] !== undefined) {
+          db[field] = existingDb[field];
+        }
+      }
+    }
+  }
+
+  const preserved = Object.values(newDatabases).filter(d => d.score > 0).length;
+  if (preserved > 0) {
+    console.log(`Preserved diagnostics for ${preserved} databases from existing data`);
+  }
+
+  return newDatabases;
+}
+
+/**
+ * Main function
+ */
+async function main() {
+  console.log('Generating database documentation...\n');
+
+  // Ensure output directories exist
+  if (!fs.existsSync(DATA_OUTPUT_DIR)) {
+    fs.mkdirSync(DATA_OUTPUT_DIR, { recursive: true });
+  }
+  if (!fs.existsSync(MDX_OUTPUT_DIR)) {
+    fs.mkdirSync(MDX_OUTPUT_DIR, { recursive: true });
+  }
+
+  // Load existing data for potential merge
+  const existingData = loadExistingData();
+
+  // Try sources in order of preference:
+  // 1. Full script with Flask context (richest data with diagnostics)
+  // 2. Engine spec metadata files (works in CI without Flask)
+  let databases = tryRunFullScript();
+  let usedFlaskContext = !!databases;
+
+  if (!databases) {
+    // Extract from engine spec metadata (preferred for CI)
+    databases = extractEngineSpecMetadata();
+  }
+
+  if (!databases || Object.keys(databases).length === 0) {
+    console.error('Failed to generate database documentation data.');
+    console.error('Could not extract from Flask app or engine spec metadata.');
+    process.exit(1);
+  }
+
+  console.log(`Processed ${Object.keys(databases).length} databases\n`);
+
+  // Check if new data has scores; if not, preserve existing diagnostics
+  const hasNewScores = Object.values(databases).some((db) => db.score > 0);
+  if (!hasNewScores && existingData) {
+    databases = mergeWithExistingDiagnostics(databases, existingData);
+  }
+
+  // Build statistics
+  const statistics = buildStatistics(databases);
+
+  // Create the final output structure
+  const output = {
+    generated: new Date().toISOString(),
+    statistics,
+    databases,
+  };
+
+  // Write the JSON file (with trailing newline for POSIX compliance)
+  fs.writeFileSync(DATA_OUTPUT_FILE, JSON.stringify(output, null, 2) + '\n');
+  console.log(`Generated: ${path.relative(DOCS_DIR, DATA_OUTPUT_FILE)}`);
+
+
+  // Ensure supported directory exists
+  if (!fs.existsSync(MDX_SUPPORTED_DIR)) {
+    fs.mkdirSync(MDX_SUPPORTED_DIR, { recursive: true });
+  }
+
+  // Clean up old MDX files that are no longer in the database list
+  console.log(`\nCleaning up old MDX files in ${path.relative(DOCS_DIR, MDX_SUPPORTED_DIR)}/`);
+  const existingMdxFiles = fs.readdirSync(MDX_SUPPORTED_DIR).filter(f => f.endsWith('.mdx'));
+  const validSlugs = new Set(Object.keys(databases).map(name => `${toSlug(name)}.mdx`));
+  let removedCount = 0;
+  for (const file of existingMdxFiles) {
+    if (!validSlugs.has(file)) {
+      fs.unlinkSync(path.join(MDX_SUPPORTED_DIR, file));
+      removedCount++;
+    }
+  }
+  if (removedCount > 0) {
+    console.log(`  Removed ${removedCount} outdated MDX files`);
+  }
+
+  // Generate individual MDX files for each database in supported/ subdirectory
+  console.log(`\nGenerating MDX files in ${path.relative(DOCS_DIR, MDX_SUPPORTED_DIR)}/`);
+
+  let mdxCount = 0;
+  for (const [name, db] of Object.entries(databases)) {
+    const slug = toSlug(name);
+    const mdxContent = generateDatabaseMDX(name, db);
+    const mdxPath = path.join(MDX_SUPPORTED_DIR, `${slug}.mdx`);
+    fs.writeFileSync(mdxPath, mdxContent);
+    mdxCount++;
+  }
+  console.log(`  Generated ${mdxCount} database pages`);
+
+  // Generate index page in parent databases/ directory
+  const indexContent = generateIndexMDX(statistics, usedFlaskContext);
+  const indexPath = path.join(MDX_OUTPUT_DIR, 'index.mdx');
+  fs.writeFileSync(indexPath, indexContent);
+  console.log(`  Generated index page`);
+
+  // Generate _category_.json for databases/ directory
+  const categoryJson = {
+    label: 'Databases',
+    position: 1,
+    link: {
+      type: 'doc',
+      id: 'databases/index',
+    },
+  };
+  fs.writeFileSync(
+    path.join(MDX_OUTPUT_DIR, '_category_.json'),
+    JSON.stringify(categoryJson, null, 2) + '\n'
+  );
+
+  // Generate _category_.json for supported/ subdirectory (collapsible)
+  const supportedCategoryJson = {
+    label: 'Supported Databases',
+    position: 2,
+    collapsed: true,
+    collapsible: true,
+  };
+  fs.writeFileSync(
+    path.join(MDX_SUPPORTED_DIR, '_category_.json'),
+    JSON.stringify(supportedCategoryJson, null, 2) + '\n'
+  );
+  console.log(`  Generated _category_.json files`);
+
+  // Update README.md database logos (only when explicitly requested)
+  if (process.env.UPDATE_README === 'true' || process.argv.includes('--update-readme')) {
+    console.log('');
+    updateReadme(databases);
+  }
+
+  console.log(`\nStatistics:`);
+  console.log(`  Total databases: ${statistics.totalDatabases}`);
+  console.log(`  With documentation: ${statistics.withDocumentation}`);
+  console.log(`  With connection strings: ${statistics.withConnectionString}`);
+  console.log(`  Categories: ${Object.keys(statistics.byCategory).length}`);
+
+  console.log('\nDone!');
+}
+
+main().catch(console.error);