superset2/docs/scripts/generate-database-docs.mjs

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

/**
 * This script generates database documentation data from engine spec metadata.
 * It outputs a JSON file that can be imported by React components for rendering.
 *
 * Usage: node scripts/generate-database-docs.mjs
 *
 * The script can run in two modes:
 * 1. With Flask app (full diagnostics) - requires superset to be installed
 * 2. Fallback mode (documentation only) - parses engine spec `metadata` attributes via AST
 */

import { spawnSync } from 'child_process';
import fs from 'fs';
import { createRequire } from 'module';
import path from 'path';
import { fileURLToPath } from 'url';

const require = createRequire(import.meta.url);

const __filename = fileURLToPath(import.meta.url);
const __dirname = path.dirname(__filename);
const ROOT_DIR = path.resolve(__dirname, '../..');
const DOCS_DIR = path.resolve(__dirname, '..');
const DATA_OUTPUT_DIR = path.join(DOCS_DIR, 'src/data');
const DATA_OUTPUT_FILE = path.join(DATA_OUTPUT_DIR, 'databases.json');
const MDX_OUTPUT_DIR = path.join(DOCS_DIR, 'docs/databases');
const MDX_SUPPORTED_DIR = path.join(MDX_OUTPUT_DIR, 'supported');
const IMAGES_DIR = path.join(DOCS_DIR, 'static/img/databases');

/**
 * Try to run the full lib.py script with Flask context
 */
function tryRunFullScript() {
  try {
    console.log('Attempting to run lib.py with Flask context...');
    const pythonCode = `
import sys
import json
sys.path.insert(0, '.')
from superset.app import create_app
from superset.db_engine_specs.lib import generate_yaml_docs
app = create_app()
with app.app_context():
    docs = generate_yaml_docs()
    print(json.dumps(docs, default=str))
`;
    const result = spawnSync('python', ['-c', pythonCode], {
      cwd: ROOT_DIR,
      encoding: 'utf-8',
      timeout: 60000,
      maxBuffer: 10 * 1024 * 1024,
      env: { ...process.env, SUPERSET_SECRET_KEY: 'docs-build-key' },
    });

    if (result.error) {
      throw result.error;
    }
    if (result.status !== 0) {
      throw new Error(result.stderr || 'Python script failed');
    }
    return JSON.parse(result.stdout);
  } catch (error) {
    console.log('Full script execution failed, using fallback mode...');
    console.log('  Reason:', error.message?.split('\n')[0] || 'Unknown error');
    return null;
  }
}

/**
 * Extract metadata from individual engine spec files using AST parsing
 * This is the preferred approach - reads directly from spec.metadata attributes
 * Supports metadata inheritance - child classes inherit and merge with parent metadata
 */
function extractEngineSpecMetadata() {
  console.log('Extracting metadata from engine spec files...');
  console.log(`  ROOT_DIR: ${ROOT_DIR}`);

  try {
    const pythonCode = `
import sys
import json
import ast
import os

def eval_node(node):
    """Safely evaluate an AST node as a Python literal."""
    if node is None:
        return None
    if isinstance(node, ast.Constant):
        return node.value
    elif isinstance(node, ast.List):
        return [eval_node(e) for e in node.elts]
    elif isinstance(node, ast.Dict):
        result = {}
        for k, v in zip(node.keys, node.values):
            if k is not None:
                key = eval_node(k)
                if key is not None:
                    result[key] = eval_node(v)
        return result
    elif isinstance(node, ast.Name):
        # Handle True, False, None constants
        if node.id == 'True':
            return True
        elif node.id == 'False':
            return False
        elif node.id == 'None':
            return None
        return node.id
    elif isinstance(node, ast.Attribute):
        # Handle DatabaseCategory.SOMETHING - return just the attribute name
        return node.attr
    elif isinstance(node, ast.BinOp) and isinstance(node.op, ast.Add):
        left, right = eval_node(node.left), eval_node(node.right)
        if isinstance(left, str) and isinstance(right, str):
            return left + right
        return None
    elif isinstance(node, ast.Tuple):
        return tuple(eval_node(e) for e in node.elts)
    elif isinstance(node, ast.JoinedStr):
        # f-strings - just return a placeholder
        return "<f-string>"
    return None

def deep_merge(base, override):
    """Deep merge two dictionaries. Override values take precedence."""
    if base is None:
        return override
    if override is None:
        return base
    if not isinstance(base, dict) or not isinstance(override, dict):
        return override

    # Fields that should NOT be inherited from parent classes
    # - compatible_databases: Each class defines its own compatible DBs
    # - categories: Each class defines its own categories (not extended from parent)
    NON_INHERITABLE_FIELDS = {'compatible_databases', 'categories'}

    result = base.copy()
    # Remove non-inheritable fields from base (they should only come from the class that defines them)
    for field in NON_INHERITABLE_FIELDS:
        result.pop(field, None)

    for key, value in override.items():
        if key in result and isinstance(result[key], dict) and isinstance(value, dict):
            result[key] = deep_merge(result[key], value)
        elif key in result and isinstance(result[key], list) and isinstance(value, list):
            # Extend lists from parent (e.g., drivers)
            result[key] = result[key] + value
        else:
            result[key] = value
    return result

databases = {}
specs_dir = 'superset/db_engine_specs'
errors = []
debug_info = {
    "cwd": os.getcwd(),
    "specs_dir_exists": os.path.isdir(specs_dir),
    "files_checked": 0,
    "classes_found": 0,
    "classes_with_metadata": 0,
    "inherited_metadata": 0,
}

if not os.path.isdir(specs_dir):
    print(json.dumps({"error": f"Directory not found: {specs_dir}", "cwd": os.getcwd()}))
    sys.exit(1)

# First pass: collect all class info (name, bases, metadata)
class_info = {}  # class_name -> {bases: [], metadata: {}, engine_name: str, filename: str}

for filename in sorted(os.listdir(specs_dir)):
    if not filename.endswith('.py') or filename in ('__init__.py', 'lib.py', 'lint_metadata.py'):
        continue

    debug_info["files_checked"] += 1
    filepath = os.path.join(specs_dir, filename)
    try:
        with open(filepath) as f:
            source = f.read()
        tree = ast.parse(source)

        for node in ast.walk(tree):
            if not isinstance(node, ast.ClassDef):
                continue

            # Get base class names
            base_names = []
            for b in node.bases:
                if isinstance(b, ast.Name):
                    base_names.append(b.id)
                elif isinstance(b, ast.Attribute):
                    base_names.append(b.attr)

            is_engine_spec = any('EngineSpec' in name or 'Mixin' in name for name in base_names)
            if not is_engine_spec:
                continue

            # Extract class attributes
            engine_name = None
            metadata = None

            for item in node.body:
                if isinstance(item, ast.Assign):
                    for target in item.targets:
                        if isinstance(target, ast.Name):
                            if target.id == 'engine_name':
                                val = eval_node(item.value)
                                if isinstance(val, str):
                                    engine_name = val
                            elif target.id == 'metadata':
                                metadata = eval_node(item.value)

            # Check for engine attribute with non-empty value to distinguish
            # true base classes from product classes like OceanBaseEngineSpec
            has_non_empty_engine = False
            for item in node.body:
                if isinstance(item, ast.Assign):
                    for target in item.targets:
                        if isinstance(target, ast.Name) and target.id == 'engine':
                            # Check if engine value is non-empty string
                            if isinstance(item.value, ast.Constant):
                                has_non_empty_engine = bool(item.value.value)
                            break

            # True base classes: end with BaseEngineSpec AND don't define engine
            # or have empty engine (like PostgresBaseEngineSpec with engine = "")
            is_true_base = (
                node.name.endswith('BaseEngineSpec') and not has_non_empty_engine
            ) or 'Mixin' in node.name

            # Store class info for inheritance resolution
            class_info[node.name] = {
                'bases': base_names,
                'metadata': metadata,
                'engine_name': engine_name,
                'filename': filename,
                'is_base_or_mixin': is_true_base,
            }
    except Exception as e:
        errors.append(f"{filename}: {str(e)}")

# Second pass: resolve inheritance and build final metadata
def get_inherited_metadata(class_name, visited=None):
    """Recursively get metadata from parent classes."""
    if visited is None:
        visited = set()
    if class_name in visited:
        return {}  # Prevent circular inheritance
    visited.add(class_name)

    info = class_info.get(class_name)
    if not info:
        return {}

    # Start with parent metadata
    inherited = {}
    for base_name in info['bases']:
        parent_metadata = get_inherited_metadata(base_name, visited.copy())
        if parent_metadata:
            inherited = deep_merge(inherited, parent_metadata)

    # Merge with own metadata (own takes precedence)
    if info['metadata']:
        inherited = deep_merge(inherited, info['metadata'])

    return inherited

for class_name, info in class_info.items():
    # Skip base classes and mixins
    if info['is_base_or_mixin']:
        continue

    debug_info["classes_found"] += 1

    # Get final metadata with inheritance
    final_metadata = get_inherited_metadata(class_name)

    # Remove compatible_databases if not defined by this class (it's not inheritable)
    own_metadata = info['metadata'] or {}
    if 'compatible_databases' not in own_metadata and 'compatible_databases' in final_metadata:
        del final_metadata['compatible_databases']

    # Track if we inherited anything
    if final_metadata and final_metadata != own_metadata:
        debug_info["inherited_metadata"] += 1

    # Use class name as fallback for engine_name
    display_name = info['engine_name'] or class_name.replace('EngineSpec', '').replace('_', ' ')

    if final_metadata and isinstance(final_metadata, dict) and display_name:
        debug_info["classes_with_metadata"] += 1
        databases[display_name] = {
            'engine': display_name.lower().replace(' ', '_'),
            'engine_name': display_name,
            'module': info['filename'][:-3],  # Remove .py extension
            'documentation': final_metadata,
            'time_grains': {},
            'score': 0,
            'max_score': 0,
            'joins': True,
            'subqueries': True,
            'supports_dynamic_schema': False,
            'supports_catalog': False,
            'supports_dynamic_catalog': False,
            'ssh_tunneling': False,
            'query_cancelation': False,
            'supports_file_upload': False,
            'user_impersonation': False,
            'query_cost_estimation': False,
            'sql_validation': False,
        }

if errors and not databases:
    print(json.dumps({"error": "Parse errors", "details": errors, "debug": debug_info}), file=sys.stderr)

# Print debug info to stderr for troubleshooting
print(json.dumps(debug_info), file=sys.stderr)

print(json.dumps(databases, default=str))
`;
    const result = spawnSync('python3', ['-c', pythonCode], {
      cwd: ROOT_DIR,
      encoding: 'utf-8',
      timeout: 30000,
      maxBuffer: 10 * 1024 * 1024,
    });

    if (result.error) {
      throw result.error;
    }
    // Log debug info from stderr
    if (result.stderr) {
      console.log('Python debug info:', result.stderr.trim());
    }
    if (result.status !== 0) {
      throw new Error(result.stderr || 'Python script failed');
    }
    const databases = JSON.parse(result.stdout);
    if (Object.keys(databases).length === 0) {
      throw new Error('No metadata found in engine specs');
    }

    console.log(`Extracted metadata from ${Object.keys(databases).length} engine specs`);
    return databases;
  } catch (err) {
    console.log('Engine spec metadata extraction failed:', err.message);
    return null;
  }
}

/**
 * Build statistics from the database data
 */
function buildStatistics(databases) {
  const stats = {
    totalDatabases: Object.keys(databases).length,
    withDocumentation: 0,
    withConnectionString: 0,
    withDrivers: 0,
    withAuthMethods: 0,
    supportsJoins: 0,
    supportsSubqueries: 0,
    supportsDynamicSchema: 0,
    supportsCatalog: 0,
    averageScore: 0,
    maxScore: 0,
    byCategory: {},
  };

  let totalScore = 0;

  for (const [name, db] of Object.entries(databases)) {
    const docs = db.documentation || {};

    if (Object.keys(docs).length > 0) stats.withDocumentation++;
    if (docs.connection_string || docs.drivers?.length > 0)
      stats.withConnectionString++;
    if (docs.drivers?.length > 0) stats.withDrivers++;
    if (docs.authentication_methods?.length > 0) stats.withAuthMethods++;
    if (db.joins) stats.supportsJoins++;
    if (db.subqueries) stats.supportsSubqueries++;
    if (db.supports_dynamic_schema) stats.supportsDynamicSchema++;
    if (db.supports_catalog) stats.supportsCatalog++;

    totalScore += db.score || 0;
    if (db.max_score > stats.maxScore) stats.maxScore = db.max_score;

    // Use categories from documentation metadata (computed by Python)
    // Each database can belong to multiple categories
    const categories = docs.categories || ['OTHER'];
    for (const cat of categories) {
      // Map category constant names to display names
      const categoryDisplayNames = {
        'CLOUD_AWS': 'Cloud - AWS',
        'CLOUD_GCP': 'Cloud - Google',
        'CLOUD_AZURE': 'Cloud - Azure',
        'CLOUD_DATA_WAREHOUSES': 'Cloud Data Warehouses',
        'APACHE_PROJECTS': 'Apache Projects',
        'TRADITIONAL_RDBMS': 'Traditional RDBMS',
        'ANALYTICAL_DATABASES': 'Analytical Databases',
        'SEARCH_NOSQL': 'Search & NoSQL',
        'QUERY_ENGINES': 'Query Engines',
        'TIME_SERIES': 'Time Series Databases',
        'OTHER': 'Other Databases',
        'OPEN_SOURCE': 'Open Source',
        'HOSTED_OPEN_SOURCE': 'Hosted Open Source',
        'PROPRIETARY': 'Proprietary',
      };
      const displayName = categoryDisplayNames[cat] || cat;
      if (!stats.byCategory[displayName]) {
        stats.byCategory[displayName] = [];
      }
      stats.byCategory[displayName].push(name);
    }
  }

  stats.averageScore = Math.round(totalScore / stats.totalDatabases);

  return stats;
}

/**
 * Convert database name to a URL-friendly slug
 */
function toSlug(name) {
  return name
    .toLowerCase()
    .replace(/[^a-z0-9]+/g, '-')
    .replace(/^-|-$/g, '');
}

/**
 * Generate MDX content for a single database page
 */
function generateDatabaseMDX(name, db) {
  const description = db.documentation?.description || `Documentation for ${name} database connection.`;
  const shortDesc = description
    .slice(0, 160)
    .replace(/\\/g, '\\\\')
    .replace(/"/g, '\\"');

  return `---
title: ${name}
sidebar_label: ${name}
description: "${shortDesc}"
hide_title: true
---

{/*
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements.  See the NOTICE file
distributed with this work for additional information
regarding copyright ownership.  The ASF licenses this file
to you under the Apache License, Version 2.0 (the
"License"); you may not use this file except in compliance
with the License.  You may obtain a copy of the License at

  http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing,
software distributed under the License is distributed on an
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
KIND, either express or implied.  See the License for the
specific language governing permissions and limitations
under the License.
*/}

import { DatabasePage } from '@site/src/components/databases';
import databaseData from '@site/src/data/databases.json';

<DatabasePage name="${name}" database={databaseData.databases["${name}"]} />
`;
}

/**
 * Generate the index MDX for the databases overview
 */
function generateIndexMDX(statistics, usedFlaskContext = true) {
  const fallbackNotice = usedFlaskContext ? '' : `
:::info Developer Note
This documentation was built without Flask context, so feature diagnostics (scores, time grain support, etc.)
may not reflect actual database capabilities. For full diagnostics, build docs locally with:

\`\`\`bash
cd docs && npm run gen-db-docs
\`\`\`

This requires a working Superset development environment.
:::

`;

  return `---
title: Connecting to Databases
sidebar_label: Overview
sidebar_position: 1
---

{/*
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements.  See the NOTICE file
distributed with this work for additional information
regarding copyright ownership.  The ASF licenses this file
to you under the Apache License, Version 2.0 (the
"License"); you may not use this file except in compliance
with the License.  You may obtain a copy of the License at

  http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing,
software distributed under the License is distributed on an
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
KIND, either express or implied.  See the License for the
specific language governing permissions and limitations
under the License.
*/}

import { DatabaseIndex } from '@site/src/components/databases';
import databaseData from '@site/src/data/databases.json';

# Connecting to Databases

Superset does not ship bundled with connectivity to databases. The main step in connecting
Superset to a database is to **install the proper database driver(s)** in your environment.

:::note
You'll need to install the required packages for the database you want to use as your metadata database
as well as the packages needed to connect to the databases you want to access through Superset.
For information about setting up Superset's metadata database, please refer to
installation documentations ([Docker Compose](/docs/installation/docker-compose), [Kubernetes](/docs/installation/kubernetes))
:::

## Supported Databases

Superset supports **${statistics.totalDatabases} databases** with varying levels of feature support.
Click on any database name to see detailed documentation including connection strings,
authentication methods, and configuration options.

<DatabaseIndex data={databaseData} />

## Installing Database Drivers

Superset requires a Python [DB-API database driver](https://peps.python.org/pep-0249/)
and a [SQLAlchemy dialect](https://docs.sqlalchemy.org/en/20/dialects/) to be installed for
each database engine you want to connect to.

### Installing Drivers in Docker

For Docker deployments, create a \`requirements-local.txt\` file in the \`docker\` directory:

\`\`\`bash
# Create the requirements file
touch ./docker/requirements-local.txt

# Add your driver (e.g., for PostgreSQL)
echo "psycopg2-binary" >> ./docker/requirements-local.txt
\`\`\`

Then restart your containers. The drivers will be installed automatically.

### Installing Drivers with pip

For non-Docker installations:

\`\`\`bash
pip install <driver-package>
\`\`\`

See individual database pages for the specific driver packages needed.

## Connecting Through the UI

1. Go to **Settings → Data: Database Connections**
2. Click **+ DATABASE**
3. Select your database type or enter a SQLAlchemy URI
4. Click **Test Connection** to verify
5. Click **Connect** to save

## Contributing

To add or update database documentation, add a \`metadata\` attribute to your engine spec class in
\`superset/db_engine_specs/\`. Documentation is auto-generated from these metadata attributes.

See [METADATA_STATUS.md](https://github.com/apache/superset/blob/master/superset/db_engine_specs/METADATA_STATUS.md)
for the current status of database documentation and the [README](https://github.com/apache/superset/blob/master/superset/db_engine_specs/README.md) for the metadata schema.
${fallbackNotice}`;
}

const README_PATH = path.join(ROOT_DIR, 'README.md');
const README_START_MARKER = '<!-- SUPPORTED_DATABASES_START -->';
const README_END_MARKER = '<!-- SUPPORTED_DATABASES_END -->';

/**
 * Read image dimensions, with fallback SVG viewBox parsing for cases where
 * image-size can't handle SVG width/height attributes (e.g., scientific notation).
 */
function getImageDimensions(imgPath) {
  const sizeOf = require('image-size');
  try {
    const dims = sizeOf(imgPath);
    // image-size may misparse SVG attributes (e.g. width="1e3" → 1).
    // Fall back to viewBox parsing if a dimension looks wrong.
    if (dims.type === 'svg' && (dims.width < 2 || dims.height < 2)) {
      const content = fs.readFileSync(imgPath, 'utf-8');
      const vbMatch = content.match(/viewBox=["']([^"']+)["']/);
      if (vbMatch) {
        const parts = vbMatch[1].trim().split(/[\s,]+/).map(Number);
        if (parts.length >= 4 && parts[2] > 0 && parts[3] > 0) {
          return { width: parts[2], height: parts[3] };
        }
      }
    }
    if (dims.width > 0 && dims.height > 0) {
      return { width: dims.width, height: dims.height };
    }
  } catch { /* fall through */ }
  return null;
}

/**
 * Compute display dimensions that fit within a bounding box while preserving
 * the image's aspect ratio. Enforces a minimum height so very wide logos
 * remain legible.
 */
function fitToBoundingBox(imgWidth, imgHeight, maxWidth, maxHeight, minHeight) {
  const ratio = imgWidth / imgHeight;
  // Start at max height, compute width
  let h = maxHeight;
  let w = h * ratio;
  // If too wide, cap width and reduce height
  if (w > maxWidth) {
    w = maxWidth;
    h = w / ratio;
  }
  // If height fell below minimum, enforce minimum (allow width to exceed max)
  if (h < minHeight) {
    h = minHeight;
    w = h * ratio;
  }
  return { width: Math.round(w), height: Math.round(h) };
}

/**
 * Generate the database logos HTML for README.md
 * Only includes databases that have logos and homepage URLs.
 * Deduplicates by logo filename to match the docs homepage behavior.
 * Reads actual image dimensions to preserve aspect ratios.
 */
function generateReadmeLogos(databases) {
  // Get databases with logos and homepage URLs, sorted alphabetically,
  // deduplicated by logo filename (matches docs homepage logic in index.tsx)
  const seenLogos = new Set();
  const dbsWithLogos = Object.entries(databases)
    .filter(([, db]) => db.documentation?.logo && db.documentation?.homepage_url)
    .sort(([a], [b]) => a.localeCompare(b))
    .filter(([, db]) => {
      const logo = db.documentation.logo;
      if (seenLogos.has(logo)) return false;
      seenLogos.add(logo);
      return true;
    });

  if (dbsWithLogos.length === 0) {
    return '';
  }

  const MAX_WIDTH = 150;
  const MAX_HEIGHT = 40;
  const MIN_HEIGHT = 24;

  const DOCS_BASE = 'https://superset.apache.org/docs/databases/supported';

  // Generate linked logo tags with aspect-ratio-preserving dimensions
  const logoTags = dbsWithLogos.map(([name, db]) => {
    const logo = db.documentation.logo;
    const slug = name.toLowerCase().replace(/[^a-z0-9]+/g, '-').replace(/^-|-$/g, '');
    const imgPath = path.join(IMAGES_DIR, logo);

    const dims = getImageDimensions(imgPath);
    let sizeAttrs;
    if (dims) {
      const { width, height } = fitToBoundingBox(dims.width, dims.height, MAX_WIDTH, MAX_HEIGHT, MIN_HEIGHT);
      sizeAttrs = `width="${width}" height="${height}"`;
    } else {
      console.warn(`  Could not read dimensions for ${logo}, using height-only fallback`);
      sizeAttrs = `height="${MAX_HEIGHT}"`;
    }

    const img = `<img src="docs/static/img/databases/${logo}" alt="${name}" ${sizeAttrs} />`;
    return `  <a href="${DOCS_BASE}/${slug}" title="${name}">${img}</a>`;
  });

  // Use &nbsp; between logos for spacing (GitHub strips style/class attributes)
  return `<p align="center">
${logoTags.join(' &nbsp;\n')}
</p>`;
}

/**
 * Update the README.md with generated database logos
 */
function updateReadme(databases) {
  if (!fs.existsSync(README_PATH)) {
    console.log('README.md not found, skipping update');
    return false;
  }

  const content = fs.readFileSync(README_PATH, 'utf-8');

  // Check if markers exist
  if (!content.includes(README_START_MARKER) || !content.includes(README_END_MARKER)) {
    console.log('README.md missing database markers, skipping update');
    console.log(`  Add ${README_START_MARKER} and ${README_END_MARKER} to enable auto-generation`);
    return false;
  }

  // Generate new logos section
  const logosHtml = generateReadmeLogos(databases);

  // Replace content between markers
  const pattern = new RegExp(
    `${README_START_MARKER}[\\s\\S]*?${README_END_MARKER}`,
    'g'
  );
  const newContent = content.replace(
    pattern,
    `${README_START_MARKER}\n${logosHtml}\n${README_END_MARKER}`
  );

  if (newContent !== content) {
    fs.writeFileSync(README_PATH, newContent);
    console.log('Updated README.md database logos');
    return true;
  }

  console.log('README.md database logos unchanged');
  return false;
}

/**
 * Extract custom_errors from engine specs for troubleshooting documentation
 * Returns a map of module names to their custom errors
 */
function extractCustomErrors() {
  console.log('Extracting custom_errors from engine specs...');

  try {
    const scriptPath = path.join(__dirname, 'extract_custom_errors.py');
    const result = spawnSync('python3', [scriptPath], {
      cwd: ROOT_DIR,
      encoding: 'utf-8',
      timeout: 30000,
      maxBuffer: 10 * 1024 * 1024,
    });

    if (result.error) {
      throw result.error;
    }
    if (result.status !== 0) {
      throw new Error(result.stderr || 'Python script failed');
    }

    const customErrors = JSON.parse(result.stdout);
    const moduleCount = Object.keys(customErrors).length;
    const errorCount = Object.values(customErrors).reduce((sum, classes) =>
      sum + Object.values(classes).reduce((s, errs) => s + errs.length, 0), 0);
    console.log(`  Found ${errorCount} custom errors across ${moduleCount} modules`);
    return customErrors;
  } catch (err) {
    console.log('  Could not extract custom_errors:', err.message);
    return null;
  }
}

/**
 * Merge custom_errors into database documentation
 * Maps by module name since that's how both datasets are keyed
 */
function mergeCustomErrors(databases, customErrors) {
  if (!customErrors) return;

  let mergedCount = 0;

  for (const [, db] of Object.entries(databases)) {
    if (!db.module) continue;
    // Normalize module name: Flask mode uses full path (superset.db_engine_specs.postgres),
    // but customErrors is keyed by file stem (postgres)
    const moduleName = db.module.split('.').pop();
    if (!customErrors[moduleName]) continue;

    // Get all errors from all classes in this module
    const moduleErrors = customErrors[moduleName];
    const allErrors = [];

    for (const classErrors of Object.values(moduleErrors)) {
      allErrors.push(...classErrors);
    }

    if (allErrors.length > 0) {
      // Add to documentation
      db.documentation = db.documentation || {};
      db.documentation.custom_errors = allErrors;
      mergedCount++;
    }
  }

  if (mergedCount > 0) {
    console.log(`  Merged custom_errors into ${mergedCount} database docs`);
  }
}

/**
 * Load existing database data if available
 */
function loadExistingData() {
  if (!fs.existsSync(DATA_OUTPUT_FILE)) {
    return null;
  }

  try {
    const content = fs.readFileSync(DATA_OUTPUT_FILE, 'utf-8');
    return JSON.parse(content);
  } catch (error) {
    console.log('Could not load existing data:', error.message);
    return null;
  }
}

/**
 * Merge new documentation with existing diagnostics
 * Preserves score, time_grains, and feature flags from existing data
 */
function mergeWithExistingDiagnostics(newDatabases, existingData) {
  if (!existingData?.databases) return newDatabases;

  const diagnosticFields = [
    'score', 'max_score', 'time_grains', 'joins', 'subqueries',
    'supports_dynamic_schema', 'supports_catalog', 'supports_dynamic_catalog',
    'ssh_tunneling', 'query_cancelation', 'supports_file_upload',
    'user_impersonation', 'query_cost_estimation', 'sql_validation'
  ];

  for (const [name, db] of Object.entries(newDatabases)) {
    const existingDb = existingData.databases[name];
    if (existingDb && existingDb.score > 0) {
      // Preserve diagnostics from existing data
      for (const field of diagnosticFields) {
        if (existingDb[field] !== undefined) {
          db[field] = existingDb[field];
        }
      }
    }
  }

  const preserved = Object.values(newDatabases).filter(d => d.score > 0).length;
  if (preserved > 0) {
    console.log(`Preserved diagnostics for ${preserved} databases from existing data`);
  }

  return newDatabases;
}

/**
 * Main function
 */
async function main() {
  console.log('Generating database documentation...\n');

  // Ensure output directories exist
  if (!fs.existsSync(DATA_OUTPUT_DIR)) {
    fs.mkdirSync(DATA_OUTPUT_DIR, { recursive: true });
  }
  if (!fs.existsSync(MDX_OUTPUT_DIR)) {
    fs.mkdirSync(MDX_OUTPUT_DIR, { recursive: true });
  }

  // Load existing data for potential merge
  const existingData = loadExistingData();

  // Try sources in order of preference:
  // 1. Full script with Flask context (richest data with diagnostics)
  // 2. Engine spec metadata files (works in CI without Flask)
  let databases = tryRunFullScript();
  let usedFlaskContext = !!databases;

  if (!databases) {
    // Extract from engine spec metadata (preferred for CI)
    databases = extractEngineSpecMetadata();
  }

  if (!databases || Object.keys(databases).length === 0) {
    console.error('Failed to generate database documentation data.');
    console.error('Could not extract from Flask app or engine spec metadata.');
    process.exit(1);
  }

  console.log(`Processed ${Object.keys(databases).length} databases\n`);

  // Check if new data has scores; if not, preserve existing diagnostics
  const hasNewScores = Object.values(databases).some((db) => db.score > 0);
  if (!hasNewScores && existingData) {
    databases = mergeWithExistingDiagnostics(databases, existingData);
  }

  // Extract and merge custom_errors for troubleshooting documentation
  const customErrors = extractCustomErrors();
  mergeCustomErrors(databases, customErrors);

  // Build statistics
  const statistics = buildStatistics(databases);

  // Create the final output structure
  const output = {
    generated: new Date().toISOString(),
    statistics,
    databases,
  };

  // Write the JSON file (with trailing newline for POSIX compliance)
  fs.writeFileSync(DATA_OUTPUT_FILE, JSON.stringify(output, null, 2) + '\n');
  console.log(`Generated: ${path.relative(DOCS_DIR, DATA_OUTPUT_FILE)}`);


  // Ensure supported directory exists
  if (!fs.existsSync(MDX_SUPPORTED_DIR)) {
    fs.mkdirSync(MDX_SUPPORTED_DIR, { recursive: true });
  }

  // Clean up old MDX files that are no longer in the database list
  console.log(`\nCleaning up old MDX files in ${path.relative(DOCS_DIR, MDX_SUPPORTED_DIR)}/`);
  const existingMdxFiles = fs.readdirSync(MDX_SUPPORTED_DIR).filter(f => f.endsWith('.mdx'));
  const validSlugs = new Set(Object.keys(databases).map(name => `${toSlug(name)}.mdx`));
  let removedCount = 0;
  for (const file of existingMdxFiles) {
    if (!validSlugs.has(file)) {
      fs.unlinkSync(path.join(MDX_SUPPORTED_DIR, file));
      removedCount++;
    }
  }
  if (removedCount > 0) {
    console.log(`  Removed ${removedCount} outdated MDX files`);
  }

  // Generate individual MDX files for each database in supported/ subdirectory
  console.log(`\nGenerating MDX files in ${path.relative(DOCS_DIR, MDX_SUPPORTED_DIR)}/`);

  let mdxCount = 0;
  for (const [name, db] of Object.entries(databases)) {
    const slug = toSlug(name);
    const mdxContent = generateDatabaseMDX(name, db);
    const mdxPath = path.join(MDX_SUPPORTED_DIR, `${slug}.mdx`);
    fs.writeFileSync(mdxPath, mdxContent);
    mdxCount++;
  }
  console.log(`  Generated ${mdxCount} database pages`);

  // Generate index page in parent databases/ directory
  const indexContent = generateIndexMDX(statistics, usedFlaskContext);
  const indexPath = path.join(MDX_OUTPUT_DIR, 'index.mdx');
  fs.writeFileSync(indexPath, indexContent);
  console.log(`  Generated index page`);

  // Generate _category_.json for databases/ directory
  const categoryJson = {
    label: 'Databases',
    position: 1,
    link: {
      type: 'doc',
      id: 'databases/index',
    },
  };
  fs.writeFileSync(
    path.join(MDX_OUTPUT_DIR, '_category_.json'),
    JSON.stringify(categoryJson, null, 2) + '\n'
  );

  // Generate _category_.json for supported/ subdirectory (collapsible)
  const supportedCategoryJson = {
    label: 'Supported Databases',
    position: 2,
    collapsed: true,
    collapsible: true,
  };
  fs.writeFileSync(
    path.join(MDX_SUPPORTED_DIR, '_category_.json'),
    JSON.stringify(supportedCategoryJson, null, 2) + '\n'
  );
  console.log(`  Generated _category_.json files`);

  // Update README.md database logos (only when explicitly requested)
  if (process.env.UPDATE_README === 'true' || process.argv.includes('--update-readme')) {
    console.log('');
    updateReadme(databases);
  }

  console.log(`\nStatistics:`);
  console.log(`  Total databases: ${statistics.totalDatabases}`);
  console.log(`  With documentation: ${statistics.withDocumentation}`);
  console.log(`  With connection strings: ${statistics.withConnectionString}`);
  console.log(`  Categories: ${Object.keys(statistics.byCategory).length}`);

  console.log('\nDone!');
}

main().catch(console.error);