fix(docs): read capability flags from engine specs in database docs generator (#39449)

Co-authored-by: Superset Dev <dev@superset.apache.org> Co-authored-by: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-10 02:15:50 +00:00 · 2026-05-08 09:13:08 -07:00
parent 69fbbfd7ce
commit 5bde86785f
3 changed files with 7302 additions and 3562 deletions
--- a/docs/scripts/generate-database-docs.mjs
+++ b/docs/scripts/generate-database-docs.mjs
@@ -141,6 +141,47 @@ def eval_node(node):
        return "<f-string>"
    return None

+def static_return_bool(func_node):
+    """
+    Statically resolve a method's return value to a bool when possible.
+
+    Returns True/False for functions whose body is (effectively) a single
+    \`return True\` / \`return False\` — allowing a leading docstring and
+    ignoring pure-comment/pass statements. Returns None for anything more
+    complex (conditional returns, computed values, no return, etc.).
+
+    Used by \`has_implicit_cancel\` handling: \`diagnose()\` in lib.py calls
+    the method and checks the return value, so an override that explicitly
+    returns False must NOT be treated as enabling query cancelation.
+    """
+    returns = []
+    other_logic = False
+    docstring_skipped = False
+    for stmt in func_node.body:
+        # Skip docstring (only the FIRST expression statement that is a
+        # string constant — later bare string literals are not docstrings
+        # and should count as non-trivial logic).
+        if (not docstring_skipped
+                and isinstance(stmt, ast.Expr)
+                and isinstance(stmt.value, ast.Constant)
+                and isinstance(stmt.value.value, str)):
+            docstring_skipped = True
+            continue
+        if isinstance(stmt, ast.Pass):
+            continue
+        if isinstance(stmt, ast.Return):
+            returns.append(stmt)
+            continue
+        # Any other statement (if/for/assign/etc.) means control flow is
+        # non-trivial; bail out to be conservative.
+        other_logic = True
+        break
+    if other_logic or len(returns) != 1:
+        return None
+    val = eval_node(returns[0].value)
+    return val if isinstance(val, bool) else None
+
+
 def deep_merge(base, override):
    """Deep merge two dictionaries. Override values take precedence."""
    if base is None:
@@ -186,8 +227,55 @@ if not os.path.isdir(specs_dir):
    print(json.dumps({"error": f"Directory not found: {specs_dir}", "cwd": os.getcwd()}))
    sys.exit(1)

-# First pass: collect all class info (name, bases, metadata)
-class_info = {}  # class_name -> {bases: [], metadata: {}, engine_name: str, filename: str}
+# Capability flag attributes with their defaults from BaseEngineSpec
+CAP_ATTR_DEFAULTS = {
+    'supports_dynamic_schema': False,
+    'supports_catalog': False,
+    'supports_dynamic_catalog': False,
+    'disable_ssh_tunneling': False,
+    'supports_file_upload': True,
+    'allows_joins': True,
+    'allows_subqueries': True,
+}
+
+# Maps source capability attribute -> output field name used in databases.json.
+# When a cap attr is assigned an unevaluable expression (e.g.
+# allows_joins = is_feature_enabled("DRUID_JOINS")), the JS layer uses this
+# mapping to preserve the corresponding field from the previously-generated
+# JSON rather than silently inheriting an incorrect parent default.
+CAP_ATTR_TO_OUTPUT_FIELD = {
+    'allows_joins': 'joins',
+    'allows_subqueries': 'subqueries',
+    'supports_dynamic_schema': 'supports_dynamic_schema',
+    'supports_catalog': 'supports_catalog',
+    'supports_dynamic_catalog': 'supports_dynamic_catalog',
+    'disable_ssh_tunneling': 'ssh_tunneling',
+    'supports_file_upload': 'supports_file_upload',
+}
+
+# Methods that indicate a capability when overridden by a non-BaseEngineSpec class.
+# Mirrors the has_custom_method checks in superset/db_engine_specs/lib.py.
+# cancel_query / has_implicit_cancel -> query_cancelation
+#   (diagnose() checks cancel_query override OR has_implicit_cancel() == True;
+#    base has_implicit_cancel returns False, so overriding it is the static
+#    equivalent of that method returning True. get_cancel_query_id is NOT
+#    part of the diagnose() heuristic and is intentionally excluded.)
+# estimate_statement_cost / estimate_query_cost -> query_cost_estimation
+# impersonate_user / update_impersonation_config / get_url_for_impersonation -> user_impersonation
+# validate_sql -> sql_validation (not used yet; validation is engine-based)
+CAP_METHODS = {
+    'cancel_query', 'has_implicit_cancel',
+    'estimate_statement_cost', 'estimate_query_cost',
+    'impersonate_user', 'update_impersonation_config', 'get_url_for_impersonation',
+    'validate_sql',
+}
+
+# Only the literal BaseEngineSpec is excluded from method-override tracking.
+# Intermediate base classes (e.g. PrestoBaseEngineSpec) do count as overrides.
+TRUE_BASE_CLASS = 'BaseEngineSpec'
+
+# First pass: collect all class info (name, bases, metadata, cap_attrs, direct_methods)
+class_info = {}  # class_name -> {bases: [], metadata: {}, engine_name: str, filename: str, ...}

 for filename in sorted(os.listdir(specs_dir)):
    if not filename.endswith('.py') or filename in ('__init__.py', 'lib.py', 'lint_metadata.py'):
@@ -218,30 +306,54 @@ for filename in sorted(os.listdir(specs_dir)):

            # Extract class attributes
            engine_name = None
+            engine_attr = None
            metadata = None
+            cap_attrs = {}   # capability flag attributes defined directly in this class
+            # Cap attrs assigned via expressions we can't statically resolve
+            # (e.g. is_feature_enabled("FLAG")). Tracked so the JS layer can
+            # fall back to the previously-generated databases.json value
+            # rather than inherit a parent default that would be wrong.
+            unresolved_cap_attrs = set()
+            direct_methods = set()  # capability methods defined directly in this class

            for item in node.body:
                if isinstance(item, ast.Assign):
                    for target in item.targets:
-                        if isinstance(target, ast.Name):
-                            if target.id == 'engine_name':
-                                val = eval_node(item.value)
-                                if isinstance(val, str):
-                                    engine_name = val
-                            elif target.id == 'metadata':
-                                metadata = eval_node(item.value)
+                        if not isinstance(target, ast.Name):
+                            continue
+                        if target.id == 'engine_name':
+                            val = eval_node(item.value)
+                            if isinstance(val, str):
+                                engine_name = val
+                        elif target.id == 'engine':
+                            val = eval_node(item.value)
+                            if isinstance(val, str):
+                                engine_attr = val
+                        elif target.id == 'metadata':
+                            metadata = eval_node(item.value)
+                        elif target.id in CAP_ATTR_DEFAULTS:
+                            val = eval_node(item.value)
+                            if isinstance(val, bool):
+                                cap_attrs[target.id] = val
+                            else:
+                                # Unevaluable expression — defer to JS fallback.
+                                unresolved_cap_attrs.add(target.id)
+                elif isinstance(item, (ast.FunctionDef, ast.AsyncFunctionDef)):
+                    if item.name in CAP_METHODS:
+                        # has_implicit_cancel is special: diagnose() uses the
+                        # method's RETURN VALUE, not just its presence. If the
+                        # override statically returns False, treat it as if
+                        # the method weren't overridden so query_cancelation
+                        # matches diagnose(). Unresolvable / True / anything
+                        # else falls through as an override (conservative).
+                        if item.name == 'has_implicit_cancel':
+                            if static_return_bool(item) is False:
+                                continue
+                        direct_methods.add(item.name)

            # Check for engine attribute with non-empty value to distinguish
            # true base classes from product classes like OceanBaseEngineSpec
-            has_non_empty_engine = False
-            for item in node.body:
-                if isinstance(item, ast.Assign):
-                    for target in item.targets:
-                        if isinstance(target, ast.Name) and target.id == 'engine':
-                            # Check if engine value is non-empty string
-                            if isinstance(item.value, ast.Constant):
-                                has_non_empty_engine = bool(item.value.value)
-                            break
+            has_non_empty_engine = engine_attr is not None and bool(engine_attr)

            # True base classes: end with BaseEngineSpec AND don't define engine
            # or have empty engine (like PostgresBaseEngineSpec with engine = "")
@@ -254,13 +366,18 @@ for filename in sorted(os.listdir(specs_dir)):
                'bases': base_names,
                'metadata': metadata,
                'engine_name': engine_name,
+                'engine': engine_attr,
                'filename': filename,
                'is_base_or_mixin': is_true_base,
+                'cap_attrs': cap_attrs,
+                'unresolved_cap_attrs': unresolved_cap_attrs,
+                'direct_methods': direct_methods,
            }
    except Exception as e:
        errors.append(f"{filename}: {str(e)}")

-# Second pass: resolve inheritance and build final metadata
+# Second pass: resolve inheritance and build final metadata + capability flags
+
 def get_inherited_metadata(class_name, visited=None):
    """Recursively get metadata from parent classes."""
    if visited is None:
@@ -286,6 +403,64 @@ def get_inherited_metadata(class_name, visited=None):

    return inherited

+def get_resolved_caps(class_name, visited=None):
+    """
+    Resolve capability flags and method overrides with inheritance.
+
+    Returns (attr_values, unresolved, methods):
+      - attr_values: {attr: bool} for attrs where the nearest MRO assignment
+        was a literal bool. Defaults are applied at the call site.
+      - unresolved: attrs where the nearest MRO assignment was an unevaluable
+        expression (e.g. is_feature_enabled("FLAG")). The JS layer falls
+        back to the previously-generated JSON value for these.
+      - methods: capability methods defined directly in some non-base ancestor,
+        matching the has_custom_method() logic in db_engine_specs/lib.py.
+
+    attr_values and unresolved are disjoint — an attr is in at most one.
+    """
+    if visited is None:
+        visited = set()
+    if class_name in visited:
+        return {}, set(), set()
+    visited.add(class_name)
+
+    info = class_info.get(class_name)
+    if not info:
+        return {}, set(), set()
+
+    attr_values = {}
+    unresolved = set()
+    resolved_methods = set()
+
+    # Collect from parents, iterating right-to-left so leftmost bases win
+    # (matches Python MRO: for class C(A, B), A's attributes take precedence).
+    for base_name in reversed(info['bases']):
+        p_vals, p_unres, p_meth = get_resolved_caps(base_name, visited.copy())
+        # A parent's literal assignments overwrite whatever we inherited so far.
+        for attr, val in p_vals.items():
+            attr_values[attr] = val
+            unresolved.discard(attr)
+        # A parent's unresolved assignments likewise take precedence.
+        for attr in p_unres:
+            unresolved.add(attr)
+            attr_values.pop(attr, None)
+        resolved_methods.update(p_meth)
+
+    # Apply this class's own assignments (override parents).
+    for attr, val in info['cap_attrs'].items():
+        attr_values[attr] = val
+        unresolved.discard(attr)
+    for attr in info['unresolved_cap_attrs']:
+        unresolved.add(attr)
+        attr_values.pop(attr, None)
+
+    # Accumulate method overrides, but skip the literal BaseEngineSpec
+    # (its implementations are stubs; only non-base overrides count).
+    if class_name != TRUE_BASE_CLASS:
+        resolved_methods.update(info['direct_methods'])
+
+    return attr_values, unresolved, resolved_methods
+
 for class_name, info in class_info.items():
    # Skip base classes and mixins
    if info['is_base_or_mixin']:
@@ -310,7 +485,14 @@ for class_name, info in class_info.items():

    if final_metadata and isinstance(final_metadata, dict) and display_name:
        debug_info["classes_with_metadata"] += 1
-        databases[display_name] = {
+
+        # Resolve capability flags from Python source
+        attr_values, unresolved_caps, cap_methods = get_resolved_caps(class_name)
+        cap_attrs = dict(CAP_ATTR_DEFAULTS)
+        cap_attrs.update(attr_values)
+        engine_attr = info.get('engine') or ''
+
+        entry = {
            'engine': display_name.lower().replace(' ', '_'),
            'engine_name': display_name,
            'module': info['filename'][:-3],  # Remove .py extension
@@ -318,19 +500,40 @@ for class_name, info in class_info.items():
            'time_grains': {},
            'score': 0,
            'max_score': 0,
-            'joins': True,
-            'subqueries': True,
-            'supports_dynamic_schema': False,
-            'supports_catalog': False,
-            'supports_dynamic_catalog': False,
-            'ssh_tunneling': False,
-            'query_cancelation': False,
-            'supports_file_upload': False,
-            'user_impersonation': False,
-            'query_cost_estimation': False,
-            'sql_validation': False,
+            # Capability flags read from engine spec class attributes/methods
+            'joins': cap_attrs['allows_joins'],
+            'subqueries': cap_attrs['allows_subqueries'],
+            'supports_dynamic_schema': cap_attrs['supports_dynamic_schema'],
+            'supports_catalog': cap_attrs['supports_catalog'],
+            'supports_dynamic_catalog': cap_attrs['supports_dynamic_catalog'],
+            'ssh_tunneling': not cap_attrs['disable_ssh_tunneling'],
+            'supports_file_upload': cap_attrs['supports_file_upload'],
+            # Method-based flags: True only when a non-base class overrides them.
+            # Matches diagnose() in lib.py: cancel_query override OR
+            # has_implicit_cancel() returning True (which, given the base
+            # returns False, is equivalent to overriding has_implicit_cancel).
+            'query_cancelation': bool({'cancel_query', 'has_implicit_cancel'} & cap_methods),
+            'query_cost_estimation': bool({'estimate_statement_cost', 'estimate_query_cost'} & cap_methods),
+            # SQL validation is implemented in external validator classes keyed by engine name
+            'sql_validation': engine_attr in {'presto', 'postgresql'},
+            'user_impersonation': bool(
+                {'impersonate_user', 'update_impersonation_config', 'get_url_for_impersonation'} & cap_methods
+            ),
        }

+        # Tell the JS layer which output fields were populated from the
+        # BaseEngineSpec default because the source assignment was an
+        # unevaluable expression; those get overridden from existing JSON.
+        unresolved_fields = sorted(
+            CAP_ATTR_TO_OUTPUT_FIELD[attr]
+            for attr in unresolved_caps
+            if attr in CAP_ATTR_TO_OUTPUT_FIELD
+        )
+        if unresolved_fields:
+            entry['_unresolved_cap_fields'] = unresolved_fields
+
+        databases[display_name] = entry
+
 if errors and not databases:
    print(json.dumps({"error": "Parse errors", "details": errors, "debug": debug_info}), file=sys.stderr)

@@ -851,24 +1054,52 @@ function loadExistingData() {
  }
 }

+/**
+ * Fall back to the previously-generated databases.json for capability flags
+ * whose source assignment couldn't be statically resolved (e.g.
+ * `allows_joins = is_feature_enabled("DRUID_JOINS")`). The Python extractor
+ * flags these via the internal `_unresolved_cap_fields` marker; without this
+ * fallback those fields would silently inherit the BaseEngineSpec default
+ * and disagree with runtime behavior. The marker is stripped before output.
+ */
+function fallbackUnresolvedCaps(newDatabases, existingData) {
+  for (const [name, db] of Object.entries(newDatabases)) {
+    const unresolved = db._unresolved_cap_fields;
+    if (!unresolved || unresolved.length === 0) {
+      delete db._unresolved_cap_fields;
+      continue;
+    }
+    const existingDb = existingData?.databases?.[name];
+    if (existingDb) {
+      for (const field of unresolved) {
+        if (existingDb[field] !== undefined) {
+          db[field] = existingDb[field];
+        }
+      }
+    }
+    delete db._unresolved_cap_fields;
+  }
+  return newDatabases;
+}
+
 /**
 * Merge new documentation with existing diagnostics
- * Preserves score, time_grains, and feature flags from existing data
+ * Preserves score, max_score, and time_grains from existing data (these require
+ * Flask context to generate and cannot be derived from static source analysis).
+ * Capability flags (joins, supports_catalog, etc.) are NOT preserved here — they
+ * are read fresh from the Python engine spec source by extractEngineSpecMetadata(),
+ * with a separate fallback for expression-based assignments (see fallbackUnresolvedCaps).
 */
 function mergeWithExistingDiagnostics(newDatabases, existingData) {
  if (!existingData?.databases) return newDatabases;

-  const diagnosticFields = [
-    'score', 'max_score', 'time_grains', 'joins', 'subqueries',
-    'supports_dynamic_schema', 'supports_catalog', 'supports_dynamic_catalog',
-    'ssh_tunneling', 'query_cancelation', 'supports_file_upload',
-    'user_impersonation', 'query_cost_estimation', 'sql_validation'
-  ];
+  // Only preserve fields that require Flask/runtime context to generate
+  const diagnosticFields = ['score', 'max_score', 'time_grains'];

  for (const [name, db] of Object.entries(newDatabases)) {
    const existingDb = existingData.databases[name];
    if (existingDb && existingDb.score > 0) {
-      // Preserve diagnostics from existing data
+      // Preserve score/time_grain diagnostics from existing data
      for (const field of diagnosticFields) {
        if (existingDb[field] !== undefined) {
          db[field] = existingDb[field];
@@ -879,7 +1110,7 @@ function mergeWithExistingDiagnostics(newDatabases, existingData) {

  const preserved = Object.values(newDatabases).filter(d => d.score > 0).length;
  if (preserved > 0) {
-    console.log(`Preserved diagnostics for ${preserved} databases from existing data`);
+    console.log(`Preserved score/time_grains for ${preserved} databases from existing data`);
  }

  return newDatabases;
@@ -927,6 +1158,12 @@ async function main() {
    databases = mergeWithExistingDiagnostics(databases, existingData);
  }

+  // For cap flags assigned via unevaluable expressions (e.g.
+  // `is_feature_enabled(...)`), prefer the value from a previously-generated
+  // JSON. Runs regardless of scores since it addresses static-analysis gaps,
+  // not missing Flask diagnostics. Always strips the internal marker.
+  databases = fallbackUnresolvedCaps(databases, existingData);
+
  // Extract and merge custom_errors for troubleshooting documentation
  const customErrors = extractCustomErrors();
  mergeCustomErrors(databases, customErrors);