feat: add global task framework (#36368)

2026-04-20 00:24:38 +00:00 · 2026-02-09 10:45:56 -08:00
parent 6984e93171
commit 59dd2fa385
89 changed files with 15535 additions and 291 deletions
--- a/superset/tasks/context.py
+++ b/superset/tasks/context.py
@@ -0,0 +1,673 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Concrete TaskContext implementation for GTF"""
+
+import logging
+import threading
+import time
+import traceback
+from typing import Any, Callable, cast, TYPE_CHECKING, TypeVar
+
+from flask import current_app
+from superset_core.api.tasks import (
+    TaskContext as CoreTaskContext,
+    TaskProperties,
+    TaskStatus,
+)
+
+from superset.stats_logger import BaseStatsLogger
+from superset.tasks.constants import ABORT_STATES
+from superset.tasks.utils import progress_update
+
+if TYPE_CHECKING:
+    from superset.models.tasks import Task
+    from superset.tasks.manager import AbortListener
+
+logger = logging.getLogger(__name__)
+
+T = TypeVar("T")
+
+
+class TaskContext(CoreTaskContext):
+    """
+    Concrete implementation of TaskContext for the Global Async Task Framework.
+
+    Provides write-only access to task state. Tasks use this context to update
+    their progress and payload, and check for cancellation. Tasks should not
+    need to read their own state - they are the source of state, not consumers.
+    """
+
+    # Type alias for handler failures: (handler_type, exception, stack_trace)
+    HandlerFailure = tuple[str, Exception, str]
+
+    def __init__(self, task: "Task") -> None:
+        """
+        Initialize TaskContext with a pre-fetched task entity.
+
+        The task entity must be pre-fetched by the caller (executor) to ensure
+        caching works correctly and to enforce the pattern of single initial fetch.
+
+        :param task: Pre-fetched Task entity (required)
+        """
+        self._task_uuid = task.uuid
+        self._cleanup_handlers: list[Callable[[], None]] = []
+        self._abort_handlers: list[Callable[[], None]] = []
+        self._abort_listener: "AbortListener | None" = None
+        self._abort_detected = False
+        self._abort_handlers_completed = False  # Track if all abort handlers finished
+        self._execution_completed = False  # Set by executor after task work completes
+
+        # Collected handler failures for unified reporting
+        self._handler_failures: list[TaskContext.HandlerFailure] = []
+
+        # Timeout timer state
+        self._timeout_timer: threading.Timer | None = None
+        self._timeout_triggered = False
+
+        # Throttling state for update_task()
+        # These manage the minimum interval between DB writes
+        self._last_db_write_time: float | None = None
+        self._has_pending_updates: bool = False
+        self._deferred_flush_timer: threading.Timer | None = None
+        self._throttle_lock = threading.Lock()
+
+        # Cached task entity - avoids repeated DB fetches.
+        # Updated only by _refresh_task() when checking external state changes.
+        self._task: "Task" = task
+
+        # In-memory state caches - authoritative during execution
+        # These are initialized from the task entity and updated locally
+        # before being written to DB via targeted SQL updates.
+        # We copy the dicts to avoid mutating the Task's cached instances.
+        self._properties_cache: TaskProperties = cast(
+            TaskProperties, {**task.properties_dict}
+        )
+        self._payload_cache: dict[str, Any] = {**task.payload_dict}
+
+        # Store Flask app reference for background thread database access
+        # Use _get_current_object() to get actual app, not proxy
+        try:
+            self._app = current_app._get_current_object()
+            # Cache stats logger to avoid repeated config lookups
+            self._stats_logger: BaseStatsLogger = current_app.config.get(
+                "STATS_LOGGER", BaseStatsLogger()
+            )
+        except RuntimeError:
+            # Handle case where app context isn't available (e.g., tests)
+            self._app = None
+            self._stats_logger = BaseStatsLogger()
+
+    def _refresh_task(self) -> "Task":
+        """
+        Force refresh the task entity from the database.
+
+        Use this method when you need to check for external state changes,
+        such as whether the task has been aborted by a concurrent operation.
+
+        This method:
+        - Fetches fresh task entity from database
+        - Updates the cached _task reference
+        - Updates properties/payload caches from fresh data
+
+        :returns: Fresh task entity from database
+        :raises ValueError: If task is not found
+        """
+        from superset.daos.tasks import TaskDAO
+
+        fresh_task = TaskDAO.find_one_or_none(uuid=self._task_uuid)
+        if not fresh_task:
+            raise ValueError(f"Task {self._task_uuid} not found")
+
+        self._task = fresh_task
+
+        # Update caches from fresh data (copy to avoid mutating Task's cache)
+        self._properties_cache = cast(TaskProperties, {**fresh_task.properties_dict})
+        self._payload_cache = {**fresh_task.payload_dict}
+
+        return self._task
+
+    def update_task(
+        self,
+        progress: float | int | tuple[int, int] | None = None,
+        payload: dict[str, object] | None = None,
+    ) -> None:
+        """
+        Update task progress and/or payload atomically.
+
+        All parameters are optional. Payload is merged with existing cached data.
+        In-memory caches are always updated immediately, but DB writes are
+        throttled according to TASK_PROGRESS_UPDATE_THROTTLE_INTERVAL to prevent
+        excessive database load from eager tasks.
+
+        Progress can be specified in three ways:
+        - float (0.0-1.0): Percentage only, e.g., 0.5 means 50%
+        - int: Count only (total unknown), e.g., 42 means "42 items processed"
+        - tuple[int, int]: Count and total, e.g., (3, 100) means "3 of 100"
+          The percentage is automatically computed from count/total.
+
+        :param progress: Progress value, or None to leave unchanged
+        :param payload: Payload data to merge (dict), or None to leave unchanged
+        """
+        has_updates = False
+
+        # Handle progress updates - always update in-memory cache
+        if progress is not None:
+            progress_props = progress_update(progress)
+            if progress_props:
+                # Merge progress into cached properties
+                self._properties_cache.update(progress_props)
+                has_updates = True
+            else:
+                # Invalid progress format - progress_update returns empty dict
+                logger.warning(
+                    "Invalid progress value for task %s: %s "
+                    "(expected float, int, or tuple[int, int])",
+                    self._task_uuid,
+                    progress,
+                )
+
+        # Handle payload updates - always update in-memory cache
+        if payload is not None:
+            # Merge payload into cached payload
+            self._payload_cache.update(payload)
+            has_updates = True
+
+        if not has_updates:
+            return
+
+        # Get throttle interval from config
+        throttle_interval = current_app.config["TASK_PROGRESS_UPDATE_THROTTLE_INTERVAL"]
+
+        # If throttling is disabled (0), write immediately
+        if throttle_interval <= 0:
+            self._write_to_db()
+            return
+
+        # Apply throttling with deferred flush
+        with self._throttle_lock:
+            now = time.time()
+
+            if self._last_db_write_time is None:
+                # First update - write immediately
+                self._write_to_db()
+                self._last_db_write_time = now
+            elif now - self._last_db_write_time >= throttle_interval:
+                # Throttle window has passed - write immediately
+                self._cancel_deferred_flush_timer()
+                self._write_to_db()
+                self._last_db_write_time = now
+                self._has_pending_updates = False
+            else:
+                # Within throttle window - defer the write
+                self._has_pending_updates = True
+                self._stats_logger.incr("gtf.task.update_deferred")
+
+                # Start deferred flush timer if not already running
+                if self._deferred_flush_timer is None:
+                    remaining_time = throttle_interval - (
+                        now - self._last_db_write_time
+                    )
+                    self._deferred_flush_timer = threading.Timer(
+                        remaining_time, self._deferred_flush
+                    )
+                    self._deferred_flush_timer.daemon = True
+                    self._deferred_flush_timer.start()
+
+    def _write_to_db(self) -> None:
+        """
+        Write current cached state to database.
+
+        This method performs the actual DB write using InternalUpdateTaskCommand.
+        It writes whatever is in the caches at the time of the call.
+        """
+        from superset.commands.tasks.internal_update import InternalUpdateTaskCommand
+
+        self._stats_logger.incr("gtf.task.update_write")
+
+        InternalUpdateTaskCommand(
+            task_uuid=self._task_uuid,
+            properties=self._properties_cache,
+            payload=self._payload_cache,
+        ).run()
+
+    def _deferred_flush(self) -> None:
+        """
+        Timer callback that flushes pending updates at end of throttle window.
+
+        This ensures the UI never shows stale progress for longer than the
+        throttle interval.
+        """
+        with self._throttle_lock:
+            self._deferred_flush_timer = None
+
+            if self._has_pending_updates:
+                # Need app context for DB operations in timer thread
+                if self._app:
+                    with self._app.app_context():
+                        self._write_to_db()
+                else:
+                    self._write_to_db()
+
+                self._last_db_write_time = time.time()
+                self._has_pending_updates = False
+
+    def _cancel_deferred_flush_timer(self) -> None:
+        """Cancel the deferred flush timer if running."""
+        if self._deferred_flush_timer is not None:
+            self._deferred_flush_timer.cancel()
+            self._deferred_flush_timer = None
+
+    def on_cleanup(self, handler: Callable[[], None]) -> Callable[[], None]:
+        """
+        Register a cleanup handler that runs when the task ends.
+
+        Cleanup handlers are called when the task completes (success),
+        fails with an error, or is aborted. Multiple handlers can be
+        registered and will execute in LIFO order (last registered runs first).
+
+        Can be used as a decorator:
+            @ctx.on_cleanup
+            def cleanup():
+                logger.info("Task ended")
+
+        Or called directly:
+            ctx.on_cleanup(lambda: logger.info("Task ended"))
+
+        :param handler: Cleanup function to register
+        :returns: The handler (for decorator compatibility)
+        """
+        self._cleanup_handlers.append(handler)
+        return handler
+
+    def on_abort(self, handler: Callable[[], None]) -> Callable[[], None]:
+        """
+        Register abort handler with automatic background listening.
+
+        When the first handler is registered:
+        1. Sets is_abortable=true in the database (marks task as abortable)
+        2. Background abort listener starts automatically (pub/sub or polling)
+
+        The handler will be called automatically when an abort is detected.
+
+        :param handler: Callback function to execute when abort is detected
+        :returns: The handler (for decorator compatibility)
+
+        Example:
+            @ctx.on_abort
+            def handle_abort():
+                logger.info("Task was aborted!")
+                cleanup_partial_work()
+
+        Note:
+            The handler executes in a background thread when abort is detected.
+            The task code continues running unless the handler does something
+            to stop it (e.g., raises an exception, modifies shared state, etc.)
+        """
+        is_first_handler = len(self._abort_handlers) == 0
+        self._abort_handlers.append(handler)
+
+        if is_first_handler:
+            # Mark task as abortable in database
+            self._set_abortable()
+
+            # Auto-start abort listener when first handler is registered
+            interval = current_app.config["TASK_ABORT_POLLING_DEFAULT_INTERVAL"]
+            self._start_abort_listener(interval)
+
+        return handler
+
+    def _set_abortable(self) -> None:
+        """Mark the task as abortable (abort handler has been registered)."""
+        from superset.commands.tasks.internal_update import InternalUpdateTaskCommand
+
+        # Update local cache and write to DB
+        self._properties_cache["is_abortable"] = True
+        InternalUpdateTaskCommand(
+            task_uuid=self._task_uuid,
+            properties=self._properties_cache,
+        ).run()
+
+    def _start_abort_listener(self, interval: float) -> None:
+        """
+        Start background abort listener via TaskManager.
+
+        Uses Redis pub/sub if available, otherwise falls back to database polling.
+        The implementation is encapsulated in TaskManager.
+        """
+        if self._abort_listener is not None:
+            return  # Already listening
+
+        from superset.tasks.manager import TaskManager
+
+        self._abort_listener = TaskManager.listen_for_abort(
+            task_uuid=self._task_uuid,
+            callback=self._on_abort_detected,
+            poll_interval=interval,
+            app=self._app,
+        )
+
+    def _on_abort_detected(self) -> None:
+        """
+        Callback invoked by TaskManager when abort is detected.
+
+        Triggers all registered abort handlers.
+        """
+        if self._abort_detected:
+            return  # Already handled
+
+        # Check if task execution has already completed (late abort race).
+        # Executor sets _execution_completed after task work finishes.
+        if self._execution_completed:
+            logger.info(
+                "Abort detected for task %s but execution already completed",
+                self._task_uuid,
+            )
+            return
+
+        self._abort_detected = True
+        logger.info("Abort detected for task %s", self._task_uuid)
+        self._trigger_abort_handlers()
+
+    def mark_execution_completed(self) -> None:
+        """
+        Mark that the task's main execution has completed.
+
+        Called by the executor after the task function returns (successfully
+        or with an exception). This prevents late abort callbacks from running
+        handlers when the task work has already finished. Cleanup handlers
+        still run after this is set.
+        """
+        self._execution_completed = True
+
+    def start_abort_polling(self, interval: float | None = None) -> None:
+        """
+        Start background abort listener.
+
+        This method is kept for backwards compatibility. It now delegates
+        to _start_abort_listener which uses TaskManager.
+
+        :param interval: Polling interval in seconds (uses config default if None)
+        """
+        if interval is None:
+            interval = current_app.config["TASK_ABORT_POLLING_DEFAULT_INTERVAL"]
+        self._start_abort_listener(interval)
+
+    def _trigger_abort_handlers(self) -> None:
+        """
+        Execute all registered abort handlers (called by polling thread or cleanup).
+
+        All handlers are attempted even if some fail (best-effort cleanup).
+        Failures are collected in self._handler_failures for unified reporting.
+
+        Note: This method never writes to DB directly. All failures are collected
+        and written by _run_cleanup() in the executor's finally block, ensuring
+        abort and cleanup handler failures are combined into a single record.
+        """
+        for handler in reversed(self._abort_handlers):
+            try:
+                handler()
+            except Exception as ex:
+                stack_trace = traceback.format_exc()
+                logger.error(
+                    "Abort handler failed for task %s: %s",
+                    self._task_uuid,
+                    str(ex),
+                    exc_info=True,
+                )
+                self._handler_failures.append(("abort", ex, stack_trace))
+
+        # Check if all abort handlers completed successfully
+        abort_failures = [f for f in self._handler_failures if f[0] == "abort"]
+        if not abort_failures:
+            self._abort_handlers_completed = True
+
+    def _write_handler_failures_to_db(self) -> None:
+        """
+        Write collected handler failures to the database.
+
+        Combines all failures (abort + cleanup) into a single error record.
+        If the task already has an error (e.g., task function threw exception),
+        handler failures are APPENDED to preserve the original error context.
+        """
+        from superset.commands.tasks.update import UpdateTaskCommand
+
+        if not self._handler_failures:
+            return
+
+        # Build error message from all handler failures
+        error_messages = [str(ex) for _, ex, _ in self._handler_failures]
+        handler_types = {htype for htype, _, _ in self._handler_failures}
+
+        if len(self._handler_failures) == 1:
+            htype, ex, handler_stack_trace = self._handler_failures[0]
+            handler_error_msg = (
+                f"{htype.capitalize()} handler failed: {error_messages[0]}"
+            )
+            handler_exception_type = type(ex).__name__
+        else:
+            # Multiple failures
+            handler_error_msg = f"Handler(s) failed: {'; '.join(error_messages)}"
+            if handler_types == {"abort"}:
+                handler_exception_type = "MultipleAbortHandlerFailures"
+            elif handler_types == {"cleanup"}:
+                handler_exception_type = "MultipleCleanupHandlerFailures"
+            else:
+                handler_exception_type = "MultipleHandlerFailures"
+
+            # Combine stack traces with clear separators
+            handler_stack_trace = "\n--- Next handler failure ---\n".join(
+                f"[{htype}:{type(ex).__name__}]\n{trace}"
+                for htype, ex, trace in self._handler_failures
+            )
+
+        if self._app:
+            with self._app.app_context():
+                # Check if task already has an error (preserve original context)
+                task = self._task
+                original_error = task.properties_dict.get("error_message")
+                original_type = task.properties_dict.get("exception_type")
+                original_trace = task.properties_dict.get("stack_trace")
+
+                if original_error:
+                    # Append handler failures to original error
+                    error_msg = f"{original_error} | {handler_error_msg}"
+                    exception_type = (
+                        f"{original_type}+{handler_exception_type}"
+                        if original_type
+                        else handler_exception_type
+                    )
+                    stack_trace = (
+                        f"{original_trace}\n\n"
+                        f"=== Handler failures during cleanup ===\n\n"
+                        f"{handler_stack_trace}"
+                        if original_trace
+                        else handler_stack_trace
+                    )
+                else:
+                    # No original error, just use handler failures
+                    error_msg = handler_error_msg
+                    exception_type = handler_exception_type
+                    stack_trace = handler_stack_trace
+
+                # Update task with combined error info
+                UpdateTaskCommand(
+                    self._task_uuid,
+                    status=TaskStatus.FAILURE.value,
+                    properties={
+                        "error_message": error_msg,
+                        "exception_type": exception_type,
+                        "stack_trace": stack_trace,
+                    },
+                    skip_security_check=True,
+                ).run()
+
+        # Clear failures after writing
+        self._handler_failures = []
+
+    def stop_abort_polling(self) -> None:
+        """Stop the background abort listener."""
+        if self._abort_listener is not None:
+            self._abort_listener.stop()
+            self._abort_listener = None
+
+    def start_timeout_timer(self, timeout_seconds: int) -> None:
+        """
+        Start a timeout timer that triggers abort when elapsed.
+
+        Called by execute_task when task transitions to IN_PROGRESS.
+        Timer only triggers abort handlers if task is abortable.
+
+        :param timeout_seconds: Timeout duration in seconds
+        """
+        if self._timeout_timer is not None:
+            return  # Already started
+
+        def on_timeout() -> None:
+            if self._abort_detected:
+                return  # Already aborting
+
+            self._timeout_triggered = True
+
+            # Check if task has abort handler (requires app context)
+            if not self._app:
+                logger.error(
+                    "Timeout fired for task %s but no app context available",
+                    self._task_uuid,
+                )
+                return
+
+            with self._app.app_context():
+                from superset.commands.tasks.update import UpdateTaskCommand
+
+                task = self._task
+                if task.properties_dict.get("is_abortable", False):
+                    logger.info(
+                        "Timeout reached for task %s after %d seconds - "
+                        "transitioning to ABORTING and triggering abort handlers",
+                        self._task_uuid,
+                        timeout_seconds,
+                    )
+                    # Set status to ABORTING (same as user abort)
+                    # The executor will determine TIMED_OUT vs FAILURE based on
+                    # whether handlers complete successfully
+                    UpdateTaskCommand(
+                        self._task_uuid,
+                        status=TaskStatus.ABORTING.value,
+                        properties={"error_message": "Task timed out"},
+                        skip_security_check=True,
+                    ).run()
+
+                    # Trigger abort handlers for cleanup
+                    self._on_abort_detected()
+                else:
+                    # No abort handler - just log warning
+                    logger.warning(
+                        "Timeout reached for task %s after %d seconds, but no "
+                        "abort handler is registered. Task will continue running.",
+                        self._task_uuid,
+                        timeout_seconds,
+                    )
+
+        self._timeout_timer = threading.Timer(timeout_seconds, on_timeout)
+        # Timer is daemon so it won't prevent process exit. If the worker dies,
+        # the task is already in an inconsistent state (stuck IN_PROGRESS) that
+        # requires external recovery (orphan detection). A non-daemon timer with
+        # long timeouts (hours) would block graceful worker shutdown.
+        self._timeout_timer.daemon = True
+        self._timeout_timer.start()
+        logger.debug(
+            "Started timeout timer for task %s: %d seconds",
+            self._task_uuid,
+            timeout_seconds,
+        )
+
+    def stop_timeout_timer(self) -> None:
+        """Cancel the timeout timer if running."""
+        if self._timeout_timer is not None:
+            self._timeout_timer.cancel()
+            self._timeout_timer = None
+
+    @property
+    def timeout_triggered(self) -> bool:
+        """Check if the timeout was triggered."""
+        return self._timeout_triggered
+
+    @property
+    def abort_handlers_completed(self) -> bool:
+        """Check if all abort handlers have completed successfully."""
+        return self._abort_handlers_completed
+
+    def _run_cleanup(self) -> None:
+        """
+        Run cleanup handlers (called by executor in finally block).
+
+        This runs:
+        1. Flushes any pending throttled updates to ensure final state is persisted
+        2. Abort handlers if task was aborting/aborted (but not yet detected)
+        3. All cleanup handlers (always)
+
+        All handler failures (abort + cleanup) are collected and written to DB
+        as a unified error record at the end.
+        """
+        # Flush any pending throttled updates before cleanup
+        with self._throttle_lock:
+            self._cancel_deferred_flush_timer()
+            if self._has_pending_updates:
+                self._write_to_db()
+                self._has_pending_updates = False
+
+        # Stop abort listener and timeout timer
+        self.stop_abort_polling()
+        self.stop_timeout_timer()
+
+        # If aborting/aborted but handlers haven't run yet, run them now
+        # (This catches the case where task ended before listener detected abort)
+        if self._app:
+            with self._app.app_context():
+                task = self._task
+                if task.status in ABORT_STATES and not self._abort_detected:
+                    self._trigger_abort_handlers()
+        else:
+            # Fallback without app context
+            try:
+                task = self._task
+                if task.status in ABORT_STATES and not self._abort_detected:
+                    self._trigger_abort_handlers()
+            except Exception as ex:
+                logger.warning(
+                    "Could not check abort status during cleanup for task %s: %s",
+                    self._task_uuid,
+                    str(ex),
+                )
+
+        # Always run cleanup handlers, collecting failures
+        for handler in reversed(self._cleanup_handlers):
+            try:
+                handler()
+            except Exception as ex:
+                stack_trace = traceback.format_exc()
+                logger.error(
+                    "Cleanup handler failed for task %s: %s",
+                    self._task_uuid,
+                    str(ex),
+                    exc_info=True,
+                )
+                self._handler_failures.append(("cleanup", ex, stack_trace))
+
+        # Write all collected failures (abort + cleanup) to DB as unified record
+        if self._handler_failures:
+            self._write_handler_failures_to_db()