feat(SIP-39): Async query support for charts (#11499)

* Generate JWT in Flask app * Refactor chart data API query logic, add JWT validation and async worker * Add redis stream implementation, refactoring * Add chart data cache endpoint, refactor QueryContext caching * Typing, linting, refactoring * pytest fixes and openapi schema update * Enforce caching be configured for async query init * Async query processing for explore_json endpoint * Add /api/v1/async_event endpoint * Async frontend for dashboards [WIP] * Chart async error message support, refactoring * Abstract asyncEvent middleware * Async chart loading for Explore * Pylint fixes * asyncEvent middleware -> TypeScript, JS linting * Chart data API: enforce forced_cache, add tests * Add tests for explore_json endpoints * Add test for chart data cache enpoint (no login) * Consolidate set_and_log_cache and add STORE_CACHE_KEYS_IN_METADATA_DB flag * Add tests for tasks/async_queries and address PR comments * Bypass non-JSON result formats for async queries * Add tests for redux middleware * Remove debug statement Co-authored-by: Ville Brofeldt <33317356+villebro@users.noreply.github.com> * Skip force_cached if no queryObj * SunburstViz: don't modify self.form_data * Fix failing annotation test * Resolve merge/lint issues * Reduce polling delay * Fix new getClientErrorObject reference * Fix flakey unit tests * /api/v1/async_event: increment redis stream ID, add tests * PR feedback: refactoring, configuration * Fixup: remove debugging * Fix typescript errors due to redux upgrade * Update UPDATING.md * Fix failing py tests * asyncEvent_spec.js -> asyncEvent_spec.ts * Refactor flakey Python 3.7 mock assertions * Fix another shared state issue in Py tests * Use 'sub' claim in JWT for user_id * Refactor async middleware config * Fixup: restore FeatureFlag boolean type Co-authored-by: Ville Brofeldt <33317356+villebro@users.noreply.github.com>
2026-04-20 16:44:46 +00:00 · 2020-12-10 20:21:56 -08:00
parent 0fdf026cbc
commit 4d329071a1
64 changed files with 2219 additions and 197 deletions
--- a/superset/common/query_context.py
+++ b/superset/common/query_context.py
@@ -17,7 +17,7 @@
 import copy
 import logging
 import math
-from datetime import datetime, timedelta
+from datetime import timedelta
 from typing import Any, cast, ClassVar, Dict, List, Optional, Union

 import numpy as np
@@ -30,13 +30,17 @@ from superset.charts.dao import ChartDAO
 from superset.common.query_object import QueryObject
 from superset.connectors.base.models import BaseDatasource
 from superset.connectors.connector_registry import ConnectorRegistry
-from superset.exceptions import QueryObjectValidationError, SupersetException
+from superset.exceptions import (
+    CacheLoadError,
+    QueryObjectValidationError,
+    SupersetException,
+)
 from superset.extensions import cache_manager, security_manager
 from superset.stats_logger import BaseStatsLogger
 from superset.utils import core as utils
+from superset.utils.cache import generate_cache_key, set_and_log_cache
 from superset.utils.core import DTTM_ALIAS
 from superset.views.utils import get_viz
-from superset.viz import set_and_log_cache

 config = app.config
 stats_logger: BaseStatsLogger = config["STATS_LOGGER"]
@@ -78,6 +82,13 @@ class QueryContext:
        self.custom_cache_timeout = custom_cache_timeout
        self.result_type = result_type or utils.ChartDataResultType.FULL
        self.result_format = result_format or utils.ChartDataResultFormat.JSON
+        self.cache_values = {
+            "datasource": datasource,
+            "queries": queries,
+            "force": force,
+            "result_type": result_type,
+            "result_format": result_format,
+        }

    def get_query_result(self, query_object: QueryObject) -> Dict[str, Any]:
        """Returns a pandas dataframe based on the query object"""
@@ -142,8 +153,11 @@ class QueryContext:

        return df.to_dict(orient="records")

-    def get_single_payload(self, query_obj: QueryObject) -> Dict[str, Any]:
+    def get_single_payload(
+        self, query_obj: QueryObject, **kwargs: Any
+    ) -> Dict[str, Any]:
        """Returns a payload of metadata and data"""
+        force_cached = kwargs.get("force_cached", False)
        if self.result_type == utils.ChartDataResultType.QUERY:
            return {
                "query": self.datasource.get_query_str(query_obj.to_dict()),
@@ -159,8 +173,7 @@ class QueryContext:
            query_obj.row_limit = min(row_limit, config["SAMPLES_ROW_LIMIT"])
            query_obj.row_offset = 0
            query_obj.columns = [o.column_name for o in self.datasource.columns]
-        payload = self.get_df_payload(query_obj)
-
+        payload = self.get_df_payload(query_obj, force_cached=force_cached)
        df = payload["df"]
        status = payload["status"]
        if status != utils.QueryStatus.FAILED:
@@ -186,9 +199,28 @@ class QueryContext:
            return {"data": payload["data"]}
        return payload

-    def get_payload(self) -> List[Dict[str, Any]]:
-        """Get all the payloads from the QueryObjects"""
-        return [self.get_single_payload(query_object) for query_object in self.queries]
+    def get_payload(self, **kwargs: Any) -> Dict[str, Any]:
+        cache_query_context = kwargs.get("cache_query_context", False)
+        force_cached = kwargs.get("force_cached", False)
+
+        # Get all the payloads from the QueryObjects
+        query_results = [
+            self.get_single_payload(query_object, force_cached=force_cached)
+            for query_object in self.queries
+        ]
+        return_value = {"queries": query_results}
+
+        if cache_query_context:
+            cache_key = self.cache_key()
+            set_and_log_cache(
+                cache_manager.cache,
+                cache_key,
+                {"data": self.cache_values},
+                self.cache_timeout,
+            )
+            return_value["cache_key"] = cache_key  # type: ignore
+
+        return return_value

    @property
    def cache_timeout(self) -> int:
@@ -203,7 +235,22 @@ class QueryContext:
            return self.datasource.database.cache_timeout
        return config["CACHE_DEFAULT_TIMEOUT"]

-    def cache_key(self, query_obj: QueryObject, **kwargs: Any) -> Optional[str]:
+    def cache_key(self, **extra: Any) -> str:
+        """
+        The QueryContext cache key is made out of the key/values from
+        self.cached_values, plus any other key/values in `extra`. It includes only data
+        required to rehydrate a QueryContext object.
+        """
+        key_prefix = "qc-"
+        cache_dict = self.cache_values.copy()
+        cache_dict.update(extra)
+
+        return generate_cache_key(cache_dict, key_prefix)
+
+    def query_cache_key(self, query_obj: QueryObject, **kwargs: Any) -> Optional[str]:
+        """
+        Returns a QueryObject cache key for objects in self.queries
+        """
        extra_cache_keys = self.datasource.get_extra_cache_keys(query_obj.to_dict())

        cache_key = (
@@ -215,7 +262,7 @@ class QueryContext:
                and self.datasource.is_rls_supported
                else [],
                changed_on=self.datasource.changed_on,
-                **kwargs
+                **kwargs,
            )
            if query_obj
            else None
@@ -298,12 +345,12 @@ class QueryContext:
        self, query_obj: QueryObject, **kwargs: Any
    ) -> Dict[str, Any]:
        """Handles caching around the df payload retrieval"""
-        cache_key = self.cache_key(query_obj, **kwargs)
+        force_cached = kwargs.get("force_cached", False)
+        cache_key = self.query_cache_key(query_obj)
        logger.info("Cache key: %s", cache_key)
        is_loaded = False
        stacktrace = None
        df = pd.DataFrame()
-        cached_dttm = datetime.utcnow().isoformat().split(".")[0]
        cache_value = None
        status = None
        query = ""
@@ -327,6 +374,12 @@ class QueryContext:
                    )
                logger.info("Serving from cache")

+        if force_cached and not is_loaded:
+            logger.warning(
+                "force_cached (QueryContext): value not found for key %s", cache_key
+            )
+            raise CacheLoadError("Error loading data from cache")
+
        if query_obj and not is_loaded:
            try:
                invalid_columns = [
@@ -367,13 +420,11 @@ class QueryContext:

            if is_loaded and cache_key and status != utils.QueryStatus.FAILED:
                set_and_log_cache(
-                    cache_key=cache_key,
-                    df=df,
-                    query=query,
-                    annotation_data=annotation_data,
-                    cached_dttm=cached_dttm,
-                    cache_timeout=self.cache_timeout,
-                    datasource_uid=self.datasource.uid,
+                    cache_manager.data_cache,
+                    cache_key,
+                    {"df": df, "query": query, "annotation_data": annotation_data},
+                    self.cache_timeout,
+                    self.datasource.uid,
                )
        return {
            "cache_key": cache_key,