feat(SIP-39): Async query support for charts (#11499)

* Generate JWT in Flask app

* Refactor chart data API query logic, add JWT validation and async worker

* Add redis stream implementation, refactoring

* Add chart data cache endpoint, refactor QueryContext caching

* Typing, linting, refactoring

* pytest fixes and openapi schema update

* Enforce caching be configured for async query init

* Async query processing for explore_json endpoint

* Add /api/v1/async_event endpoint

* Async frontend for dashboards [WIP]

* Chart async error message support, refactoring

* Abstract asyncEvent middleware

* Async chart loading for Explore

* Pylint fixes

* asyncEvent middleware -> TypeScript, JS linting

* Chart data API: enforce forced_cache, add tests

* Add tests for explore_json endpoints

* Add test for chart data cache enpoint (no login)

* Consolidate set_and_log_cache and add STORE_CACHE_KEYS_IN_METADATA_DB flag

* Add tests for tasks/async_queries and address PR comments

* Bypass non-JSON result formats for async queries

* Add tests for redux middleware

* Remove debug statement

Co-authored-by: Ville Brofeldt <33317356+villebro@users.noreply.github.com>

* Skip force_cached if no queryObj

* SunburstViz: don't modify self.form_data

* Fix failing annotation test

* Resolve merge/lint issues

* Reduce polling delay

* Fix new getClientErrorObject reference

* Fix flakey unit tests

* /api/v1/async_event: increment redis stream ID, add tests

* PR feedback: refactoring, configuration

* Fixup: remove debugging

* Fix typescript errors due to redux upgrade

* Update UPDATING.md

* Fix failing py tests

* asyncEvent_spec.js -> asyncEvent_spec.ts

* Refactor flakey Python 3.7 mock assertions

* Fix another shared state issue in Py tests

* Use 'sub' claim in JWT for user_id

* Refactor async middleware config

* Fixup: restore FeatureFlag boolean type

Co-authored-by: Ville Brofeldt <33317356+villebro@users.noreply.github.com>
This commit is contained in:
Rob DiCiuccio
2020-12-10 20:21:56 -08:00
committed by GitHub
parent 0fdf026cbc
commit 4d329071a1
64 changed files with 2219 additions and 197 deletions

View File

@@ -17,7 +17,7 @@
import copy
import logging
import math
from datetime import datetime, timedelta
from datetime import timedelta
from typing import Any, cast, ClassVar, Dict, List, Optional, Union
import numpy as np
@@ -30,13 +30,17 @@ from superset.charts.dao import ChartDAO
from superset.common.query_object import QueryObject
from superset.connectors.base.models import BaseDatasource
from superset.connectors.connector_registry import ConnectorRegistry
from superset.exceptions import QueryObjectValidationError, SupersetException
from superset.exceptions import (
CacheLoadError,
QueryObjectValidationError,
SupersetException,
)
from superset.extensions import cache_manager, security_manager
from superset.stats_logger import BaseStatsLogger
from superset.utils import core as utils
from superset.utils.cache import generate_cache_key, set_and_log_cache
from superset.utils.core import DTTM_ALIAS
from superset.views.utils import get_viz
from superset.viz import set_and_log_cache
config = app.config
stats_logger: BaseStatsLogger = config["STATS_LOGGER"]
@@ -78,6 +82,13 @@ class QueryContext:
self.custom_cache_timeout = custom_cache_timeout
self.result_type = result_type or utils.ChartDataResultType.FULL
self.result_format = result_format or utils.ChartDataResultFormat.JSON
self.cache_values = {
"datasource": datasource,
"queries": queries,
"force": force,
"result_type": result_type,
"result_format": result_format,
}
def get_query_result(self, query_object: QueryObject) -> Dict[str, Any]:
"""Returns a pandas dataframe based on the query object"""
@@ -142,8 +153,11 @@ class QueryContext:
return df.to_dict(orient="records")
def get_single_payload(self, query_obj: QueryObject) -> Dict[str, Any]:
def get_single_payload(
self, query_obj: QueryObject, **kwargs: Any
) -> Dict[str, Any]:
"""Returns a payload of metadata and data"""
force_cached = kwargs.get("force_cached", False)
if self.result_type == utils.ChartDataResultType.QUERY:
return {
"query": self.datasource.get_query_str(query_obj.to_dict()),
@@ -159,8 +173,7 @@ class QueryContext:
query_obj.row_limit = min(row_limit, config["SAMPLES_ROW_LIMIT"])
query_obj.row_offset = 0
query_obj.columns = [o.column_name for o in self.datasource.columns]
payload = self.get_df_payload(query_obj)
payload = self.get_df_payload(query_obj, force_cached=force_cached)
df = payload["df"]
status = payload["status"]
if status != utils.QueryStatus.FAILED:
@@ -186,9 +199,28 @@ class QueryContext:
return {"data": payload["data"]}
return payload
def get_payload(self) -> List[Dict[str, Any]]:
"""Get all the payloads from the QueryObjects"""
return [self.get_single_payload(query_object) for query_object in self.queries]
def get_payload(self, **kwargs: Any) -> Dict[str, Any]:
cache_query_context = kwargs.get("cache_query_context", False)
force_cached = kwargs.get("force_cached", False)
# Get all the payloads from the QueryObjects
query_results = [
self.get_single_payload(query_object, force_cached=force_cached)
for query_object in self.queries
]
return_value = {"queries": query_results}
if cache_query_context:
cache_key = self.cache_key()
set_and_log_cache(
cache_manager.cache,
cache_key,
{"data": self.cache_values},
self.cache_timeout,
)
return_value["cache_key"] = cache_key # type: ignore
return return_value
@property
def cache_timeout(self) -> int:
@@ -203,7 +235,22 @@ class QueryContext:
return self.datasource.database.cache_timeout
return config["CACHE_DEFAULT_TIMEOUT"]
def cache_key(self, query_obj: QueryObject, **kwargs: Any) -> Optional[str]:
def cache_key(self, **extra: Any) -> str:
"""
The QueryContext cache key is made out of the key/values from
self.cached_values, plus any other key/values in `extra`. It includes only data
required to rehydrate a QueryContext object.
"""
key_prefix = "qc-"
cache_dict = self.cache_values.copy()
cache_dict.update(extra)
return generate_cache_key(cache_dict, key_prefix)
def query_cache_key(self, query_obj: QueryObject, **kwargs: Any) -> Optional[str]:
"""
Returns a QueryObject cache key for objects in self.queries
"""
extra_cache_keys = self.datasource.get_extra_cache_keys(query_obj.to_dict())
cache_key = (
@@ -215,7 +262,7 @@ class QueryContext:
and self.datasource.is_rls_supported
else [],
changed_on=self.datasource.changed_on,
**kwargs
**kwargs,
)
if query_obj
else None
@@ -298,12 +345,12 @@ class QueryContext:
self, query_obj: QueryObject, **kwargs: Any
) -> Dict[str, Any]:
"""Handles caching around the df payload retrieval"""
cache_key = self.cache_key(query_obj, **kwargs)
force_cached = kwargs.get("force_cached", False)
cache_key = self.query_cache_key(query_obj)
logger.info("Cache key: %s", cache_key)
is_loaded = False
stacktrace = None
df = pd.DataFrame()
cached_dttm = datetime.utcnow().isoformat().split(".")[0]
cache_value = None
status = None
query = ""
@@ -327,6 +374,12 @@ class QueryContext:
)
logger.info("Serving from cache")
if force_cached and not is_loaded:
logger.warning(
"force_cached (QueryContext): value not found for key %s", cache_key
)
raise CacheLoadError("Error loading data from cache")
if query_obj and not is_loaded:
try:
invalid_columns = [
@@ -367,13 +420,11 @@ class QueryContext:
if is_loaded and cache_key and status != utils.QueryStatus.FAILED:
set_and_log_cache(
cache_key=cache_key,
df=df,
query=query,
annotation_data=annotation_data,
cached_dttm=cached_dttm,
cache_timeout=self.cache_timeout,
datasource_uid=self.datasource.uid,
cache_manager.data_cache,
cache_key,
{"df": df, "query": query, "annotation_data": annotation_data},
self.cache_timeout,
self.datasource.uid,
)
return {
"cache_key": cache_key,