feat: Add post processing to QueryObject (#9427)

* Add post processing to QueryObject

* Simplify sort signature and require explicit sort order

* Add new operations and unit tests

* linting

* Address comments

* Simplify test method names

* Address comments

* Linting

* remove unnecessary logic

* Apply strict whitelisting to all getattr calls

* Add checking of rolling_type_options and add/improve docs
This commit is contained in:
Ville Brofeldt
2020-04-10 20:50:11 +03:00
committed by GitHub
parent 5ec0192bcc
commit a8ce3bccdf
9 changed files with 899 additions and 12 deletions

View File

@@ -51,7 +51,7 @@ class QueryContext:
custom_cache_timeout: Optional[int]
# TODO: Type datasource and query_object dictionary with TypedDict when it becomes
# a vanilla python type https://github.com/python/mypy/issues/5288
# a vanilla python type https://github.com/python/mypy/issues/5288
def __init__(
self,
datasource: Dict[str, Any],
@@ -70,8 +70,8 @@ class QueryContext:
"""Returns a pandas dataframe based on the query object"""
# Here, we assume that all the queries will use the same datasource, which is
# is a valid assumption for current setting. In a long term, we may or maynot
# support multiple queries from different data source.
# a valid assumption for current setting. In the long term, we may
# support multiple queries from different data sources.
timestamp_format = None
if self.datasource.type == "table":
@@ -105,6 +105,9 @@ class QueryContext:
self.df_metrics_to_num(df, query_object)
df.replace([np.inf, -np.inf], np.nan)
df = query_object.exec_post_processing(df)
return {
"query": result.query,
"status": result.status,

View File

@@ -20,13 +20,16 @@ from datetime import datetime, timedelta
from typing import Any, Dict, List, Optional, Union
import simplejson as json
from flask_babel import gettext as _
from pandas import DataFrame
from superset import app
from superset.utils import core as utils
from superset.exceptions import QueryObjectValidationError
from superset.utils import core as utils, pandas_postprocessing
from superset.views.utils import get_time_range_endpoints
# TODO: Type Metrics dictionary with TypedDict when it becomes a vanilla python type
# https://github.com/python/mypy/issues/5288
# https://github.com/python/mypy/issues/5288
class QueryObject:
@@ -50,6 +53,7 @@ class QueryObject:
extras: Dict
columns: List[str]
orderby: List[List]
post_processing: List[Dict[str, Any]]
def __init__(
self,
@@ -67,6 +71,7 @@ class QueryObject:
extras: Optional[Dict] = None,
columns: Optional[List[str]] = None,
orderby: Optional[List[List]] = None,
post_processing: Optional[List[Dict[str, Any]]] = None,
relative_start: str = app.config["DEFAULT_RELATIVE_START_TIME"],
relative_end: str = app.config["DEFAULT_RELATIVE_END_TIME"],
):
@@ -81,8 +86,9 @@ class QueryObject:
self.time_range = time_range
self.time_shift = utils.parse_human_timedelta(time_shift)
self.groupby = groupby or []
self.post_processing = post_processing or []
# Temporal solution for backward compatability issue due the new format of
# Temporary solution for backward compatibility issue due the new format of
# non-ad-hoc metric which needs to adhere to superset-ui per
# https://git.io/Jvm7P.
self.metrics = [
@@ -138,9 +144,37 @@ class QueryObject:
if self.time_range:
cache_dict["time_range"] = self.time_range
json_data = self.json_dumps(cache_dict, sort_keys=True)
if self.post_processing:
cache_dict["post_processing"] = self.post_processing
return hashlib.md5(json_data.encode("utf-8")).hexdigest()
def json_dumps(self, obj: Any, sort_keys: bool = False) -> str:
return json.dumps(
obj, default=utils.json_int_dttm_ser, ignore_nan=True, sort_keys=sort_keys
)
def exec_post_processing(self, df: DataFrame) -> DataFrame:
"""
Perform post processing operations on DataFrame.
:param df: DataFrame returned from database model.
:return: new DataFrame to which all post processing operations have been
applied
:raises ChartDataValidationError: If the post processing operation in incorrect
"""
for post_process in self.post_processing:
operation = post_process.get("operation")
if not operation:
raise QueryObjectValidationError(
_("`operation` property of post processing object undefined")
)
if not hasattr(pandas_postprocessing, operation):
raise QueryObjectValidationError(
_(
"Unsupported post processing operation: %(operation)s",
type=operation,
)
)
options = post_process.get("options", {})
df = getattr(pandas_postprocessing, operation)(df, **options)
return df