feat: Add post processing to QueryObject (#9427)

* Add post processing to QueryObject * Simplify sort signature and require explicit sort order * Add new operations and unit tests * linting * Address comments * Simplify test method names * Address comments * Linting * remove unnecessary logic * Apply strict whitelisting to all getattr calls * Add checking of rolling_type_options and add/improve docs
2026-04-21 00:54:44 +00:00 · 2020-04-10 20:50:11 +03:00
parent 5ec0192bcc
commit a8ce3bccdf
9 changed files with 899 additions and 12 deletions
--- a/superset/common/query_context.py
+++ b/superset/common/query_context.py
@@ -51,7 +51,7 @@ class QueryContext:
    custom_cache_timeout: Optional[int]

    # TODO: Type datasource and query_object dictionary with TypedDict when it becomes
-    # a vanilla python type https://github.com/python/mypy/issues/5288
+    #  a vanilla python type https://github.com/python/mypy/issues/5288
    def __init__(
        self,
        datasource: Dict[str, Any],
@@ -70,8 +70,8 @@ class QueryContext:
        """Returns a pandas dataframe based on the query object"""

        # Here, we assume that all the queries will use the same datasource, which is
-        # is a valid assumption for current setting. In a long term, we may or maynot
-        # support multiple queries from different data source.
+        # a valid assumption for current setting. In the long term, we may
+        # support multiple queries from different data sources.

        timestamp_format = None
        if self.datasource.type == "table":
@@ -105,6 +105,9 @@ class QueryContext:
                self.df_metrics_to_num(df, query_object)

            df.replace([np.inf, -np.inf], np.nan)
+
+        df = query_object.exec_post_processing(df)
+
        return {
            "query": result.query,
            "status": result.status,
--- a/superset/common/query_object.py
+++ b/superset/common/query_object.py
@@ -20,13 +20,16 @@ from datetime import datetime, timedelta
 from typing import Any, Dict, List, Optional, Union

 import simplejson as json
+from flask_babel import gettext as _
+from pandas import DataFrame

 from superset import app
-from superset.utils import core as utils
+from superset.exceptions import QueryObjectValidationError
+from superset.utils import core as utils, pandas_postprocessing
 from superset.views.utils import get_time_range_endpoints

 # TODO: Type Metrics dictionary with TypedDict when it becomes a vanilla python type
-# https://github.com/python/mypy/issues/5288
+#  https://github.com/python/mypy/issues/5288


 class QueryObject:
@@ -50,6 +53,7 @@ class QueryObject:
    extras: Dict
    columns: List[str]
    orderby: List[List]
+    post_processing: List[Dict[str, Any]]

    def __init__(
        self,
@@ -67,6 +71,7 @@ class QueryObject:
        extras: Optional[Dict] = None,
        columns: Optional[List[str]] = None,
        orderby: Optional[List[List]] = None,
+        post_processing: Optional[List[Dict[str, Any]]] = None,
        relative_start: str = app.config["DEFAULT_RELATIVE_START_TIME"],
        relative_end: str = app.config["DEFAULT_RELATIVE_END_TIME"],
    ):
@@ -81,8 +86,9 @@ class QueryObject:
        self.time_range = time_range
        self.time_shift = utils.parse_human_timedelta(time_shift)
        self.groupby = groupby or []
+        self.post_processing = post_processing or []

-        # Temporal solution for backward compatability issue due the new format of
+        # Temporary solution for backward compatibility issue due the new format of
        # non-ad-hoc metric which needs to adhere to superset-ui per
        # https://git.io/Jvm7P.
        self.metrics = [
@@ -138,9 +144,37 @@ class QueryObject:
        if self.time_range:
            cache_dict["time_range"] = self.time_range
        json_data = self.json_dumps(cache_dict, sort_keys=True)
+        if self.post_processing:
+            cache_dict["post_processing"] = self.post_processing
        return hashlib.md5(json_data.encode("utf-8")).hexdigest()

    def json_dumps(self, obj: Any, sort_keys: bool = False) -> str:
        return json.dumps(
            obj, default=utils.json_int_dttm_ser, ignore_nan=True, sort_keys=sort_keys
        )
+
+    def exec_post_processing(self, df: DataFrame) -> DataFrame:
+        """
+        Perform post processing operations on DataFrame.
+
+        :param df: DataFrame returned from database model.
+        :return: new DataFrame to which all post processing operations have been
+                 applied
+        :raises ChartDataValidationError: If the post processing operation in incorrect
+        """
+        for post_process in self.post_processing:
+            operation = post_process.get("operation")
+            if not operation:
+                raise QueryObjectValidationError(
+                    _("`operation` property of post processing object undefined")
+                )
+            if not hasattr(pandas_postprocessing, operation):
+                raise QueryObjectValidationError(
+                    _(
+                        "Unsupported post processing operation: %(operation)s",
+                        type=operation,
+                    )
+                )
+            options = post_process.get("options", {})
+            df = getattr(pandas_postprocessing, operation)(df, **options)
+        return df