refactor: decouple pandas postprocessing operator (#18710)

2026-05-30 04:39:20 +00:00 · 2022-02-14 16:44:26 +08:00
parent ea12024d25
commit 8d6aff3e5d
16 changed files with 1364 additions and 1002 deletions
--- a/superset/utils/pandas_postprocessing/utils.py
+++ b/superset/utils/pandas_postprocessing/utils.py
@@ -0,0 +1,201 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+from functools import partial
+from typing import Any, Callable, Dict, Tuple, Union
+
+import numpy as np
+from flask_babel import gettext as _
+from pandas import DataFrame, NamedAgg, Timestamp
+
+from superset.exceptions import QueryObjectValidationError
+
+NUMPY_FUNCTIONS = {
+    "average": np.average,
+    "argmin": np.argmin,
+    "argmax": np.argmax,
+    "count": np.ma.count,
+    "count_nonzero": np.count_nonzero,
+    "cumsum": np.cumsum,
+    "cumprod": np.cumprod,
+    "max": np.max,
+    "mean": np.mean,
+    "median": np.median,
+    "nansum": np.nansum,
+    "nanmin": np.nanmin,
+    "nanmax": np.nanmax,
+    "nanmean": np.nanmean,
+    "nanmedian": np.nanmedian,
+    "nanpercentile": np.nanpercentile,
+    "min": np.min,
+    "percentile": np.percentile,
+    "prod": np.prod,
+    "product": np.product,
+    "std": np.std,
+    "sum": np.sum,
+    "var": np.var,
+}
+
+DENYLIST_ROLLING_FUNCTIONS = (
+    "count",
+    "corr",
+    "cov",
+    "kurt",
+    "max",
+    "mean",
+    "median",
+    "min",
+    "std",
+    "skew",
+    "sum",
+    "var",
+    "quantile",
+)
+
+ALLOWLIST_CUMULATIVE_FUNCTIONS = (
+    "cummax",
+    "cummin",
+    "cumprod",
+    "cumsum",
+)
+
+PROPHET_TIME_GRAIN_MAP = {
+    "PT1S": "S",
+    "PT1M": "min",
+    "PT5M": "5min",
+    "PT10M": "10min",
+    "PT15M": "15min",
+    "PT30M": "30min",
+    "PT1H": "H",
+    "P1D": "D",
+    "P1W": "W",
+    "P1M": "M",
+    "P3M": "Q",
+    "P1Y": "A",
+    "1969-12-28T00:00:00Z/P1W": "W",
+    "1969-12-29T00:00:00Z/P1W": "W",
+    "P1W/1970-01-03T00:00:00Z": "W",
+    "P1W/1970-01-04T00:00:00Z": "W",
+}
+
+
+def _flatten_column_after_pivot(
+    column: Union[float, Timestamp, str, Tuple[str, ...]],
+    aggregates: Dict[str, Dict[str, Any]],
+) -> str:
+    """
+    Function for flattening column names into a single string. This step is necessary
+    to be able to properly serialize a DataFrame. If the column is a string, return
+    element unchanged. For multi-element columns, join column elements with a comma,
+    with the exception of pivots made with a single aggregate, in which case the
+    aggregate column name is omitted.
+
+    :param column: single element from `DataFrame.columns`
+    :param aggregates: aggregates
+    :return:
+    """
+    if not isinstance(column, tuple):
+        column = (column,)
+    if len(aggregates) == 1 and len(column) > 1:
+        # drop aggregate for single aggregate pivots with multiple groupings
+        # from column name (aggregates always come first in column name)
+        column = column[1:]
+    return ", ".join([str(col) for col in column])
+
+
+def validate_column_args(*argnames: str) -> Callable[..., Any]:
+    def wrapper(func: Callable[..., Any]) -> Callable[..., Any]:
+        def wrapped(df: DataFrame, **options: Any) -> Any:
+            if options.get("is_pivot_df"):
+                # skip validation when pivot Dataframe
+                return func(df, **options)
+            columns = df.columns.tolist()
+            for name in argnames:
+                if name in options and not all(
+                    elem in columns for elem in options.get(name) or []
+                ):
+                    raise QueryObjectValidationError(
+                        _("Referenced columns not available in DataFrame.")
+                    )
+            return func(df, **options)
+
+        return wrapped
+
+    return wrapper
+
+
+def _get_aggregate_funcs(
+    df: DataFrame, aggregates: Dict[str, Dict[str, Any]],
+) -> Dict[str, NamedAgg]:
+    """
+    Converts a set of aggregate config objects into functions that pandas can use as
+    aggregators. Currently only numpy aggregators are supported.
+
+    :param df: DataFrame on which to perform aggregate operation.
+    :param aggregates: Mapping from column name to aggregate config.
+    :return: Mapping from metric name to function that takes a single input argument.
+    """
+    agg_funcs: Dict[str, NamedAgg] = {}
+    for name, agg_obj in aggregates.items():
+        column = agg_obj.get("column", name)
+        if column not in df:
+            raise QueryObjectValidationError(
+                _(
+                    "Column referenced by aggregate is undefined: %(column)s",
+                    column=column,
+                )
+            )
+        if "operator" not in agg_obj:
+            raise QueryObjectValidationError(
+                _("Operator undefined for aggregator: %(name)s", name=name,)
+            )
+        operator = agg_obj["operator"]
+        if callable(operator):
+            aggfunc = operator
+        else:
+            func = NUMPY_FUNCTIONS.get(operator)
+            if not func:
+                raise QueryObjectValidationError(
+                    _("Invalid numpy function: %(operator)s", operator=operator,)
+                )
+            options = agg_obj.get("options", {})
+            aggfunc = partial(func, **options)
+        agg_funcs[name] = NamedAgg(column=column, aggfunc=aggfunc)
+
+    return agg_funcs
+
+
+def _append_columns(
+    base_df: DataFrame, append_df: DataFrame, columns: Dict[str, str]
+) -> DataFrame:
+    """
+    Function for adding columns from one DataFrame to another DataFrame. Calls the
+    assign method, which overwrites the original column in `base_df` if the column
+    already exists, and appends the column if the name is not defined.
+
+    :param base_df: DataFrame which to use as the base
+    :param append_df: DataFrame from which to select data.
+    :param columns: columns on which to append, mapping source column to
+           target column. For instance, `{'y': 'y'}` will replace the values in
+           column `y` in `base_df` with the values in `y` in `append_df`,
+           while `{'y': 'y2'}` will add a column `y2` to `base_df` based
+           on values in column `y` in `append_df`, leaving the original column `y`
+           in `base_df` unchanged.
+    :return: new DataFrame with combined data from `base_df` and `append_df`
+    """
+    return base_df.assign(
+        **{target: append_df[source] for source, target in columns.items()}
+    )