mirror of
https://github.com/apache/superset.git
synced 2026-04-14 13:44:46 +00:00
refactor: decouple pandas postprocessing operator (#18710)
This commit is contained in:
201
superset/utils/pandas_postprocessing/utils.py
Normal file
201
superset/utils/pandas_postprocessing/utils.py
Normal file
@@ -0,0 +1,201 @@
|
||||
# Licensed to the Apache Software Foundation (ASF) under one
|
||||
# or more contributor license agreements. See the NOTICE file
|
||||
# distributed with this work for additional information
|
||||
# regarding copyright ownership. The ASF licenses this file
|
||||
# to you under the Apache License, Version 2.0 (the
|
||||
# "License"); you may not use this file except in compliance
|
||||
# with the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing,
|
||||
# software distributed under the License is distributed on an
|
||||
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
# KIND, either express or implied. See the License for the
|
||||
# specific language governing permissions and limitations
|
||||
# under the License.
|
||||
from functools import partial
|
||||
from typing import Any, Callable, Dict, Tuple, Union
|
||||
|
||||
import numpy as np
|
||||
from flask_babel import gettext as _
|
||||
from pandas import DataFrame, NamedAgg, Timestamp
|
||||
|
||||
from superset.exceptions import QueryObjectValidationError
|
||||
|
||||
NUMPY_FUNCTIONS = {
|
||||
"average": np.average,
|
||||
"argmin": np.argmin,
|
||||
"argmax": np.argmax,
|
||||
"count": np.ma.count,
|
||||
"count_nonzero": np.count_nonzero,
|
||||
"cumsum": np.cumsum,
|
||||
"cumprod": np.cumprod,
|
||||
"max": np.max,
|
||||
"mean": np.mean,
|
||||
"median": np.median,
|
||||
"nansum": np.nansum,
|
||||
"nanmin": np.nanmin,
|
||||
"nanmax": np.nanmax,
|
||||
"nanmean": np.nanmean,
|
||||
"nanmedian": np.nanmedian,
|
||||
"nanpercentile": np.nanpercentile,
|
||||
"min": np.min,
|
||||
"percentile": np.percentile,
|
||||
"prod": np.prod,
|
||||
"product": np.product,
|
||||
"std": np.std,
|
||||
"sum": np.sum,
|
||||
"var": np.var,
|
||||
}
|
||||
|
||||
DENYLIST_ROLLING_FUNCTIONS = (
|
||||
"count",
|
||||
"corr",
|
||||
"cov",
|
||||
"kurt",
|
||||
"max",
|
||||
"mean",
|
||||
"median",
|
||||
"min",
|
||||
"std",
|
||||
"skew",
|
||||
"sum",
|
||||
"var",
|
||||
"quantile",
|
||||
)
|
||||
|
||||
ALLOWLIST_CUMULATIVE_FUNCTIONS = (
|
||||
"cummax",
|
||||
"cummin",
|
||||
"cumprod",
|
||||
"cumsum",
|
||||
)
|
||||
|
||||
PROPHET_TIME_GRAIN_MAP = {
|
||||
"PT1S": "S",
|
||||
"PT1M": "min",
|
||||
"PT5M": "5min",
|
||||
"PT10M": "10min",
|
||||
"PT15M": "15min",
|
||||
"PT30M": "30min",
|
||||
"PT1H": "H",
|
||||
"P1D": "D",
|
||||
"P1W": "W",
|
||||
"P1M": "M",
|
||||
"P3M": "Q",
|
||||
"P1Y": "A",
|
||||
"1969-12-28T00:00:00Z/P1W": "W",
|
||||
"1969-12-29T00:00:00Z/P1W": "W",
|
||||
"P1W/1970-01-03T00:00:00Z": "W",
|
||||
"P1W/1970-01-04T00:00:00Z": "W",
|
||||
}
|
||||
|
||||
|
||||
def _flatten_column_after_pivot(
|
||||
column: Union[float, Timestamp, str, Tuple[str, ...]],
|
||||
aggregates: Dict[str, Dict[str, Any]],
|
||||
) -> str:
|
||||
"""
|
||||
Function for flattening column names into a single string. This step is necessary
|
||||
to be able to properly serialize a DataFrame. If the column is a string, return
|
||||
element unchanged. For multi-element columns, join column elements with a comma,
|
||||
with the exception of pivots made with a single aggregate, in which case the
|
||||
aggregate column name is omitted.
|
||||
|
||||
:param column: single element from `DataFrame.columns`
|
||||
:param aggregates: aggregates
|
||||
:return:
|
||||
"""
|
||||
if not isinstance(column, tuple):
|
||||
column = (column,)
|
||||
if len(aggregates) == 1 and len(column) > 1:
|
||||
# drop aggregate for single aggregate pivots with multiple groupings
|
||||
# from column name (aggregates always come first in column name)
|
||||
column = column[1:]
|
||||
return ", ".join([str(col) for col in column])
|
||||
|
||||
|
||||
def validate_column_args(*argnames: str) -> Callable[..., Any]:
|
||||
def wrapper(func: Callable[..., Any]) -> Callable[..., Any]:
|
||||
def wrapped(df: DataFrame, **options: Any) -> Any:
|
||||
if options.get("is_pivot_df"):
|
||||
# skip validation when pivot Dataframe
|
||||
return func(df, **options)
|
||||
columns = df.columns.tolist()
|
||||
for name in argnames:
|
||||
if name in options and not all(
|
||||
elem in columns for elem in options.get(name) or []
|
||||
):
|
||||
raise QueryObjectValidationError(
|
||||
_("Referenced columns not available in DataFrame.")
|
||||
)
|
||||
return func(df, **options)
|
||||
|
||||
return wrapped
|
||||
|
||||
return wrapper
|
||||
|
||||
|
||||
def _get_aggregate_funcs(
|
||||
df: DataFrame, aggregates: Dict[str, Dict[str, Any]],
|
||||
) -> Dict[str, NamedAgg]:
|
||||
"""
|
||||
Converts a set of aggregate config objects into functions that pandas can use as
|
||||
aggregators. Currently only numpy aggregators are supported.
|
||||
|
||||
:param df: DataFrame on which to perform aggregate operation.
|
||||
:param aggregates: Mapping from column name to aggregate config.
|
||||
:return: Mapping from metric name to function that takes a single input argument.
|
||||
"""
|
||||
agg_funcs: Dict[str, NamedAgg] = {}
|
||||
for name, agg_obj in aggregates.items():
|
||||
column = agg_obj.get("column", name)
|
||||
if column not in df:
|
||||
raise QueryObjectValidationError(
|
||||
_(
|
||||
"Column referenced by aggregate is undefined: %(column)s",
|
||||
column=column,
|
||||
)
|
||||
)
|
||||
if "operator" not in agg_obj:
|
||||
raise QueryObjectValidationError(
|
||||
_("Operator undefined for aggregator: %(name)s", name=name,)
|
||||
)
|
||||
operator = agg_obj["operator"]
|
||||
if callable(operator):
|
||||
aggfunc = operator
|
||||
else:
|
||||
func = NUMPY_FUNCTIONS.get(operator)
|
||||
if not func:
|
||||
raise QueryObjectValidationError(
|
||||
_("Invalid numpy function: %(operator)s", operator=operator,)
|
||||
)
|
||||
options = agg_obj.get("options", {})
|
||||
aggfunc = partial(func, **options)
|
||||
agg_funcs[name] = NamedAgg(column=column, aggfunc=aggfunc)
|
||||
|
||||
return agg_funcs
|
||||
|
||||
|
||||
def _append_columns(
|
||||
base_df: DataFrame, append_df: DataFrame, columns: Dict[str, str]
|
||||
) -> DataFrame:
|
||||
"""
|
||||
Function for adding columns from one DataFrame to another DataFrame. Calls the
|
||||
assign method, which overwrites the original column in `base_df` if the column
|
||||
already exists, and appends the column if the name is not defined.
|
||||
|
||||
:param base_df: DataFrame which to use as the base
|
||||
:param append_df: DataFrame from which to select data.
|
||||
:param columns: columns on which to append, mapping source column to
|
||||
target column. For instance, `{'y': 'y'}` will replace the values in
|
||||
column `y` in `base_df` with the values in `y` in `append_df`,
|
||||
while `{'y': 'y2'}` will add a column `y2` to `base_df` based
|
||||
on values in column `y` in `append_df`, leaving the original column `y`
|
||||
in `base_df` unchanged.
|
||||
:return: new DataFrame with combined data from `base_df` and `append_df`
|
||||
"""
|
||||
return base_df.assign(
|
||||
**{target: append_df[source] for source, target in columns.items()}
|
||||
)
|
||||
Reference in New Issue
Block a user