feat(advanced analysis): support MultiIndex column in post processing stage (#19116)

This commit is contained in:
Yongjie Zhao
2022-03-23 13:46:28 +08:00
committed by Ville Brofeldt
parent f8a92de75c
commit 9bc76337cf
55 changed files with 1272 additions and 772 deletions

View File

@@ -14,48 +14,35 @@
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
from typing import Optional, Tuple, Union
from typing import Optional, Union
from pandas import DataFrame
import pandas as pd
from flask_babel import gettext as _
from superset.utils.pandas_postprocessing.utils import validate_column_args
from superset.exceptions import InvalidPostProcessingError
@validate_column_args("groupby_columns")
def resample( # pylint: disable=too-many-arguments
df: DataFrame,
def resample(
df: pd.DataFrame,
rule: str,
method: str,
time_column: str,
groupby_columns: Optional[Tuple[Optional[str], ...]] = None,
fill_value: Optional[Union[float, int]] = None,
) -> DataFrame:
) -> pd.DataFrame:
"""
support upsampling in resample
:param df: DataFrame to resample.
:param rule: The offset string representing target conversion.
:param method: How to fill the NaN value after resample.
:param time_column: existing columns in DataFrame.
:param groupby_columns: columns except time_column in dataframe
:param fill_value: What values do fill missing.
:return: DataFrame after resample
:raises QueryObjectValidationError: If the request in incorrect
:raises InvalidPostProcessingError: If the request in incorrect
"""
if not isinstance(df.index, pd.DatetimeIndex):
raise InvalidPostProcessingError(_("Resample operation requires DatetimeIndex"))
def _upsampling(_df: DataFrame) -> DataFrame:
_df = _df.set_index(time_column)
if method == "asfreq" and fill_value is not None:
return _df.resample(rule).asfreq(fill_value=fill_value)
return getattr(_df.resample(rule), method)()
if groupby_columns:
df = (
df.set_index(keys=list(groupby_columns))
.groupby(by=list(groupby_columns))
.apply(_upsampling)
)
df = df.reset_index().set_index(time_column).sort_index()
if method == "asfreq" and fill_value is not None:
_df = df.resample(rule).asfreq(fill_value=fill_value)
else:
df = _upsampling(df)
return df.reset_index()
_df = getattr(df.resample(rule), method)()
return _df