superset2/superset/utils/pandas_postprocessing/pivot.py

# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements.  See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership.  The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License.  You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.
from typing import Any, Optional

from flask_babel import gettext as _
from pandas import DataFrame

from superset.constants import NULL_STRING, PandasAxis
from superset.exceptions import InvalidPostProcessingError
from superset.utils.pandas_postprocessing.utils import (
    _get_aggregate_funcs,
    validate_column_args,
)


def _restore_dropped_metric_columns(
    df: DataFrame,
    expected_metrics: list[str],
    orig_columns: Optional[DataFrame],
) -> DataFrame:
    """Re-add metric columns that pivot_table dropped due to all-NaN values.

    When drop_missing_columns=True, pandas pivot_table silently removes columns
    whose entries are all NaN. This breaks downstream post-processing steps
    (rename, rolling) that use validate_column_args to assert the columns exist.
    Restoring the columns as all-NaN preserves the expected schema while still
    allowing sparse category combinations to be dropped — only metric-level
    absences are restored.

    Note: this intentionally changes the visible output of drop_missing_columns=True
    for all-NaN metrics: they are kept as empty series rather than dropped. This is
    necessary for chart-rendering post-processing to maintain schema stability.

    :param df: Post-pivot DataFrame (may have MultiIndex or flat columns).
    :param expected_metrics: Metric column names that should exist at level 0.
    :param orig_columns: Pre-pivot slice of the groupby column(s), used to
           lazily compute (metric, *col_vals) restoration keys for only the
           metrics that were entirely absent after pivoting. None for flat pivots.
    """
    if orig_columns is not None:
        # MultiIndex case. Only compute keys for metrics that were entirely
        # dropped — skips metrics still present, avoiding O(n_rows × n_metrics)
        # upfront work when no all-NaN drop occurred.
        existing_metrics = (
            set(df.columns.get_level_values(0)) if len(df.columns) > 0 else set()
        )
        missing = {m for m in expected_metrics if m not in existing_metrics}
        if missing:
            # Dict preserves data-insertion order and deduplicates, so restored
            # columns appear in deterministic order.
            keys_dict: dict[tuple[Any, ...], None] = {}
            for row in orig_columns.itertuples():
                for metric in missing:
                    keys_dict[(metric, *row[1:])] = None
            for key in keys_dict:
                df[key] = float("nan")
    else:
        # Flat case (no groupby columns): restore simple metric columns.
        for metric in expected_metrics:
            if metric not in df.columns:
                df[metric] = float("nan")
    return df


@validate_column_args("index", "columns")
def pivot(  # pylint: disable=too-many-arguments
    df: DataFrame,
    index: list[str],
    aggregates: dict[str, dict[str, Any]],
    columns: Optional[list[str]] = None,
    metric_fill_value: Optional[Any] = None,
    column_fill_value: Optional[str] = NULL_STRING,
    drop_missing_columns: Optional[bool] = True,
    combine_value_with_metric: bool = False,
    marginal_distributions: Optional[bool] = None,
    marginal_distribution_name: Optional[str] = None,
) -> DataFrame:
    """
    Perform a pivot operation on a DataFrame.

    :param df: Object on which pivot operation will be performed
    :param index: Columns to group by on the table index (=rows)
    :param columns: Columns to group by on the table columns
    :param metric_fill_value: Value to replace missing values with
    :param column_fill_value: Value to replace missing pivot columns with. By default
           replaces missing values with "<NULL>". Set to `None` to remove columns
           with missing values.
    :param drop_missing_columns: Do not include columns whose entries are all missing.
           Note: metric columns entirely absent after pivoting (the whole metric is
           all-NaN) are restored as empty series so that downstream post-processing
           (rename, rolling) can reference them. Sparse category combinations where
           only some (metric, category) pairs are all-NaN may still be dropped.
    :param combine_value_with_metric: Display metrics side by side within each column,
           as opposed to each column being displayed side by side for each metric.
    :param aggregates: A mapping from aggregate column name to the aggregate
           config.
    :param marginal_distributions: Add totals for row/column. Default to False
    :param marginal_distribution_name: Name of row/column with marginal distribution.
           Default to 'All'.
    :return: A pivot table
    :raises InvalidPostProcessingError: If the request in incorrect
    """
    if not index:
        raise InvalidPostProcessingError(
            _("Pivot operation requires at least one index")
        )
    if not aggregates:
        raise InvalidPostProcessingError(
            _("Pivot operation must include at least one aggregate")
        )

    if columns and column_fill_value:
        df[columns] = df[columns].fillna(value=column_fill_value)

    aggregate_funcs = _get_aggregate_funcs(df, aggregates)

    # TODO (villebro): Pandas 1.0.3 doesn't yet support NamedAgg in pivot_table.
    #  Remove once/if support is added.
    aggfunc = {na.column: na.aggfunc for na in aggregate_funcs.values()}

    # For drop_missing_columns=False: pre-compute all (metric, *col_vals) tuples
    # to filter Cartesian-product columns after pivoting.
    # For drop_missing_columns=True: save a slice of the groupby column data so
    # that _restore_dropped_metric_columns can build keys lazily — only for metrics
    # that were actually dropped, avoiding O(n_rows × n_metrics) upfront work in
    # the common case where no metric is entirely all-NaN.
    # https://github.com/apache/superset/issues/15956
    # https://github.com/pandas-dev/pandas/issues/18030
    pivot_key_set: set[tuple[Any, ...]] = set()
    if not drop_missing_columns and columns:
        for row in df[columns].itertuples():
            for metric in aggfunc.keys():
                pivot_key_set.add((metric, *row[1:]))
    orig_columns_df = df[columns] if columns else None

    df = df.pivot_table(
        values=aggfunc.keys(),
        index=index,
        columns=columns,
        aggfunc=aggfunc,
        fill_value=metric_fill_value,
        dropna=drop_missing_columns,
        margins=marginal_distributions,
        margins_name=marginal_distribution_name,
    )

    if drop_missing_columns:
        df = _restore_dropped_metric_columns(df, list(aggfunc.keys()), orig_columns_df)
    elif pivot_key_set and not df.empty:
        df = df.drop(df.columns.difference(pivot_key_set), axis=PandasAxis.COLUMN)

    if combine_value_with_metric:
        # dropna=False preserves restored all-NaN metric rows that would otherwise
        # be silently dropped by stack's default dropna=True behavior.
        df = df.stack(level=0, dropna=False).unstack()

    return df