From 01224007dafd123f9722fd722cf34a7cf3c5e70f Mon Sep 17 00:00:00 2001 From: Mafi Date: Thu, 14 May 2026 20:46:34 +1000 Subject: [PATCH] fix(mixed-timeseries): preserve all-NaN metric columns after pivot when Jinja evaluates to NULL (#40005) Co-authored-by: Matt Fitzgerald Co-authored-by: Claude Sonnet 4.6 --- superset/utils/pandas_postprocessing/pivot.py | 78 +++++- .../pandas_postprocessing/test_pivot.py | 243 ++++++++++++++++++ 2 files changed, 313 insertions(+), 8 deletions(-) diff --git a/superset/utils/pandas_postprocessing/pivot.py b/superset/utils/pandas_postprocessing/pivot.py index aadde058b21..3f8883bfb56 100644 --- a/superset/utils/pandas_postprocessing/pivot.py +++ b/superset/utils/pandas_postprocessing/pivot.py @@ -27,6 +27,55 @@ from superset.utils.pandas_postprocessing.utils import ( ) +def _restore_dropped_metric_columns( + df: DataFrame, + expected_metrics: list[str], + orig_columns: Optional[DataFrame], +) -> DataFrame: + """Re-add metric columns that pivot_table dropped due to all-NaN values. + + When drop_missing_columns=True, pandas pivot_table silently removes columns + whose entries are all NaN. This breaks downstream post-processing steps + (rename, rolling) that use validate_column_args to assert the columns exist. + Restoring the columns as all-NaN preserves the expected schema while still + allowing sparse category combinations to be dropped — only metric-level + absences are restored. + + Note: this intentionally changes the visible output of drop_missing_columns=True + for all-NaN metrics: they are kept as empty series rather than dropped. This is + necessary for chart-rendering post-processing to maintain schema stability. + + :param df: Post-pivot DataFrame (may have MultiIndex or flat columns). + :param expected_metrics: Metric column names that should exist at level 0. + :param orig_columns: Pre-pivot slice of the groupby column(s), used to + lazily compute (metric, *col_vals) restoration keys for only the + metrics that were entirely absent after pivoting. None for flat pivots. + """ + if orig_columns is not None: + # MultiIndex case. Only compute keys for metrics that were entirely + # dropped — skips metrics still present, avoiding O(n_rows × n_metrics) + # upfront work when no all-NaN drop occurred. + existing_metrics = ( + set(df.columns.get_level_values(0)) if len(df.columns) > 0 else set() + ) + missing = {m for m in expected_metrics if m not in existing_metrics} + if missing: + # Dict preserves data-insertion order and deduplicates, so restored + # columns appear in deterministic order. + keys_dict: dict[tuple[Any, ...], None] = {} + for row in orig_columns.itertuples(): + for metric in missing: + keys_dict[(metric, *row[1:])] = None + for key in keys_dict: + df[key] = float("nan") + else: + # Flat case (no groupby columns): restore simple metric columns. + for metric in expected_metrics: + if metric not in df.columns: + df[metric] = float("nan") + return df + + @validate_column_args("index", "columns") def pivot( # pylint: disable=too-many-arguments df: DataFrame, @@ -50,7 +99,11 @@ def pivot( # pylint: disable=too-many-arguments :param column_fill_value: Value to replace missing pivot columns with. By default replaces missing values with "". Set to `None` to remove columns with missing values. - :param drop_missing_columns: Do not include columns whose entries are all missing + :param drop_missing_columns: Do not include columns whose entries are all missing. + Note: metric columns entirely absent after pivoting (the whole metric is + all-NaN) are restored as empty series so that downstream post-processing + (rename, rolling) can reference them. Sparse category combinations where + only some (metric, category) pairs are all-NaN may still be dropped. :param combine_value_with_metric: Display metrics side by side within each column, as opposed to each column being displayed side by side for each metric. :param aggregates: A mapping from aggregate column name to the aggregate @@ -79,15 +132,20 @@ def pivot( # pylint: disable=too-many-arguments # Remove once/if support is added. aggfunc = {na.column: na.aggfunc for na in aggregate_funcs.values()} - # When dropna = False, the pivot_table function will calculate cartesian-product - # for MultiIndex. + # For drop_missing_columns=False: pre-compute all (metric, *col_vals) tuples + # to filter Cartesian-product columns after pivoting. + # For drop_missing_columns=True: save a slice of the groupby column data so + # that _restore_dropped_metric_columns can build keys lazily — only for metrics + # that were actually dropped, avoiding O(n_rows × n_metrics) upfront work in + # the common case where no metric is entirely all-NaN. # https://github.com/apache/superset/issues/15956 # https://github.com/pandas-dev/pandas/issues/18030 - series_set = set() + pivot_key_set: set[tuple[Any, ...]] = set() if not drop_missing_columns and columns: for row in df[columns].itertuples(): for metric in aggfunc.keys(): - series_set.add(tuple([metric]) + tuple(row[1:])) # noqa: C409 + pivot_key_set.add((metric, *row[1:])) + orig_columns_df = df[columns] if columns else None df = df.pivot_table( values=aggfunc.keys(), @@ -100,10 +158,14 @@ def pivot( # pylint: disable=too-many-arguments margins_name=marginal_distribution_name, ) - if not drop_missing_columns and len(series_set) > 0 and not df.empty: - df = df.drop(df.columns.difference(series_set), axis=PandasAxis.COLUMN) + if drop_missing_columns: + df = _restore_dropped_metric_columns(df, list(aggfunc.keys()), orig_columns_df) + elif pivot_key_set and not df.empty: + df = df.drop(df.columns.difference(pivot_key_set), axis=PandasAxis.COLUMN) if combine_value_with_metric: - df = df.stack(0).unstack() + # dropna=False preserves restored all-NaN metric rows that would otherwise + # be silently dropped by stack's default dropna=True behavior. + df = df.stack(level=0, dropna=False).unstack() return df diff --git a/tests/unit_tests/pandas_postprocessing/test_pivot.py b/tests/unit_tests/pandas_postprocessing/test_pivot.py index 5b05b9a3eab..616f1c44316 100644 --- a/tests/unit_tests/pandas_postprocessing/test_pivot.py +++ b/tests/unit_tests/pandas_postprocessing/test_pivot.py @@ -16,6 +16,7 @@ # under the License. import numpy as np +import pandas as pd import pytest from pandas import DataFrame, to_datetime @@ -203,3 +204,245 @@ def test_pivot_eliminate_cartesian_product_columns(): "metric2, 1, 1", ] assert np.isnan(df["metric, 1, 1"][0]) + + +def test_pivot_preserves_all_nan_metric_flat(): + """ + Pivot with drop_missing_columns=True must not drop metric columns whose entries + are all NaN. This prevents downstream post-processing (e.g. rename) from failing + with "Referenced columns not available in DataFrame" when a Jinja metric + expression evaluates to NULL for every row (SC-100398). + """ + mock_df = DataFrame( + { + "dttm": to_datetime(["2019-01-01", "2019-01-02", "2019-01-03"]), + "metric": [np.nan, np.nan, np.nan], + } + ) + + df = pivot( + df=mock_df, + index=["dttm"], + aggregates={"metric": {"operator": "mean"}}, + drop_missing_columns=True, + ) + + assert "metric" in df.columns + assert df["metric"].isna().all() + + +def test_pivot_preserves_all_nan_metric_with_columns(): + """ + Pivot with groupby columns and drop_missing_columns=True must restore the + exact (metric, category_val) MultiIndex keys when all values for that metric + are NaN. The restored keys must use the actual category values from the input + data so that downstream rename/rolling validation and flatten produce the + correct column names. + """ + mock_df = DataFrame( + { + "dttm": to_datetime(["2019-01-01", "2019-01-01"]), + "category": ["A", "B"], + "metric": [np.nan, np.nan], + } + ) + + df = pivot( + df=mock_df, + index=["dttm"], + columns=["category"], + aggregates={"metric": {"operator": "mean"}}, + drop_missing_columns=True, + ) + + assert isinstance(df.columns, pd.MultiIndex) + assert "metric" in df.columns.get_level_values(0) + # Exact keys must reflect the real category values, not placeholders. + assert ("metric", "A") in df.columns + assert ("metric", "B") in df.columns + + df = flatten(df) + assert "metric, A" in df.columns + assert "metric, B" in df.columns + assert df["metric, A"].isna().all() + assert df["metric, B"].isna().all() + + +def test_pivot_preserves_all_nan_metric_multi_column(): + """ + Pivot with multiple groupby columns and an all-NaN metric restores the full + multi-level (metric, col_val_1, col_val_2) key, not a truncated or placeholder + version. Exercises the case where columns=["country", "category"]. + """ + mock_df = DataFrame( + { + "dttm": to_datetime( + ["2019-01-01", "2019-01-01", "2019-01-01", "2019-01-01"] + ), + "country": ["US", "US", "EU", "EU"], + "category": ["A", "B", "A", "B"], + "metric": [np.nan, np.nan, np.nan, np.nan], + } + ) + + df = pivot( + df=mock_df, + index=["dttm"], + columns=["country", "category"], + aggregates={"metric": {"operator": "mean"}}, + drop_missing_columns=True, + ) + + assert isinstance(df.columns, pd.MultiIndex) + assert "metric" in df.columns.get_level_values(0) + # All four combinations must be restored with correct full tuple keys. + assert ("metric", "US", "A") in df.columns + assert ("metric", "US", "B") in df.columns + assert ("metric", "EU", "A") in df.columns + assert ("metric", "EU", "B") in df.columns + + df = flatten(df) + assert "metric, US, A" in df.columns + assert "metric, EU, B" in df.columns + assert df["metric, US, A"].isna().all() + + +def test_pivot_restored_nan_metric_column_order_is_deterministic(): + """ + Restored all-NaN metric columns must appear in data-insertion order, not + in nondeterministic hash-set iteration order. This prevents column ordering + from varying across Python processes (which randomize hash seeds by default). + """ + mock_df = DataFrame( + { + "dttm": to_datetime(["2019-01-01", "2019-01-01", "2019-01-01"]), + "category": ["C", "A", "B"], + "metric": [np.nan, np.nan, np.nan], + } + ) + + df = pivot( + df=mock_df, + index=["dttm"], + columns=["category"], + aggregates={"metric": {"operator": "mean"}}, + drop_missing_columns=True, + ) + + # Columns restored in data-insertion order: C, A, B (not alphabetical or random). + assert list(df.columns.get_level_values(1)) == ["C", "A", "B"] + + +def test_pivot_preserves_all_nan_metric_combine_value_with_metric(): + """ + When combine_value_with_metric=True, a stack()/unstack() is applied after + column restoration. stack() drops all-NaN rows by default, which would remove + the restored metric before downstream post-processing can reference it. + Using dropna=False on stack() ensures restored all-NaN metrics survive. + """ + mock_df = DataFrame( + { + "dttm": to_datetime(["2019-01-01", "2019-01-01"]), + "category": ["A", "B"], + "metric": [np.nan, np.nan], + "metric2": [1.0, 2.0], + } + ) + + df = pivot( + df=mock_df, + index=["dttm"], + columns=["category"], + aggregates={ + "metric": {"operator": "mean"}, + "metric2": {"operator": "mean"}, + }, + drop_missing_columns=True, + combine_value_with_metric=True, + ) + + # After stack()/unstack(), columns are (category_val, metric_name) tuples. + # The all-NaN metric must appear in level 1 alongside metric2. + assert isinstance(df.columns, pd.MultiIndex) + metric_names = df.columns.get_level_values(1).tolist() + assert "metric" in metric_names + assert "metric2" in metric_names + + +def test_pivot_combine_sparse_metrics_no_spurious_extra_columns(): + """ + With drop_missing_columns=True and combine_value_with_metric=True, using + stack(dropna=False) to preserve restored all-NaN metrics must not alter output + shape for sparse-but-not-all-NaN metric/category pairs. stack(dropna=False) only + changes behaviour for rows that are entirely NaN (a restored metric); sparse rows + with at least one non-NaN value are unaffected — same result as dropna=True. + """ + mock_df = DataFrame( + { + "dttm": to_datetime(["2019-01-01", "2019-01-01"]), + "category": ["A", "B"], + "metric1": [1.0, np.nan], # data only for category A + "metric2": [np.nan, 2.0], # data only for category B + } + ) + + df = pivot( + df=mock_df, + index=["dttm"], + columns=["category"], + aggregates={ + "metric1": {"operator": "mean"}, + "metric2": {"operator": "mean"}, + }, + drop_missing_columns=True, + combine_value_with_metric=True, + ) + + # After combine, columns are (category_val, metric_name) tuples. + # Neither metric is entirely absent after pivoting, so _restore adds nothing. + # stack(dropna=False) does not change results for sparse rows with mixed NaN/data. + assert isinstance(df.columns, pd.MultiIndex) + assert sorted(df.columns.get_level_values(0).unique()) == ["A", "B"] + assert sorted(df.columns.get_level_values(1).unique()) == ["metric1", "metric2"] + # Sparse NaN cells are present but the data cells must retain their values. + assert df[("A", "metric1")].iloc[0] == 1.0 + assert df[("B", "metric2")].iloc[0] == 2.0 + + +def test_pivot_only_entirely_absent_metrics_are_restored(): + """ + Only metrics with zero surviving columns after pivoting are restored. + A metric with partial NaN — data for some categories but not all — must not + be touched: its present columns are unchanged and its absent sparse combinations + remain dropped. This makes the restoration invariant explicit. + """ + mock_df = DataFrame( + { + "dttm": to_datetime(["2019-01-01", "2019-01-01"]), + "category": ["A", "B"], + "metric_all_nan": [np.nan, np.nan], # entirely absent → restored + "metric_partial": [1.0, np.nan], # partially present → not restored + } + ) + + df = pivot( + df=mock_df, + index=["dttm"], + columns=["category"], + aggregates={ + "metric_all_nan": {"operator": "mean"}, + "metric_partial": {"operator": "mean"}, + }, + drop_missing_columns=True, + ) + + # metric_all_nan was entirely absent: both category columns are restored as NaN. + assert ("metric_all_nan", "A") in df.columns + assert ("metric_all_nan", "B") in df.columns + assert df[("metric_all_nan", "A")].isna().all() + assert df[("metric_all_nan", "B")].isna().all() + + # metric_partial has data for A: present column is unchanged, sparse B dropped. + assert ("metric_partial", "A") in df.columns + assert ("metric_partial", "B") not in df.columns + assert df[("metric_partial", "A")].iloc[0] == 1.0