From 01224007dafd123f9722fd722cf34a7cf3c5e70f Mon Sep 17 00:00:00 2001
From: Mafi <matt.fitzgerald@gmail.com>
Date: Thu, 14 May 2026 20:46:34 +1000
Subject: [PATCH] fix(mixed-timeseries): preserve all-NaN metric columns after
 pivot when Jinja evaluates to NULL (#40005)

Co-authored-by: Matt Fitzgerald <matt.fitzgerald@preset.io>
Co-authored-by: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 superset/utils/pandas_postprocessing/pivot.py |  78 +++++-
 .../pandas_postprocessing/test_pivot.py       | 243 ++++++++++++++++++
 2 files changed, 313 insertions(+), 8 deletions(-)
diff --git a/superset/utils/pandas_postprocessing/pivot.py b/superset/utils/pandas_postprocessing/pivot.py
index aadde058b21..3f8883bfb56 100644
--- a/superset/utils/pandas_postprocessing/pivot.py
+++ b/superset/utils/pandas_postprocessing/pivot.py
@@ -27,6 +27,55 @@ from superset.utils.pandas_postprocessing.utils import (
 )
 
 
+def _restore_dropped_metric_columns(
+    df: DataFrame,
+    expected_metrics: list[str],
+    orig_columns: Optional[DataFrame],
+) -> DataFrame:
+    """Re-add metric columns that pivot_table dropped due to all-NaN values.
+
+    When drop_missing_columns=True, pandas pivot_table silently removes columns
+    whose entries are all NaN. This breaks downstream post-processing steps
+    (rename, rolling) that use validate_column_args to assert the columns exist.
+    Restoring the columns as all-NaN preserves the expected schema while still
+    allowing sparse category combinations to be dropped — only metric-level
+    absences are restored.
+
+    Note: this intentionally changes the visible output of drop_missing_columns=True
+    for all-NaN metrics: they are kept as empty series rather than dropped. This is
+    necessary for chart-rendering post-processing to maintain schema stability.
+
+    :param df: Post-pivot DataFrame (may have MultiIndex or flat columns).
+    :param expected_metrics: Metric column names that should exist at level 0.
+    :param orig_columns: Pre-pivot slice of the groupby column(s), used to
+           lazily compute (metric, *col_vals) restoration keys for only the
+           metrics that were entirely absent after pivoting. None for flat pivots.
+    """
+    if orig_columns is not None:
+        # MultiIndex case. Only compute keys for metrics that were entirely
+        # dropped — skips metrics still present, avoiding O(n_rows × n_metrics)
+        # upfront work when no all-NaN drop occurred.
+        existing_metrics = (
+            set(df.columns.get_level_values(0)) if len(df.columns) > 0 else set()
+        )
+        missing = {m for m in expected_metrics if m not in existing_metrics}
+        if missing:
+            # Dict preserves data-insertion order and deduplicates, so restored
+            # columns appear in deterministic order.
+            keys_dict: dict[tuple[Any, ...], None] = {}
+            for row in orig_columns.itertuples():
+                for metric in missing:
+                    keys_dict[(metric, *row[1:])] = None
+            for key in keys_dict:
+                df[key] = float("nan")
+    else:
+        # Flat case (no groupby columns): restore simple metric columns.
+        for metric in expected_metrics:
+            if metric not in df.columns:
+                df[metric] = float("nan")
+    return df
+
+
 @validate_column_args("index", "columns")
 def pivot(  # pylint: disable=too-many-arguments
     df: DataFrame,
@@ -50,7 +99,11 @@ def pivot(  # pylint: disable=too-many-arguments
     :param column_fill_value: Value to replace missing pivot columns with. By default
            replaces missing values with "<NULL>". Set to `None` to remove columns
            with missing values.
-    :param drop_missing_columns: Do not include columns whose entries are all missing
+    :param drop_missing_columns: Do not include columns whose entries are all missing.
+           Note: metric columns entirely absent after pivoting (the whole metric is
+           all-NaN) are restored as empty series so that downstream post-processing
+           (rename, rolling) can reference them. Sparse category combinations where
+           only some (metric, category) pairs are all-NaN may still be dropped.
     :param combine_value_with_metric: Display metrics side by side within each column,
            as opposed to each column being displayed side by side for each metric.
     :param aggregates: A mapping from aggregate column name to the aggregate
@@ -79,15 +132,20 @@ def pivot(  # pylint: disable=too-many-arguments
     #  Remove once/if support is added.
     aggfunc = {na.column: na.aggfunc for na in aggregate_funcs.values()}
 
-    # When dropna = False, the pivot_table function will calculate cartesian-product
-    # for MultiIndex.
+    # For drop_missing_columns=False: pre-compute all (metric, *col_vals) tuples
+    # to filter Cartesian-product columns after pivoting.
+    # For drop_missing_columns=True: save a slice of the groupby column data so
+    # that _restore_dropped_metric_columns can build keys lazily — only for metrics
+    # that were actually dropped, avoiding O(n_rows × n_metrics) upfront work in
+    # the common case where no metric is entirely all-NaN.
     # https://github.com/apache/superset/issues/15956
     # https://github.com/pandas-dev/pandas/issues/18030
-    series_set = set()
+    pivot_key_set: set[tuple[Any, ...]] = set()
     if not drop_missing_columns and columns:
         for row in df[columns].itertuples():
             for metric in aggfunc.keys():
-                series_set.add(tuple([metric]) + tuple(row[1:]))  # noqa: C409
+                pivot_key_set.add((metric, *row[1:]))
+    orig_columns_df = df[columns] if columns else None
 
     df = df.pivot_table(
         values=aggfunc.keys(),
@@ -100,10 +158,14 @@ def pivot(  # pylint: disable=too-many-arguments
         margins_name=marginal_distribution_name,
     )
 
-    if not drop_missing_columns and len(series_set) > 0 and not df.empty:
-        df = df.drop(df.columns.difference(series_set), axis=PandasAxis.COLUMN)
+    if drop_missing_columns:
+        df = _restore_dropped_metric_columns(df, list(aggfunc.keys()), orig_columns_df)
+    elif pivot_key_set and not df.empty:
+        df = df.drop(df.columns.difference(pivot_key_set), axis=PandasAxis.COLUMN)
 
     if combine_value_with_metric:
-        df = df.stack(0).unstack()
+        # dropna=False preserves restored all-NaN metric rows that would otherwise
+        # be silently dropped by stack's default dropna=True behavior.
+        df = df.stack(level=0, dropna=False).unstack()
 
     return df
diff --git a/tests/unit_tests/pandas_postprocessing/test_pivot.py b/tests/unit_tests/pandas_postprocessing/test_pivot.py
index 5b05b9a3eab..616f1c44316 100644
--- a/tests/unit_tests/pandas_postprocessing/test_pivot.py
+++ b/tests/unit_tests/pandas_postprocessing/test_pivot.py
@@ -16,6 +16,7 @@
 # under the License.
 
 import numpy as np
+import pandas as pd
 import pytest
 from pandas import DataFrame, to_datetime
 
@@ -203,3 +204,245 @@ def test_pivot_eliminate_cartesian_product_columns():
         "metric2, 1, 1",
     ]
     assert np.isnan(df["metric, 1, 1"][0])
+
+
+def test_pivot_preserves_all_nan_metric_flat():
+    """
+    Pivot with drop_missing_columns=True must not drop metric columns whose entries
+    are all NaN. This prevents downstream post-processing (e.g. rename) from failing
+    with "Referenced columns not available in DataFrame" when a Jinja metric
+    expression evaluates to NULL for every row (SC-100398).
+    """
+    mock_df = DataFrame(
+        {
+            "dttm": to_datetime(["2019-01-01", "2019-01-02", "2019-01-03"]),
+            "metric": [np.nan, np.nan, np.nan],
+        }
+    )
+
+    df = pivot(
+        df=mock_df,
+        index=["dttm"],
+        aggregates={"metric": {"operator": "mean"}},
+        drop_missing_columns=True,
+    )
+
+    assert "metric" in df.columns
+    assert df["metric"].isna().all()
+
+
+def test_pivot_preserves_all_nan_metric_with_columns():
+    """
+    Pivot with groupby columns and drop_missing_columns=True must restore the
+    exact (metric, category_val) MultiIndex keys when all values for that metric
+    are NaN. The restored keys must use the actual category values from the input
+    data so that downstream rename/rolling validation and flatten produce the
+    correct column names.
+    """
+    mock_df = DataFrame(
+        {
+            "dttm": to_datetime(["2019-01-01", "2019-01-01"]),
+            "category": ["A", "B"],
+            "metric": [np.nan, np.nan],
+        }
+    )
+
+    df = pivot(
+        df=mock_df,
+        index=["dttm"],
+        columns=["category"],
+        aggregates={"metric": {"operator": "mean"}},
+        drop_missing_columns=True,
+    )
+
+    assert isinstance(df.columns, pd.MultiIndex)
+    assert "metric" in df.columns.get_level_values(0)
+    # Exact keys must reflect the real category values, not placeholders.
+    assert ("metric", "A") in df.columns
+    assert ("metric", "B") in df.columns
+
+    df = flatten(df)
+    assert "metric, A" in df.columns
+    assert "metric, B" in df.columns
+    assert df["metric, A"].isna().all()
+    assert df["metric, B"].isna().all()
+
+
+def test_pivot_preserves_all_nan_metric_multi_column():
+    """
+    Pivot with multiple groupby columns and an all-NaN metric restores the full
+    multi-level (metric, col_val_1, col_val_2) key, not a truncated or placeholder
+    version. Exercises the case where columns=["country", "category"].
+    """
+    mock_df = DataFrame(
+        {
+            "dttm": to_datetime(
+                ["2019-01-01", "2019-01-01", "2019-01-01", "2019-01-01"]
+            ),
+            "country": ["US", "US", "EU", "EU"],
+            "category": ["A", "B", "A", "B"],
+            "metric": [np.nan, np.nan, np.nan, np.nan],
+        }
+    )
+
+    df = pivot(
+        df=mock_df,
+        index=["dttm"],
+        columns=["country", "category"],
+        aggregates={"metric": {"operator": "mean"}},
+        drop_missing_columns=True,
+    )
+
+    assert isinstance(df.columns, pd.MultiIndex)
+    assert "metric" in df.columns.get_level_values(0)
+    # All four combinations must be restored with correct full tuple keys.
+    assert ("metric", "US", "A") in df.columns
+    assert ("metric", "US", "B") in df.columns
+    assert ("metric", "EU", "A") in df.columns
+    assert ("metric", "EU", "B") in df.columns
+
+    df = flatten(df)
+    assert "metric, US, A" in df.columns
+    assert "metric, EU, B" in df.columns
+    assert df["metric, US, A"].isna().all()
+
+
+def test_pivot_restored_nan_metric_column_order_is_deterministic():
+    """
+    Restored all-NaN metric columns must appear in data-insertion order, not
+    in nondeterministic hash-set iteration order. This prevents column ordering
+    from varying across Python processes (which randomize hash seeds by default).
+    """
+    mock_df = DataFrame(
+        {
+            "dttm": to_datetime(["2019-01-01", "2019-01-01", "2019-01-01"]),
+            "category": ["C", "A", "B"],
+            "metric": [np.nan, np.nan, np.nan],
+        }
+    )
+
+    df = pivot(
+        df=mock_df,
+        index=["dttm"],
+        columns=["category"],
+        aggregates={"metric": {"operator": "mean"}},
+        drop_missing_columns=True,
+    )
+
+    # Columns restored in data-insertion order: C, A, B (not alphabetical or random).
+    assert list(df.columns.get_level_values(1)) == ["C", "A", "B"]
+
+
+def test_pivot_preserves_all_nan_metric_combine_value_with_metric():
+    """
+    When combine_value_with_metric=True, a stack()/unstack() is applied after
+    column restoration. stack() drops all-NaN rows by default, which would remove
+    the restored metric before downstream post-processing can reference it.
+    Using dropna=False on stack() ensures restored all-NaN metrics survive.
+    """
+    mock_df = DataFrame(
+        {
+            "dttm": to_datetime(["2019-01-01", "2019-01-01"]),
+            "category": ["A", "B"],
+            "metric": [np.nan, np.nan],
+            "metric2": [1.0, 2.0],
+        }
+    )
+
+    df = pivot(
+        df=mock_df,
+        index=["dttm"],
+        columns=["category"],
+        aggregates={
+            "metric": {"operator": "mean"},
+            "metric2": {"operator": "mean"},
+        },
+        drop_missing_columns=True,
+        combine_value_with_metric=True,
+    )
+
+    # After stack()/unstack(), columns are (category_val, metric_name) tuples.
+    # The all-NaN metric must appear in level 1 alongside metric2.
+    assert isinstance(df.columns, pd.MultiIndex)
+    metric_names = df.columns.get_level_values(1).tolist()
+    assert "metric" in metric_names
+    assert "metric2" in metric_names
+
+
+def test_pivot_combine_sparse_metrics_no_spurious_extra_columns():
+    """
+    With drop_missing_columns=True and combine_value_with_metric=True, using
+    stack(dropna=False) to preserve restored all-NaN metrics must not alter output
+    shape for sparse-but-not-all-NaN metric/category pairs. stack(dropna=False) only
+    changes behaviour for rows that are entirely NaN (a restored metric); sparse rows
+    with at least one non-NaN value are unaffected — same result as dropna=True.
+    """
+    mock_df = DataFrame(
+        {
+            "dttm": to_datetime(["2019-01-01", "2019-01-01"]),
+            "category": ["A", "B"],
+            "metric1": [1.0, np.nan],  # data only for category A
+            "metric2": [np.nan, 2.0],  # data only for category B
+        }
+    )
+
+    df = pivot(
+        df=mock_df,
+        index=["dttm"],
+        columns=["category"],
+        aggregates={
+            "metric1": {"operator": "mean"},
+            "metric2": {"operator": "mean"},
+        },
+        drop_missing_columns=True,
+        combine_value_with_metric=True,
+    )
+
+    # After combine, columns are (category_val, metric_name) tuples.
+    # Neither metric is entirely absent after pivoting, so _restore adds nothing.
+    # stack(dropna=False) does not change results for sparse rows with mixed NaN/data.
+    assert isinstance(df.columns, pd.MultiIndex)
+    assert sorted(df.columns.get_level_values(0).unique()) == ["A", "B"]
+    assert sorted(df.columns.get_level_values(1).unique()) == ["metric1", "metric2"]
+    # Sparse NaN cells are present but the data cells must retain their values.
+    assert df[("A", "metric1")].iloc[0] == 1.0
+    assert df[("B", "metric2")].iloc[0] == 2.0
+
+
+def test_pivot_only_entirely_absent_metrics_are_restored():
+    """
+    Only metrics with zero surviving columns after pivoting are restored.
+    A metric with partial NaN — data for some categories but not all — must not
+    be touched: its present columns are unchanged and its absent sparse combinations
+    remain dropped. This makes the restoration invariant explicit.
+    """
+    mock_df = DataFrame(
+        {
+            "dttm": to_datetime(["2019-01-01", "2019-01-01"]),
+            "category": ["A", "B"],
+            "metric_all_nan": [np.nan, np.nan],  # entirely absent → restored
+            "metric_partial": [1.0, np.nan],  # partially present → not restored
+        }
+    )
+
+    df = pivot(
+        df=mock_df,
+        index=["dttm"],
+        columns=["category"],
+        aggregates={
+            "metric_all_nan": {"operator": "mean"},
+            "metric_partial": {"operator": "mean"},
+        },
+        drop_missing_columns=True,
+    )
+
+    # metric_all_nan was entirely absent: both category columns are restored as NaN.
+    assert ("metric_all_nan", "A") in df.columns
+    assert ("metric_all_nan", "B") in df.columns
+    assert df[("metric_all_nan", "A")].isna().all()
+    assert df[("metric_all_nan", "B")].isna().all()
+
+    # metric_partial has data for A: present column is unchanged, sparse B dropped.
+    assert ("metric_partial", "A") in df.columns
+    assert ("metric_partial", "B") not in df.columns
+    assert df[("metric_partial", "A")].iloc[0] == 1.0