fix(mixed-timeseries): preserve all-NaN metric columns after pivot when Jinja evaluates to NULL (#40005)

Co-authored-by: Matt Fitzgerald <matt.fitzgerald@preset.io> Co-authored-by: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-28 11:15:24 +00:00 · 2026-05-14 20:46:34 +10:00
parent d1e9a5df06
commit 01224007da
2 changed files with 313 additions and 8 deletions
--- a/tests/unit_tests/pandas_postprocessing/test_pivot.py
+++ b/tests/unit_tests/pandas_postprocessing/test_pivot.py
@@ -16,6 +16,7 @@
 # under the License.

 import numpy as np
+import pandas as pd
 import pytest
 from pandas import DataFrame, to_datetime

@@ -203,3 +204,245 @@ def test_pivot_eliminate_cartesian_product_columns():
        "metric2, 1, 1",
    ]
    assert np.isnan(df["metric, 1, 1"][0])
+
+
+def test_pivot_preserves_all_nan_metric_flat():
+    """
+    Pivot with drop_missing_columns=True must not drop metric columns whose entries
+    are all NaN. This prevents downstream post-processing (e.g. rename) from failing
+    with "Referenced columns not available in DataFrame" when a Jinja metric
+    expression evaluates to NULL for every row (SC-100398).
+    """
+    mock_df = DataFrame(
+        {
+            "dttm": to_datetime(["2019-01-01", "2019-01-02", "2019-01-03"]),
+            "metric": [np.nan, np.nan, np.nan],
+        }
+    )
+
+    df = pivot(
+        df=mock_df,
+        index=["dttm"],
+        aggregates={"metric": {"operator": "mean"}},
+        drop_missing_columns=True,
+    )
+
+    assert "metric" in df.columns
+    assert df["metric"].isna().all()
+
+
+def test_pivot_preserves_all_nan_metric_with_columns():
+    """
+    Pivot with groupby columns and drop_missing_columns=True must restore the
+    exact (metric, category_val) MultiIndex keys when all values for that metric
+    are NaN. The restored keys must use the actual category values from the input
+    data so that downstream rename/rolling validation and flatten produce the
+    correct column names.
+    """
+    mock_df = DataFrame(
+        {
+            "dttm": to_datetime(["2019-01-01", "2019-01-01"]),
+            "category": ["A", "B"],
+            "metric": [np.nan, np.nan],
+        }
+    )
+
+    df = pivot(
+        df=mock_df,
+        index=["dttm"],
+        columns=["category"],
+        aggregates={"metric": {"operator": "mean"}},
+        drop_missing_columns=True,
+    )
+
+    assert isinstance(df.columns, pd.MultiIndex)
+    assert "metric" in df.columns.get_level_values(0)
+    # Exact keys must reflect the real category values, not placeholders.
+    assert ("metric", "A") in df.columns
+    assert ("metric", "B") in df.columns
+
+    df = flatten(df)
+    assert "metric, A" in df.columns
+    assert "metric, B" in df.columns
+    assert df["metric, A"].isna().all()
+    assert df["metric, B"].isna().all()
+
+
+def test_pivot_preserves_all_nan_metric_multi_column():
+    """
+    Pivot with multiple groupby columns and an all-NaN metric restores the full
+    multi-level (metric, col_val_1, col_val_2) key, not a truncated or placeholder
+    version. Exercises the case where columns=["country", "category"].
+    """
+    mock_df = DataFrame(
+        {
+            "dttm": to_datetime(
+                ["2019-01-01", "2019-01-01", "2019-01-01", "2019-01-01"]
+            ),
+            "country": ["US", "US", "EU", "EU"],
+            "category": ["A", "B", "A", "B"],
+            "metric": [np.nan, np.nan, np.nan, np.nan],
+        }
+    )
+
+    df = pivot(
+        df=mock_df,
+        index=["dttm"],
+        columns=["country", "category"],
+        aggregates={"metric": {"operator": "mean"}},
+        drop_missing_columns=True,
+    )
+
+    assert isinstance(df.columns, pd.MultiIndex)
+    assert "metric" in df.columns.get_level_values(0)
+    # All four combinations must be restored with correct full tuple keys.
+    assert ("metric", "US", "A") in df.columns
+    assert ("metric", "US", "B") in df.columns
+    assert ("metric", "EU", "A") in df.columns
+    assert ("metric", "EU", "B") in df.columns
+
+    df = flatten(df)
+    assert "metric, US, A" in df.columns
+    assert "metric, EU, B" in df.columns
+    assert df["metric, US, A"].isna().all()
+
+
+def test_pivot_restored_nan_metric_column_order_is_deterministic():
+    """
+    Restored all-NaN metric columns must appear in data-insertion order, not
+    in nondeterministic hash-set iteration order. This prevents column ordering
+    from varying across Python processes (which randomize hash seeds by default).
+    """
+    mock_df = DataFrame(
+        {
+            "dttm": to_datetime(["2019-01-01", "2019-01-01", "2019-01-01"]),
+            "category": ["C", "A", "B"],
+            "metric": [np.nan, np.nan, np.nan],
+        }
+    )
+
+    df = pivot(
+        df=mock_df,
+        index=["dttm"],
+        columns=["category"],
+        aggregates={"metric": {"operator": "mean"}},
+        drop_missing_columns=True,
+    )
+
+    # Columns restored in data-insertion order: C, A, B (not alphabetical or random).
+    assert list(df.columns.get_level_values(1)) == ["C", "A", "B"]
+
+
+def test_pivot_preserves_all_nan_metric_combine_value_with_metric():
+    """
+    When combine_value_with_metric=True, a stack()/unstack() is applied after
+    column restoration. stack() drops all-NaN rows by default, which would remove
+    the restored metric before downstream post-processing can reference it.
+    Using dropna=False on stack() ensures restored all-NaN metrics survive.
+    """
+    mock_df = DataFrame(
+        {
+            "dttm": to_datetime(["2019-01-01", "2019-01-01"]),
+            "category": ["A", "B"],
+            "metric": [np.nan, np.nan],
+            "metric2": [1.0, 2.0],
+        }
+    )
+
+    df = pivot(
+        df=mock_df,
+        index=["dttm"],
+        columns=["category"],
+        aggregates={
+            "metric": {"operator": "mean"},
+            "metric2": {"operator": "mean"},
+        },
+        drop_missing_columns=True,
+        combine_value_with_metric=True,
+    )
+
+    # After stack()/unstack(), columns are (category_val, metric_name) tuples.
+    # The all-NaN metric must appear in level 1 alongside metric2.
+    assert isinstance(df.columns, pd.MultiIndex)
+    metric_names = df.columns.get_level_values(1).tolist()
+    assert "metric" in metric_names
+    assert "metric2" in metric_names
+
+
+def test_pivot_combine_sparse_metrics_no_spurious_extra_columns():
+    """
+    With drop_missing_columns=True and combine_value_with_metric=True, using
+    stack(dropna=False) to preserve restored all-NaN metrics must not alter output
+    shape for sparse-but-not-all-NaN metric/category pairs. stack(dropna=False) only
+    changes behaviour for rows that are entirely NaN (a restored metric); sparse rows
+    with at least one non-NaN value are unaffected — same result as dropna=True.
+    """
+    mock_df = DataFrame(
+        {
+            "dttm": to_datetime(["2019-01-01", "2019-01-01"]),
+            "category": ["A", "B"],
+            "metric1": [1.0, np.nan],  # data only for category A
+            "metric2": [np.nan, 2.0],  # data only for category B
+        }
+    )
+
+    df = pivot(
+        df=mock_df,
+        index=["dttm"],
+        columns=["category"],
+        aggregates={
+            "metric1": {"operator": "mean"},
+            "metric2": {"operator": "mean"},
+        },
+        drop_missing_columns=True,
+        combine_value_with_metric=True,
+    )
+
+    # After combine, columns are (category_val, metric_name) tuples.
+    # Neither metric is entirely absent after pivoting, so _restore adds nothing.
+    # stack(dropna=False) does not change results for sparse rows with mixed NaN/data.
+    assert isinstance(df.columns, pd.MultiIndex)
+    assert sorted(df.columns.get_level_values(0).unique()) == ["A", "B"]
+    assert sorted(df.columns.get_level_values(1).unique()) == ["metric1", "metric2"]
+    # Sparse NaN cells are present but the data cells must retain their values.
+    assert df[("A", "metric1")].iloc[0] == 1.0
+    assert df[("B", "metric2")].iloc[0] == 2.0
+
+
+def test_pivot_only_entirely_absent_metrics_are_restored():
+    """
+    Only metrics with zero surviving columns after pivoting are restored.
+    A metric with partial NaN — data for some categories but not all — must not
+    be touched: its present columns are unchanged and its absent sparse combinations
+    remain dropped. This makes the restoration invariant explicit.
+    """
+    mock_df = DataFrame(
+        {
+            "dttm": to_datetime(["2019-01-01", "2019-01-01"]),
+            "category": ["A", "B"],
+            "metric_all_nan": [np.nan, np.nan],  # entirely absent → restored
+            "metric_partial": [1.0, np.nan],  # partially present → not restored
+        }
+    )
+
+    df = pivot(
+        df=mock_df,
+        index=["dttm"],
+        columns=["category"],
+        aggregates={
+            "metric_all_nan": {"operator": "mean"},
+            "metric_partial": {"operator": "mean"},
+        },
+        drop_missing_columns=True,
+    )
+
+    # metric_all_nan was entirely absent: both category columns are restored as NaN.
+    assert ("metric_all_nan", "A") in df.columns
+    assert ("metric_all_nan", "B") in df.columns
+    assert df[("metric_all_nan", "A")].isna().all()
+    assert df[("metric_all_nan", "B")].isna().all()
+
+    # metric_partial has data for A: present column is unchanged, sparse B dropped.
+    assert ("metric_partial", "A") in df.columns
+    assert ("metric_partial", "B") not in df.columns
+    assert df[("metric_partial", "A")].iloc[0] == 1.0