fix(mixed-timeseries): preserve all-NaN metric columns after pivot when Jinja evaluates to NULL (#40005)

Co-authored-by: Matt Fitzgerald <matt.fitzgerald@preset.io>
Co-authored-by: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Mafi
2026-05-14 20:46:34 +10:00
committed by GitHub
parent d1e9a5df06
commit 01224007da
2 changed files with 313 additions and 8 deletions

View File

@@ -16,6 +16,7 @@
# under the License.
import numpy as np
import pandas as pd
import pytest
from pandas import DataFrame, to_datetime
@@ -203,3 +204,245 @@ def test_pivot_eliminate_cartesian_product_columns():
"metric2, 1, 1",
]
assert np.isnan(df["metric, 1, 1"][0])
def test_pivot_preserves_all_nan_metric_flat():
"""
Pivot with drop_missing_columns=True must not drop metric columns whose entries
are all NaN. This prevents downstream post-processing (e.g. rename) from failing
with "Referenced columns not available in DataFrame" when a Jinja metric
expression evaluates to NULL for every row (SC-100398).
"""
mock_df = DataFrame(
{
"dttm": to_datetime(["2019-01-01", "2019-01-02", "2019-01-03"]),
"metric": [np.nan, np.nan, np.nan],
}
)
df = pivot(
df=mock_df,
index=["dttm"],
aggregates={"metric": {"operator": "mean"}},
drop_missing_columns=True,
)
assert "metric" in df.columns
assert df["metric"].isna().all()
def test_pivot_preserves_all_nan_metric_with_columns():
"""
Pivot with groupby columns and drop_missing_columns=True must restore the
exact (metric, category_val) MultiIndex keys when all values for that metric
are NaN. The restored keys must use the actual category values from the input
data so that downstream rename/rolling validation and flatten produce the
correct column names.
"""
mock_df = DataFrame(
{
"dttm": to_datetime(["2019-01-01", "2019-01-01"]),
"category": ["A", "B"],
"metric": [np.nan, np.nan],
}
)
df = pivot(
df=mock_df,
index=["dttm"],
columns=["category"],
aggregates={"metric": {"operator": "mean"}},
drop_missing_columns=True,
)
assert isinstance(df.columns, pd.MultiIndex)
assert "metric" in df.columns.get_level_values(0)
# Exact keys must reflect the real category values, not placeholders.
assert ("metric", "A") in df.columns
assert ("metric", "B") in df.columns
df = flatten(df)
assert "metric, A" in df.columns
assert "metric, B" in df.columns
assert df["metric, A"].isna().all()
assert df["metric, B"].isna().all()
def test_pivot_preserves_all_nan_metric_multi_column():
"""
Pivot with multiple groupby columns and an all-NaN metric restores the full
multi-level (metric, col_val_1, col_val_2) key, not a truncated or placeholder
version. Exercises the case where columns=["country", "category"].
"""
mock_df = DataFrame(
{
"dttm": to_datetime(
["2019-01-01", "2019-01-01", "2019-01-01", "2019-01-01"]
),
"country": ["US", "US", "EU", "EU"],
"category": ["A", "B", "A", "B"],
"metric": [np.nan, np.nan, np.nan, np.nan],
}
)
df = pivot(
df=mock_df,
index=["dttm"],
columns=["country", "category"],
aggregates={"metric": {"operator": "mean"}},
drop_missing_columns=True,
)
assert isinstance(df.columns, pd.MultiIndex)
assert "metric" in df.columns.get_level_values(0)
# All four combinations must be restored with correct full tuple keys.
assert ("metric", "US", "A") in df.columns
assert ("metric", "US", "B") in df.columns
assert ("metric", "EU", "A") in df.columns
assert ("metric", "EU", "B") in df.columns
df = flatten(df)
assert "metric, US, A" in df.columns
assert "metric, EU, B" in df.columns
assert df["metric, US, A"].isna().all()
def test_pivot_restored_nan_metric_column_order_is_deterministic():
"""
Restored all-NaN metric columns must appear in data-insertion order, not
in nondeterministic hash-set iteration order. This prevents column ordering
from varying across Python processes (which randomize hash seeds by default).
"""
mock_df = DataFrame(
{
"dttm": to_datetime(["2019-01-01", "2019-01-01", "2019-01-01"]),
"category": ["C", "A", "B"],
"metric": [np.nan, np.nan, np.nan],
}
)
df = pivot(
df=mock_df,
index=["dttm"],
columns=["category"],
aggregates={"metric": {"operator": "mean"}},
drop_missing_columns=True,
)
# Columns restored in data-insertion order: C, A, B (not alphabetical or random).
assert list(df.columns.get_level_values(1)) == ["C", "A", "B"]
def test_pivot_preserves_all_nan_metric_combine_value_with_metric():
"""
When combine_value_with_metric=True, a stack()/unstack() is applied after
column restoration. stack() drops all-NaN rows by default, which would remove
the restored metric before downstream post-processing can reference it.
Using dropna=False on stack() ensures restored all-NaN metrics survive.
"""
mock_df = DataFrame(
{
"dttm": to_datetime(["2019-01-01", "2019-01-01"]),
"category": ["A", "B"],
"metric": [np.nan, np.nan],
"metric2": [1.0, 2.0],
}
)
df = pivot(
df=mock_df,
index=["dttm"],
columns=["category"],
aggregates={
"metric": {"operator": "mean"},
"metric2": {"operator": "mean"},
},
drop_missing_columns=True,
combine_value_with_metric=True,
)
# After stack()/unstack(), columns are (category_val, metric_name) tuples.
# The all-NaN metric must appear in level 1 alongside metric2.
assert isinstance(df.columns, pd.MultiIndex)
metric_names = df.columns.get_level_values(1).tolist()
assert "metric" in metric_names
assert "metric2" in metric_names
def test_pivot_combine_sparse_metrics_no_spurious_extra_columns():
"""
With drop_missing_columns=True and combine_value_with_metric=True, using
stack(dropna=False) to preserve restored all-NaN metrics must not alter output
shape for sparse-but-not-all-NaN metric/category pairs. stack(dropna=False) only
changes behaviour for rows that are entirely NaN (a restored metric); sparse rows
with at least one non-NaN value are unaffected — same result as dropna=True.
"""
mock_df = DataFrame(
{
"dttm": to_datetime(["2019-01-01", "2019-01-01"]),
"category": ["A", "B"],
"metric1": [1.0, np.nan], # data only for category A
"metric2": [np.nan, 2.0], # data only for category B
}
)
df = pivot(
df=mock_df,
index=["dttm"],
columns=["category"],
aggregates={
"metric1": {"operator": "mean"},
"metric2": {"operator": "mean"},
},
drop_missing_columns=True,
combine_value_with_metric=True,
)
# After combine, columns are (category_val, metric_name) tuples.
# Neither metric is entirely absent after pivoting, so _restore adds nothing.
# stack(dropna=False) does not change results for sparse rows with mixed NaN/data.
assert isinstance(df.columns, pd.MultiIndex)
assert sorted(df.columns.get_level_values(0).unique()) == ["A", "B"]
assert sorted(df.columns.get_level_values(1).unique()) == ["metric1", "metric2"]
# Sparse NaN cells are present but the data cells must retain their values.
assert df[("A", "metric1")].iloc[0] == 1.0
assert df[("B", "metric2")].iloc[0] == 2.0
def test_pivot_only_entirely_absent_metrics_are_restored():
"""
Only metrics with zero surviving columns after pivoting are restored.
A metric with partial NaN — data for some categories but not all — must not
be touched: its present columns are unchanged and its absent sparse combinations
remain dropped. This makes the restoration invariant explicit.
"""
mock_df = DataFrame(
{
"dttm": to_datetime(["2019-01-01", "2019-01-01"]),
"category": ["A", "B"],
"metric_all_nan": [np.nan, np.nan], # entirely absent → restored
"metric_partial": [1.0, np.nan], # partially present → not restored
}
)
df = pivot(
df=mock_df,
index=["dttm"],
columns=["category"],
aggregates={
"metric_all_nan": {"operator": "mean"},
"metric_partial": {"operator": "mean"},
},
drop_missing_columns=True,
)
# metric_all_nan was entirely absent: both category columns are restored as NaN.
assert ("metric_all_nan", "A") in df.columns
assert ("metric_all_nan", "B") in df.columns
assert df[("metric_all_nan", "A")].isna().all()
assert df[("metric_all_nan", "B")].isna().all()
# metric_partial has data for A: present column is unchanged, sparse B dropped.
assert ("metric_partial", "A") in df.columns
assert ("metric_partial", "B") not in df.columns
assert df[("metric_partial", "A")].iloc[0] == 1.0