mirror of
https://github.com/apache/superset.git
synced 2026-05-28 11:15:24 +00:00
fix(mixed-timeseries): preserve all-NaN metric columns after pivot when Jinja evaluates to NULL (#40005)
Co-authored-by: Matt Fitzgerald <matt.fitzgerald@preset.io> Co-authored-by: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -16,6 +16,7 @@
|
||||
# under the License.
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import pytest
|
||||
from pandas import DataFrame, to_datetime
|
||||
|
||||
@@ -203,3 +204,245 @@ def test_pivot_eliminate_cartesian_product_columns():
|
||||
"metric2, 1, 1",
|
||||
]
|
||||
assert np.isnan(df["metric, 1, 1"][0])
|
||||
|
||||
|
||||
def test_pivot_preserves_all_nan_metric_flat():
|
||||
"""
|
||||
Pivot with drop_missing_columns=True must not drop metric columns whose entries
|
||||
are all NaN. This prevents downstream post-processing (e.g. rename) from failing
|
||||
with "Referenced columns not available in DataFrame" when a Jinja metric
|
||||
expression evaluates to NULL for every row (SC-100398).
|
||||
"""
|
||||
mock_df = DataFrame(
|
||||
{
|
||||
"dttm": to_datetime(["2019-01-01", "2019-01-02", "2019-01-03"]),
|
||||
"metric": [np.nan, np.nan, np.nan],
|
||||
}
|
||||
)
|
||||
|
||||
df = pivot(
|
||||
df=mock_df,
|
||||
index=["dttm"],
|
||||
aggregates={"metric": {"operator": "mean"}},
|
||||
drop_missing_columns=True,
|
||||
)
|
||||
|
||||
assert "metric" in df.columns
|
||||
assert df["metric"].isna().all()
|
||||
|
||||
|
||||
def test_pivot_preserves_all_nan_metric_with_columns():
|
||||
"""
|
||||
Pivot with groupby columns and drop_missing_columns=True must restore the
|
||||
exact (metric, category_val) MultiIndex keys when all values for that metric
|
||||
are NaN. The restored keys must use the actual category values from the input
|
||||
data so that downstream rename/rolling validation and flatten produce the
|
||||
correct column names.
|
||||
"""
|
||||
mock_df = DataFrame(
|
||||
{
|
||||
"dttm": to_datetime(["2019-01-01", "2019-01-01"]),
|
||||
"category": ["A", "B"],
|
||||
"metric": [np.nan, np.nan],
|
||||
}
|
||||
)
|
||||
|
||||
df = pivot(
|
||||
df=mock_df,
|
||||
index=["dttm"],
|
||||
columns=["category"],
|
||||
aggregates={"metric": {"operator": "mean"}},
|
||||
drop_missing_columns=True,
|
||||
)
|
||||
|
||||
assert isinstance(df.columns, pd.MultiIndex)
|
||||
assert "metric" in df.columns.get_level_values(0)
|
||||
# Exact keys must reflect the real category values, not placeholders.
|
||||
assert ("metric", "A") in df.columns
|
||||
assert ("metric", "B") in df.columns
|
||||
|
||||
df = flatten(df)
|
||||
assert "metric, A" in df.columns
|
||||
assert "metric, B" in df.columns
|
||||
assert df["metric, A"].isna().all()
|
||||
assert df["metric, B"].isna().all()
|
||||
|
||||
|
||||
def test_pivot_preserves_all_nan_metric_multi_column():
|
||||
"""
|
||||
Pivot with multiple groupby columns and an all-NaN metric restores the full
|
||||
multi-level (metric, col_val_1, col_val_2) key, not a truncated or placeholder
|
||||
version. Exercises the case where columns=["country", "category"].
|
||||
"""
|
||||
mock_df = DataFrame(
|
||||
{
|
||||
"dttm": to_datetime(
|
||||
["2019-01-01", "2019-01-01", "2019-01-01", "2019-01-01"]
|
||||
),
|
||||
"country": ["US", "US", "EU", "EU"],
|
||||
"category": ["A", "B", "A", "B"],
|
||||
"metric": [np.nan, np.nan, np.nan, np.nan],
|
||||
}
|
||||
)
|
||||
|
||||
df = pivot(
|
||||
df=mock_df,
|
||||
index=["dttm"],
|
||||
columns=["country", "category"],
|
||||
aggregates={"metric": {"operator": "mean"}},
|
||||
drop_missing_columns=True,
|
||||
)
|
||||
|
||||
assert isinstance(df.columns, pd.MultiIndex)
|
||||
assert "metric" in df.columns.get_level_values(0)
|
||||
# All four combinations must be restored with correct full tuple keys.
|
||||
assert ("metric", "US", "A") in df.columns
|
||||
assert ("metric", "US", "B") in df.columns
|
||||
assert ("metric", "EU", "A") in df.columns
|
||||
assert ("metric", "EU", "B") in df.columns
|
||||
|
||||
df = flatten(df)
|
||||
assert "metric, US, A" in df.columns
|
||||
assert "metric, EU, B" in df.columns
|
||||
assert df["metric, US, A"].isna().all()
|
||||
|
||||
|
||||
def test_pivot_restored_nan_metric_column_order_is_deterministic():
|
||||
"""
|
||||
Restored all-NaN metric columns must appear in data-insertion order, not
|
||||
in nondeterministic hash-set iteration order. This prevents column ordering
|
||||
from varying across Python processes (which randomize hash seeds by default).
|
||||
"""
|
||||
mock_df = DataFrame(
|
||||
{
|
||||
"dttm": to_datetime(["2019-01-01", "2019-01-01", "2019-01-01"]),
|
||||
"category": ["C", "A", "B"],
|
||||
"metric": [np.nan, np.nan, np.nan],
|
||||
}
|
||||
)
|
||||
|
||||
df = pivot(
|
||||
df=mock_df,
|
||||
index=["dttm"],
|
||||
columns=["category"],
|
||||
aggregates={"metric": {"operator": "mean"}},
|
||||
drop_missing_columns=True,
|
||||
)
|
||||
|
||||
# Columns restored in data-insertion order: C, A, B (not alphabetical or random).
|
||||
assert list(df.columns.get_level_values(1)) == ["C", "A", "B"]
|
||||
|
||||
|
||||
def test_pivot_preserves_all_nan_metric_combine_value_with_metric():
|
||||
"""
|
||||
When combine_value_with_metric=True, a stack()/unstack() is applied after
|
||||
column restoration. stack() drops all-NaN rows by default, which would remove
|
||||
the restored metric before downstream post-processing can reference it.
|
||||
Using dropna=False on stack() ensures restored all-NaN metrics survive.
|
||||
"""
|
||||
mock_df = DataFrame(
|
||||
{
|
||||
"dttm": to_datetime(["2019-01-01", "2019-01-01"]),
|
||||
"category": ["A", "B"],
|
||||
"metric": [np.nan, np.nan],
|
||||
"metric2": [1.0, 2.0],
|
||||
}
|
||||
)
|
||||
|
||||
df = pivot(
|
||||
df=mock_df,
|
||||
index=["dttm"],
|
||||
columns=["category"],
|
||||
aggregates={
|
||||
"metric": {"operator": "mean"},
|
||||
"metric2": {"operator": "mean"},
|
||||
},
|
||||
drop_missing_columns=True,
|
||||
combine_value_with_metric=True,
|
||||
)
|
||||
|
||||
# After stack()/unstack(), columns are (category_val, metric_name) tuples.
|
||||
# The all-NaN metric must appear in level 1 alongside metric2.
|
||||
assert isinstance(df.columns, pd.MultiIndex)
|
||||
metric_names = df.columns.get_level_values(1).tolist()
|
||||
assert "metric" in metric_names
|
||||
assert "metric2" in metric_names
|
||||
|
||||
|
||||
def test_pivot_combine_sparse_metrics_no_spurious_extra_columns():
|
||||
"""
|
||||
With drop_missing_columns=True and combine_value_with_metric=True, using
|
||||
stack(dropna=False) to preserve restored all-NaN metrics must not alter output
|
||||
shape for sparse-but-not-all-NaN metric/category pairs. stack(dropna=False) only
|
||||
changes behaviour for rows that are entirely NaN (a restored metric); sparse rows
|
||||
with at least one non-NaN value are unaffected — same result as dropna=True.
|
||||
"""
|
||||
mock_df = DataFrame(
|
||||
{
|
||||
"dttm": to_datetime(["2019-01-01", "2019-01-01"]),
|
||||
"category": ["A", "B"],
|
||||
"metric1": [1.0, np.nan], # data only for category A
|
||||
"metric2": [np.nan, 2.0], # data only for category B
|
||||
}
|
||||
)
|
||||
|
||||
df = pivot(
|
||||
df=mock_df,
|
||||
index=["dttm"],
|
||||
columns=["category"],
|
||||
aggregates={
|
||||
"metric1": {"operator": "mean"},
|
||||
"metric2": {"operator": "mean"},
|
||||
},
|
||||
drop_missing_columns=True,
|
||||
combine_value_with_metric=True,
|
||||
)
|
||||
|
||||
# After combine, columns are (category_val, metric_name) tuples.
|
||||
# Neither metric is entirely absent after pivoting, so _restore adds nothing.
|
||||
# stack(dropna=False) does not change results for sparse rows with mixed NaN/data.
|
||||
assert isinstance(df.columns, pd.MultiIndex)
|
||||
assert sorted(df.columns.get_level_values(0).unique()) == ["A", "B"]
|
||||
assert sorted(df.columns.get_level_values(1).unique()) == ["metric1", "metric2"]
|
||||
# Sparse NaN cells are present but the data cells must retain their values.
|
||||
assert df[("A", "metric1")].iloc[0] == 1.0
|
||||
assert df[("B", "metric2")].iloc[0] == 2.0
|
||||
|
||||
|
||||
def test_pivot_only_entirely_absent_metrics_are_restored():
|
||||
"""
|
||||
Only metrics with zero surviving columns after pivoting are restored.
|
||||
A metric with partial NaN — data for some categories but not all — must not
|
||||
be touched: its present columns are unchanged and its absent sparse combinations
|
||||
remain dropped. This makes the restoration invariant explicit.
|
||||
"""
|
||||
mock_df = DataFrame(
|
||||
{
|
||||
"dttm": to_datetime(["2019-01-01", "2019-01-01"]),
|
||||
"category": ["A", "B"],
|
||||
"metric_all_nan": [np.nan, np.nan], # entirely absent → restored
|
||||
"metric_partial": [1.0, np.nan], # partially present → not restored
|
||||
}
|
||||
)
|
||||
|
||||
df = pivot(
|
||||
df=mock_df,
|
||||
index=["dttm"],
|
||||
columns=["category"],
|
||||
aggregates={
|
||||
"metric_all_nan": {"operator": "mean"},
|
||||
"metric_partial": {"operator": "mean"},
|
||||
},
|
||||
drop_missing_columns=True,
|
||||
)
|
||||
|
||||
# metric_all_nan was entirely absent: both category columns are restored as NaN.
|
||||
assert ("metric_all_nan", "A") in df.columns
|
||||
assert ("metric_all_nan", "B") in df.columns
|
||||
assert df[("metric_all_nan", "A")].isna().all()
|
||||
assert df[("metric_all_nan", "B")].isna().all()
|
||||
|
||||
# metric_partial has data for A: present column is unchanged, sparse B dropped.
|
||||
assert ("metric_partial", "A") in df.columns
|
||||
assert ("metric_partial", "B") not in df.columns
|
||||
assert df[("metric_partial", "A")].iloc[0] == 1.0
|
||||
|
||||
Reference in New Issue
Block a user