mirror of
https://github.com/apache/superset.git
synced 2026-05-22 00:05:15 +00:00
Co-authored-by: Matt Fitzgerald <matt.fitzgerald@preset.io> Co-authored-by: Claude Sonnet 4.6 <noreply@anthropic.com>
449 lines
14 KiB
Python
449 lines
14 KiB
Python
# Licensed to the Apache Software Foundation (ASF) under one
|
|
# or more contributor license agreements. See the NOTICE file
|
|
# distributed with this work for additional information
|
|
# regarding copyright ownership. The ASF licenses this file
|
|
# to you under the Apache License, Version 2.0 (the
|
|
# "License"); you may not use this file except in compliance
|
|
# with the License. You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing,
|
|
# software distributed under the License is distributed on an
|
|
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
# KIND, either express or implied. See the License for the
|
|
# specific language governing permissions and limitations
|
|
# under the License.
|
|
|
|
import numpy as np
|
|
import pandas as pd
|
|
import pytest
|
|
from pandas import DataFrame, to_datetime
|
|
|
|
from superset.exceptions import InvalidPostProcessingError
|
|
from superset.utils.pandas_postprocessing import flatten, pivot
|
|
from tests.unit_tests.fixtures.dataframes import categories_df
|
|
from tests.unit_tests.pandas_postprocessing.utils import AGGREGATES_SINGLE
|
|
|
|
|
|
def test_pivot_without_columns():
|
|
"""
|
|
Make sure pivot without columns returns correct DataFrame
|
|
"""
|
|
df = pivot(
|
|
df=categories_df,
|
|
index=["name"],
|
|
aggregates=AGGREGATES_SINGLE,
|
|
)
|
|
assert df.columns.tolist() == ["idx_nulls"]
|
|
assert len(df) == 101
|
|
assert df["idx_nulls"].sum() == 1050
|
|
|
|
|
|
def test_pivot_with_single_column():
|
|
"""
|
|
Make sure pivot with single column returns correct DataFrame
|
|
"""
|
|
df = pivot(
|
|
df=categories_df,
|
|
index=["name"],
|
|
columns=["category"],
|
|
aggregates=AGGREGATES_SINGLE,
|
|
)
|
|
assert df.columns.tolist() == [
|
|
("idx_nulls", "cat0"),
|
|
("idx_nulls", "cat1"),
|
|
("idx_nulls", "cat2"),
|
|
]
|
|
assert len(df) == 101
|
|
assert df["idx_nulls"]["cat0"].sum() == 315
|
|
|
|
df = pivot(
|
|
df=categories_df,
|
|
index=["dept"],
|
|
columns=["category"],
|
|
aggregates=AGGREGATES_SINGLE,
|
|
)
|
|
assert df.columns.tolist() == [
|
|
("idx_nulls", "cat0"),
|
|
("idx_nulls", "cat1"),
|
|
("idx_nulls", "cat2"),
|
|
]
|
|
assert len(df) == 5
|
|
|
|
|
|
def test_pivot_with_multiple_columns():
|
|
"""
|
|
Make sure pivot with multiple columns returns correct DataFrame
|
|
"""
|
|
df = pivot(
|
|
df=categories_df,
|
|
index=["name"],
|
|
columns=["category", "dept"],
|
|
aggregates=AGGREGATES_SINGLE,
|
|
)
|
|
df = flatten(df)
|
|
assert len(df.columns) == 1 + 3 * 5 # index + possible permutations
|
|
|
|
|
|
def test_pivot_fill_values():
|
|
"""
|
|
Make sure pivot with fill values returns correct DataFrame
|
|
"""
|
|
df = pivot(
|
|
df=categories_df,
|
|
index=["name"],
|
|
columns=["category"],
|
|
metric_fill_value=1,
|
|
aggregates={"idx_nulls": {"operator": "sum"}},
|
|
)
|
|
assert df["idx_nulls"]["cat0"].sum() == 382
|
|
|
|
|
|
def test_pivot_fill_column_values():
|
|
"""
|
|
Make sure pivot with null column names returns correct DataFrame
|
|
"""
|
|
df_copy = categories_df.copy()
|
|
df_copy["category"] = None
|
|
df = pivot(
|
|
df=df_copy,
|
|
index=["name"],
|
|
columns=["category"],
|
|
aggregates={"idx_nulls": {"operator": "sum"}},
|
|
)
|
|
assert len(df) == 101
|
|
assert df.columns.tolist() == [("idx_nulls", "<NULL>")]
|
|
|
|
|
|
def test_pivot_exceptions():
|
|
"""
|
|
Make sure pivot raises correct Exceptions
|
|
"""
|
|
# Missing index
|
|
with pytest.raises(TypeError):
|
|
pivot(df=categories_df, columns=["dept"], aggregates=AGGREGATES_SINGLE)
|
|
|
|
# invalid index reference
|
|
with pytest.raises(InvalidPostProcessingError):
|
|
pivot(
|
|
df=categories_df,
|
|
index=["abc"],
|
|
columns=["dept"],
|
|
aggregates=AGGREGATES_SINGLE,
|
|
)
|
|
|
|
# invalid column reference
|
|
with pytest.raises(InvalidPostProcessingError):
|
|
pivot(
|
|
df=categories_df,
|
|
index=["dept"],
|
|
columns=["abc"],
|
|
aggregates=AGGREGATES_SINGLE,
|
|
)
|
|
|
|
# invalid aggregate options
|
|
with pytest.raises(InvalidPostProcessingError):
|
|
pivot(
|
|
df=categories_df,
|
|
index=["name"],
|
|
columns=["category"],
|
|
aggregates={"idx_nulls": {}},
|
|
)
|
|
|
|
|
|
def test_pivot_eliminate_cartesian_product_columns():
|
|
# single metric
|
|
mock_df = DataFrame(
|
|
{
|
|
"dttm": to_datetime(["2019-01-01", "2019-01-01"]),
|
|
"a": [0, 1],
|
|
"b": [0, 1],
|
|
"metric": [9, np.NAN],
|
|
}
|
|
)
|
|
|
|
df = pivot(
|
|
df=mock_df,
|
|
index=["dttm"],
|
|
columns=["a", "b"],
|
|
aggregates={"metric": {"operator": "mean"}},
|
|
drop_missing_columns=False,
|
|
)
|
|
df = flatten(df)
|
|
assert list(df.columns) == ["dttm", "metric, 0, 0", "metric, 1, 1"]
|
|
assert np.isnan(df["metric, 1, 1"][0])
|
|
|
|
# multiple metrics
|
|
mock_df = DataFrame(
|
|
{
|
|
"dttm": to_datetime(["2019-01-01", "2019-01-01"]),
|
|
"a": [0, 1],
|
|
"b": [0, 1],
|
|
"metric": [9, np.NAN],
|
|
"metric2": [10, 11],
|
|
}
|
|
)
|
|
|
|
df = pivot(
|
|
df=mock_df,
|
|
index=["dttm"],
|
|
columns=["a", "b"],
|
|
aggregates={
|
|
"metric": {"operator": "mean"},
|
|
"metric2": {"operator": "mean"},
|
|
},
|
|
drop_missing_columns=False,
|
|
)
|
|
df = flatten(df)
|
|
assert list(df.columns) == [
|
|
"dttm",
|
|
"metric, 0, 0",
|
|
"metric, 1, 1",
|
|
"metric2, 0, 0",
|
|
"metric2, 1, 1",
|
|
]
|
|
assert np.isnan(df["metric, 1, 1"][0])
|
|
|
|
|
|
def test_pivot_preserves_all_nan_metric_flat():
|
|
"""
|
|
Pivot with drop_missing_columns=True must not drop metric columns whose entries
|
|
are all NaN. This prevents downstream post-processing (e.g. rename) from failing
|
|
with "Referenced columns not available in DataFrame" when a Jinja metric
|
|
expression evaluates to NULL for every row (SC-100398).
|
|
"""
|
|
mock_df = DataFrame(
|
|
{
|
|
"dttm": to_datetime(["2019-01-01", "2019-01-02", "2019-01-03"]),
|
|
"metric": [np.nan, np.nan, np.nan],
|
|
}
|
|
)
|
|
|
|
df = pivot(
|
|
df=mock_df,
|
|
index=["dttm"],
|
|
aggregates={"metric": {"operator": "mean"}},
|
|
drop_missing_columns=True,
|
|
)
|
|
|
|
assert "metric" in df.columns
|
|
assert df["metric"].isna().all()
|
|
|
|
|
|
def test_pivot_preserves_all_nan_metric_with_columns():
|
|
"""
|
|
Pivot with groupby columns and drop_missing_columns=True must restore the
|
|
exact (metric, category_val) MultiIndex keys when all values for that metric
|
|
are NaN. The restored keys must use the actual category values from the input
|
|
data so that downstream rename/rolling validation and flatten produce the
|
|
correct column names.
|
|
"""
|
|
mock_df = DataFrame(
|
|
{
|
|
"dttm": to_datetime(["2019-01-01", "2019-01-01"]),
|
|
"category": ["A", "B"],
|
|
"metric": [np.nan, np.nan],
|
|
}
|
|
)
|
|
|
|
df = pivot(
|
|
df=mock_df,
|
|
index=["dttm"],
|
|
columns=["category"],
|
|
aggregates={"metric": {"operator": "mean"}},
|
|
drop_missing_columns=True,
|
|
)
|
|
|
|
assert isinstance(df.columns, pd.MultiIndex)
|
|
assert "metric" in df.columns.get_level_values(0)
|
|
# Exact keys must reflect the real category values, not placeholders.
|
|
assert ("metric", "A") in df.columns
|
|
assert ("metric", "B") in df.columns
|
|
|
|
df = flatten(df)
|
|
assert "metric, A" in df.columns
|
|
assert "metric, B" in df.columns
|
|
assert df["metric, A"].isna().all()
|
|
assert df["metric, B"].isna().all()
|
|
|
|
|
|
def test_pivot_preserves_all_nan_metric_multi_column():
|
|
"""
|
|
Pivot with multiple groupby columns and an all-NaN metric restores the full
|
|
multi-level (metric, col_val_1, col_val_2) key, not a truncated or placeholder
|
|
version. Exercises the case where columns=["country", "category"].
|
|
"""
|
|
mock_df = DataFrame(
|
|
{
|
|
"dttm": to_datetime(
|
|
["2019-01-01", "2019-01-01", "2019-01-01", "2019-01-01"]
|
|
),
|
|
"country": ["US", "US", "EU", "EU"],
|
|
"category": ["A", "B", "A", "B"],
|
|
"metric": [np.nan, np.nan, np.nan, np.nan],
|
|
}
|
|
)
|
|
|
|
df = pivot(
|
|
df=mock_df,
|
|
index=["dttm"],
|
|
columns=["country", "category"],
|
|
aggregates={"metric": {"operator": "mean"}},
|
|
drop_missing_columns=True,
|
|
)
|
|
|
|
assert isinstance(df.columns, pd.MultiIndex)
|
|
assert "metric" in df.columns.get_level_values(0)
|
|
# All four combinations must be restored with correct full tuple keys.
|
|
assert ("metric", "US", "A") in df.columns
|
|
assert ("metric", "US", "B") in df.columns
|
|
assert ("metric", "EU", "A") in df.columns
|
|
assert ("metric", "EU", "B") in df.columns
|
|
|
|
df = flatten(df)
|
|
assert "metric, US, A" in df.columns
|
|
assert "metric, EU, B" in df.columns
|
|
assert df["metric, US, A"].isna().all()
|
|
|
|
|
|
def test_pivot_restored_nan_metric_column_order_is_deterministic():
|
|
"""
|
|
Restored all-NaN metric columns must appear in data-insertion order, not
|
|
in nondeterministic hash-set iteration order. This prevents column ordering
|
|
from varying across Python processes (which randomize hash seeds by default).
|
|
"""
|
|
mock_df = DataFrame(
|
|
{
|
|
"dttm": to_datetime(["2019-01-01", "2019-01-01", "2019-01-01"]),
|
|
"category": ["C", "A", "B"],
|
|
"metric": [np.nan, np.nan, np.nan],
|
|
}
|
|
)
|
|
|
|
df = pivot(
|
|
df=mock_df,
|
|
index=["dttm"],
|
|
columns=["category"],
|
|
aggregates={"metric": {"operator": "mean"}},
|
|
drop_missing_columns=True,
|
|
)
|
|
|
|
# Columns restored in data-insertion order: C, A, B (not alphabetical or random).
|
|
assert list(df.columns.get_level_values(1)) == ["C", "A", "B"]
|
|
|
|
|
|
def test_pivot_preserves_all_nan_metric_combine_value_with_metric():
|
|
"""
|
|
When combine_value_with_metric=True, a stack()/unstack() is applied after
|
|
column restoration. stack() drops all-NaN rows by default, which would remove
|
|
the restored metric before downstream post-processing can reference it.
|
|
Using dropna=False on stack() ensures restored all-NaN metrics survive.
|
|
"""
|
|
mock_df = DataFrame(
|
|
{
|
|
"dttm": to_datetime(["2019-01-01", "2019-01-01"]),
|
|
"category": ["A", "B"],
|
|
"metric": [np.nan, np.nan],
|
|
"metric2": [1.0, 2.0],
|
|
}
|
|
)
|
|
|
|
df = pivot(
|
|
df=mock_df,
|
|
index=["dttm"],
|
|
columns=["category"],
|
|
aggregates={
|
|
"metric": {"operator": "mean"},
|
|
"metric2": {"operator": "mean"},
|
|
},
|
|
drop_missing_columns=True,
|
|
combine_value_with_metric=True,
|
|
)
|
|
|
|
# After stack()/unstack(), columns are (category_val, metric_name) tuples.
|
|
# The all-NaN metric must appear in level 1 alongside metric2.
|
|
assert isinstance(df.columns, pd.MultiIndex)
|
|
metric_names = df.columns.get_level_values(1).tolist()
|
|
assert "metric" in metric_names
|
|
assert "metric2" in metric_names
|
|
|
|
|
|
def test_pivot_combine_sparse_metrics_no_spurious_extra_columns():
|
|
"""
|
|
With drop_missing_columns=True and combine_value_with_metric=True, using
|
|
stack(dropna=False) to preserve restored all-NaN metrics must not alter output
|
|
shape for sparse-but-not-all-NaN metric/category pairs. stack(dropna=False) only
|
|
changes behaviour for rows that are entirely NaN (a restored metric); sparse rows
|
|
with at least one non-NaN value are unaffected — same result as dropna=True.
|
|
"""
|
|
mock_df = DataFrame(
|
|
{
|
|
"dttm": to_datetime(["2019-01-01", "2019-01-01"]),
|
|
"category": ["A", "B"],
|
|
"metric1": [1.0, np.nan], # data only for category A
|
|
"metric2": [np.nan, 2.0], # data only for category B
|
|
}
|
|
)
|
|
|
|
df = pivot(
|
|
df=mock_df,
|
|
index=["dttm"],
|
|
columns=["category"],
|
|
aggregates={
|
|
"metric1": {"operator": "mean"},
|
|
"metric2": {"operator": "mean"},
|
|
},
|
|
drop_missing_columns=True,
|
|
combine_value_with_metric=True,
|
|
)
|
|
|
|
# After combine, columns are (category_val, metric_name) tuples.
|
|
# Neither metric is entirely absent after pivoting, so _restore adds nothing.
|
|
# stack(dropna=False) does not change results for sparse rows with mixed NaN/data.
|
|
assert isinstance(df.columns, pd.MultiIndex)
|
|
assert sorted(df.columns.get_level_values(0).unique()) == ["A", "B"]
|
|
assert sorted(df.columns.get_level_values(1).unique()) == ["metric1", "metric2"]
|
|
# Sparse NaN cells are present but the data cells must retain their values.
|
|
assert df[("A", "metric1")].iloc[0] == 1.0
|
|
assert df[("B", "metric2")].iloc[0] == 2.0
|
|
|
|
|
|
def test_pivot_only_entirely_absent_metrics_are_restored():
|
|
"""
|
|
Only metrics with zero surviving columns after pivoting are restored.
|
|
A metric with partial NaN — data for some categories but not all — must not
|
|
be touched: its present columns are unchanged and its absent sparse combinations
|
|
remain dropped. This makes the restoration invariant explicit.
|
|
"""
|
|
mock_df = DataFrame(
|
|
{
|
|
"dttm": to_datetime(["2019-01-01", "2019-01-01"]),
|
|
"category": ["A", "B"],
|
|
"metric_all_nan": [np.nan, np.nan], # entirely absent → restored
|
|
"metric_partial": [1.0, np.nan], # partially present → not restored
|
|
}
|
|
)
|
|
|
|
df = pivot(
|
|
df=mock_df,
|
|
index=["dttm"],
|
|
columns=["category"],
|
|
aggregates={
|
|
"metric_all_nan": {"operator": "mean"},
|
|
"metric_partial": {"operator": "mean"},
|
|
},
|
|
drop_missing_columns=True,
|
|
)
|
|
|
|
# metric_all_nan was entirely absent: both category columns are restored as NaN.
|
|
assert ("metric_all_nan", "A") in df.columns
|
|
assert ("metric_all_nan", "B") in df.columns
|
|
assert df[("metric_all_nan", "A")].isna().all()
|
|
assert df[("metric_all_nan", "B")].isna().all()
|
|
|
|
# metric_partial has data for A: present column is unchanged, sparse B dropped.
|
|
assert ("metric_partial", "A") in df.columns
|
|
assert ("metric_partial", "B") not in df.columns
|
|
assert df[("metric_partial", "A")].iloc[0] == 1.0
|