feat(advanced analysis): support MultiIndex column in post processing stage (#19116)

2026-04-18 23:55:00 +00:00 · 2022-03-23 13:46:28 +08:00
parent 6083545e86
commit 375c03e084
55 changed files with 1267 additions and 772 deletions
--- a/tests/unit_tests/pandas_postprocessing/test_compare.py
+++ b/tests/unit_tests/pandas_postprocessing/test_compare.py
@@ -14,49 +14,220 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
+import pandas as pd

-from superset.utils.pandas_postprocessing import compare
-from tests.unit_tests.fixtures.dataframes import timeseries_df2
-from tests.unit_tests.pandas_postprocessing.utils import series_to_list
+from superset.constants import PandasPostprocessingCompare as PPC
+from superset.utils import pandas_postprocessing as pp
+from superset.utils.pandas_postprocessing.utils import FLAT_COLUMN_SEPARATOR
+from tests.unit_tests.fixtures.dataframes import multiple_metrics_df, timeseries_df2


-def test_compare():
+def test_compare_should_not_side_effect():
+    _timeseries_df2 = timeseries_df2.copy()
+    pp.compare(
+        df=_timeseries_df2,
+        source_columns=["y"],
+        compare_columns=["z"],
+        compare_type=PPC.DIFF,
+    )
+    assert _timeseries_df2.equals(timeseries_df2)
+
+
+def test_compare_diff():
    # `difference` comparison
-    post_df = compare(
+    post_df = pp.compare(
        df=timeseries_df2,
        source_columns=["y"],
        compare_columns=["z"],
-        compare_type="difference",
+        compare_type=PPC.DIFF,
+    )
+    """
+               label    y     z  difference__y__z
+    2019-01-01     x  2.0   2.0               0.0
+    2019-01-02     y  2.0   4.0               2.0
+    2019-01-05     z  2.0  10.0               8.0
+    2019-01-07     q  2.0   8.0               6.0
+    """
+    assert post_df.equals(
+        pd.DataFrame(
+            index=timeseries_df2.index,
+            data={
+                "label": ["x", "y", "z", "q"],
+                "y": [2.0, 2.0, 2.0, 2.0],
+                "z": [2.0, 4.0, 10.0, 8.0],
+                "difference__y__z": [0.0, 2.0, 8.0, 6.0],
+            },
+        )
    )
-    assert post_df.columns.tolist() == ["label", "y", "z", "difference__y__z"]
-    assert series_to_list(post_df["difference__y__z"]) == [0.0, -2.0, -8.0, -6.0]

    # drop original columns
-    post_df = compare(
+    post_df = pp.compare(
        df=timeseries_df2,
        source_columns=["y"],
        compare_columns=["z"],
-        compare_type="difference",
+        compare_type=PPC.DIFF,
        drop_original_columns=True,
    )
-    assert post_df.columns.tolist() == ["label", "difference__y__z"]
+    assert post_df.equals(
+        pd.DataFrame(
+            index=timeseries_df2.index,
+            data={
+                "label": ["x", "y", "z", "q"],
+                "difference__y__z": [0.0, 2.0, 8.0, 6.0],
+            },
+        )
+    )

+
+def test_compare_percentage():
    # `percentage` comparison
-    post_df = compare(
+    post_df = pp.compare(
        df=timeseries_df2,
        source_columns=["y"],
        compare_columns=["z"],
-        compare_type="percentage",
+        compare_type=PPC.PCT,
+    )
+    """
+               label    y     z  percentage__y__z
+    2019-01-01     x  2.0   2.0              0.0
+    2019-01-02     y  2.0   4.0              1.0
+    2019-01-05     z  2.0  10.0              4.0
+    2019-01-07     q  2.0   8.0              3.0
+    """
+    assert post_df.equals(
+        pd.DataFrame(
+            index=timeseries_df2.index,
+            data={
+                "label": ["x", "y", "z", "q"],
+                "y": [2.0, 2.0, 2.0, 2.0],
+                "z": [2.0, 4.0, 10.0, 8.0],
+                "percentage__y__z": [0.0, 1.0, 4.0, 3.0],
+            },
+        )
    )
-    assert post_df.columns.tolist() == ["label", "y", "z", "percentage__y__z"]
-    assert series_to_list(post_df["percentage__y__z"]) == [0.0, -0.5, -0.8, -0.75]

+
+def test_compare_ratio():
    # `ratio` comparison
-    post_df = compare(
+    post_df = pp.compare(
        df=timeseries_df2,
        source_columns=["y"],
        compare_columns=["z"],
-        compare_type="ratio",
+        compare_type=PPC.RAT,
+    )
+    """
+               label    y     z  ratio__y__z
+    2019-01-01     x  2.0   2.0         1.0
+    2019-01-02     y  2.0   4.0         2.0
+    2019-01-05     z  2.0  10.0         5.0
+    2019-01-07     q  2.0   8.0         4.0
+    """
+    assert post_df.equals(
+        pd.DataFrame(
+            index=timeseries_df2.index,
+            data={
+                "label": ["x", "y", "z", "q"],
+                "y": [2.0, 2.0, 2.0, 2.0],
+                "z": [2.0, 4.0, 10.0, 8.0],
+                "ratio__y__z": [1.0, 2.0, 5.0, 4.0],
+            },
+        )
+    )
+
+
+def test_compare_multi_index_column():
+    index = pd.to_datetime(["2021-01-01", "2021-01-02", "2021-01-03"])
+    index.name = "__timestamp"
+    iterables = [["m1", "m2"], ["a", "b"], ["x", "y"]]
+    columns = pd.MultiIndex.from_product(iterables, names=[None, "level1", "level2"])
+    df = pd.DataFrame(index=index, columns=columns, data=1)
+    """
+                m1          m2
+    level1       a     b     a     b
+    level2       x  y  x  y  x  y  x  y
+    __timestamp
+    2021-01-01   1  1  1  1  1  1  1  1
+    2021-01-02   1  1  1  1  1  1  1  1
+    2021-01-03   1  1  1  1  1  1  1  1
+    """
+    post_df = pp.compare(
+        df,
+        source_columns=["m1"],
+        compare_columns=["m2"],
+        compare_type=PPC.DIFF,
+        drop_original_columns=True,
+    )
+    flat_df = pp.flatten(post_df)
+    """
+      __timestamp  difference__m1__m2, a, x  difference__m1__m2, a, y  difference__m1__m2, b, x  difference__m1__m2, b, y
+    0  2021-01-01                         0                         0                         0                         0
+    1  2021-01-02                         0                         0                         0                         0
+    2  2021-01-03                         0                         0                         0                         0
+    """
+    assert flat_df.equals(
+        pd.DataFrame(
+            data={
+                "__timestamp": pd.to_datetime(
+                    ["2021-01-01", "2021-01-02", "2021-01-03"]
+                ),
+                "difference__m1__m2, a, x": [0, 0, 0],
+                "difference__m1__m2, a, y": [0, 0, 0],
+                "difference__m1__m2, b, x": [0, 0, 0],
+                "difference__m1__m2, b, y": [0, 0, 0],
+            }
+        )
+    )
+
+
+def test_compare_after_pivot():
+    pivot_df = pp.pivot(
+        df=multiple_metrics_df,
+        index=["dttm"],
+        columns=["country"],
+        aggregates={
+            "sum_metric": {"operator": "sum"},
+            "count_metric": {"operator": "sum"},
+        },
+        flatten_columns=False,
+        reset_index=False,
+    )
+    """
+                   count_metric    sum_metric
+    country              UK US         UK US
+    dttm
+    2019-01-01            1  2          5  6
+    2019-01-02            3  4          7  8
+    """
+    compared_df = pp.compare(
+        pivot_df,
+        source_columns=["count_metric"],
+        compare_columns=["sum_metric"],
+        compare_type=PPC.DIFF,
+        drop_original_columns=True,
+    )
+    """
+               difference__count_metric__sum_metric
+    country                                      UK US
+    dttm
+    2019-01-01                                    4  4
+    2019-01-02                                    4  4
+    """
+    flat_df = pp.flatten(compared_df)
+    """
+            dttm  difference__count_metric__sum_metric, UK  difference__count_metric__sum_metric, US
+    0 2019-01-01                                         4                                         4
+    1 2019-01-02                                         4                                         4
+    """
+    assert flat_df.equals(
+        pd.DataFrame(
+            data={
+                "dttm": pd.to_datetime(["2019-01-01", "2019-01-02"]),
+                FLAT_COLUMN_SEPARATOR.join(
+                    ["difference__count_metric__sum_metric", "UK"]
+                ): [4, 4],
+                FLAT_COLUMN_SEPARATOR.join(
+                    ["difference__count_metric__sum_metric", "US"]
+                ): [4, 4],
+            }
+        )
    )
-    assert post_df.columns.tolist() == ["label", "y", "z", "ratio__y__z"]
-    assert series_to_list(post_df["ratio__y__z"]) == [1.0, 0.5, 0.2, 0.25]