fix(dataset): apply HOURS OFFSET to all temporal columns

The dataset-level HOURS OFFSET (and time shift) was only applied to the selected time column / base-axis / granularity column, plus any column declaring a python_date_format. A second temporal column returned by the database as a native datetime kept its raw, un-offset value. normalize_df now also applies the offset to any already-datetime temporal column the query returns. Columns arriving as plain integers/strings with no declared format are left untouched, since they cannot be safely coerced to datetimes. Fixes #23167 Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-07-02 12:55:35 +00:00 · 2026-06-28 14:04:33 -07:00
2 changed files with 170 additions and 2 deletions
--- a/superset/models/helpers.py
+++ b/superset/models/helpers.py
@@ -1625,6 +1625,56 @@ class ExploreMixin:  # pylint: disable=too-many-public-methods
                seen.add(label)
        return tuple(labels)

+    def _offset_only_dttm_cols(
+        self,
+        df: pd.DataFrame,
+        query_object: QueryObject,
+        already_collected: set[str],
+    ) -> list[DateColumn]:
+        """``DateColumn`` entries that only need the dataset HOURS OFFSET (and any
+        time shift) applied, for temporal columns the database already returns as
+        native datetimes.
+
+        The dataset offset must apply to *every* temporal column a query returns,
+        not just the selected time column. ``_collect_dttm_labels`` only covers
+        columns that need parsing (a declared ``python_date_format``) or the
+        base-axis / granularity column; a second temporal column returned as a
+        native datetime would otherwise keep its raw, un-offset value. Columns
+        arriving as plain integers/strings without a declared format are skipped,
+        since they cannot be safely interpreted as datetimes.
+        See https://github.com/apache/superset/issues/23167.
+        """
+        if not (self.offset or query_object.time_shift) or not hasattr(
+            self, "get_column"
+        ):
+            return []
+
+        extra: list[DateColumn] = []
+        for label in df.columns:
+            if label in already_collected or label == DTTM_ALIAS:
+                continue
+            if not pd.api.types.is_datetime64_any_dtype(df[label]):
+                continue
+            column_obj = self.get_column(label)
+            if not column_obj:
+                continue
+            is_dttm = (
+                column_obj.get("is_dttm")
+                if isinstance(column_obj, dict)
+                else getattr(column_obj, "is_dttm", False)
+            )
+            if is_dttm:
+                extra.append(
+                    DateColumn(
+                        timestamp_format=None,
+                        offset=self.offset,
+                        time_shift=query_object.time_shift,
+                        col_label=label,
+                    )
+                )
+                already_collected.add(label)
+        return extra
+
    def normalize_df(self, df: pd.DataFrame, query_object: QueryObject) -> pd.DataFrame:
        """
        Normalize the dataframe by converting datetime columns and ensuring
@@ -1655,6 +1705,12 @@ class ExploreMixin:  # pylint: disable=too-many-public-methods
                )
            )

+        dttm_cols.extend(
+            self._offset_only_dttm_cols(
+                df, query_object, {col.col_label for col in dttm_cols}
+            )
+        )
+
        # Build format map from detected datetime formats stored in dataset columns
        format_map: dict[str, str] = {}
        if hasattr(self, "columns"):
--- a/tests/unit_tests/models/helpers_test.py
+++ b/tests/unit_tests/models/helpers_test.py
@@ -2626,7 +2626,12 @@ def _normalize_df_datasource(column: object) -> MagicMock:
    datasource.enforce_numerical_metrics = False
    datasource.columns = [column]
    datasource.get_column = lambda name: {"ts": column}.get(name)
-    for method in ("_python_date_format", "_collect_dttm_labels", "normalize_df"):
+    for method in (
+        "_python_date_format",
+        "_collect_dttm_labels",
+        "_offset_only_dttm_cols",
+        "normalize_df",
+    ):
        setattr(datasource, method, getattr(ExploreMixin, method).__get__(datasource))
    return datasource

@@ -2750,7 +2755,12 @@ def test_normalize_df_without_get_column_is_a_noop() -> None:
        columns: list[object] = []

    datasource = _NoGetColumnDatasource()
-    for method in ("_python_date_format", "_collect_dttm_labels", "normalize_df"):
+    for method in (
+        "_python_date_format",
+        "_collect_dttm_labels",
+        "_offset_only_dttm_cols",
+        "normalize_df",
+    ):
        setattr(datasource, method, getattr(ExploreMixin, method).__get__(datasource))

    df = pd.DataFrame({"ts": [1577836800, 1609459200]})
@@ -2849,6 +2859,108 @@ def test_normalize_df_normalizes_legacy_time_column() -> None:
    assert result[DTTM_ALIAS][0].strftime("%Y-%m-%d") == "2020-01-01"


+def test_normalize_df_applies_offset_to_all_temporal_columns() -> None:
+    """Regression test for issue #23167: the dataset HOURS OFFSET must be applied
+    to every temporal column a query returns, not only the selected time column.
+    Two native-datetime temporal columns (neither declaring a
+    ``python_date_format``) must both be shifted by the dataset offset."""
+    import pandas as pd
+
+    from superset.models.helpers import ExploreMixin
+
+    created = MagicMock(
+        column_name="created",
+        is_dttm=True,
+        python_date_format=None,
+        datetime_format=None,
+    )
+    expired = MagicMock(
+        column_name="expired",
+        is_dttm=True,
+        python_date_format=None,
+        datetime_format=None,
+    )
+    columns = {"created": created, "expired": expired}
+
+    datasource = MagicMock()
+    datasource.offset = 4
+    datasource.enforce_numerical_metrics = False
+    datasource.columns = list(columns.values())
+    datasource.get_column = lambda name: columns.get(name)
+    for method in (
+        "_python_date_format",
+        "_collect_dttm_labels",
+        "_offset_only_dttm_cols",
+        "normalize_df",
+    ):
+        setattr(datasource, method, getattr(ExploreMixin, method).__get__(datasource))
+
+    query_object = MagicMock()
+    query_object.columns = ["created", "expired"]
+    query_object.granularity = None
+    query_object.time_shift = None
+
+    df = pd.DataFrame(
+        {
+            "created": pd.to_datetime(["2020-01-01 00:00:00", "2020-01-02 00:00:00"]),
+            "expired": pd.to_datetime(["2020-06-01 12:00:00", "2020-06-02 12:00:00"]),
+        }
+    )
+
+    result = datasource.normalize_df(df, query_object)
+
+    assert (
+        result["created"].tolist()
+        == pd.to_datetime(["2020-01-01 04:00:00", "2020-01-02 04:00:00"]).tolist()
+    )
+    assert (
+        result["expired"].tolist()
+        == pd.to_datetime(["2020-06-01 16:00:00", "2020-06-02 16:00:00"]).tolist()
+    )
+
+
+def test_normalize_df_offset_skips_unconfigured_integer_temporal_columns() -> None:
+    """The offset extension for native-datetime temporal columns must not touch a
+    temporal column whose values arrive as plain integers with no declared
+    format: such a column cannot be safely interpreted as a datetime, so it is
+    left untouched rather than reinterpreted as nanoseconds (see issue #23167)."""
+    import pandas as pd
+    from pandas.api.types import is_datetime64_any_dtype
+
+    from superset.models.helpers import ExploreMixin
+
+    int_col = MagicMock(
+        column_name="ts",
+        is_dttm=True,
+        python_date_format=None,
+        datetime_format=None,
+    )
+    datasource = MagicMock()
+    datasource.offset = 4
+    datasource.enforce_numerical_metrics = False
+    datasource.columns = [int_col]
+    datasource.get_column = lambda name: {"ts": int_col}.get(name)
+    for method in (
+        "_python_date_format",
+        "_collect_dttm_labels",
+        "_offset_only_dttm_cols",
+        "normalize_df",
+    ):
+        setattr(datasource, method, getattr(ExploreMixin, method).__get__(datasource))
+
+    query_object = MagicMock()
+    query_object.columns = ["ts"]
+    query_object.granularity = None
+    query_object.time_shift = None
+
+    df = pd.DataFrame({"ts": [1577836800, 1609459200, 1640995200]})
+
+    result = datasource.normalize_df(df, query_object)
+
+    assert not is_datetime64_any_dtype(result["ts"])
+    assert result["ts"].tolist() == [1577836800, 1609459200, 1640995200]
+
+
 def test_adhoc_column_to_sqla_returns_type_from_column_metadata(
    database: Database,
 ) -> None: