Compare commits

...

1 Commits

Author SHA1 Message Date
Evan
72aa495908 fix(dataset): apply HOURS OFFSET to all temporal columns
The dataset-level HOURS OFFSET (and time shift) was only applied to the
selected time column / base-axis / granularity column, plus any column
declaring a python_date_format. A second temporal column returned by the
database as a native datetime kept its raw, un-offset value.

normalize_df now also applies the offset to any already-datetime temporal
column the query returns. Columns arriving as plain integers/strings with
no declared format are left untouched, since they cannot be safely coerced
to datetimes.

Fixes #23167

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-28 14:04:33 -07:00
2 changed files with 170 additions and 2 deletions

View File

@@ -1625,6 +1625,56 @@ class ExploreMixin: # pylint: disable=too-many-public-methods
seen.add(label)
return tuple(labels)
def _offset_only_dttm_cols(
self,
df: pd.DataFrame,
query_object: QueryObject,
already_collected: set[str],
) -> list[DateColumn]:
"""``DateColumn`` entries that only need the dataset HOURS OFFSET (and any
time shift) applied, for temporal columns the database already returns as
native datetimes.
The dataset offset must apply to *every* temporal column a query returns,
not just the selected time column. ``_collect_dttm_labels`` only covers
columns that need parsing (a declared ``python_date_format``) or the
base-axis / granularity column; a second temporal column returned as a
native datetime would otherwise keep its raw, un-offset value. Columns
arriving as plain integers/strings without a declared format are skipped,
since they cannot be safely interpreted as datetimes.
See https://github.com/apache/superset/issues/23167.
"""
if not (self.offset or query_object.time_shift) or not hasattr(
self, "get_column"
):
return []
extra: list[DateColumn] = []
for label in df.columns:
if label in already_collected or label == DTTM_ALIAS:
continue
if not pd.api.types.is_datetime64_any_dtype(df[label]):
continue
column_obj = self.get_column(label)
if not column_obj:
continue
is_dttm = (
column_obj.get("is_dttm")
if isinstance(column_obj, dict)
else getattr(column_obj, "is_dttm", False)
)
if is_dttm:
extra.append(
DateColumn(
timestamp_format=None,
offset=self.offset,
time_shift=query_object.time_shift,
col_label=label,
)
)
already_collected.add(label)
return extra
def normalize_df(self, df: pd.DataFrame, query_object: QueryObject) -> pd.DataFrame:
"""
Normalize the dataframe by converting datetime columns and ensuring
@@ -1655,6 +1705,12 @@ class ExploreMixin: # pylint: disable=too-many-public-methods
)
)
dttm_cols.extend(
self._offset_only_dttm_cols(
df, query_object, {col.col_label for col in dttm_cols}
)
)
# Build format map from detected datetime formats stored in dataset columns
format_map: dict[str, str] = {}
if hasattr(self, "columns"):

View File

@@ -2626,7 +2626,12 @@ def _normalize_df_datasource(column: object) -> MagicMock:
datasource.enforce_numerical_metrics = False
datasource.columns = [column]
datasource.get_column = lambda name: {"ts": column}.get(name)
for method in ("_python_date_format", "_collect_dttm_labels", "normalize_df"):
for method in (
"_python_date_format",
"_collect_dttm_labels",
"_offset_only_dttm_cols",
"normalize_df",
):
setattr(datasource, method, getattr(ExploreMixin, method).__get__(datasource))
return datasource
@@ -2750,7 +2755,12 @@ def test_normalize_df_without_get_column_is_a_noop() -> None:
columns: list[object] = []
datasource = _NoGetColumnDatasource()
for method in ("_python_date_format", "_collect_dttm_labels", "normalize_df"):
for method in (
"_python_date_format",
"_collect_dttm_labels",
"_offset_only_dttm_cols",
"normalize_df",
):
setattr(datasource, method, getattr(ExploreMixin, method).__get__(datasource))
df = pd.DataFrame({"ts": [1577836800, 1609459200]})
@@ -2849,6 +2859,108 @@ def test_normalize_df_normalizes_legacy_time_column() -> None:
assert result[DTTM_ALIAS][0].strftime("%Y-%m-%d") == "2020-01-01"
def test_normalize_df_applies_offset_to_all_temporal_columns() -> None:
"""Regression test for issue #23167: the dataset HOURS OFFSET must be applied
to every temporal column a query returns, not only the selected time column.
Two native-datetime temporal columns (neither declaring a
``python_date_format``) must both be shifted by the dataset offset."""
import pandas as pd
from superset.models.helpers import ExploreMixin
created = MagicMock(
column_name="created",
is_dttm=True,
python_date_format=None,
datetime_format=None,
)
expired = MagicMock(
column_name="expired",
is_dttm=True,
python_date_format=None,
datetime_format=None,
)
columns = {"created": created, "expired": expired}
datasource = MagicMock()
datasource.offset = 4
datasource.enforce_numerical_metrics = False
datasource.columns = list(columns.values())
datasource.get_column = lambda name: columns.get(name)
for method in (
"_python_date_format",
"_collect_dttm_labels",
"_offset_only_dttm_cols",
"normalize_df",
):
setattr(datasource, method, getattr(ExploreMixin, method).__get__(datasource))
query_object = MagicMock()
query_object.columns = ["created", "expired"]
query_object.granularity = None
query_object.time_shift = None
df = pd.DataFrame(
{
"created": pd.to_datetime(["2020-01-01 00:00:00", "2020-01-02 00:00:00"]),
"expired": pd.to_datetime(["2020-06-01 12:00:00", "2020-06-02 12:00:00"]),
}
)
result = datasource.normalize_df(df, query_object)
assert (
result["created"].tolist()
== pd.to_datetime(["2020-01-01 04:00:00", "2020-01-02 04:00:00"]).tolist()
)
assert (
result["expired"].tolist()
== pd.to_datetime(["2020-06-01 16:00:00", "2020-06-02 16:00:00"]).tolist()
)
def test_normalize_df_offset_skips_unconfigured_integer_temporal_columns() -> None:
"""The offset extension for native-datetime temporal columns must not touch a
temporal column whose values arrive as plain integers with no declared
format: such a column cannot be safely interpreted as a datetime, so it is
left untouched rather than reinterpreted as nanoseconds (see issue #23167)."""
import pandas as pd
from pandas.api.types import is_datetime64_any_dtype
from superset.models.helpers import ExploreMixin
int_col = MagicMock(
column_name="ts",
is_dttm=True,
python_date_format=None,
datetime_format=None,
)
datasource = MagicMock()
datasource.offset = 4
datasource.enforce_numerical_metrics = False
datasource.columns = [int_col]
datasource.get_column = lambda name: {"ts": int_col}.get(name)
for method in (
"_python_date_format",
"_collect_dttm_labels",
"_offset_only_dttm_cols",
"normalize_df",
):
setattr(datasource, method, getattr(ExploreMixin, method).__get__(datasource))
query_object = MagicMock()
query_object.columns = ["ts"]
query_object.granularity = None
query_object.time_shift = None
df = pd.DataFrame({"ts": [1577836800, 1609459200, 1640995200]})
result = datasource.normalize_df(df, query_object)
assert not is_datetime64_any_dtype(result["ts"])
assert result["ts"].tolist() == [1577836800, 1609459200, 1640995200]
def test_adhoc_column_to_sqla_returns_type_from_column_metadata(
database: Database,
) -> None: