fix: coerce datetime conversion errors (#32683)

This commit is contained in:
Beto Dealmeida
2025-03-18 13:09:23 -04:00
committed by GitHub
parent a2c164a77d
commit 99e69c32ee
3 changed files with 204 additions and 8 deletions

View File

@@ -1682,18 +1682,26 @@ def normalize_dttm_col(
utc=False,
unit=unit,
origin="unix",
errors="raise",
errors="coerce",
exact=False,
)
else:
# Column has already been formatted as a timestamp.
df[_col.col_label] = dttm_series.apply(pd.Timestamp)
try:
df[_col.col_label] = dttm_series.apply(
lambda x: pd.Timestamp(x) if pd.notna(x) else pd.NaT
)
except ValueError:
logger.warning(
"Unable to convert column %s to datetime, ignoring",
_col.col_label,
)
else:
df[_col.col_label] = pd.to_datetime(
df[_col.col_label],
utc=False,
format=_col.timestamp_format,
errors="raise",
errors="coerce",
exact=False,
)
if _col.offset:

View File

@@ -483,8 +483,3 @@ class TestUtils(SupersetTestCase):
# test numeric epoch_ms format
df = pd.DataFrame([{"__timestamp": ts.timestamp() * 1000, "a": 1}])
assert normalize_col(df, "epoch_ms", 0, None)[DTTM_ALIAS][0] == ts
# test that we raise an error when we can't convert
df = pd.DataFrame([{"__timestamp": "1677-09-21 00:00:00", "a": 1}])
with pytest.raises(pd.errors.OutOfBoundsDatetime):
normalize_col(df, None, 0, None)

View File

@@ -19,9 +19,11 @@ from dataclasses import dataclass
from typing import Any, Optional
from unittest.mock import MagicMock, patch
import numpy as np
import pandas as pd
import pytest
from flask import current_app
from pandas.api.types import is_datetime64_dtype
from pytest_mock import MockerFixture
from superset.exceptions import SupersetException
@@ -225,6 +227,197 @@ def test_normalize_dttm_col() -> None:
assert df["__time"].astype(str).tolist() == ["2017-07-01"]
def test_normalize_dttm_col_epoch_seconds() -> None:
"""Test conversion of epoch seconds."""
df = pd.DataFrame(
{
"epoch_col": [
1577836800,
1609459200,
1640995200,
] # 2020-01-01, 2021-01-01, 2022-01-01
}
)
dttm_cols = (DateColumn(col_label="epoch_col", timestamp_format="epoch_s"),)
normalize_dttm_col(df, dttm_cols)
assert is_datetime64_dtype(df["epoch_col"])
assert df["epoch_col"][0].strftime("%Y-%m-%d") == "2020-01-01"
assert df["epoch_col"][1].strftime("%Y-%m-%d") == "2021-01-01"
assert df["epoch_col"][2].strftime("%Y-%m-%d") == "2022-01-01"
def test_normalize_dttm_col_epoch_milliseconds() -> None:
"""Test conversion of epoch milliseconds."""
df = pd.DataFrame(
{
"epoch_ms_col": [
1577836800000,
1609459200000,
1640995200000,
] # 2020-01-01, 2021-01-01, 2022-01-01
}
)
dttm_cols = (DateColumn(col_label="epoch_ms_col", timestamp_format="epoch_ms"),)
normalize_dttm_col(df, dttm_cols)
assert is_datetime64_dtype(df["epoch_ms_col"])
assert df["epoch_ms_col"][0].strftime("%Y-%m-%d") == "2020-01-01"
assert df["epoch_ms_col"][1].strftime("%Y-%m-%d") == "2021-01-01"
assert df["epoch_ms_col"][2].strftime("%Y-%m-%d") == "2022-01-01"
def test_normalize_dttm_col_formatted_date() -> None:
"""Test conversion of formatted date strings."""
df = pd.DataFrame({"date_col": ["2020-01-01", "2021-01-01", "2022-01-01"]})
dttm_cols = (DateColumn(col_label="date_col", timestamp_format="%Y-%m-%d"),)
normalize_dttm_col(df, dttm_cols)
assert is_datetime64_dtype(df["date_col"])
assert df["date_col"][0].strftime("%Y-%m-%d") == "2020-01-01"
assert df["date_col"][1].strftime("%Y-%m-%d") == "2021-01-01"
assert df["date_col"][2].strftime("%Y-%m-%d") == "2022-01-01"
def test_normalize_dttm_col_with_offset() -> None:
"""Test with hour offset."""
df = pd.DataFrame({"date_col": ["2020-01-01", "2021-01-01", "2022-01-01"]})
dttm_cols = (
DateColumn(col_label="date_col", timestamp_format="%Y-%m-%d", offset=3),
)
normalize_dttm_col(df, dttm_cols)
assert is_datetime64_dtype(df["date_col"])
assert df["date_col"][0].strftime("%Y-%m-%d %H:%M:%S") == "2020-01-01 03:00:00"
assert df["date_col"][1].strftime("%Y-%m-%d %H:%M:%S") == "2021-01-01 03:00:00"
assert df["date_col"][2].strftime("%Y-%m-%d %H:%M:%S") == "2022-01-01 03:00:00"
def test_normalize_dttm_col_with_time_shift() -> None:
"""Test with time shift."""
df = pd.DataFrame({"date_col": ["2020-01-01", "2021-01-01", "2022-01-01"]})
dttm_cols = (
DateColumn(
col_label="date_col", timestamp_format="%Y-%m-%d", time_shift="1 day"
),
)
normalize_dttm_col(df, dttm_cols)
assert is_datetime64_dtype(df["date_col"])
assert df["date_col"][0].strftime("%Y-%m-%d") == "2020-01-02"
assert df["date_col"][1].strftime("%Y-%m-%d") == "2021-01-02"
assert df["date_col"][2].strftime("%Y-%m-%d") == "2022-01-02"
def test_normalize_dttm_col_with_offset_and_time_shift() -> None:
"""Test with both offset and time shift."""
df = pd.DataFrame({"date_col": ["2020-01-01", "2021-01-01", "2022-01-01"]})
dttm_cols = (
DateColumn(
col_label="date_col",
timestamp_format="%Y-%m-%d",
offset=3,
time_shift="1 hour",
),
)
normalize_dttm_col(df, dttm_cols)
assert is_datetime64_dtype(df["date_col"])
assert df["date_col"][0].strftime("%Y-%m-%d %H:%M:%S") == "2020-01-01 04:00:00"
assert df["date_col"][1].strftime("%Y-%m-%d %H:%M:%S") == "2021-01-01 04:00:00"
assert df["date_col"][2].strftime("%Y-%m-%d %H:%M:%S") == "2022-01-01 04:00:00"
def test_normalize_dttm_col_invalid_date_coerced() -> None:
"""Test that invalid dates are coerced to NaT."""
df = pd.DataFrame({"date_col": ["2020-01-01", "invalid_date", "2022-01-01"]})
dttm_cols = (DateColumn(col_label="date_col", timestamp_format="%Y-%m-%d"),)
normalize_dttm_col(df, dttm_cols)
assert is_datetime64_dtype(df["date_col"])
assert df["date_col"][0].strftime("%Y-%m-%d") == "2020-01-01"
assert pd.isna(df["date_col"][1])
assert df["date_col"][2].strftime("%Y-%m-%d") == "2022-01-01"
def test_normalize_dttm_col_invalid_epoch_coerced() -> None:
"""Test that invalid epoch values are coerced to NaT."""
df = pd.DataFrame(
{"epoch_col": [1577836800, np.nan, 1640995200]} # 2020-01-01, NaN, 2022-01-01
)
dttm_cols = (DateColumn(col_label="epoch_col", timestamp_format="epoch_s"),)
normalize_dttm_col(df, dttm_cols)
assert is_datetime64_dtype(df["epoch_col"])
assert df["epoch_col"][0].strftime("%Y-%m-%d") == "2020-01-01"
assert pd.isna(df["epoch_col"][1])
assert df["epoch_col"][2].strftime("%Y-%m-%d") == "2022-01-01"
def test_normalize_dttm_col_non_existing_column() -> None:
"""Test handling of non-existing columns."""
df = pd.DataFrame({"existing_col": [1, 2, 3]})
dttm_cols = (DateColumn(col_label="non_existing_col", timestamp_format="%Y-%m-%d"),)
# Should not raise any exception
normalize_dttm_col(df, dttm_cols)
# DataFrame should remain unchanged
assert list(df.columns) == ["existing_col"]
assert df["existing_col"].tolist() == [1, 2, 3]
def test_normalize_dttm_col_multiple_columns() -> None:
"""Test normalizing multiple datetime columns."""
df = pd.DataFrame(
{
"date_col1": ["2020-01-01", "2021-01-01", "2022-01-01"],
"date_col2": ["01/01/2020", "01/01/2021", "01/01/2022"],
}
)
dttm_cols = (
DateColumn(col_label="date_col1", timestamp_format="%Y-%m-%d"),
DateColumn(col_label="date_col2", timestamp_format="%m/%d/%Y"),
)
normalize_dttm_col(df, dttm_cols)
assert is_datetime64_dtype(df["date_col1"])
assert is_datetime64_dtype(df["date_col2"])
assert df["date_col1"][0].strftime("%Y-%m-%d") == "2020-01-01"
assert df["date_col2"][0].strftime("%Y-%m-%d") == "2020-01-01"
def test_normalize_dttm_col_already_datetime_series() -> None:
"""Test handling of already datetime series with epoch format."""
# Create a DataFrame with timestamp strings
df = pd.DataFrame(
{
"ts_col": [
"2020-01-01 00:00:00",
"2021-01-01 00:00:00",
"2022-01-01 00:00:00",
]
}
)
dttm_cols = (DateColumn(col_label="ts_col", timestamp_format="epoch_s"),)
normalize_dttm_col(df, dttm_cols)
assert is_datetime64_dtype(df["ts_col"])
assert df["ts_col"][0].strftime("%Y-%m-%d") == "2020-01-01"
assert df["ts_col"][1].strftime("%Y-%m-%d") == "2021-01-01"
assert df["ts_col"][2].strftime("%Y-%m-%d") == "2022-01-01"
def test_check_if_safe_zip_success(app_context: None) -> None:
"""
Test if ZIP files are safe