mirror of
https://github.com/apache/superset.git
synced 2026-04-16 14:45:21 +00:00
fix: coerce datetime conversion errors (#32683)
This commit is contained in:
@@ -1682,18 +1682,26 @@ def normalize_dttm_col(
|
||||
utc=False,
|
||||
unit=unit,
|
||||
origin="unix",
|
||||
errors="raise",
|
||||
errors="coerce",
|
||||
exact=False,
|
||||
)
|
||||
else:
|
||||
# Column has already been formatted as a timestamp.
|
||||
df[_col.col_label] = dttm_series.apply(pd.Timestamp)
|
||||
try:
|
||||
df[_col.col_label] = dttm_series.apply(
|
||||
lambda x: pd.Timestamp(x) if pd.notna(x) else pd.NaT
|
||||
)
|
||||
except ValueError:
|
||||
logger.warning(
|
||||
"Unable to convert column %s to datetime, ignoring",
|
||||
_col.col_label,
|
||||
)
|
||||
else:
|
||||
df[_col.col_label] = pd.to_datetime(
|
||||
df[_col.col_label],
|
||||
utc=False,
|
||||
format=_col.timestamp_format,
|
||||
errors="raise",
|
||||
errors="coerce",
|
||||
exact=False,
|
||||
)
|
||||
if _col.offset:
|
||||
|
||||
@@ -483,8 +483,3 @@ class TestUtils(SupersetTestCase):
|
||||
# test numeric epoch_ms format
|
||||
df = pd.DataFrame([{"__timestamp": ts.timestamp() * 1000, "a": 1}])
|
||||
assert normalize_col(df, "epoch_ms", 0, None)[DTTM_ALIAS][0] == ts
|
||||
|
||||
# test that we raise an error when we can't convert
|
||||
df = pd.DataFrame([{"__timestamp": "1677-09-21 00:00:00", "a": 1}])
|
||||
with pytest.raises(pd.errors.OutOfBoundsDatetime):
|
||||
normalize_col(df, None, 0, None)
|
||||
|
||||
@@ -19,9 +19,11 @@ from dataclasses import dataclass
|
||||
from typing import Any, Optional
|
||||
from unittest.mock import MagicMock, patch
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import pytest
|
||||
from flask import current_app
|
||||
from pandas.api.types import is_datetime64_dtype
|
||||
from pytest_mock import MockerFixture
|
||||
|
||||
from superset.exceptions import SupersetException
|
||||
@@ -225,6 +227,197 @@ def test_normalize_dttm_col() -> None:
|
||||
assert df["__time"].astype(str).tolist() == ["2017-07-01"]
|
||||
|
||||
|
||||
def test_normalize_dttm_col_epoch_seconds() -> None:
|
||||
"""Test conversion of epoch seconds."""
|
||||
df = pd.DataFrame(
|
||||
{
|
||||
"epoch_col": [
|
||||
1577836800,
|
||||
1609459200,
|
||||
1640995200,
|
||||
] # 2020-01-01, 2021-01-01, 2022-01-01
|
||||
}
|
||||
)
|
||||
dttm_cols = (DateColumn(col_label="epoch_col", timestamp_format="epoch_s"),)
|
||||
|
||||
normalize_dttm_col(df, dttm_cols)
|
||||
|
||||
assert is_datetime64_dtype(df["epoch_col"])
|
||||
assert df["epoch_col"][0].strftime("%Y-%m-%d") == "2020-01-01"
|
||||
assert df["epoch_col"][1].strftime("%Y-%m-%d") == "2021-01-01"
|
||||
assert df["epoch_col"][2].strftime("%Y-%m-%d") == "2022-01-01"
|
||||
|
||||
|
||||
def test_normalize_dttm_col_epoch_milliseconds() -> None:
|
||||
"""Test conversion of epoch milliseconds."""
|
||||
df = pd.DataFrame(
|
||||
{
|
||||
"epoch_ms_col": [
|
||||
1577836800000,
|
||||
1609459200000,
|
||||
1640995200000,
|
||||
] # 2020-01-01, 2021-01-01, 2022-01-01
|
||||
}
|
||||
)
|
||||
dttm_cols = (DateColumn(col_label="epoch_ms_col", timestamp_format="epoch_ms"),)
|
||||
|
||||
normalize_dttm_col(df, dttm_cols)
|
||||
|
||||
assert is_datetime64_dtype(df["epoch_ms_col"])
|
||||
assert df["epoch_ms_col"][0].strftime("%Y-%m-%d") == "2020-01-01"
|
||||
assert df["epoch_ms_col"][1].strftime("%Y-%m-%d") == "2021-01-01"
|
||||
assert df["epoch_ms_col"][2].strftime("%Y-%m-%d") == "2022-01-01"
|
||||
|
||||
|
||||
def test_normalize_dttm_col_formatted_date() -> None:
|
||||
"""Test conversion of formatted date strings."""
|
||||
df = pd.DataFrame({"date_col": ["2020-01-01", "2021-01-01", "2022-01-01"]})
|
||||
dttm_cols = (DateColumn(col_label="date_col", timestamp_format="%Y-%m-%d"),)
|
||||
|
||||
normalize_dttm_col(df, dttm_cols)
|
||||
|
||||
assert is_datetime64_dtype(df["date_col"])
|
||||
assert df["date_col"][0].strftime("%Y-%m-%d") == "2020-01-01"
|
||||
assert df["date_col"][1].strftime("%Y-%m-%d") == "2021-01-01"
|
||||
assert df["date_col"][2].strftime("%Y-%m-%d") == "2022-01-01"
|
||||
|
||||
|
||||
def test_normalize_dttm_col_with_offset() -> None:
|
||||
"""Test with hour offset."""
|
||||
df = pd.DataFrame({"date_col": ["2020-01-01", "2021-01-01", "2022-01-01"]})
|
||||
dttm_cols = (
|
||||
DateColumn(col_label="date_col", timestamp_format="%Y-%m-%d", offset=3),
|
||||
)
|
||||
|
||||
normalize_dttm_col(df, dttm_cols)
|
||||
|
||||
assert is_datetime64_dtype(df["date_col"])
|
||||
assert df["date_col"][0].strftime("%Y-%m-%d %H:%M:%S") == "2020-01-01 03:00:00"
|
||||
assert df["date_col"][1].strftime("%Y-%m-%d %H:%M:%S") == "2021-01-01 03:00:00"
|
||||
assert df["date_col"][2].strftime("%Y-%m-%d %H:%M:%S") == "2022-01-01 03:00:00"
|
||||
|
||||
|
||||
def test_normalize_dttm_col_with_time_shift() -> None:
|
||||
"""Test with time shift."""
|
||||
df = pd.DataFrame({"date_col": ["2020-01-01", "2021-01-01", "2022-01-01"]})
|
||||
dttm_cols = (
|
||||
DateColumn(
|
||||
col_label="date_col", timestamp_format="%Y-%m-%d", time_shift="1 day"
|
||||
),
|
||||
)
|
||||
|
||||
normalize_dttm_col(df, dttm_cols)
|
||||
|
||||
assert is_datetime64_dtype(df["date_col"])
|
||||
assert df["date_col"][0].strftime("%Y-%m-%d") == "2020-01-02"
|
||||
assert df["date_col"][1].strftime("%Y-%m-%d") == "2021-01-02"
|
||||
assert df["date_col"][2].strftime("%Y-%m-%d") == "2022-01-02"
|
||||
|
||||
|
||||
def test_normalize_dttm_col_with_offset_and_time_shift() -> None:
|
||||
"""Test with both offset and time shift."""
|
||||
df = pd.DataFrame({"date_col": ["2020-01-01", "2021-01-01", "2022-01-01"]})
|
||||
dttm_cols = (
|
||||
DateColumn(
|
||||
col_label="date_col",
|
||||
timestamp_format="%Y-%m-%d",
|
||||
offset=3,
|
||||
time_shift="1 hour",
|
||||
),
|
||||
)
|
||||
|
||||
normalize_dttm_col(df, dttm_cols)
|
||||
|
||||
assert is_datetime64_dtype(df["date_col"])
|
||||
assert df["date_col"][0].strftime("%Y-%m-%d %H:%M:%S") == "2020-01-01 04:00:00"
|
||||
assert df["date_col"][1].strftime("%Y-%m-%d %H:%M:%S") == "2021-01-01 04:00:00"
|
||||
assert df["date_col"][2].strftime("%Y-%m-%d %H:%M:%S") == "2022-01-01 04:00:00"
|
||||
|
||||
|
||||
def test_normalize_dttm_col_invalid_date_coerced() -> None:
|
||||
"""Test that invalid dates are coerced to NaT."""
|
||||
df = pd.DataFrame({"date_col": ["2020-01-01", "invalid_date", "2022-01-01"]})
|
||||
dttm_cols = (DateColumn(col_label="date_col", timestamp_format="%Y-%m-%d"),)
|
||||
|
||||
normalize_dttm_col(df, dttm_cols)
|
||||
|
||||
assert is_datetime64_dtype(df["date_col"])
|
||||
assert df["date_col"][0].strftime("%Y-%m-%d") == "2020-01-01"
|
||||
assert pd.isna(df["date_col"][1])
|
||||
assert df["date_col"][2].strftime("%Y-%m-%d") == "2022-01-01"
|
||||
|
||||
|
||||
def test_normalize_dttm_col_invalid_epoch_coerced() -> None:
|
||||
"""Test that invalid epoch values are coerced to NaT."""
|
||||
df = pd.DataFrame(
|
||||
{"epoch_col": [1577836800, np.nan, 1640995200]} # 2020-01-01, NaN, 2022-01-01
|
||||
)
|
||||
dttm_cols = (DateColumn(col_label="epoch_col", timestamp_format="epoch_s"),)
|
||||
|
||||
normalize_dttm_col(df, dttm_cols)
|
||||
|
||||
assert is_datetime64_dtype(df["epoch_col"])
|
||||
assert df["epoch_col"][0].strftime("%Y-%m-%d") == "2020-01-01"
|
||||
assert pd.isna(df["epoch_col"][1])
|
||||
assert df["epoch_col"][2].strftime("%Y-%m-%d") == "2022-01-01"
|
||||
|
||||
|
||||
def test_normalize_dttm_col_non_existing_column() -> None:
|
||||
"""Test handling of non-existing columns."""
|
||||
df = pd.DataFrame({"existing_col": [1, 2, 3]})
|
||||
dttm_cols = (DateColumn(col_label="non_existing_col", timestamp_format="%Y-%m-%d"),)
|
||||
|
||||
# Should not raise any exception
|
||||
normalize_dttm_col(df, dttm_cols)
|
||||
|
||||
# DataFrame should remain unchanged
|
||||
assert list(df.columns) == ["existing_col"]
|
||||
assert df["existing_col"].tolist() == [1, 2, 3]
|
||||
|
||||
|
||||
def test_normalize_dttm_col_multiple_columns() -> None:
|
||||
"""Test normalizing multiple datetime columns."""
|
||||
df = pd.DataFrame(
|
||||
{
|
||||
"date_col1": ["2020-01-01", "2021-01-01", "2022-01-01"],
|
||||
"date_col2": ["01/01/2020", "01/01/2021", "01/01/2022"],
|
||||
}
|
||||
)
|
||||
dttm_cols = (
|
||||
DateColumn(col_label="date_col1", timestamp_format="%Y-%m-%d"),
|
||||
DateColumn(col_label="date_col2", timestamp_format="%m/%d/%Y"),
|
||||
)
|
||||
|
||||
normalize_dttm_col(df, dttm_cols)
|
||||
|
||||
assert is_datetime64_dtype(df["date_col1"])
|
||||
assert is_datetime64_dtype(df["date_col2"])
|
||||
assert df["date_col1"][0].strftime("%Y-%m-%d") == "2020-01-01"
|
||||
assert df["date_col2"][0].strftime("%Y-%m-%d") == "2020-01-01"
|
||||
|
||||
|
||||
def test_normalize_dttm_col_already_datetime_series() -> None:
|
||||
"""Test handling of already datetime series with epoch format."""
|
||||
# Create a DataFrame with timestamp strings
|
||||
df = pd.DataFrame(
|
||||
{
|
||||
"ts_col": [
|
||||
"2020-01-01 00:00:00",
|
||||
"2021-01-01 00:00:00",
|
||||
"2022-01-01 00:00:00",
|
||||
]
|
||||
}
|
||||
)
|
||||
dttm_cols = (DateColumn(col_label="ts_col", timestamp_format="epoch_s"),)
|
||||
|
||||
normalize_dttm_col(df, dttm_cols)
|
||||
|
||||
assert is_datetime64_dtype(df["ts_col"])
|
||||
assert df["ts_col"][0].strftime("%Y-%m-%d") == "2020-01-01"
|
||||
assert df["ts_col"][1].strftime("%Y-%m-%d") == "2021-01-01"
|
||||
assert df["ts_col"][2].strftime("%Y-%m-%d") == "2022-01-01"
|
||||
|
||||
|
||||
def test_check_if_safe_zip_success(app_context: None) -> None:
|
||||
"""
|
||||
Test if ZIP files are safe
|
||||
|
||||
Reference in New Issue
Block a user