diff --git a/superset/utils/core.py b/superset/utils/core.py index 69de1707ede..2b80c89f612 100644 --- a/superset/utils/core.py +++ b/superset/utils/core.py @@ -1682,18 +1682,26 @@ def normalize_dttm_col( utc=False, unit=unit, origin="unix", - errors="raise", + errors="coerce", exact=False, ) else: # Column has already been formatted as a timestamp. - df[_col.col_label] = dttm_series.apply(pd.Timestamp) + try: + df[_col.col_label] = dttm_series.apply( + lambda x: pd.Timestamp(x) if pd.notna(x) else pd.NaT + ) + except ValueError: + logger.warning( + "Unable to convert column %s to datetime, ignoring", + _col.col_label, + ) else: df[_col.col_label] = pd.to_datetime( df[_col.col_label], utc=False, format=_col.timestamp_format, - errors="raise", + errors="coerce", exact=False, ) if _col.offset: diff --git a/tests/integration_tests/utils_tests.py b/tests/integration_tests/utils_tests.py index aa399231522..2bab7cdee41 100644 --- a/tests/integration_tests/utils_tests.py +++ b/tests/integration_tests/utils_tests.py @@ -483,8 +483,3 @@ class TestUtils(SupersetTestCase): # test numeric epoch_ms format df = pd.DataFrame([{"__timestamp": ts.timestamp() * 1000, "a": 1}]) assert normalize_col(df, "epoch_ms", 0, None)[DTTM_ALIAS][0] == ts - - # test that we raise an error when we can't convert - df = pd.DataFrame([{"__timestamp": "1677-09-21 00:00:00", "a": 1}]) - with pytest.raises(pd.errors.OutOfBoundsDatetime): - normalize_col(df, None, 0, None) diff --git a/tests/unit_tests/utils/test_core.py b/tests/unit_tests/utils/test_core.py index aa51e52f6cb..e629f002906 100644 --- a/tests/unit_tests/utils/test_core.py +++ b/tests/unit_tests/utils/test_core.py @@ -19,9 +19,11 @@ from dataclasses import dataclass from typing import Any, Optional from unittest.mock import MagicMock, patch +import numpy as np import pandas as pd import pytest from flask import current_app +from pandas.api.types import is_datetime64_dtype from pytest_mock import MockerFixture from superset.exceptions import SupersetException @@ -225,6 +227,197 @@ def test_normalize_dttm_col() -> None: assert df["__time"].astype(str).tolist() == ["2017-07-01"] +def test_normalize_dttm_col_epoch_seconds() -> None: + """Test conversion of epoch seconds.""" + df = pd.DataFrame( + { + "epoch_col": [ + 1577836800, + 1609459200, + 1640995200, + ] # 2020-01-01, 2021-01-01, 2022-01-01 + } + ) + dttm_cols = (DateColumn(col_label="epoch_col", timestamp_format="epoch_s"),) + + normalize_dttm_col(df, dttm_cols) + + assert is_datetime64_dtype(df["epoch_col"]) + assert df["epoch_col"][0].strftime("%Y-%m-%d") == "2020-01-01" + assert df["epoch_col"][1].strftime("%Y-%m-%d") == "2021-01-01" + assert df["epoch_col"][2].strftime("%Y-%m-%d") == "2022-01-01" + + +def test_normalize_dttm_col_epoch_milliseconds() -> None: + """Test conversion of epoch milliseconds.""" + df = pd.DataFrame( + { + "epoch_ms_col": [ + 1577836800000, + 1609459200000, + 1640995200000, + ] # 2020-01-01, 2021-01-01, 2022-01-01 + } + ) + dttm_cols = (DateColumn(col_label="epoch_ms_col", timestamp_format="epoch_ms"),) + + normalize_dttm_col(df, dttm_cols) + + assert is_datetime64_dtype(df["epoch_ms_col"]) + assert df["epoch_ms_col"][0].strftime("%Y-%m-%d") == "2020-01-01" + assert df["epoch_ms_col"][1].strftime("%Y-%m-%d") == "2021-01-01" + assert df["epoch_ms_col"][2].strftime("%Y-%m-%d") == "2022-01-01" + + +def test_normalize_dttm_col_formatted_date() -> None: + """Test conversion of formatted date strings.""" + df = pd.DataFrame({"date_col": ["2020-01-01", "2021-01-01", "2022-01-01"]}) + dttm_cols = (DateColumn(col_label="date_col", timestamp_format="%Y-%m-%d"),) + + normalize_dttm_col(df, dttm_cols) + + assert is_datetime64_dtype(df["date_col"]) + assert df["date_col"][0].strftime("%Y-%m-%d") == "2020-01-01" + assert df["date_col"][1].strftime("%Y-%m-%d") == "2021-01-01" + assert df["date_col"][2].strftime("%Y-%m-%d") == "2022-01-01" + + +def test_normalize_dttm_col_with_offset() -> None: + """Test with hour offset.""" + df = pd.DataFrame({"date_col": ["2020-01-01", "2021-01-01", "2022-01-01"]}) + dttm_cols = ( + DateColumn(col_label="date_col", timestamp_format="%Y-%m-%d", offset=3), + ) + + normalize_dttm_col(df, dttm_cols) + + assert is_datetime64_dtype(df["date_col"]) + assert df["date_col"][0].strftime("%Y-%m-%d %H:%M:%S") == "2020-01-01 03:00:00" + assert df["date_col"][1].strftime("%Y-%m-%d %H:%M:%S") == "2021-01-01 03:00:00" + assert df["date_col"][2].strftime("%Y-%m-%d %H:%M:%S") == "2022-01-01 03:00:00" + + +def test_normalize_dttm_col_with_time_shift() -> None: + """Test with time shift.""" + df = pd.DataFrame({"date_col": ["2020-01-01", "2021-01-01", "2022-01-01"]}) + dttm_cols = ( + DateColumn( + col_label="date_col", timestamp_format="%Y-%m-%d", time_shift="1 day" + ), + ) + + normalize_dttm_col(df, dttm_cols) + + assert is_datetime64_dtype(df["date_col"]) + assert df["date_col"][0].strftime("%Y-%m-%d") == "2020-01-02" + assert df["date_col"][1].strftime("%Y-%m-%d") == "2021-01-02" + assert df["date_col"][2].strftime("%Y-%m-%d") == "2022-01-02" + + +def test_normalize_dttm_col_with_offset_and_time_shift() -> None: + """Test with both offset and time shift.""" + df = pd.DataFrame({"date_col": ["2020-01-01", "2021-01-01", "2022-01-01"]}) + dttm_cols = ( + DateColumn( + col_label="date_col", + timestamp_format="%Y-%m-%d", + offset=3, + time_shift="1 hour", + ), + ) + + normalize_dttm_col(df, dttm_cols) + + assert is_datetime64_dtype(df["date_col"]) + assert df["date_col"][0].strftime("%Y-%m-%d %H:%M:%S") == "2020-01-01 04:00:00" + assert df["date_col"][1].strftime("%Y-%m-%d %H:%M:%S") == "2021-01-01 04:00:00" + assert df["date_col"][2].strftime("%Y-%m-%d %H:%M:%S") == "2022-01-01 04:00:00" + + +def test_normalize_dttm_col_invalid_date_coerced() -> None: + """Test that invalid dates are coerced to NaT.""" + df = pd.DataFrame({"date_col": ["2020-01-01", "invalid_date", "2022-01-01"]}) + dttm_cols = (DateColumn(col_label="date_col", timestamp_format="%Y-%m-%d"),) + + normalize_dttm_col(df, dttm_cols) + + assert is_datetime64_dtype(df["date_col"]) + assert df["date_col"][0].strftime("%Y-%m-%d") == "2020-01-01" + assert pd.isna(df["date_col"][1]) + assert df["date_col"][2].strftime("%Y-%m-%d") == "2022-01-01" + + +def test_normalize_dttm_col_invalid_epoch_coerced() -> None: + """Test that invalid epoch values are coerced to NaT.""" + df = pd.DataFrame( + {"epoch_col": [1577836800, np.nan, 1640995200]} # 2020-01-01, NaN, 2022-01-01 + ) + dttm_cols = (DateColumn(col_label="epoch_col", timestamp_format="epoch_s"),) + + normalize_dttm_col(df, dttm_cols) + + assert is_datetime64_dtype(df["epoch_col"]) + assert df["epoch_col"][0].strftime("%Y-%m-%d") == "2020-01-01" + assert pd.isna(df["epoch_col"][1]) + assert df["epoch_col"][2].strftime("%Y-%m-%d") == "2022-01-01" + + +def test_normalize_dttm_col_non_existing_column() -> None: + """Test handling of non-existing columns.""" + df = pd.DataFrame({"existing_col": [1, 2, 3]}) + dttm_cols = (DateColumn(col_label="non_existing_col", timestamp_format="%Y-%m-%d"),) + + # Should not raise any exception + normalize_dttm_col(df, dttm_cols) + + # DataFrame should remain unchanged + assert list(df.columns) == ["existing_col"] + assert df["existing_col"].tolist() == [1, 2, 3] + + +def test_normalize_dttm_col_multiple_columns() -> None: + """Test normalizing multiple datetime columns.""" + df = pd.DataFrame( + { + "date_col1": ["2020-01-01", "2021-01-01", "2022-01-01"], + "date_col2": ["01/01/2020", "01/01/2021", "01/01/2022"], + } + ) + dttm_cols = ( + DateColumn(col_label="date_col1", timestamp_format="%Y-%m-%d"), + DateColumn(col_label="date_col2", timestamp_format="%m/%d/%Y"), + ) + + normalize_dttm_col(df, dttm_cols) + + assert is_datetime64_dtype(df["date_col1"]) + assert is_datetime64_dtype(df["date_col2"]) + assert df["date_col1"][0].strftime("%Y-%m-%d") == "2020-01-01" + assert df["date_col2"][0].strftime("%Y-%m-%d") == "2020-01-01" + + +def test_normalize_dttm_col_already_datetime_series() -> None: + """Test handling of already datetime series with epoch format.""" + # Create a DataFrame with timestamp strings + df = pd.DataFrame( + { + "ts_col": [ + "2020-01-01 00:00:00", + "2021-01-01 00:00:00", + "2022-01-01 00:00:00", + ] + } + ) + dttm_cols = (DateColumn(col_label="ts_col", timestamp_format="epoch_s"),) + + normalize_dttm_col(df, dttm_cols) + + assert is_datetime64_dtype(df["ts_col"]) + assert df["ts_col"][0].strftime("%Y-%m-%d") == "2020-01-01" + assert df["ts_col"][1].strftime("%Y-%m-%d") == "2021-01-01" + assert df["ts_col"][2].strftime("%Y-%m-%d") == "2022-01-01" + + def test_check_if_safe_zip_success(app_context: None) -> None: """ Test if ZIP files are safe