fix(utils): Suppress pandas date parsing warnings in normalize_dttm_col (#35042)

2026-04-18 23:55:00 +00:00 · 2025-09-08 21:17:33 -04:00
parent c5f220a9ff
commit 15e4e8df94
3 changed files with 385 additions and 32 deletions
--- a/superset/utils/core.py
+++ b/superset/utils/core.py
@@ -34,6 +34,7 @@ import tempfile
 import threading
 import traceback
 import uuid
+import warnings
 import zlib
 from collections.abc import Iterable, Iterator, Sequence
 from contextlib import closing, contextmanager
@@ -110,6 +111,7 @@ from superset.utils.backports import StrEnum
 from superset.utils.database import get_example_database
 from superset.utils.date_parser import parse_human_timedelta
 from superset.utils.hashing import md5_sha_from_dict, md5_sha_from_str
+from superset.utils.pandas import detect_datetime_format

 if TYPE_CHECKING:
    from superset.connectors.sqla.models import BaseDatasource, TableColumn
@@ -1858,6 +1860,62 @@ class DateColumn:
        )


+def _process_datetime_column(
+    df: pd.DataFrame,
+    col: DateColumn,
+) -> None:
+    """Process a single datetime column with format detection."""
+    if col.timestamp_format in ("epoch_s", "epoch_ms"):
+        dttm_series = df[col.col_label]
+        if is_numeric_dtype(dttm_series):
+            # Column is formatted as a numeric value
+            unit = col.timestamp_format.replace("epoch_", "")
+            df[col.col_label] = pd.to_datetime(
+                dttm_series,
+                utc=False,
+                unit=unit,
+                origin="unix",
+                errors="coerce",
+                exact=False,
+            )
+        else:
+            # Column has already been formatted as a timestamp.
+            try:
+                df[col.col_label] = dttm_series.apply(
+                    lambda x: pd.Timestamp(x) if pd.notna(x) else pd.NaT
+                )
+            except ValueError:
+                logger.warning(
+                    "Unable to convert column %s to datetime, ignoring",
+                    col.col_label,
+                )
+    else:
+        # Try to detect format if not specified
+        format_to_use = col.timestamp_format or detect_datetime_format(
+            df[col.col_label]
+        )
+
+        # Parse with or without format (suppress warning if no format)
+        if format_to_use:
+            df[col.col_label] = pd.to_datetime(
+                df[col.col_label],
+                utc=False,
+                format=format_to_use,
+                errors="coerce",
+                exact=False,
+            )
+        else:
+            with warnings.catch_warnings():
+                warnings.filterwarnings("ignore", message=".*Could not infer format.*")
+                df[col.col_label] = pd.to_datetime(
+                    df[col.col_label],
+                    utc=False,
+                    format=None,
+                    errors="coerce",
+                    exact=False,
+                )
+
+
 def normalize_dttm_col(
    df: pd.DataFrame,
    dttm_cols: tuple[DateColumn, ...] = tuple(),  # noqa: C408
@@ -1866,38 +1924,8 @@ def normalize_dttm_col(
        if _col.col_label not in df.columns:
            continue

-        if _col.timestamp_format in ("epoch_s", "epoch_ms"):
-            dttm_series = df[_col.col_label]
-            if is_numeric_dtype(dttm_series):
-                # Column is formatted as a numeric value
-                unit = _col.timestamp_format.replace("epoch_", "")
-                df[_col.col_label] = pd.to_datetime(
-                    dttm_series,
-                    utc=False,
-                    unit=unit,
-                    origin="unix",
-                    errors="coerce",
-                    exact=False,
-                )
-            else:
-                # Column has already been formatted as a timestamp.
-                try:
-                    df[_col.col_label] = dttm_series.apply(
-                        lambda x: pd.Timestamp(x) if pd.notna(x) else pd.NaT
-                    )
-                except ValueError:
-                    logger.warning(
-                        "Unable to convert column %s to datetime, ignoring",
-                        _col.col_label,
-                    )
-        else:
-            df[_col.col_label] = pd.to_datetime(
-                df[_col.col_label],
-                utc=False,
-                format=_col.timestamp_format,
-                errors="coerce",
-                exact=False,
-            )
+        _process_datetime_column(df, _col)
+
        if _col.offset:
            df[_col.col_label] += timedelta(hours=_col.offset)
        if _col.time_shift is not None:
--- a/superset/utils/pandas.py
+++ b/superset/utils/pandas.py
@@ -0,0 +1,69 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Pandas utilities for data processing."""
+
+import pandas as pd
+
+
+def detect_datetime_format(series: pd.Series, sample_size: int = 100) -> str | None:
+    """
+    Detect the datetime format from a sample of the series.
+
+    :param series: The pandas Series to analyze
+    :param sample_size: Number of rows to sample for format detection
+    :return: Detected format string or None if no consistent format found
+    """
+    # Most common formats first for performance
+    common_formats = [
+        "%Y-%m-%d %H:%M:%S",
+        "%Y-%m-%d",
+        "%Y-%m-%dT%H:%M:%S",
+        "%Y-%m-%dT%H:%M:%SZ",
+        "%Y-%m-%dT%H:%M:%S.%f",
+        "%Y-%m-%dT%H:%M:%S.%fZ",
+        "%m/%d/%Y",
+        "%d/%m/%Y",
+        "%Y/%m/%d",
+        "%m/%d/%Y %H:%M:%S",
+        "%d/%m/%Y %H:%M:%S",
+        "%m-%d-%Y",
+        "%d-%m-%Y",
+        "%Y%m%d",
+    ]
+
+    # Get non-null sample
+    sample = series.dropna().head(sample_size)
+    if sample.empty:
+        return None
+
+    # Convert to string if not already
+    if not pd.api.types.is_string_dtype(sample):
+        sample = sample.astype(str)
+
+    # Try each format
+    for fmt in common_formats:
+        try:
+            # Test on small sample first
+            test_sample = sample.head(10)
+            pd.to_datetime(test_sample, format=fmt, errors="raise")
+            # If successful, verify on larger sample
+            pd.to_datetime(sample, format=fmt, errors="raise")
+            return fmt
+        except (ValueError, TypeError):
+            continue
+
+    return None
--- a/tests/unit_tests/utils/test_date_parsing.py
+++ b/tests/unit_tests/utils/test_date_parsing.py
@@ -0,0 +1,256 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Tests for datetime format detection and warning suppression."""
+
+import warnings
+
+import pandas as pd
+import pytest
+
+from superset.utils.core import DateColumn, normalize_dttm_col
+from superset.utils.pandas import detect_datetime_format
+
+
+def capture_warnings(func, *args, **kwargs):
+    """Execute function and return any format inference warnings."""
+    with warnings.catch_warnings(record=True) as w:
+        warnings.simplefilter("always")
+        result = func(*args, **kwargs)
+        format_warnings = [
+            str(warning.message)
+            for warning in w
+            if "Could not infer format" in str(warning.message)
+        ]
+        return result, format_warnings
+
+
+def test_detect_datetime_format():
+    """Test format detection for common datetime patterns."""
+    test_cases = [
+        (["2023-01-01", "2023-01-02"], "%Y-%m-%d"),
+        (["2023-01-01 12:00:00", "2023-01-02 13:00:00"], "%Y-%m-%d %H:%M:%S"),
+        (["01/15/2023", "02/20/2023"], "%m/%d/%Y"),
+        (["2023-01-01", "01/02/2023"], None),  # Mixed formats
+        ([], None),  # Empty
+        ([None, None], None),  # All nulls
+    ]
+
+    for data, expected in test_cases:
+        assert detect_datetime_format(pd.Series(data)) == expected
+
+
+def test_no_warnings_with_consistent_formats():
+    """Verify no warnings are produced for consistent date formats."""
+    df = pd.DataFrame(
+        {
+            "date": ["2023-01-01", "2023-01-02", "2023-01-03"],
+            "datetime": [
+                "2023-01-01 12:00:00",
+                "2023-01-02 13:00:00",
+                "2023-01-03 14:00:00",
+            ],
+        }
+    )
+
+    date_cols = (
+        DateColumn(col_label="date"),
+        DateColumn(col_label="datetime"),
+    )
+
+    _, warnings_list = capture_warnings(normalize_dttm_col, df, date_cols)
+    assert len(warnings_list) == 0
+
+    # Verify parsing worked
+    assert pd.api.types.is_datetime64_any_dtype(df["date"])
+    assert pd.api.types.is_datetime64_any_dtype(df["datetime"])
+    assert df["date"].iloc[0] == pd.Timestamp("2023-01-01")
+
+
+def test_explicit_format_respected():
+    """Verify explicit formats are still used when provided."""
+    df = pd.DataFrame({"date": ["01/15/2023", "02/20/2023"]})
+    date_cols = (DateColumn(col_label="date", timestamp_format="%m/%d/%Y"),)
+
+    normalize_dttm_col(df, date_cols)
+
+    assert pd.api.types.is_datetime64_any_dtype(df["date"])
+    assert df["date"].iloc[0] == pd.Timestamp("2023-01-15")
+
+
+def test_mixed_formats_suppressed():
+    """Verify warnings are suppressed for mixed format data."""
+    df = pd.DataFrame(
+        {
+            "mixed": ["2023-01-01", "01/02/2023", "2023-03-01 12:00:00"],
+        }
+    )
+
+    date_cols = (DateColumn(col_label="mixed"),)
+    _, warnings_list = capture_warnings(normalize_dttm_col, df, date_cols)
+
+    assert len(warnings_list) == 0
+    assert pd.api.types.is_datetime64_any_dtype(df["mixed"])
+
+
+def test_epoch_format():
+    """Verify epoch timestamp handling works correctly."""
+    df = pd.DataFrame({"epoch": [1672531200, 1672617600]})  # 2023-01-01, 2023-01-02
+    date_cols = (DateColumn(col_label="epoch", timestamp_format="epoch_s"),)
+
+    normalize_dttm_col(df, date_cols)
+
+    assert pd.api.types.is_datetime64_any_dtype(df["epoch"])
+    assert df["epoch"].iloc[0] == pd.Timestamp("2023-01-01")
+
+
+def test_epoch_format_invalid_values(caplog):
+    """Test epoch format with invalid values triggers warning."""
+    # Test with non-numeric values that can't be converted to epoch
+    df = pd.DataFrame({"epoch": ["not_a_number", "invalid", "abc"]})
+    date_cols = (DateColumn(col_label="epoch", timestamp_format="epoch_s"),)
+
+    # Clear any existing log records
+    caplog.clear()
+
+    # Run the function - should log a warning
+    with caplog.at_level("WARNING"):
+        normalize_dttm_col(df, date_cols)
+
+    # Verify warning was logged
+    assert "Unable to convert column epoch to datetime, ignoring" in caplog.text
+
+    # The column should remain unchanged when conversion fails
+    assert df["epoch"].dtype == object
+    assert df["epoch"].iloc[0] == "not_a_number"
+
+
+@pytest.mark.parametrize(
+    "data,expected_format",
+    [
+        (["2023-01-01", "2023-01-02"], "%Y-%m-%d"),
+        (["01/15/2023", "02/20/2023"], "%m/%d/%Y"),
+        (["2023-01-01T12:00:00Z", "2023-01-02T13:00:00Z"], "%Y-%m-%dT%H:%M:%SZ"),
+        (
+            ["2023-01-01T12:00:00.123Z", "2023-01-02T13:00:00.456Z"],
+            "%Y-%m-%dT%H:%M:%S.%fZ",
+        ),
+    ],
+)
+def test_format_detection_patterns(data: list[str], expected_format: str):
+    """Test detection of various datetime formats."""
+    assert detect_datetime_format(pd.Series(data)) == expected_format
+
+
+def test_edge_cases():
+    """Test handling of edge cases."""
+    edge_cases = [
+        pd.DataFrame({"date": []}),  # Empty
+        pd.DataFrame({"date": [None, None]}),  # All nulls
+        pd.DataFrame({"date": ["2023-01-01"]}),  # Single value
+        pd.DataFrame({"date": pd.to_datetime(["2023-01-01"])}),  # Already datetime
+    ]
+
+    for df in edge_cases:
+        df_copy = df.copy()
+        date_cols = (DateColumn(col_label="date"),)
+        # Should not raise
+        normalize_dttm_col(df_copy, date_cols)
+
+
+def test_detect_datetime_format_empty_series():
+    """Test detect_datetime_format returns None for empty series after dropping NaN."""
+    # Test with all None values - covers lines 50-51 in pandas.py
+    series_all_none = pd.Series([None, None, None])
+    assert detect_datetime_format(series_all_none) is None
+
+    # Test with all NaN values
+    series_all_nan = pd.Series([pd.NaT, pd.NaT, pd.NaT])
+    assert detect_datetime_format(series_all_nan) is None
+
+    # Test with empty series
+    series_empty = pd.Series([], dtype=object)
+    assert detect_datetime_format(series_empty) is None
+
+
+def test_datetime_conversion_value_error(caplog, monkeypatch):
+    """Test ValueError during datetime conversion logs a warning.
+
+    Covers core.py lines 1887-88.
+    """
+    # Create a DataFrame with string values representing dates that are
+    # already datetime-like but when epoch_s format is specified and the
+    # values are NOT numeric, it tries to convert them using pd.Timestamp
+    # which can fail
+
+    # Create a mock type that raises ValueError when pd.Timestamp is called on it
+    class BadTimestampValue:
+        def __init__(self, value):
+            self.value = value
+
+        def __repr__(self):
+            return f"BadTimestamp({self.value})"
+
+        def __bool__(self):
+            return True
+
+    # Create DataFrame with values that will fail pd.Timestamp conversion
+    df = pd.DataFrame(
+        {
+            "date": [
+                BadTimestampValue("2023-01-01"),
+                BadTimestampValue("2023-01-02"),
+                BadTimestampValue("2023-01-03"),
+            ]
+        }
+    )
+
+    # Store original Timestamp
+    original_timestamp = pd.Timestamp
+
+    def failing_timestamp(value):
+        if isinstance(value, BadTimestampValue):
+            raise ValueError(f"Cannot convert {value} to Timestamp")
+        return original_timestamp(value)
+
+    # Set to epoch format with non-numeric data to trigger the else branch
+    # (lines 1881-1891 in core.py)
+    date_cols = (DateColumn(col_label="date", timestamp_format="epoch_s"),)
+
+    # Clear any existing log records
+    caplog.clear()
+
+    # Run the function with our patched Timestamp - should log a warning
+    with caplog.at_level("WARNING"):
+        # Use monkeypatch for cleaner patching
+        monkeypatch.setattr(pd, "Timestamp", failing_timestamp)
+        normalize_dttm_col(df, date_cols)
+
+    # Verify warning was logged (covers lines 1887-88 in core.py)
+    assert "Unable to convert column date to datetime, ignoring" in caplog.text
+
+
+def test_warning_suppression():
+    """Verify our implementation suppresses warnings for mixed formats."""
+    df = pd.DataFrame({"date": ["2023-01-01", "01/02/2023", "March 3, 2023"]})
+
+    # Our approach should suppress warnings
+    _, warnings_list = capture_warnings(
+        normalize_dttm_col, df, (DateColumn(col_label="date"),)
+    )
+
+    assert len(warnings_list) == 0  # Should suppress all format inference warnings
+    assert pd.api.types.is_datetime64_any_dtype(df["date"])  # Should still parse dates