mirror of
https://github.com/apache/superset.git
synced 2026-04-18 23:55:00 +00:00
fix(utils): Suppress pandas date parsing warnings in normalize_dttm_col (#35042)
This commit is contained in:
@@ -34,6 +34,7 @@ import tempfile
|
||||
import threading
|
||||
import traceback
|
||||
import uuid
|
||||
import warnings
|
||||
import zlib
|
||||
from collections.abc import Iterable, Iterator, Sequence
|
||||
from contextlib import closing, contextmanager
|
||||
@@ -110,6 +111,7 @@ from superset.utils.backports import StrEnum
|
||||
from superset.utils.database import get_example_database
|
||||
from superset.utils.date_parser import parse_human_timedelta
|
||||
from superset.utils.hashing import md5_sha_from_dict, md5_sha_from_str
|
||||
from superset.utils.pandas import detect_datetime_format
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from superset.connectors.sqla.models import BaseDatasource, TableColumn
|
||||
@@ -1858,6 +1860,62 @@ class DateColumn:
|
||||
)
|
||||
|
||||
|
||||
def _process_datetime_column(
|
||||
df: pd.DataFrame,
|
||||
col: DateColumn,
|
||||
) -> None:
|
||||
"""Process a single datetime column with format detection."""
|
||||
if col.timestamp_format in ("epoch_s", "epoch_ms"):
|
||||
dttm_series = df[col.col_label]
|
||||
if is_numeric_dtype(dttm_series):
|
||||
# Column is formatted as a numeric value
|
||||
unit = col.timestamp_format.replace("epoch_", "")
|
||||
df[col.col_label] = pd.to_datetime(
|
||||
dttm_series,
|
||||
utc=False,
|
||||
unit=unit,
|
||||
origin="unix",
|
||||
errors="coerce",
|
||||
exact=False,
|
||||
)
|
||||
else:
|
||||
# Column has already been formatted as a timestamp.
|
||||
try:
|
||||
df[col.col_label] = dttm_series.apply(
|
||||
lambda x: pd.Timestamp(x) if pd.notna(x) else pd.NaT
|
||||
)
|
||||
except ValueError:
|
||||
logger.warning(
|
||||
"Unable to convert column %s to datetime, ignoring",
|
||||
col.col_label,
|
||||
)
|
||||
else:
|
||||
# Try to detect format if not specified
|
||||
format_to_use = col.timestamp_format or detect_datetime_format(
|
||||
df[col.col_label]
|
||||
)
|
||||
|
||||
# Parse with or without format (suppress warning if no format)
|
||||
if format_to_use:
|
||||
df[col.col_label] = pd.to_datetime(
|
||||
df[col.col_label],
|
||||
utc=False,
|
||||
format=format_to_use,
|
||||
errors="coerce",
|
||||
exact=False,
|
||||
)
|
||||
else:
|
||||
with warnings.catch_warnings():
|
||||
warnings.filterwarnings("ignore", message=".*Could not infer format.*")
|
||||
df[col.col_label] = pd.to_datetime(
|
||||
df[col.col_label],
|
||||
utc=False,
|
||||
format=None,
|
||||
errors="coerce",
|
||||
exact=False,
|
||||
)
|
||||
|
||||
|
||||
def normalize_dttm_col(
|
||||
df: pd.DataFrame,
|
||||
dttm_cols: tuple[DateColumn, ...] = tuple(), # noqa: C408
|
||||
@@ -1866,38 +1924,8 @@ def normalize_dttm_col(
|
||||
if _col.col_label not in df.columns:
|
||||
continue
|
||||
|
||||
if _col.timestamp_format in ("epoch_s", "epoch_ms"):
|
||||
dttm_series = df[_col.col_label]
|
||||
if is_numeric_dtype(dttm_series):
|
||||
# Column is formatted as a numeric value
|
||||
unit = _col.timestamp_format.replace("epoch_", "")
|
||||
df[_col.col_label] = pd.to_datetime(
|
||||
dttm_series,
|
||||
utc=False,
|
||||
unit=unit,
|
||||
origin="unix",
|
||||
errors="coerce",
|
||||
exact=False,
|
||||
)
|
||||
else:
|
||||
# Column has already been formatted as a timestamp.
|
||||
try:
|
||||
df[_col.col_label] = dttm_series.apply(
|
||||
lambda x: pd.Timestamp(x) if pd.notna(x) else pd.NaT
|
||||
)
|
||||
except ValueError:
|
||||
logger.warning(
|
||||
"Unable to convert column %s to datetime, ignoring",
|
||||
_col.col_label,
|
||||
)
|
||||
else:
|
||||
df[_col.col_label] = pd.to_datetime(
|
||||
df[_col.col_label],
|
||||
utc=False,
|
||||
format=_col.timestamp_format,
|
||||
errors="coerce",
|
||||
exact=False,
|
||||
)
|
||||
_process_datetime_column(df, _col)
|
||||
|
||||
if _col.offset:
|
||||
df[_col.col_label] += timedelta(hours=_col.offset)
|
||||
if _col.time_shift is not None:
|
||||
|
||||
69
superset/utils/pandas.py
Normal file
69
superset/utils/pandas.py
Normal file
@@ -0,0 +1,69 @@
|
||||
# Licensed to the Apache Software Foundation (ASF) under one
|
||||
# or more contributor license agreements. See the NOTICE file
|
||||
# distributed with this work for additional information
|
||||
# regarding copyright ownership. The ASF licenses this file
|
||||
# to you under the Apache License, Version 2.0 (the
|
||||
# "License"); you may not use this file except in compliance
|
||||
# with the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing,
|
||||
# software distributed under the License is distributed on an
|
||||
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
# KIND, either express or implied. See the License for the
|
||||
# specific language governing permissions and limitations
|
||||
# under the License.
|
||||
"""Pandas utilities for data processing."""
|
||||
|
||||
import pandas as pd
|
||||
|
||||
|
||||
def detect_datetime_format(series: pd.Series, sample_size: int = 100) -> str | None:
|
||||
"""
|
||||
Detect the datetime format from a sample of the series.
|
||||
|
||||
:param series: The pandas Series to analyze
|
||||
:param sample_size: Number of rows to sample for format detection
|
||||
:return: Detected format string or None if no consistent format found
|
||||
"""
|
||||
# Most common formats first for performance
|
||||
common_formats = [
|
||||
"%Y-%m-%d %H:%M:%S",
|
||||
"%Y-%m-%d",
|
||||
"%Y-%m-%dT%H:%M:%S",
|
||||
"%Y-%m-%dT%H:%M:%SZ",
|
||||
"%Y-%m-%dT%H:%M:%S.%f",
|
||||
"%Y-%m-%dT%H:%M:%S.%fZ",
|
||||
"%m/%d/%Y",
|
||||
"%d/%m/%Y",
|
||||
"%Y/%m/%d",
|
||||
"%m/%d/%Y %H:%M:%S",
|
||||
"%d/%m/%Y %H:%M:%S",
|
||||
"%m-%d-%Y",
|
||||
"%d-%m-%Y",
|
||||
"%Y%m%d",
|
||||
]
|
||||
|
||||
# Get non-null sample
|
||||
sample = series.dropna().head(sample_size)
|
||||
if sample.empty:
|
||||
return None
|
||||
|
||||
# Convert to string if not already
|
||||
if not pd.api.types.is_string_dtype(sample):
|
||||
sample = sample.astype(str)
|
||||
|
||||
# Try each format
|
||||
for fmt in common_formats:
|
||||
try:
|
||||
# Test on small sample first
|
||||
test_sample = sample.head(10)
|
||||
pd.to_datetime(test_sample, format=fmt, errors="raise")
|
||||
# If successful, verify on larger sample
|
||||
pd.to_datetime(sample, format=fmt, errors="raise")
|
||||
return fmt
|
||||
except (ValueError, TypeError):
|
||||
continue
|
||||
|
||||
return None
|
||||
256
tests/unit_tests/utils/test_date_parsing.py
Normal file
256
tests/unit_tests/utils/test_date_parsing.py
Normal file
@@ -0,0 +1,256 @@
|
||||
# Licensed to the Apache Software Foundation (ASF) under one
|
||||
# or more contributor license agreements. See the NOTICE file
|
||||
# distributed with this work for additional information
|
||||
# regarding copyright ownership. The ASF licenses this file
|
||||
# to you under the Apache License, Version 2.0 (the
|
||||
# "License"); you may not use this file except in compliance
|
||||
# with the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing,
|
||||
# software distributed under the License is distributed on an
|
||||
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
# KIND, either express or implied. See the License for the
|
||||
# specific language governing permissions and limitations
|
||||
# under the License.
|
||||
"""Tests for datetime format detection and warning suppression."""
|
||||
|
||||
import warnings
|
||||
|
||||
import pandas as pd
|
||||
import pytest
|
||||
|
||||
from superset.utils.core import DateColumn, normalize_dttm_col
|
||||
from superset.utils.pandas import detect_datetime_format
|
||||
|
||||
|
||||
def capture_warnings(func, *args, **kwargs):
|
||||
"""Execute function and return any format inference warnings."""
|
||||
with warnings.catch_warnings(record=True) as w:
|
||||
warnings.simplefilter("always")
|
||||
result = func(*args, **kwargs)
|
||||
format_warnings = [
|
||||
str(warning.message)
|
||||
for warning in w
|
||||
if "Could not infer format" in str(warning.message)
|
||||
]
|
||||
return result, format_warnings
|
||||
|
||||
|
||||
def test_detect_datetime_format():
|
||||
"""Test format detection for common datetime patterns."""
|
||||
test_cases = [
|
||||
(["2023-01-01", "2023-01-02"], "%Y-%m-%d"),
|
||||
(["2023-01-01 12:00:00", "2023-01-02 13:00:00"], "%Y-%m-%d %H:%M:%S"),
|
||||
(["01/15/2023", "02/20/2023"], "%m/%d/%Y"),
|
||||
(["2023-01-01", "01/02/2023"], None), # Mixed formats
|
||||
([], None), # Empty
|
||||
([None, None], None), # All nulls
|
||||
]
|
||||
|
||||
for data, expected in test_cases:
|
||||
assert detect_datetime_format(pd.Series(data)) == expected
|
||||
|
||||
|
||||
def test_no_warnings_with_consistent_formats():
|
||||
"""Verify no warnings are produced for consistent date formats."""
|
||||
df = pd.DataFrame(
|
||||
{
|
||||
"date": ["2023-01-01", "2023-01-02", "2023-01-03"],
|
||||
"datetime": [
|
||||
"2023-01-01 12:00:00",
|
||||
"2023-01-02 13:00:00",
|
||||
"2023-01-03 14:00:00",
|
||||
],
|
||||
}
|
||||
)
|
||||
|
||||
date_cols = (
|
||||
DateColumn(col_label="date"),
|
||||
DateColumn(col_label="datetime"),
|
||||
)
|
||||
|
||||
_, warnings_list = capture_warnings(normalize_dttm_col, df, date_cols)
|
||||
assert len(warnings_list) == 0
|
||||
|
||||
# Verify parsing worked
|
||||
assert pd.api.types.is_datetime64_any_dtype(df["date"])
|
||||
assert pd.api.types.is_datetime64_any_dtype(df["datetime"])
|
||||
assert df["date"].iloc[0] == pd.Timestamp("2023-01-01")
|
||||
|
||||
|
||||
def test_explicit_format_respected():
|
||||
"""Verify explicit formats are still used when provided."""
|
||||
df = pd.DataFrame({"date": ["01/15/2023", "02/20/2023"]})
|
||||
date_cols = (DateColumn(col_label="date", timestamp_format="%m/%d/%Y"),)
|
||||
|
||||
normalize_dttm_col(df, date_cols)
|
||||
|
||||
assert pd.api.types.is_datetime64_any_dtype(df["date"])
|
||||
assert df["date"].iloc[0] == pd.Timestamp("2023-01-15")
|
||||
|
||||
|
||||
def test_mixed_formats_suppressed():
|
||||
"""Verify warnings are suppressed for mixed format data."""
|
||||
df = pd.DataFrame(
|
||||
{
|
||||
"mixed": ["2023-01-01", "01/02/2023", "2023-03-01 12:00:00"],
|
||||
}
|
||||
)
|
||||
|
||||
date_cols = (DateColumn(col_label="mixed"),)
|
||||
_, warnings_list = capture_warnings(normalize_dttm_col, df, date_cols)
|
||||
|
||||
assert len(warnings_list) == 0
|
||||
assert pd.api.types.is_datetime64_any_dtype(df["mixed"])
|
||||
|
||||
|
||||
def test_epoch_format():
|
||||
"""Verify epoch timestamp handling works correctly."""
|
||||
df = pd.DataFrame({"epoch": [1672531200, 1672617600]}) # 2023-01-01, 2023-01-02
|
||||
date_cols = (DateColumn(col_label="epoch", timestamp_format="epoch_s"),)
|
||||
|
||||
normalize_dttm_col(df, date_cols)
|
||||
|
||||
assert pd.api.types.is_datetime64_any_dtype(df["epoch"])
|
||||
assert df["epoch"].iloc[0] == pd.Timestamp("2023-01-01")
|
||||
|
||||
|
||||
def test_epoch_format_invalid_values(caplog):
|
||||
"""Test epoch format with invalid values triggers warning."""
|
||||
# Test with non-numeric values that can't be converted to epoch
|
||||
df = pd.DataFrame({"epoch": ["not_a_number", "invalid", "abc"]})
|
||||
date_cols = (DateColumn(col_label="epoch", timestamp_format="epoch_s"),)
|
||||
|
||||
# Clear any existing log records
|
||||
caplog.clear()
|
||||
|
||||
# Run the function - should log a warning
|
||||
with caplog.at_level("WARNING"):
|
||||
normalize_dttm_col(df, date_cols)
|
||||
|
||||
# Verify warning was logged
|
||||
assert "Unable to convert column epoch to datetime, ignoring" in caplog.text
|
||||
|
||||
# The column should remain unchanged when conversion fails
|
||||
assert df["epoch"].dtype == object
|
||||
assert df["epoch"].iloc[0] == "not_a_number"
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"data,expected_format",
|
||||
[
|
||||
(["2023-01-01", "2023-01-02"], "%Y-%m-%d"),
|
||||
(["01/15/2023", "02/20/2023"], "%m/%d/%Y"),
|
||||
(["2023-01-01T12:00:00Z", "2023-01-02T13:00:00Z"], "%Y-%m-%dT%H:%M:%SZ"),
|
||||
(
|
||||
["2023-01-01T12:00:00.123Z", "2023-01-02T13:00:00.456Z"],
|
||||
"%Y-%m-%dT%H:%M:%S.%fZ",
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_format_detection_patterns(data: list[str], expected_format: str):
|
||||
"""Test detection of various datetime formats."""
|
||||
assert detect_datetime_format(pd.Series(data)) == expected_format
|
||||
|
||||
|
||||
def test_edge_cases():
|
||||
"""Test handling of edge cases."""
|
||||
edge_cases = [
|
||||
pd.DataFrame({"date": []}), # Empty
|
||||
pd.DataFrame({"date": [None, None]}), # All nulls
|
||||
pd.DataFrame({"date": ["2023-01-01"]}), # Single value
|
||||
pd.DataFrame({"date": pd.to_datetime(["2023-01-01"])}), # Already datetime
|
||||
]
|
||||
|
||||
for df in edge_cases:
|
||||
df_copy = df.copy()
|
||||
date_cols = (DateColumn(col_label="date"),)
|
||||
# Should not raise
|
||||
normalize_dttm_col(df_copy, date_cols)
|
||||
|
||||
|
||||
def test_detect_datetime_format_empty_series():
|
||||
"""Test detect_datetime_format returns None for empty series after dropping NaN."""
|
||||
# Test with all None values - covers lines 50-51 in pandas.py
|
||||
series_all_none = pd.Series([None, None, None])
|
||||
assert detect_datetime_format(series_all_none) is None
|
||||
|
||||
# Test with all NaN values
|
||||
series_all_nan = pd.Series([pd.NaT, pd.NaT, pd.NaT])
|
||||
assert detect_datetime_format(series_all_nan) is None
|
||||
|
||||
# Test with empty series
|
||||
series_empty = pd.Series([], dtype=object)
|
||||
assert detect_datetime_format(series_empty) is None
|
||||
|
||||
|
||||
def test_datetime_conversion_value_error(caplog, monkeypatch):
|
||||
"""Test ValueError during datetime conversion logs a warning.
|
||||
|
||||
Covers core.py lines 1887-88.
|
||||
"""
|
||||
# Create a DataFrame with string values representing dates that are
|
||||
# already datetime-like but when epoch_s format is specified and the
|
||||
# values are NOT numeric, it tries to convert them using pd.Timestamp
|
||||
# which can fail
|
||||
|
||||
# Create a mock type that raises ValueError when pd.Timestamp is called on it
|
||||
class BadTimestampValue:
|
||||
def __init__(self, value):
|
||||
self.value = value
|
||||
|
||||
def __repr__(self):
|
||||
return f"BadTimestamp({self.value})"
|
||||
|
||||
def __bool__(self):
|
||||
return True
|
||||
|
||||
# Create DataFrame with values that will fail pd.Timestamp conversion
|
||||
df = pd.DataFrame(
|
||||
{
|
||||
"date": [
|
||||
BadTimestampValue("2023-01-01"),
|
||||
BadTimestampValue("2023-01-02"),
|
||||
BadTimestampValue("2023-01-03"),
|
||||
]
|
||||
}
|
||||
)
|
||||
|
||||
# Store original Timestamp
|
||||
original_timestamp = pd.Timestamp
|
||||
|
||||
def failing_timestamp(value):
|
||||
if isinstance(value, BadTimestampValue):
|
||||
raise ValueError(f"Cannot convert {value} to Timestamp")
|
||||
return original_timestamp(value)
|
||||
|
||||
# Set to epoch format with non-numeric data to trigger the else branch
|
||||
# (lines 1881-1891 in core.py)
|
||||
date_cols = (DateColumn(col_label="date", timestamp_format="epoch_s"),)
|
||||
|
||||
# Clear any existing log records
|
||||
caplog.clear()
|
||||
|
||||
# Run the function with our patched Timestamp - should log a warning
|
||||
with caplog.at_level("WARNING"):
|
||||
# Use monkeypatch for cleaner patching
|
||||
monkeypatch.setattr(pd, "Timestamp", failing_timestamp)
|
||||
normalize_dttm_col(df, date_cols)
|
||||
|
||||
# Verify warning was logged (covers lines 1887-88 in core.py)
|
||||
assert "Unable to convert column date to datetime, ignoring" in caplog.text
|
||||
|
||||
|
||||
def test_warning_suppression():
|
||||
"""Verify our implementation suppresses warnings for mixed formats."""
|
||||
df = pd.DataFrame({"date": ["2023-01-01", "01/02/2023", "March 3, 2023"]})
|
||||
|
||||
# Our approach should suppress warnings
|
||||
_, warnings_list = capture_warnings(
|
||||
normalize_dttm_col, df, (DateColumn(col_label="date"),)
|
||||
)
|
||||
|
||||
assert len(warnings_list) == 0 # Should suppress all format inference warnings
|
||||
assert pd.api.types.is_datetime64_any_dtype(df["date"]) # Should still parse dates
|
||||
Reference in New Issue
Block a user