fix(utils): Suppress pandas date parsing warnings in normalize_dttm_col (#35042)

This commit is contained in:
Amin Ghadersohi
2025-09-08 21:17:33 -04:00
committed by GitHub
parent c5f220a9ff
commit 15e4e8df94
3 changed files with 385 additions and 32 deletions

View File

@@ -34,6 +34,7 @@ import tempfile
import threading
import traceback
import uuid
import warnings
import zlib
from collections.abc import Iterable, Iterator, Sequence
from contextlib import closing, contextmanager
@@ -110,6 +111,7 @@ from superset.utils.backports import StrEnum
from superset.utils.database import get_example_database
from superset.utils.date_parser import parse_human_timedelta
from superset.utils.hashing import md5_sha_from_dict, md5_sha_from_str
from superset.utils.pandas import detect_datetime_format
if TYPE_CHECKING:
from superset.connectors.sqla.models import BaseDatasource, TableColumn
@@ -1858,6 +1860,62 @@ class DateColumn:
)
def _process_datetime_column(
df: pd.DataFrame,
col: DateColumn,
) -> None:
"""Process a single datetime column with format detection."""
if col.timestamp_format in ("epoch_s", "epoch_ms"):
dttm_series = df[col.col_label]
if is_numeric_dtype(dttm_series):
# Column is formatted as a numeric value
unit = col.timestamp_format.replace("epoch_", "")
df[col.col_label] = pd.to_datetime(
dttm_series,
utc=False,
unit=unit,
origin="unix",
errors="coerce",
exact=False,
)
else:
# Column has already been formatted as a timestamp.
try:
df[col.col_label] = dttm_series.apply(
lambda x: pd.Timestamp(x) if pd.notna(x) else pd.NaT
)
except ValueError:
logger.warning(
"Unable to convert column %s to datetime, ignoring",
col.col_label,
)
else:
# Try to detect format if not specified
format_to_use = col.timestamp_format or detect_datetime_format(
df[col.col_label]
)
# Parse with or without format (suppress warning if no format)
if format_to_use:
df[col.col_label] = pd.to_datetime(
df[col.col_label],
utc=False,
format=format_to_use,
errors="coerce",
exact=False,
)
else:
with warnings.catch_warnings():
warnings.filterwarnings("ignore", message=".*Could not infer format.*")
df[col.col_label] = pd.to_datetime(
df[col.col_label],
utc=False,
format=None,
errors="coerce",
exact=False,
)
def normalize_dttm_col(
df: pd.DataFrame,
dttm_cols: tuple[DateColumn, ...] = tuple(), # noqa: C408
@@ -1866,38 +1924,8 @@ def normalize_dttm_col(
if _col.col_label not in df.columns:
continue
if _col.timestamp_format in ("epoch_s", "epoch_ms"):
dttm_series = df[_col.col_label]
if is_numeric_dtype(dttm_series):
# Column is formatted as a numeric value
unit = _col.timestamp_format.replace("epoch_", "")
df[_col.col_label] = pd.to_datetime(
dttm_series,
utc=False,
unit=unit,
origin="unix",
errors="coerce",
exact=False,
)
else:
# Column has already been formatted as a timestamp.
try:
df[_col.col_label] = dttm_series.apply(
lambda x: pd.Timestamp(x) if pd.notna(x) else pd.NaT
)
except ValueError:
logger.warning(
"Unable to convert column %s to datetime, ignoring",
_col.col_label,
)
else:
df[_col.col_label] = pd.to_datetime(
df[_col.col_label],
utc=False,
format=_col.timestamp_format,
errors="coerce",
exact=False,
)
_process_datetime_column(df, _col)
if _col.offset:
df[_col.col_label] += timedelta(hours=_col.offset)
if _col.time_shift is not None: