# Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. """Tests for datetime format detection and warning suppression.""" import warnings import pandas as pd import pytest from superset.utils.core import DateColumn, normalize_dttm_col from superset.utils.pandas import detect_datetime_format def capture_warnings(func, *args, **kwargs): """Execute function and return any format inference warnings.""" with warnings.catch_warnings(record=True) as w: warnings.simplefilter("always") result = func(*args, **kwargs) format_warnings = [ str(warning.message) for warning in w if "Could not infer format" in str(warning.message) ] return result, format_warnings def test_detect_datetime_format(): """Test format detection for common datetime patterns.""" test_cases = [ (["2023-01-01", "2023-01-02"], "%Y-%m-%d"), (["2023-01-01 12:00:00", "2023-01-02 13:00:00"], "%Y-%m-%d %H:%M:%S"), (["01/15/2023", "02/20/2023"], "%m/%d/%Y"), (["2023-01-01", "01/02/2023"], None), # Mixed formats ([], None), # Empty ([None, None], None), # All nulls ] for data, expected in test_cases: assert detect_datetime_format(pd.Series(data)) == expected def test_no_warnings_with_consistent_formats(): """Verify no warnings are produced for consistent date formats.""" df = pd.DataFrame( { "date": ["2023-01-01", "2023-01-02", "2023-01-03"], "datetime": [ "2023-01-01 12:00:00", "2023-01-02 13:00:00", "2023-01-03 14:00:00", ], } ) date_cols = ( DateColumn(col_label="date"), DateColumn(col_label="datetime"), ) _, warnings_list = capture_warnings(normalize_dttm_col, df, date_cols) assert len(warnings_list) == 0 # Verify parsing worked assert pd.api.types.is_datetime64_any_dtype(df["date"]) assert pd.api.types.is_datetime64_any_dtype(df["datetime"]) assert df["date"].iloc[0] == pd.Timestamp("2023-01-01") def test_explicit_format_respected(): """Verify explicit formats are still used when provided.""" df = pd.DataFrame({"date": ["01/15/2023", "02/20/2023"]}) date_cols = (DateColumn(col_label="date", timestamp_format="%m/%d/%Y"),) normalize_dttm_col(df, date_cols) assert pd.api.types.is_datetime64_any_dtype(df["date"]) assert df["date"].iloc[0] == pd.Timestamp("2023-01-15") def test_mixed_formats_suppressed(): """Verify warnings are suppressed for mixed format data.""" df = pd.DataFrame( { "mixed": ["2023-01-01", "01/02/2023", "2023-03-01 12:00:00"], } ) date_cols = (DateColumn(col_label="mixed"),) _, warnings_list = capture_warnings(normalize_dttm_col, df, date_cols) assert len(warnings_list) == 0 assert pd.api.types.is_datetime64_any_dtype(df["mixed"]) def test_epoch_format(): """Verify epoch timestamp handling works correctly.""" df = pd.DataFrame({"epoch": [1672531200, 1672617600]}) # 2023-01-01, 2023-01-02 date_cols = (DateColumn(col_label="epoch", timestamp_format="epoch_s"),) normalize_dttm_col(df, date_cols) assert pd.api.types.is_datetime64_any_dtype(df["epoch"]) assert df["epoch"].iloc[0] == pd.Timestamp("2023-01-01") def test_epoch_format_invalid_values(caplog): """Test epoch format with invalid values triggers warning.""" # Test with non-numeric values that can't be converted to epoch df = pd.DataFrame({"epoch": ["not_a_number", "invalid", "abc"]}) date_cols = (DateColumn(col_label="epoch", timestamp_format="epoch_s"),) # Clear any existing log records caplog.clear() # Run the function - should log a warning with caplog.at_level("WARNING"): normalize_dttm_col(df, date_cols) # Verify warning was logged assert "Unable to convert column epoch to datetime, ignoring" in caplog.text # The column should remain unchanged when conversion fails assert df["epoch"].dtype == object assert df["epoch"].iloc[0] == "not_a_number" @pytest.mark.parametrize( "data,expected_format", [ (["2023-01-01", "2023-01-02"], "%Y-%m-%d"), (["01/15/2023", "02/20/2023"], "%m/%d/%Y"), (["2023-01-01T12:00:00Z", "2023-01-02T13:00:00Z"], "%Y-%m-%dT%H:%M:%SZ"), ( ["2023-01-01T12:00:00.123Z", "2023-01-02T13:00:00.456Z"], "%Y-%m-%dT%H:%M:%S.%fZ", ), ], ) def test_format_detection_patterns(data: list[str], expected_format: str): """Test detection of various datetime formats.""" assert detect_datetime_format(pd.Series(data)) == expected_format def test_edge_cases(): """Test handling of edge cases.""" edge_cases = [ pd.DataFrame({"date": []}), # Empty pd.DataFrame({"date": [None, None]}), # All nulls pd.DataFrame({"date": ["2023-01-01"]}), # Single value pd.DataFrame({"date": pd.to_datetime(["2023-01-01"])}), # Already datetime ] for df in edge_cases: df_copy = df.copy() date_cols = (DateColumn(col_label="date"),) # Should not raise normalize_dttm_col(df_copy, date_cols) def test_detect_datetime_format_empty_series(): """Test detect_datetime_format returns None for empty series after dropping NaN.""" # Test with all None values - covers lines 50-51 in pandas.py series_all_none = pd.Series([None, None, None]) assert detect_datetime_format(series_all_none) is None # Test with all NaN values series_all_nan = pd.Series([pd.NaT, pd.NaT, pd.NaT]) assert detect_datetime_format(series_all_nan) is None # Test with empty series series_empty = pd.Series([], dtype=object) assert detect_datetime_format(series_empty) is None def test_datetime_conversion_value_error(caplog, monkeypatch): """Test ValueError during datetime conversion logs a warning. Covers core.py lines 1887-88. """ # Create a DataFrame with string values representing dates that are # already datetime-like but when epoch_s format is specified and the # values are NOT numeric, it tries to convert them using pd.Timestamp # which can fail # Create a mock type that raises ValueError when pd.Timestamp is called on it class BadTimestampValue: def __init__(self, value): self.value = value def __repr__(self): return f"BadTimestamp({self.value})" def __bool__(self): return True # Create DataFrame with values that will fail pd.Timestamp conversion df = pd.DataFrame( { "date": [ BadTimestampValue("2023-01-01"), BadTimestampValue("2023-01-02"), BadTimestampValue("2023-01-03"), ] } ) # Store original Timestamp original_timestamp = pd.Timestamp def failing_timestamp(value): if isinstance(value, BadTimestampValue): raise ValueError(f"Cannot convert {value} to Timestamp") return original_timestamp(value) # Set to epoch format with non-numeric data to trigger the else branch # (lines 1881-1891 in core.py) date_cols = (DateColumn(col_label="date", timestamp_format="epoch_s"),) # Clear any existing log records caplog.clear() # Run the function with our patched Timestamp - should log a warning with caplog.at_level("WARNING"): # Use monkeypatch for cleaner patching monkeypatch.setattr(pd, "Timestamp", failing_timestamp) normalize_dttm_col(df, date_cols) # Verify warning was logged (covers lines 1887-88 in core.py) assert "Unable to convert column date to datetime, ignoring" in caplog.text def test_warning_suppression(): """Verify our implementation suppresses warnings for mixed formats.""" df = pd.DataFrame({"date": ["2023-01-01", "01/02/2023", "March 3, 2023"]}) # Our approach should suppress warnings _, warnings_list = capture_warnings( normalize_dttm_col, df, (DateColumn(col_label="date"),) ) assert len(warnings_list) == 0 # Should suppress all format inference warnings assert pd.api.types.is_datetime64_any_dtype(df["date"]) # Should still parse dates