Files
superset2/superset/datasets/datetime_format_detector.py
2025-11-25 14:25:24 -08:00

212 lines
7.5 KiB
Python

# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
"""Service for detecting datetime formats in dataset columns."""
from __future__ import annotations
import logging
from typing import TYPE_CHECKING
from flask import current_app
from superset.connectors.sqla.models import SqlaTable, TableColumn
from superset.utils.decorators import transaction
from superset.utils.pandas import detect_datetime_format
if TYPE_CHECKING:
from superset.models.core import Database
logger = logging.getLogger(__name__)
class DatetimeFormatDetector:
"""
Service for detecting and storing datetime formats in dataset columns.
This service samples data from datetime columns to detect their format,
reducing the need for runtime format detection on every query.
"""
def __init__(self, sample_size: int | None = None) -> None:
"""
Initialize the datetime format detector.
:param sample_size: Number of rows to sample for format detection
"""
self.sample_size = sample_size or current_app.config.get(
"DATETIME_FORMAT_DETECTION_SAMPLE_SIZE", 1000
)
def detect_column_format(
self,
dataset: SqlaTable,
column: TableColumn,
) -> str | None:
"""
Detect datetime format for a specific column.
:param dataset: The dataset containing the column
:param column: The column to detect format for
:return: Detected format string or None if detection fails
"""
if not column.is_temporal:
logger.debug(
"Column %s is not temporal, skipping format detection",
column.column_name,
)
return None
# Skip expression columns - they don't have stored data to sample
if column.expression:
logger.debug(
"Column %s is an expression column, skipping format detection",
column.column_name,
)
return None
# Skip virtual datasets - they use SQL queries, not physical tables
if dataset.is_virtual:
logger.debug(
"Dataset %s is virtual, skipping format detection for column %s",
dataset.table_name,
column.column_name,
)
return None
try:
# Build SQL query using database's identifier quoting
# Note: Column and table names come from internal metadata, not user input
database: Database = dataset.database
# Get the database engine's dialect for proper identifier quoting
with database.get_sqla_engine() as engine:
dialect = engine.dialect
# Quote identifiers using the dialect's identifier preparer
column_name_quoted = dialect.identifier_preparer.quote(
column.column_name
)
table_name_quoted = dialect.identifier_preparer.quote(
dataset.table_name
)
if dataset.schema:
schema_quoted = dialect.identifier_preparer.quote(dataset.schema)
full_table = f"{schema_quoted}.{table_name_quoted}"
else:
full_table = table_name_quoted
# Build SQL query string with quoted identifiers
# S608: false positive - using dialect's identifier preparer
sql = ( # noqa: S608
f"SELECT {column_name_quoted} FROM {full_table} " # noqa: S608
f"WHERE {column_name_quoted} IS NOT NULL" # noqa: S608
)
# Apply database-specific LIMIT using apply_limit_to_sql
# This handles different SQL dialects (LIMIT, TOP, FETCH FIRST, etc.)
sql = database.apply_limit_to_sql(sql, limit=self.sample_size, force=True)
# Execute query and get results
df = database.get_df(sql, dataset.schema)
if df.empty or column.column_name not in df.columns:
logger.warning(
"No data returned for column %s in dataset %s",
column.column_name,
dataset.table_name,
)
return None
# Detect format using existing utility
series = df[column.column_name]
detected_format = detect_datetime_format(series, self.sample_size)
if detected_format:
logger.info(
"Detected format '%s' for column %s.%s",
detected_format,
dataset.table_name,
column.column_name,
)
else:
logger.warning(
"Could not detect format for column %s.%s",
dataset.table_name,
column.column_name,
)
return detected_format
except Exception as ex:
logger.exception(
"Error detecting format for column %s.%s: %s",
dataset.table_name,
column.column_name,
str(ex),
)
return None
@transaction()
def detect_all_formats(
self,
dataset: SqlaTable,
force: bool = False,
) -> dict[str, str | None]:
"""
Detect datetime formats for all temporal columns in a dataset.
:param dataset: The dataset to process
:param force: If True, re-detect even if format already exists
:return: Dictionary mapping column names to detected formats
"""
results: dict[str, str | None] = {}
for column in dataset.columns:
# Skip if not temporal
if not column.is_temporal:
continue
# Skip if format already exists and not forcing re-detection
if column.datetime_format and not force:
logger.debug(
"Column %s.%s already has format '%s', skipping",
dataset.table_name,
column.column_name,
column.datetime_format,
)
results[column.column_name] = column.datetime_format
continue
# Detect and store format
detected_format = self.detect_column_format(dataset, column)
if detected_format:
column.datetime_format = detected_format
results[column.column_name] = detected_format
else:
results[column.column_name] = None
# Log results
if results:
logger.info(
"Detected formats for %d columns in dataset %s",
len(results),
dataset.table_name,
)
return results