superset2/superset/datasets/datetime_format_detector.py

# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements.  See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership.  The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License.  You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.
"""Service for detecting datetime formats in dataset columns."""

from __future__ import annotations

import logging
from typing import TYPE_CHECKING

from flask import current_app

from superset.connectors.sqla.models import SqlaTable, TableColumn
from superset.utils.decorators import transaction
from superset.utils.pandas import detect_datetime_format

if TYPE_CHECKING:
    from superset.models.core import Database

logger = logging.getLogger(__name__)


class DatetimeFormatDetector:
    """
    Service for detecting and storing datetime formats in dataset columns.

    This service samples data from datetime columns to detect their format,
    reducing the need for runtime format detection on every query.
    """

    def __init__(self, sample_size: int | None = None) -> None:
        """
        Initialize the datetime format detector.

        :param sample_size: Number of rows to sample for format detection
        """
        self.sample_size = sample_size or current_app.config.get(
            "DATETIME_FORMAT_DETECTION_SAMPLE_SIZE", 1000
        )

    def detect_column_format(
        self,
        dataset: SqlaTable,
        column: TableColumn,
    ) -> str | None:
        """
        Detect datetime format for a specific column.

        :param dataset: The dataset containing the column
        :param column: The column to detect format for
        :return: Detected format string or None if detection fails
        """
        if not column.is_temporal:
            logger.debug(
                "Column %s is not temporal, skipping format detection",
                column.column_name,
            )
            return None

        # Skip expression columns - they don't have stored data to sample
        if column.expression:
            logger.debug(
                "Column %s is an expression column, skipping format detection",
                column.column_name,
            )
            return None

        # Skip virtual datasets - they use SQL queries, not physical tables
        if dataset.is_virtual:
            logger.debug(
                "Dataset %s is virtual, skipping format detection for column %s",
                dataset.table_name,
                column.column_name,
            )
            return None

        try:
            # Build SQL query using database's identifier quoting
            # Note: Column and table names come from internal metadata, not user input
            database: Database = dataset.database

            # Get the database engine's dialect for proper identifier quoting
            with database.get_sqla_engine() as engine:
                dialect = engine.dialect

                # Quote identifiers using the dialect's identifier preparer
                column_name_quoted = dialect.identifier_preparer.quote(
                    column.column_name
                )
                table_name_quoted = dialect.identifier_preparer.quote(
                    dataset.table_name
                )

                if dataset.schema:
                    schema_quoted = dialect.identifier_preparer.quote(dataset.schema)
                    full_table = f"{schema_quoted}.{table_name_quoted}"
                else:
                    full_table = table_name_quoted

                # Build SQL query string with quoted identifiers
                # S608: false positive - using dialect's identifier preparer
                sql = (  # noqa: S608
                    f"SELECT {column_name_quoted} FROM {full_table} "  # noqa: S608
                    f"WHERE {column_name_quoted} IS NOT NULL"  # noqa: S608
                )

            # Apply database-specific LIMIT using apply_limit_to_sql
            # This handles different SQL dialects (LIMIT, TOP, FETCH FIRST, etc.)
            sql = database.apply_limit_to_sql(sql, limit=self.sample_size, force=True)

            # Execute query and get results
            df = database.get_df(sql, dataset.schema)

            if df.empty or column.column_name not in df.columns:
                logger.warning(
                    "No data returned for column %s in dataset %s",
                    column.column_name,
                    dataset.table_name,
                )
                return None

            # Detect format using existing utility
            series = df[column.column_name]
            detected_format = detect_datetime_format(series, self.sample_size)

            if detected_format:
                logger.info(
                    "Detected format '%s' for column %s.%s",
                    detected_format,
                    dataset.table_name,
                    column.column_name,
                )
            else:
                logger.warning(
                    "Could not detect format for column %s.%s",
                    dataset.table_name,
                    column.column_name,
                )

            return detected_format

        except Exception as ex:
            logger.exception(
                "Error detecting format for column %s.%s: %s",
                dataset.table_name,
                column.column_name,
                str(ex),
            )
            return None

    @transaction()
    def detect_all_formats(
        self,
        dataset: SqlaTable,
        force: bool = False,
    ) -> dict[str, str | None]:
        """
        Detect datetime formats for all temporal columns in a dataset.

        :param dataset: The dataset to process
        :param force: If True, re-detect even if format already exists
        :return: Dictionary mapping column names to detected formats
        """
        results: dict[str, str | None] = {}

        for column in dataset.columns:
            # Skip if not temporal
            if not column.is_temporal:
                continue

            # Skip if format already exists and not forcing re-detection
            if column.datetime_format and not force:
                logger.debug(
                    "Column %s.%s already has format '%s', skipping",
                    dataset.table_name,
                    column.column_name,
                    column.datetime_format,
                )
                results[column.column_name] = column.datetime_format
                continue

            # Detect and store format
            detected_format = self.detect_column_format(dataset, column)
            if detected_format:
                column.datetime_format = detected_format
                results[column.column_name] = detected_format
            else:
                results[column.column_name] = None

        # Log results
        if results:
            logger.info(
                "Detected formats for %d columns in dataset %s",
                len(results),
                dataset.table_name,
            )

        return results