mirror of
https://github.com/apache/superset.git
synced 2026-04-07 18:35:15 +00:00
212 lines
7.5 KiB
Python
212 lines
7.5 KiB
Python
# Licensed to the Apache Software Foundation (ASF) under one
|
|
# or more contributor license agreements. See the NOTICE file
|
|
# distributed with this work for additional information
|
|
# regarding copyright ownership. The ASF licenses this file
|
|
# to you under the Apache License, Version 2.0 (the
|
|
# "License"); you may not use this file except in compliance
|
|
# with the License. You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing,
|
|
# software distributed under the License is distributed on an
|
|
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
# KIND, either express or implied. See the License for the
|
|
# specific language governing permissions and limitations
|
|
# under the License.
|
|
"""Service for detecting datetime formats in dataset columns."""
|
|
|
|
from __future__ import annotations
|
|
|
|
import logging
|
|
from typing import TYPE_CHECKING
|
|
|
|
from flask import current_app
|
|
|
|
from superset.connectors.sqla.models import SqlaTable, TableColumn
|
|
from superset.utils.decorators import transaction
|
|
from superset.utils.pandas import detect_datetime_format
|
|
|
|
if TYPE_CHECKING:
|
|
from superset.models.core import Database
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class DatetimeFormatDetector:
|
|
"""
|
|
Service for detecting and storing datetime formats in dataset columns.
|
|
|
|
This service samples data from datetime columns to detect their format,
|
|
reducing the need for runtime format detection on every query.
|
|
"""
|
|
|
|
def __init__(self, sample_size: int | None = None) -> None:
|
|
"""
|
|
Initialize the datetime format detector.
|
|
|
|
:param sample_size: Number of rows to sample for format detection
|
|
"""
|
|
self.sample_size = sample_size or current_app.config.get(
|
|
"DATETIME_FORMAT_DETECTION_SAMPLE_SIZE", 1000
|
|
)
|
|
|
|
def detect_column_format(
|
|
self,
|
|
dataset: SqlaTable,
|
|
column: TableColumn,
|
|
) -> str | None:
|
|
"""
|
|
Detect datetime format for a specific column.
|
|
|
|
:param dataset: The dataset containing the column
|
|
:param column: The column to detect format for
|
|
:return: Detected format string or None if detection fails
|
|
"""
|
|
if not column.is_temporal:
|
|
logger.debug(
|
|
"Column %s is not temporal, skipping format detection",
|
|
column.column_name,
|
|
)
|
|
return None
|
|
|
|
# Skip expression columns - they don't have stored data to sample
|
|
if column.expression:
|
|
logger.debug(
|
|
"Column %s is an expression column, skipping format detection",
|
|
column.column_name,
|
|
)
|
|
return None
|
|
|
|
# Skip virtual datasets - they use SQL queries, not physical tables
|
|
if dataset.is_virtual:
|
|
logger.debug(
|
|
"Dataset %s is virtual, skipping format detection for column %s",
|
|
dataset.table_name,
|
|
column.column_name,
|
|
)
|
|
return None
|
|
|
|
try:
|
|
# Build SQL query using database's identifier quoting
|
|
# Note: Column and table names come from internal metadata, not user input
|
|
database: Database = dataset.database
|
|
|
|
# Get the database engine's dialect for proper identifier quoting
|
|
with database.get_sqla_engine() as engine:
|
|
dialect = engine.dialect
|
|
|
|
# Quote identifiers using the dialect's identifier preparer
|
|
column_name_quoted = dialect.identifier_preparer.quote(
|
|
column.column_name
|
|
)
|
|
table_name_quoted = dialect.identifier_preparer.quote(
|
|
dataset.table_name
|
|
)
|
|
|
|
if dataset.schema:
|
|
schema_quoted = dialect.identifier_preparer.quote(dataset.schema)
|
|
full_table = f"{schema_quoted}.{table_name_quoted}"
|
|
else:
|
|
full_table = table_name_quoted
|
|
|
|
# Build SQL query string with quoted identifiers
|
|
# S608: false positive - using dialect's identifier preparer
|
|
sql = ( # noqa: S608
|
|
f"SELECT {column_name_quoted} FROM {full_table} " # noqa: S608
|
|
f"WHERE {column_name_quoted} IS NOT NULL" # noqa: S608
|
|
)
|
|
|
|
# Apply database-specific LIMIT using apply_limit_to_sql
|
|
# This handles different SQL dialects (LIMIT, TOP, FETCH FIRST, etc.)
|
|
sql = database.apply_limit_to_sql(sql, limit=self.sample_size, force=True)
|
|
|
|
# Execute query and get results
|
|
df = database.get_df(sql, dataset.schema)
|
|
|
|
if df.empty or column.column_name not in df.columns:
|
|
logger.warning(
|
|
"No data returned for column %s in dataset %s",
|
|
column.column_name,
|
|
dataset.table_name,
|
|
)
|
|
return None
|
|
|
|
# Detect format using existing utility
|
|
series = df[column.column_name]
|
|
detected_format = detect_datetime_format(series, self.sample_size)
|
|
|
|
if detected_format:
|
|
logger.info(
|
|
"Detected format '%s' for column %s.%s",
|
|
detected_format,
|
|
dataset.table_name,
|
|
column.column_name,
|
|
)
|
|
else:
|
|
logger.warning(
|
|
"Could not detect format for column %s.%s",
|
|
dataset.table_name,
|
|
column.column_name,
|
|
)
|
|
|
|
return detected_format
|
|
|
|
except Exception as ex:
|
|
logger.exception(
|
|
"Error detecting format for column %s.%s: %s",
|
|
dataset.table_name,
|
|
column.column_name,
|
|
str(ex),
|
|
)
|
|
return None
|
|
|
|
@transaction()
|
|
def detect_all_formats(
|
|
self,
|
|
dataset: SqlaTable,
|
|
force: bool = False,
|
|
) -> dict[str, str | None]:
|
|
"""
|
|
Detect datetime formats for all temporal columns in a dataset.
|
|
|
|
:param dataset: The dataset to process
|
|
:param force: If True, re-detect even if format already exists
|
|
:return: Dictionary mapping column names to detected formats
|
|
"""
|
|
results: dict[str, str | None] = {}
|
|
|
|
for column in dataset.columns:
|
|
# Skip if not temporal
|
|
if not column.is_temporal:
|
|
continue
|
|
|
|
# Skip if format already exists and not forcing re-detection
|
|
if column.datetime_format and not force:
|
|
logger.debug(
|
|
"Column %s.%s already has format '%s', skipping",
|
|
dataset.table_name,
|
|
column.column_name,
|
|
column.datetime_format,
|
|
)
|
|
results[column.column_name] = column.datetime_format
|
|
continue
|
|
|
|
# Detect and store format
|
|
detected_format = self.detect_column_format(dataset, column)
|
|
if detected_format:
|
|
column.datetime_format = detected_format
|
|
results[column.column_name] = detected_format
|
|
else:
|
|
results[column.column_name] = None
|
|
|
|
# Log results
|
|
if results:
|
|
logger.info(
|
|
"Detected formats for %d columns in dataset %s",
|
|
len(results),
|
|
dataset.table_name,
|
|
)
|
|
|
|
return results
|