mirror of
https://github.com/apache/superset.git
synced 2026-04-18 23:55:00 +00:00
feat(timeshift): Add support for date range timeshifts (#34375)
This commit is contained in:
committed by
GitHub
parent
407fb67f1e
commit
761daec53d
@@ -46,7 +46,7 @@ from superset.exceptions import (
|
||||
QueryObjectValidationError,
|
||||
SupersetException,
|
||||
)
|
||||
from superset.extensions import cache_manager, security_manager
|
||||
from superset.extensions import cache_manager, feature_flag_manager, security_manager
|
||||
from superset.models.helpers import QueryResult
|
||||
from superset.models.sql_lab import Query
|
||||
from superset.superset_typing import AdhocColumn, AdhocMetric
|
||||
@@ -67,6 +67,7 @@ from superset.utils.core import (
|
||||
is_adhoc_column,
|
||||
is_adhoc_metric,
|
||||
normalize_dttm_col,
|
||||
QueryObjectFilterClause,
|
||||
TIME_COMPARISON,
|
||||
)
|
||||
from superset.utils.date_parser import get_past_or_future, normalize_time_delta
|
||||
@@ -138,6 +139,10 @@ class QueryContextProcessor:
|
||||
force_cached=force_cached,
|
||||
)
|
||||
|
||||
if query_obj:
|
||||
# Always validate the query object before processing
|
||||
query_obj.validate()
|
||||
|
||||
if query_obj and cache_key and not cache.is_loaded:
|
||||
try:
|
||||
if invalid_columns := [
|
||||
@@ -473,26 +478,19 @@ class QueryContextProcessor:
|
||||
)
|
||||
|
||||
time_grain = self.get_time_grain(query_object)
|
||||
|
||||
metric_names = get_metric_names(query_object.metrics)
|
||||
|
||||
# use columns that are not metrics as join keys
|
||||
join_keys = [col for col in df.columns if col not in metric_names]
|
||||
|
||||
for offset in query_object.time_offsets:
|
||||
try:
|
||||
# pylint: disable=line-too-long
|
||||
# Since the x-axis is also a column name for the time filter, x_axis_label will be set as granularity # noqa: E501
|
||||
# these query object are equivalent:
|
||||
# 1) { granularity: 'dttm_col', time_range: '2020 : 2021', time_offsets: ['1 year ago']} # noqa: E501
|
||||
# 2) { columns: [
|
||||
# {label: 'dttm_col', sqlExpression: 'dttm_col', "columnType": "BASE_AXIS" } # noqa: E501
|
||||
# ],
|
||||
# time_offsets: ['1 year ago'],
|
||||
# filters: [{col: 'dttm_col', op: 'TEMPORAL_RANGE', val: '2020 : 2021'}], # noqa: E501
|
||||
# }
|
||||
original_offset = offset
|
||||
if self.is_valid_date_range(offset):
|
||||
is_date_range_offset = self.is_valid_date_range(offset)
|
||||
|
||||
if is_date_range_offset and feature_flag_manager.is_feature_enabled(
|
||||
"DATE_RANGE_TIMESHIFTS_ENABLED"
|
||||
):
|
||||
# DATE RANGE OFFSET LOGIC (like "2015-01-03 : 2015-01-04")
|
||||
try:
|
||||
# Parse the specified range
|
||||
offset_from_dttm, offset_to_dttm = (
|
||||
@@ -504,7 +502,23 @@ class QueryContextProcessor:
|
||||
# Use the specified range directly
|
||||
query_object_clone.from_dttm = offset_from_dttm
|
||||
query_object_clone.to_dttm = offset_to_dttm
|
||||
|
||||
# For date range offsets, we must NOT set inner bounds
|
||||
# These create additional WHERE clauses that conflict with our
|
||||
# date range
|
||||
query_object_clone.inner_from_dttm = None
|
||||
query_object_clone.inner_to_dttm = None
|
||||
|
||||
elif is_date_range_offset:
|
||||
# Date range timeshift feature is disabled
|
||||
raise QueryObjectValidationError(
|
||||
"Date range timeshifts are not enabled. "
|
||||
"Please contact your administrator to enable the "
|
||||
"DATE_RANGE_TIMESHIFTS_ENABLED feature flag."
|
||||
)
|
||||
|
||||
else:
|
||||
# RELATIVE OFFSET LOGIC (like "1 day ago")
|
||||
if self.is_valid_date(offset) or offset == "inherit":
|
||||
offset = self.get_offset_custom_or_inherit(
|
||||
offset,
|
||||
@@ -519,34 +533,64 @@ class QueryContextProcessor:
|
||||
offset, outer_to_dttm
|
||||
)
|
||||
|
||||
query_object_clone.inner_from_dttm = query_object_clone.from_dttm
|
||||
query_object_clone.inner_to_dttm = query_object_clone.to_dttm
|
||||
|
||||
x_axis_label = get_x_axis_label(query_object.columns)
|
||||
query_object_clone.granularity = (
|
||||
query_object_clone.granularity or x_axis_label
|
||||
)
|
||||
|
||||
except ValueError as ex:
|
||||
raise QueryObjectValidationError(str(ex)) from ex
|
||||
# make sure subquery use main query where clause
|
||||
query_object_clone.inner_from_dttm = outer_from_dttm
|
||||
query_object_clone.inner_to_dttm = outer_to_dttm
|
||||
|
||||
query_object_clone.time_offsets = []
|
||||
query_object_clone.post_processing = []
|
||||
|
||||
# Get time offset index
|
||||
index = (get_base_axis_labels(query_object.columns) or [DTTM_ALIAS])[0]
|
||||
# The comparison is not using a temporal column so we need to modify
|
||||
# the temporal filter so we run the query with the correct time range
|
||||
if not dataframe_utils.is_datetime_series(df.get(index)):
|
||||
# Lets find the first temporal filter in the filters array and change
|
||||
# its val to be the result of get_since_until with the offset
|
||||
for flt in query_object_clone.filter:
|
||||
if flt.get("op") == FilterOperator.TEMPORAL_RANGE and isinstance(
|
||||
flt.get("val"), str
|
||||
):
|
||||
time_range = cast(str, flt.get("val"))
|
||||
if self.is_valid_date_range(offset):
|
||||
flt["val"] = (
|
||||
f"{query_object_clone.from_dttm} : {query_object_clone.to_dttm}" # noqa: E501
|
||||
)
|
||||
else:
|
||||
|
||||
# Handle temporal filters
|
||||
if is_date_range_offset and feature_flag_manager.is_feature_enabled(
|
||||
"DATE_RANGE_TIMESHIFTS_ENABLED"
|
||||
):
|
||||
# Create a completely new filter list to avoid conflicts
|
||||
query_object_clone.filter = copy.deepcopy(query_object_clone.filter)
|
||||
|
||||
# Remove any existing temporal filters that might conflict
|
||||
query_object_clone.filter = [
|
||||
flt
|
||||
for flt in query_object_clone.filter
|
||||
if not (flt.get("op") == FilterOperator.TEMPORAL_RANGE)
|
||||
]
|
||||
|
||||
# Add our specific temporal filter
|
||||
temporal_col = query_object_clone.granularity or x_axis_label
|
||||
if temporal_col:
|
||||
new_temporal_filter: QueryObjectFilterClause = {
|
||||
"col": temporal_col,
|
||||
"op": FilterOperator.TEMPORAL_RANGE,
|
||||
"val": (
|
||||
f"{query_object_clone.from_dttm} : "
|
||||
f"{query_object_clone.to_dttm}"
|
||||
),
|
||||
}
|
||||
query_object_clone.filter.append(new_temporal_filter)
|
||||
|
||||
else:
|
||||
# The comparison is not using a temporal column so we need to modify
|
||||
# the temporal filter so we run the query with the correct time range
|
||||
if not dataframe_utils.is_datetime_series(df.get(index)):
|
||||
query_object_clone.filter = copy.deepcopy(query_object_clone.filter)
|
||||
|
||||
# Find and update temporal filters
|
||||
for flt in query_object_clone.filter:
|
||||
if flt.get(
|
||||
"op"
|
||||
) == FilterOperator.TEMPORAL_RANGE and isinstance(
|
||||
flt.get("val"), str
|
||||
):
|
||||
time_range = cast(str, flt.get("val"))
|
||||
(
|
||||
new_outer_from_dttm,
|
||||
new_outer_to_dttm,
|
||||
@@ -555,21 +599,41 @@ class QueryContextProcessor:
|
||||
time_shift=offset,
|
||||
)
|
||||
flt["val"] = f"{new_outer_from_dttm} : {new_outer_to_dttm}"
|
||||
else:
|
||||
# If it IS a datetime series, we still need to clear conflicting
|
||||
# filters
|
||||
query_object_clone.filter = copy.deepcopy(query_object_clone.filter)
|
||||
|
||||
# For relative offsets with datetime series, ensure the temporal
|
||||
# filter matches our range
|
||||
temporal_col = query_object_clone.granularity or x_axis_label
|
||||
|
||||
# Update any existing temporal filters to match our shifted range
|
||||
for flt in query_object_clone.filter:
|
||||
if (
|
||||
flt.get("op") == FilterOperator.TEMPORAL_RANGE
|
||||
and flt.get("col") == temporal_col
|
||||
):
|
||||
flt["val"] = (
|
||||
f"{query_object_clone.from_dttm} : "
|
||||
f"{query_object_clone.to_dttm}"
|
||||
)
|
||||
|
||||
# Remove non-temporal x-axis filters (but keep temporal ones)
|
||||
query_object_clone.filter = [
|
||||
flt
|
||||
for flt in query_object_clone.filter
|
||||
if flt.get("col") != x_axis_label
|
||||
if not (
|
||||
flt.get("col") == x_axis_label
|
||||
and flt.get("op") != FilterOperator.TEMPORAL_RANGE
|
||||
)
|
||||
]
|
||||
|
||||
# Inherit or custom start dates might compute the same offset but the response cannot be given # noqa: E501
|
||||
# using cached data unless you are using the same date of inherited range, that's why we # noqa: E501
|
||||
# set the cache cache using a custom key that includes the original offset and the computed offset # noqa: E501
|
||||
# for those two scenarios, the rest of the scenarios will use the original offset as cache key # noqa: E501
|
||||
# Continue with the rest of the method...
|
||||
cached_time_offset_key = (
|
||||
offset if offset == original_offset else f"{offset}_{original_offset}"
|
||||
)
|
||||
|
||||
# `offset` is added to the hash function
|
||||
cache_key = self.query_cache_key(
|
||||
query_object_clone,
|
||||
time_offset=cached_time_offset_key,
|
||||
@@ -578,7 +642,7 @@ class QueryContextProcessor:
|
||||
cache = QueryCacheManager.get(
|
||||
cache_key, CacheRegion.DATA, query_context.force
|
||||
)
|
||||
# whether hit on the cache
|
||||
|
||||
if cache.is_loaded:
|
||||
offset_dfs[offset] = cache.df
|
||||
queries.append(cache.query)
|
||||
@@ -586,6 +650,7 @@ class QueryContextProcessor:
|
||||
continue
|
||||
|
||||
query_object_clone_dct = query_object_clone.to_dict()
|
||||
|
||||
# rename metrics: SUM(value) => SUM(value) 1 year ago
|
||||
metrics_mapping = {
|
||||
metric: TIME_COMPARISON.join([metric, original_offset])
|
||||
@@ -648,6 +713,125 @@ class QueryContextProcessor:
|
||||
|
||||
return CachedTimeOffset(df=df, queries=queries, cache_keys=cache_keys)
|
||||
|
||||
def _process_date_range_offset(
|
||||
self, offset_df: pd.DataFrame, join_keys: list[str]
|
||||
) -> tuple[pd.DataFrame, list[str]]:
|
||||
"""Process date range offset data and return modified DataFrame and keys."""
|
||||
temporal_cols = ["ds", "__timestamp", "dttm"]
|
||||
non_temporal_join_keys = [key for key in join_keys if key not in temporal_cols]
|
||||
|
||||
if non_temporal_join_keys:
|
||||
return offset_df, non_temporal_join_keys
|
||||
|
||||
metric_columns = [col for col in offset_df.columns if col not in temporal_cols]
|
||||
|
||||
if metric_columns:
|
||||
aggregated_values = {}
|
||||
for col in metric_columns:
|
||||
if pd.api.types.is_numeric_dtype(offset_df[col]):
|
||||
aggregated_values[col] = offset_df[col].sum()
|
||||
else:
|
||||
aggregated_values[col] = (
|
||||
offset_df[col].iloc[0] if not offset_df.empty else None
|
||||
)
|
||||
|
||||
offset_df = pd.DataFrame([aggregated_values])
|
||||
|
||||
return offset_df, []
|
||||
|
||||
def _apply_cleanup_logic(
|
||||
self,
|
||||
df: pd.DataFrame,
|
||||
offset: str,
|
||||
time_grain: str | None,
|
||||
join_keys: list[str],
|
||||
is_date_range_offset: bool,
|
||||
) -> pd.DataFrame:
|
||||
"""Apply appropriate cleanup logic based on offset type."""
|
||||
if time_grain and not is_date_range_offset:
|
||||
if join_keys:
|
||||
col = df.pop(join_keys[0])
|
||||
df.insert(0, col.name, col)
|
||||
|
||||
df.drop(
|
||||
list(df.filter(regex=f"{OFFSET_JOIN_COLUMN_SUFFIX}|{R_SUFFIX}")),
|
||||
axis=1,
|
||||
inplace=True,
|
||||
)
|
||||
elif is_date_range_offset:
|
||||
df.drop(
|
||||
list(df.filter(regex=f"{R_SUFFIX}")),
|
||||
axis=1,
|
||||
inplace=True,
|
||||
)
|
||||
else:
|
||||
df.drop(
|
||||
list(df.filter(regex=f"{R_SUFFIX}")),
|
||||
axis=1,
|
||||
inplace=True,
|
||||
)
|
||||
|
||||
return df
|
||||
|
||||
def _determine_join_keys(
|
||||
self,
|
||||
df: pd.DataFrame,
|
||||
offset_df: pd.DataFrame,
|
||||
offset: str,
|
||||
time_grain: str | None,
|
||||
join_keys: list[str],
|
||||
is_date_range_offset: bool,
|
||||
join_column_producer: Any,
|
||||
) -> tuple[pd.DataFrame, list[str]]:
|
||||
"""Determine appropriate join keys and modify DataFrames if needed."""
|
||||
if time_grain and not is_date_range_offset:
|
||||
column_name = OFFSET_JOIN_COLUMN_SUFFIX + offset
|
||||
|
||||
# Add offset join columns for relative time offsets
|
||||
self.add_offset_join_column(
|
||||
df, column_name, time_grain, offset, join_column_producer
|
||||
)
|
||||
self.add_offset_join_column(
|
||||
offset_df, column_name, time_grain, None, join_column_producer
|
||||
)
|
||||
return offset_df, [column_name, *join_keys[1:]]
|
||||
|
||||
elif is_date_range_offset:
|
||||
return self._process_date_range_offset(offset_df, join_keys)
|
||||
|
||||
else:
|
||||
return offset_df, join_keys
|
||||
|
||||
def _perform_join(
|
||||
self, df: pd.DataFrame, offset_df: pd.DataFrame, actual_join_keys: list[str]
|
||||
) -> pd.DataFrame:
|
||||
"""Perform the appropriate join operation."""
|
||||
if actual_join_keys:
|
||||
return dataframe_utils.left_join_df(
|
||||
left_df=df,
|
||||
right_df=offset_df,
|
||||
join_keys=actual_join_keys,
|
||||
rsuffix=R_SUFFIX,
|
||||
)
|
||||
else:
|
||||
temp_key = "__temp_join_key__"
|
||||
df[temp_key] = 1
|
||||
offset_df[temp_key] = 1
|
||||
|
||||
result_df = dataframe_utils.left_join_df(
|
||||
left_df=df,
|
||||
right_df=offset_df,
|
||||
join_keys=[temp_key],
|
||||
rsuffix=R_SUFFIX,
|
||||
)
|
||||
|
||||
# Remove temporary join keys
|
||||
result_df.drop(columns=[temp_key], inplace=True, errors="ignore")
|
||||
result_df.drop(
|
||||
columns=[f"{temp_key}{R_SUFFIX}"], inplace=True, errors="ignore"
|
||||
)
|
||||
return result_df
|
||||
|
||||
def join_offset_dfs(
|
||||
self,
|
||||
df: pd.DataFrame,
|
||||
@@ -672,54 +856,28 @@ class QueryContextProcessor:
|
||||
_("Time Grain must be specified when using Time Shift.")
|
||||
)
|
||||
|
||||
# iterate on offset_dfs, left join each with df
|
||||
for offset, offset_df in offset_dfs.items():
|
||||
actual_join_keys = join_keys
|
||||
is_date_range_offset = self.is_valid_date_range(
|
||||
offset
|
||||
) and feature_flag_manager.is_feature_enabled(
|
||||
"DATE_RANGE_TIMESHIFTS_ENABLED"
|
||||
)
|
||||
|
||||
if time_grain:
|
||||
# defines a column name for the offset join column
|
||||
column_name = OFFSET_JOIN_COLUMN_SUFFIX + offset
|
||||
offset_df, actual_join_keys = self._determine_join_keys(
|
||||
df,
|
||||
offset_df,
|
||||
offset,
|
||||
time_grain,
|
||||
join_keys,
|
||||
is_date_range_offset,
|
||||
join_column_producer,
|
||||
)
|
||||
|
||||
# add offset join column to df
|
||||
self.add_offset_join_column(
|
||||
df, column_name, time_grain, offset, join_column_producer
|
||||
)
|
||||
df = self._perform_join(df, offset_df, actual_join_keys)
|
||||
df = self._apply_cleanup_logic(
|
||||
df, offset, time_grain, join_keys, is_date_range_offset
|
||||
)
|
||||
|
||||
# add offset join column to offset_df
|
||||
self.add_offset_join_column(
|
||||
offset_df, column_name, time_grain, None, join_column_producer
|
||||
)
|
||||
|
||||
# the temporal column is the first column in the join keys
|
||||
# so we use the join column instead of the temporal column
|
||||
actual_join_keys = [column_name, *join_keys[1:]]
|
||||
|
||||
if join_keys:
|
||||
df = dataframe_utils.left_join_df(
|
||||
left_df=df,
|
||||
right_df=offset_df,
|
||||
join_keys=actual_join_keys,
|
||||
rsuffix=R_SUFFIX,
|
||||
)
|
||||
else:
|
||||
df = dataframe_utils.full_outer_join_df(
|
||||
left_df=df,
|
||||
right_df=offset_df,
|
||||
rsuffix=R_SUFFIX,
|
||||
)
|
||||
|
||||
if time_grain:
|
||||
# move the temporal column to the first column in df
|
||||
if join_keys:
|
||||
col = df.pop(join_keys[0])
|
||||
df.insert(0, col.name, col)
|
||||
|
||||
# removes columns created only for join purposes
|
||||
df.drop(
|
||||
list(df.filter(regex=f"{OFFSET_JOIN_COLUMN_SUFFIX}|{R_SUFFIX}")),
|
||||
axis=1,
|
||||
inplace=True,
|
||||
)
|
||||
return df
|
||||
|
||||
@staticmethod
|
||||
@@ -732,7 +890,9 @@ class QueryContextProcessor:
|
||||
value = row[column_index]
|
||||
|
||||
if hasattr(value, "strftime"):
|
||||
if time_offset:
|
||||
if time_offset and not QueryContextProcessor.is_valid_date_range_static(
|
||||
time_offset
|
||||
):
|
||||
value = value + DateOffset(**normalize_time_delta(time_offset))
|
||||
|
||||
if time_grain in (
|
||||
@@ -759,6 +919,21 @@ class QueryContextProcessor:
|
||||
|
||||
return str(value)
|
||||
|
||||
@staticmethod
|
||||
def is_valid_date_range_static(date_range: str) -> bool:
|
||||
"""Static version of is_valid_date_range for use in static methods"""
|
||||
try:
|
||||
# Attempt to parse the string as a date range in the format
|
||||
# YYYY-MM-DD:YYYY-MM-DD
|
||||
start_date, end_date = date_range.split(":")
|
||||
datetime.strptime(start_date.strip(), "%Y-%m-%d")
|
||||
datetime.strptime(end_date.strip(), "%Y-%m-%d")
|
||||
return True
|
||||
except ValueError:
|
||||
# If parsing fails, it's not a valid date range in the format
|
||||
# YYYY-MM-DD:YYYY-MM-DD
|
||||
return False
|
||||
|
||||
def get_data(
|
||||
self, df: pd.DataFrame, coltypes: list[GenericDataType]
|
||||
) -> str | list[dict[str, Any]]:
|
||||
|
||||
Reference in New Issue
Block a user