fix(reports): improve error handling for report schedule execution (#35800)

Co-authored-by: Claude <noreply@anthropic.com>
This commit is contained in:
Elizabeth Thompson
2025-11-07 17:57:43 -08:00
committed by GitHub
parent 3167a0dbc0
commit c42e3c6837
2 changed files with 117 additions and 23 deletions

View File

@@ -177,19 +177,35 @@ class BaseReportState:
"""
Creates a Report execution log, uses the current computed last_value for Alerts
"""
log = ReportExecutionLog(
scheduled_dttm=self._scheduled_dttm,
start_dttm=self._start_dttm,
end_dttm=datetime.utcnow(),
value=self._report_schedule.last_value,
value_row_json=self._report_schedule.last_value_row_json,
state=self._report_schedule.last_state,
error_message=error_message,
report_schedule=self._report_schedule,
uuid=self._execution_id,
)
db.session.add(log)
db.session.commit() # pylint: disable=consider-using-transaction
from sqlalchemy.orm.exc import StaleDataError
try:
log = ReportExecutionLog(
scheduled_dttm=self._scheduled_dttm,
start_dttm=self._start_dttm,
end_dttm=datetime.utcnow(),
value=self._report_schedule.last_value,
value_row_json=self._report_schedule.last_value_row_json,
state=self._report_schedule.last_state,
error_message=error_message,
report_schedule=self._report_schedule,
uuid=self._execution_id,
)
db.session.add(log)
db.session.commit() # pylint: disable=consider-using-transaction
except StaleDataError as ex:
# Report schedule was modified or deleted by another process
db.session.rollback()
logger.warning(
"Report schedule (execution %s) was modified or deleted "
"during execution. This can occur when a report is deleted "
"while running.",
self._execution_id,
)
raise ReportScheduleUnexpectedError(
"Report schedule was modified or deleted by another process "
"during execution"
) from ex
def _get_url(
self,
@@ -743,7 +759,7 @@ class ReportNotTriggeredErrorState(BaseReportState):
current_states = [ReportState.NOOP, ReportState.ERROR]
initial = True
def next(self) -> None:
def next(self) -> None: # noqa: C901
self.update_report_schedule_and_log(ReportState.WORKING)
try:
# If it's an alert check if the alert is triggered
@@ -758,9 +774,21 @@ class ReportNotTriggeredErrorState(BaseReportState):
if isinstance(first_ex, SupersetErrorsException):
error_message = ";".join([error.message for error in first_ex.errors])
self.update_report_schedule_and_log(
ReportState.ERROR, error_message=error_message
)
try:
self.update_report_schedule_and_log(
ReportState.ERROR, error_message=error_message
)
except ReportScheduleUnexpectedError as logging_ex:
# Logging failed (likely StaleDataError), but we still want to
# raise the original error so the root cause remains visible
logger.warning(
"Failed to log error for report schedule (execution %s) "
"due to database issue",
self._execution_id,
exc_info=True,
)
# Re-raise the original exception, not the logging failure
raise first_ex from logging_ex
# TODO (dpgaspar) convert this logic to a new state eg: ERROR_ON_GRACE
if not self.is_in_error_grace_period():
@@ -776,12 +804,26 @@ class ReportNotTriggeredErrorState(BaseReportState):
second_error_message = ";".join(
[error.message for error in second_ex.errors]
)
except ReportScheduleUnexpectedError:
# send_error failed due to logging issue, log and continue
# to raise the original error
logger.warning(
"Failed to send error notification due to database issue",
exc_info=True,
)
except Exception as second_ex: # pylint: disable=broad-except
second_error_message = str(second_ex)
finally:
self.update_report_schedule_and_log(
ReportState.ERROR, error_message=second_error_message
)
try:
self.update_report_schedule_and_log(
ReportState.ERROR, error_message=second_error_message
)
except ReportScheduleUnexpectedError:
# Logging failed again, log it but don't let it hide first_ex
logger.warning(
"Failed to log final error state due to database issue",
exc_info=True,
)
raise
@@ -851,9 +893,22 @@ class ReportSuccessState(BaseReportState):
self.send()
self.update_report_schedule_and_log(ReportState.SUCCESS)
except Exception as ex: # pylint: disable=broad-except
self.update_report_schedule_and_log(
ReportState.ERROR, error_message=str(ex)
)
try:
self.update_report_schedule_and_log(
ReportState.ERROR, error_message=str(ex)
)
except ReportScheduleUnexpectedError as logging_ex:
# Logging failed (likely StaleDataError), but we still want to
# raise the original error so the root cause remains visible
logger.warning(
"Failed to log error for report schedule (execution %s) "
"due to database issue",
self._execution_id,
exc_info=True,
)
# Re-raise the original exception, not the logging failure
raise ex from logging_ex
raise
class ReportScheduleStateMachine: # pylint: disable=too-few-public-methods

View File

@@ -20,8 +20,10 @@ from uuid import uuid4
import pytest
from flask import current_app
from sqlalchemy.orm.exc import StaleDataError
from superset.commands.dashboard.permalink.create import CreateDashboardPermalinkCommand
from superset.commands.report.exceptions import ReportScheduleUnexpectedError
from superset.commands.report.execute import AsyncExecuteReportScheduleCommand
from superset.models.dashboard import Dashboard
from superset.reports.models import ReportSourceFormat
@@ -118,3 +120,40 @@ def test_report_with_header_data(
assert header_data.get("notification_source") == ReportSourceFormat.DASHBOARD
assert header_data.get("notification_type") == report_schedule.type
assert len(send_email_smtp_mock.call_args.kwargs["header_data"]) == 8
@patch("superset.reports.notifications.email.send_email_smtp")
@patch("superset.commands.report.execute.DashboardScreenshot")
@pytest.mark.usefixtures("login_as_admin")
def test_report_schedule_stale_data_error_preserves_cause(
dashboard_screenshot_mock: MagicMock,
send_email_smtp_mock: MagicMock,
tabbed_dashboard: Dashboard, # noqa: F811
) -> None:
"""
Test that when db.session.commit raises StaleDataError during logging,
we surface ReportScheduleUnexpectedError while preserving the original
StaleDataError as the cause.
"""
dashboard_screenshot_mock.get_screenshot.return_value = b"test-image"
current_app.config["ALERT_REPORTS_NOTIFICATION_DRY_RUN"] = False
with create_dashboard_report(
dashboard=tabbed_dashboard,
extra={},
name="test stale data error",
) as report_schedule:
# Mock db.session.commit to raise StaleDataError during log creation
with patch("superset.db.session.commit") as mock_commit:
mock_commit.side_effect = StaleDataError("test stale data")
# Execute the report and expect ReportScheduleUnexpectedError
with pytest.raises(ReportScheduleUnexpectedError) as exc_info:
AsyncExecuteReportScheduleCommand(
str(uuid4()), report_schedule.id, datetime.utcnow()
).run()
# Verify the original StaleDataError is preserved as the cause
assert exc_info.value.__cause__ is not None
assert isinstance(exc_info.value.__cause__, StaleDataError)
assert str(exc_info.value.__cause__) == "test stale data"