fix(alerts): improve Slack API rate limiting for large workspaces (#35622)

This commit is contained in:
Marcos Amorim
2025-10-24 13:02:41 -04:00
committed by GitHub
parent 93cb60b24e
commit c3b8c96db6
4 changed files with 90 additions and 17 deletions

View File

@@ -65,6 +65,22 @@ To send alerts and reports to Slack channels, you need to create a new Slack App
Note: when you configure an alert or a report, the Slack channel list takes channel names without the leading '#' e.g. use `alerts` instead of `#alerts`. Note: when you configure an alert or a report, the Slack channel list takes channel names without the leading '#' e.g. use `alerts` instead of `#alerts`.
#### Large Slack Workspaces (10k+ channels)
For workspaces with many channels, fetching the complete channel list can take several minutes and may encounter Slack API rate limits. Add the following to your `superset_config.py`:
```python
from datetime import timedelta
# Increase cache timeout to reduce API calls
# Default: 1 day (86400 seconds)
SLACK_CACHE_TIMEOUT = int(timedelta(days=2).total_seconds())
# Increase retry count for rate limit errors
# Default: 2
SLACK_API_RATE_LIMIT_RETRY_COUNT = 5
```
### Kubernetes-specific ### Kubernetes-specific
- You must have a `celery beat` pod running. If you're using the chart included in the GitHub repository under [helm/superset](https://github.com/apache/superset/tree/master/helm/superset), you need to put `supersetCeleryBeat.enabled = true` in your values override. - You must have a `celery beat` pod running. If you're using the chart included in the GitHub repository under [helm/superset](https://github.com/apache/superset/tree/master/helm/superset), you need to put `supersetCeleryBeat.enabled = true` in your values override.

View File

@@ -1744,6 +1744,11 @@ SLACK_API_TOKEN: Callable[[], str] | str | None = None
SLACK_PROXY = None SLACK_PROXY = None
SLACK_CACHE_TIMEOUT = int(timedelta(days=1).total_seconds()) SLACK_CACHE_TIMEOUT = int(timedelta(days=1).total_seconds())
# Maximum number of retries when Slack API returns rate limit errors
# Default: 2
# For workspaces with 10k+ channels, consider increasing to 10
SLACK_API_RATE_LIMIT_RETRY_COUNT = 2
# The webdriver to use for generating reports. Use one of the following # The webdriver to use for generating reports. Use one of the following
# firefox # firefox
# Requires: geckodriver and firefox installations # Requires: geckodriver and firefox installations

View File

@@ -26,10 +26,23 @@ logger = logging.getLogger(__name__)
@celery_app.task(name="slack.cache_channels") @celery_app.task(name="slack.cache_channels")
def cache_channels() -> None: def cache_channels() -> None:
cache_timeout = current_app.config["SLACK_CACHE_TIMEOUT"]
retry_count = current_app.config.get("SLACK_API_RATE_LIMIT_RETRY_COUNT", 2)
logger.info(
"Starting Slack channels cache warm-up task "
"(cache_timeout=%ds, retry_count=%d)",
cache_timeout,
retry_count,
)
try: try:
get_channels( get_channels(force=True, cache_timeout=cache_timeout)
force=True, cache_timeout=current_app.config["SLACK_CACHE_TIMEOUT"]
)
except Exception as ex: except Exception as ex:
logger.exception("An error occurred while caching Slack channels: %s", ex) logger.exception(
"Failed to cache Slack channels: %s. "
"If this is due to rate limiting, consider increasing "
"SLACK_API_RATE_LIMIT_RETRY_COUNT.",
str(ex),
)
raise raise

View File

@@ -50,9 +50,12 @@ def get_slack_client() -> WebClient:
token = token() token = token()
client = WebClient(token=token, proxy=app.config["SLACK_PROXY"]) client = WebClient(token=token, proxy=app.config["SLACK_PROXY"])
rate_limit_handler = RateLimitErrorRetryHandler(max_retry_count=2) max_retry_count = app.config.get("SLACK_API_RATE_LIMIT_RETRY_COUNT", 2)
rate_limit_handler = RateLimitErrorRetryHandler(max_retry_count=max_retry_count)
client.retry_handlers.append(rate_limit_handler) client.retry_handlers.append(rate_limit_handler)
logger.debug("Slack client configured with %d rate limit retries", max_retry_count)
return client return client
@@ -73,19 +76,45 @@ def get_channels() -> list[SlackChannelSchema]:
channels: list[SlackChannelSchema] = [] channels: list[SlackChannelSchema] = []
extra_params = {"types": ",".join(SlackChannelTypes)} extra_params = {"types": ",".join(SlackChannelTypes)}
cursor = None cursor = None
page_count = 0
while True: logger.info("Starting Slack channels fetch")
response = client.conversations_list(
limit=999, cursor=cursor, exclude_archived=True, **extra_params
)
channels.extend(
channel_schema.load(channel) for channel in response.data["channels"]
)
cursor = response.data.get("response_metadata", {}).get("next_cursor")
if not cursor:
break
return channels try:
while True:
page_count += 1
response = client.conversations_list(
limit=999, cursor=cursor, exclude_archived=True, **extra_params
)
page_channels = response.data["channels"]
channels.extend(channel_schema.load(channel) for channel in page_channels)
logger.debug(
"Fetched page %d: %d channels (total: %d)",
page_count,
len(page_channels),
len(channels),
)
cursor = response.data.get("response_metadata", {}).get("next_cursor")
if not cursor:
break
logger.info(
"Successfully fetched %d Slack channels in %d pages",
len(channels),
page_count,
)
return channels
except SlackApiError as ex:
logger.error(
"Failed to fetch Slack channels after %d pages: %s",
page_count,
str(ex),
exc_info=True,
)
raise
def get_channels_with_search( def get_channels_with_search(
@@ -104,7 +133,17 @@ def get_channels_with_search(
force=force, force=force,
cache_timeout=app.config["SLACK_CACHE_TIMEOUT"], cache_timeout=app.config["SLACK_CACHE_TIMEOUT"],
) )
except (SlackClientError, SlackApiError) as ex: except SlackApiError as ex:
# Check if it's a rate limit error
status_code = getattr(ex.response, "status_code", None)
if status_code == 429:
raise SupersetException(
f"Slack API rate limit exceeded: {ex}. "
"For large workspaces, consider increasing "
"SLACK_API_RATE_LIMIT_RETRY_COUNT"
) from ex
raise SupersetException(f"Failed to list channels: {ex}") from ex
except SlackClientError as ex:
raise SupersetException(f"Failed to list channels: {ex}") from ex raise SupersetException(f"Failed to list channels: {ex}") from ex
if types and not len(types) == len(SlackChannelTypes): if types and not len(types) == len(SlackChannelTypes):