fix: loading examples in CI returns http error "too many requests" (#33412)

This commit is contained in:
Maxime Beauchemin
2025-05-13 08:36:12 -07:00
committed by GitHub
parent 21ca26acd7
commit 7f14e434c8
12 changed files with 71 additions and 41 deletions

View File

@@ -43,7 +43,11 @@ Environment knobs
from __future__ import annotations
import os
import time
from typing import Any
from urllib.error import HTTPError
import pandas as pd
from superset import app, db
from superset.connectors.sqla.models import SqlaTable
@@ -119,3 +123,33 @@ def get_example_url(filepath: str) -> str:
paths like ``datasets/examples/slack/messages.csv``.
"""
return f"{BASE_URL}{filepath}"
def read_example_data(
filepath: str,
max_attempts: int = 5,
wait_seconds: float = 60,
**kwargs: Any,
) -> pd.DataFrame:
"""Load CSV or JSON from example data mirror with retry/backoff."""
from superset.examples.helpers import get_example_url
url = get_example_url(filepath)
is_json = filepath.endswith(".json") or filepath.endswith(".json.gz")
for attempt in range(1, max_attempts + 1):
try:
if is_json:
return pd.read_json(url, **kwargs)
return pd.read_csv(url, **kwargs)
except HTTPError as e:
if e.code == 429 and attempt < max_attempts:
sleep_time = wait_seconds * (2 ** (attempt - 1))
print(
f"HTTP 429 received from {url}. ",
f"Retrying in {sleep_time:.1f}s ",
f"(attempt {attempt}/{max_attempts})...",
)
time.sleep(sleep_time)
else:
raise