mirror of
https://github.com/apache/superset.git
synced 2026-04-18 23:55:00 +00:00
fix: loading examples in CI returns http error "too many requests" (#33412)
This commit is contained in:
committed by
GitHub
parent
21ca26acd7
commit
7f14e434c8
@@ -16,7 +16,6 @@
|
||||
# under the License.
|
||||
import logging
|
||||
|
||||
import pandas as pd
|
||||
import polyline
|
||||
from sqlalchemy import inspect, String, Text
|
||||
|
||||
@@ -25,7 +24,7 @@ from superset.sql_parse import Table
|
||||
from superset.utils import json
|
||||
|
||||
from ..utils.database import get_example_database
|
||||
from .helpers import get_example_url, get_table_connector_registry
|
||||
from .helpers import get_table_connector_registry, read_example_data
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -38,8 +37,9 @@ def load_bart_lines(only_metadata: bool = False, force: bool = False) -> None:
|
||||
table_exists = database.has_table(Table(tbl_name, schema))
|
||||
|
||||
if not only_metadata and (not table_exists or force):
|
||||
url = get_example_url("bart-lines.json.gz")
|
||||
df = pd.read_json(url, encoding="latin-1", compression="gzip")
|
||||
df = read_example_data(
|
||||
"bart-lines.json.gz", encoding="latin-1", compression="gzip"
|
||||
)
|
||||
df["path_json"] = df.path.map(json.dumps)
|
||||
df["polyline"] = df.path.map(polyline.encode)
|
||||
del df["path"]
|
||||
|
||||
@@ -33,11 +33,11 @@ from superset.utils.core import DatasourceType
|
||||
|
||||
from ..utils.database import get_example_database
|
||||
from .helpers import (
|
||||
get_example_url,
|
||||
get_slice_json,
|
||||
get_table_connector_registry,
|
||||
merge_slice,
|
||||
misc_dash_slices,
|
||||
read_example_data,
|
||||
update_slice_ids,
|
||||
)
|
||||
|
||||
@@ -57,8 +57,8 @@ def gen_filter(
|
||||
|
||||
|
||||
def load_data(tbl_name: str, database: Database, sample: bool = False) -> None:
|
||||
url = get_example_url("birth_names2.json.gz")
|
||||
pdf = pd.read_json(url, compression="gzip")
|
||||
pdf = read_example_data("birth_names2.json.gz", compression="gzip")
|
||||
|
||||
# TODO(bkyryliuk): move load examples data into the pytest fixture
|
||||
if database.backend == "presto":
|
||||
pdf.ds = pd.to_datetime(pdf.ds, unit="ms")
|
||||
|
||||
@@ -17,7 +17,6 @@
|
||||
import datetime
|
||||
import logging
|
||||
|
||||
import pandas as pd
|
||||
from sqlalchemy import BigInteger, Date, inspect, String
|
||||
from sqlalchemy.sql import column
|
||||
|
||||
@@ -29,11 +28,11 @@ from superset.sql_parse import Table
|
||||
from superset.utils.core import DatasourceType
|
||||
|
||||
from .helpers import (
|
||||
get_example_url,
|
||||
get_slice_json,
|
||||
get_table_connector_registry,
|
||||
merge_slice,
|
||||
misc_dash_slices,
|
||||
read_example_data,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
@@ -49,8 +48,9 @@ def load_country_map_data(only_metadata: bool = False, force: bool = False) -> N
|
||||
table_exists = database.has_table(Table(tbl_name, schema))
|
||||
|
||||
if not only_metadata and (not table_exists or force):
|
||||
url = get_example_url("birth_france_data_for_country_map.csv")
|
||||
data = pd.read_csv(url, encoding="utf-8")
|
||||
data = read_example_data(
|
||||
"birth_france_data_for_country_map.csv", encoding="utf-8"
|
||||
)
|
||||
data["dttm"] = datetime.datetime.now().date()
|
||||
data.to_sql(
|
||||
tbl_name,
|
||||
|
||||
@@ -17,7 +17,6 @@
|
||||
import logging
|
||||
import textwrap
|
||||
|
||||
import pandas as pd
|
||||
from sqlalchemy import Float, inspect, String
|
||||
from sqlalchemy.sql import column
|
||||
|
||||
@@ -29,11 +28,11 @@ from superset.sql_parse import Table
|
||||
from superset.utils.core import DatasourceType
|
||||
|
||||
from .helpers import (
|
||||
get_example_url,
|
||||
get_slice_json,
|
||||
get_table_connector_registry,
|
||||
merge_slice,
|
||||
misc_dash_slices,
|
||||
read_example_data,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
@@ -51,8 +50,7 @@ def load_energy(
|
||||
table_exists = database.has_table(Table(tbl_name, schema))
|
||||
|
||||
if not only_metadata and (not table_exists or force):
|
||||
url = get_example_url("energy.json.gz")
|
||||
pdf = pd.read_json(url, compression="gzip")
|
||||
pdf = read_example_data("energy.json.gz", compression="gzip")
|
||||
pdf = pdf.head(100) if sample else pdf
|
||||
pdf.to_sql(
|
||||
tbl_name,
|
||||
|
||||
@@ -23,7 +23,7 @@ import superset.utils.database as database_utils
|
||||
from superset import db
|
||||
from superset.sql_parse import Table
|
||||
|
||||
from .helpers import get_example_url, get_table_connector_registry
|
||||
from .helpers import get_table_connector_registry, read_example_data
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -37,12 +37,14 @@ def load_flights(only_metadata: bool = False, force: bool = False) -> None:
|
||||
table_exists = database.has_table(Table(tbl_name, schema))
|
||||
|
||||
if not only_metadata and (not table_exists or force):
|
||||
flight_data_url = get_example_url("flight_data.csv.gz")
|
||||
pdf = pd.read_csv(flight_data_url, encoding="latin-1", compression="gzip")
|
||||
pdf = read_example_data(
|
||||
"flight_data.csv.gz", encoding="latin-1", compression="gzip"
|
||||
)
|
||||
|
||||
# Loading airports info to join and get lat/long
|
||||
airports_url = get_example_url("airports.csv.gz")
|
||||
airports = pd.read_csv(airports_url, encoding="latin-1", compression="gzip")
|
||||
airports = read_example_data(
|
||||
"airports.csv.gz", encoding="latin-1", compression="gzip"
|
||||
)
|
||||
airports = airports.set_index("IATA_CODE")
|
||||
|
||||
pdf[ # pylint: disable=unsupported-assignment-operation,useless-suppression
|
||||
|
||||
@@ -43,7 +43,11 @@ Environment knobs
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
import time
|
||||
from typing import Any
|
||||
from urllib.error import HTTPError
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from superset import app, db
|
||||
from superset.connectors.sqla.models import SqlaTable
|
||||
@@ -119,3 +123,33 @@ def get_example_url(filepath: str) -> str:
|
||||
paths like ``datasets/examples/slack/messages.csv``.
|
||||
"""
|
||||
return f"{BASE_URL}{filepath}"
|
||||
|
||||
|
||||
def read_example_data(
|
||||
filepath: str,
|
||||
max_attempts: int = 5,
|
||||
wait_seconds: float = 60,
|
||||
**kwargs: Any,
|
||||
) -> pd.DataFrame:
|
||||
"""Load CSV or JSON from example data mirror with retry/backoff."""
|
||||
from superset.examples.helpers import get_example_url
|
||||
|
||||
url = get_example_url(filepath)
|
||||
is_json = filepath.endswith(".json") or filepath.endswith(".json.gz")
|
||||
|
||||
for attempt in range(1, max_attempts + 1):
|
||||
try:
|
||||
if is_json:
|
||||
return pd.read_json(url, **kwargs)
|
||||
return pd.read_csv(url, **kwargs)
|
||||
except HTTPError as e:
|
||||
if e.code == 429 and attempt < max_attempts:
|
||||
sleep_time = wait_seconds * (2 ** (attempt - 1))
|
||||
print(
|
||||
f"HTTP 429 received from {url}. ",
|
||||
f"Retrying in {sleep_time:.1f}s ",
|
||||
f"(attempt {attempt}/{max_attempts})...",
|
||||
)
|
||||
time.sleep(sleep_time)
|
||||
else:
|
||||
raise
|
||||
|
||||
@@ -19,7 +19,6 @@ import logging
|
||||
import random
|
||||
|
||||
import geohash
|
||||
import pandas as pd
|
||||
from sqlalchemy import DateTime, Float, inspect, String
|
||||
|
||||
import superset.utils.database as database_utils
|
||||
@@ -29,11 +28,11 @@ from superset.sql_parse import Table
|
||||
from superset.utils.core import DatasourceType
|
||||
|
||||
from .helpers import (
|
||||
get_example_url,
|
||||
get_slice_json,
|
||||
get_table_connector_registry,
|
||||
merge_slice,
|
||||
misc_dash_slices,
|
||||
read_example_data,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
@@ -48,8 +47,9 @@ def load_long_lat_data(only_metadata: bool = False, force: bool = False) -> None
|
||||
table_exists = database.has_table(Table(tbl_name, schema))
|
||||
|
||||
if not only_metadata and (not table_exists or force):
|
||||
url = get_example_url("san_francisco.csv.gz")
|
||||
pdf = pd.read_csv(url, encoding="utf-8", compression="gzip")
|
||||
pdf = read_example_data(
|
||||
"san_francisco.csv.gz", encoding="utf-8", compression="gzip"
|
||||
)
|
||||
start = datetime.datetime.now().replace(
|
||||
hour=0, minute=0, second=0, microsecond=0
|
||||
)
|
||||
|
||||
@@ -27,11 +27,11 @@ from superset.utils.core import DatasourceType
|
||||
|
||||
from ..utils.database import get_example_database
|
||||
from .helpers import (
|
||||
get_example_url,
|
||||
get_slice_json,
|
||||
get_table_connector_registry,
|
||||
merge_slice,
|
||||
misc_dash_slices,
|
||||
read_example_data,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
@@ -48,8 +48,10 @@ def load_multiformat_time_series( # pylint: disable=too-many-locals
|
||||
table_exists = database.has_table(Table(tbl_name, schema))
|
||||
|
||||
if not only_metadata and (not table_exists or force):
|
||||
url = get_example_url("multiformat_time_series.json.gz")
|
||||
pdf = pd.read_json(url, compression="gzip")
|
||||
pdf = read_example_data(
|
||||
"multiformat_time_series.json.gz", compression="gzip"
|
||||
)
|
||||
|
||||
# TODO(bkyryliuk): move load examples data into the pytest fixture
|
||||
if database.backend == "presto":
|
||||
pdf.ds = pd.to_datetime(pdf.ds, unit="s")
|
||||
|
||||
@@ -17,7 +17,6 @@
|
||||
|
||||
import logging
|
||||
|
||||
import pandas as pd
|
||||
from sqlalchemy import inspect, String, Text
|
||||
|
||||
import superset.utils.database as database_utils
|
||||
@@ -25,7 +24,7 @@ from superset import db
|
||||
from superset.sql_parse import Table
|
||||
from superset.utils import json
|
||||
|
||||
from .helpers import get_example_url, get_table_connector_registry
|
||||
from .helpers import get_table_connector_registry, read_example_data
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -38,8 +37,7 @@ def load_paris_iris_geojson(only_metadata: bool = False, force: bool = False) ->
|
||||
table_exists = database.has_table(Table(tbl_name, schema))
|
||||
|
||||
if not only_metadata and (not table_exists or force):
|
||||
url = get_example_url("paris_iris.json.gz")
|
||||
df = pd.read_json(url, compression="gzip")
|
||||
df = read_example_data("paris_iris.json.gz", compression="gzip")
|
||||
df["features"] = df.features.map(json.dumps)
|
||||
|
||||
df.to_sql(
|
||||
|
||||
@@ -26,10 +26,10 @@ from superset.sql_parse import Table
|
||||
from superset.utils.core import DatasourceType
|
||||
|
||||
from .helpers import (
|
||||
get_example_url,
|
||||
get_slice_json,
|
||||
get_table_connector_registry,
|
||||
merge_slice,
|
||||
read_example_data,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
@@ -46,8 +46,7 @@ def load_random_time_series_data(
|
||||
table_exists = database.has_table(Table(tbl_name, schema))
|
||||
|
||||
if not only_metadata and (not table_exists or force):
|
||||
url = get_example_url("random_time_series.json.gz")
|
||||
pdf = pd.read_json(url, compression="gzip")
|
||||
pdf = read_example_data("random_time_series.json.gz", compression="gzip")
|
||||
if database.backend == "presto":
|
||||
pdf.ds = pd.to_datetime(pdf.ds, unit="s")
|
||||
pdf.ds = pdf.ds.dt.strftime("%Y-%m-%d %H:%M%:%S")
|
||||
|
||||
@@ -17,7 +17,6 @@
|
||||
|
||||
import logging
|
||||
|
||||
import pandas as pd
|
||||
from sqlalchemy import BigInteger, Float, inspect, Text
|
||||
|
||||
import superset.utils.database as database_utils
|
||||
@@ -25,7 +24,7 @@ from superset import db
|
||||
from superset.sql_parse import Table
|
||||
from superset.utils import json
|
||||
|
||||
from .helpers import get_example_url, get_table_connector_registry
|
||||
from .helpers import get_table_connector_registry, read_example_data
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -40,8 +39,7 @@ def load_sf_population_polygons(
|
||||
table_exists = database.has_table(Table(tbl_name, schema))
|
||||
|
||||
if not only_metadata and (not table_exists or force):
|
||||
url = get_example_url("sf_population.json.gz")
|
||||
df = pd.read_json(url, compression="gzip")
|
||||
df = read_example_data("sf_population.json.gz", compression="gzip")
|
||||
df["contour"] = df.contour.map(json.dumps)
|
||||
|
||||
df.to_sql(
|
||||
|
||||
@@ -25,12 +25,12 @@ import superset.utils.database
|
||||
from superset import app, db
|
||||
from superset.connectors.sqla.models import BaseDatasource, SqlMetric
|
||||
from superset.examples.helpers import (
|
||||
get_example_url,
|
||||
get_examples_folder,
|
||||
get_slice_json,
|
||||
get_table_connector_registry,
|
||||
merge_slice,
|
||||
misc_dash_slices,
|
||||
read_example_data,
|
||||
update_slice_ids,
|
||||
)
|
||||
from superset.models.dashboard import Dashboard
|
||||
@@ -55,8 +55,7 @@ def load_world_bank_health_n_pop( # pylint: disable=too-many-locals
|
||||
table_exists = database.has_table(Table(tbl_name, schema))
|
||||
|
||||
if not only_metadata and (not table_exists or force):
|
||||
url = get_example_url("countries.json.gz")
|
||||
pdf = pd.read_json(url, compression="gzip")
|
||||
pdf = read_example_data("countries.json.gz", compression="gzip")
|
||||
pdf.columns = [col.replace(".", "_") for col in pdf.columns]
|
||||
if database.backend == "presto":
|
||||
pdf.year = pd.to_datetime(pdf.year)
|
||||
|
||||
Reference in New Issue
Block a user