feat: Use DuckDB for examples data in Docker development environment (#34831)

Co-authored-by: Claude <noreply@anthropic.com>
Co-authored-by: Beto Dealmeida <roberto@dealmeida.net>
This commit is contained in:
Maxime Beauchemin
2025-08-25 12:37:28 -07:00
committed by GitHub
parent 9c9588cce6
commit 47414e18d4
15 changed files with 142 additions and 44 deletions

View File

@@ -102,7 +102,7 @@ jobs:
docker history $IMAGE_TAG docker history $IMAGE_TAG
- name: docker-compose sanity check - name: docker-compose sanity check
if: (steps.check.outputs.python || steps.check.outputs.frontend || steps.check.outputs.docker) && (matrix.build_preset == 'dev' || matrix.build_preset == 'lean') if: (steps.check.outputs.python || steps.check.outputs.frontend || steps.check.outputs.docker) && matrix.build_preset == 'dev'
shell: bash shell: bash
run: | run: |
export SUPERSET_BUILD_TARGET=${{ matrix.build_preset }} export SUPERSET_BUILD_TARGET=${{ matrix.build_preset }}

View File

@@ -145,6 +145,9 @@ RUN if [ "$BUILD_TRANSLATIONS" = "true" ]; then \
###################################################################### ######################################################################
FROM python-base AS python-common FROM python-base AS python-common
# Build arg to pre-populate examples DuckDB file
ARG LOAD_EXAMPLES_DUCKDB="false"
ENV SUPERSET_HOME="/app/superset_home" \ ENV SUPERSET_HOME="/app/superset_home" \
HOME="/app/superset_home" \ HOME="/app/superset_home" \
SUPERSET_ENV="production" \ SUPERSET_ENV="production" \
@@ -196,6 +199,18 @@ RUN /app/docker/apt-install.sh \
libecpg-dev \ libecpg-dev \
libldap2-dev libldap2-dev
# Pre-load examples DuckDB file if requested
RUN if [ "$LOAD_EXAMPLES_DUCKDB" = "true" ]; then \
mkdir -p /app/data && \
echo "Downloading pre-built examples.duckdb..." && \
curl -L -o /app/data/examples.duckdb \
"https://raw.githubusercontent.com/apache-superset/examples-data/master/examples.duckdb" && \
chown -R superset:superset /app/data; \
else \
mkdir -p /app/data && \
chown -R superset:superset /app/data; \
fi
# Copy compiled things from previous stages # Copy compiled things from previous stages
COPY --from=superset-node /app/superset/static/assets superset/static/assets COPY --from=superset-node /app/superset/static/assets superset/static/assets
@@ -267,6 +282,15 @@ USER superset
###################################################################### ######################################################################
FROM lean AS ci FROM lean AS ci
USER root USER root
RUN uv pip install .[postgres] RUN uv pip install .[postgres,duckdb]
USER superset
CMD ["/app/docker/entrypoints/docker-ci.sh"]
######################################################################
# Showtime image - lean + DuckDB for examples database
######################################################################
FROM lean AS showtime
USER root
RUN uv pip install .[duckdb]
USER superset USER superset
CMD ["/app/docker/entrypoints/docker-ci.sh"] CMD ["/app/docker/entrypoints/docker-ci.sh"]

View File

@@ -77,6 +77,7 @@ x-common-build: &common-build
INCLUDE_CHROMIUM: ${INCLUDE_CHROMIUM:-false} INCLUDE_CHROMIUM: ${INCLUDE_CHROMIUM:-false}
INCLUDE_FIREFOX: ${INCLUDE_FIREFOX:-false} INCLUDE_FIREFOX: ${INCLUDE_FIREFOX:-false}
BUILD_TRANSLATIONS: ${BUILD_TRANSLATIONS:-false} BUILD_TRANSLATIONS: ${BUILD_TRANSLATIONS:-false}
LOAD_EXAMPLES_DUCKDB: ${LOAD_EXAMPLES_DUCKDB:-true}
services: services:
db-light: db-light:
@@ -91,9 +92,7 @@ services:
- db_home_light:/var/lib/postgresql/data - db_home_light:/var/lib/postgresql/data
- ./docker/docker-entrypoint-initdb.d:/docker-entrypoint-initdb.d - ./docker/docker-entrypoint-initdb.d:/docker-entrypoint-initdb.d
environment: environment:
# Override database name to avoid conflicts
POSTGRES_DB: superset_light POSTGRES_DB: superset_light
# Increase max connections for test runs
command: postgres -c max_connections=200 command: postgres -c max_connections=200
superset-light: superset-light:
@@ -106,7 +105,6 @@ services:
<<: *common-build <<: *common-build
command: ["/app/docker/docker-bootstrap.sh", "app"] command: ["/app/docker/docker-bootstrap.sh", "app"]
restart: unless-stopped restart: unless-stopped
# No host port mapping - accessed via webpack dev server proxy
extra_hosts: extra_hosts:
- "host.docker.internal:host-gateway" - "host.docker.internal:host-gateway"
user: *superset-user user: *superset-user
@@ -115,15 +113,10 @@ services:
condition: service_completed_successfully condition: service_completed_successfully
volumes: *superset-volumes volumes: *superset-volumes
environment: environment:
# Override DB connection for light service
DATABASE_HOST: db-light DATABASE_HOST: db-light
DATABASE_DB: superset_light DATABASE_DB: superset_light
POSTGRES_DB: superset_light POSTGRES_DB: superset_light
EXAMPLES_HOST: db-light SUPERSET__SQLALCHEMY_EXAMPLES_URI: "duckdb:////app/data/examples.duckdb"
EXAMPLES_DB: superset_light
EXAMPLES_USER: superset
EXAMPLES_PASSWORD: superset
# Use light-specific config that disables Redis
SUPERSET_CONFIG_PATH: /app/docker/pythonpath_dev/superset_config_docker_light.py SUPERSET_CONFIG_PATH: /app/docker/pythonpath_dev/superset_config_docker_light.py
superset-init-light: superset-init-light:
@@ -135,21 +128,16 @@ services:
required: true required: true
- path: docker/.env-local # optional override - path: docker/.env-local # optional override
required: false required: false
user: *superset-user
depends_on: depends_on:
db-light: db-light:
condition: service_started condition: service_started
user: *superset-user
volumes: *superset-volumes volumes: *superset-volumes
environment: environment:
# Override DB connection for light service
DATABASE_HOST: db-light DATABASE_HOST: db-light
DATABASE_DB: superset_light DATABASE_DB: superset_light
POSTGRES_DB: superset_light POSTGRES_DB: superset_light
EXAMPLES_HOST: db-light SUPERSET__SQLALCHEMY_EXAMPLES_URI: "duckdb:////app/data/examples.duckdb"
EXAMPLES_DB: superset_light
EXAMPLES_USER: superset
EXAMPLES_PASSWORD: superset
# Use light-specific config that disables Redis
SUPERSET_CONFIG_PATH: /app/docker/pythonpath_dev/superset_config_docker_light.py SUPERSET_CONFIG_PATH: /app/docker/pythonpath_dev/superset_config_docker_light.py
healthcheck: healthcheck:
disable: true disable: true
@@ -199,15 +187,12 @@ services:
user: *superset-user user: *superset-user
volumes: *superset-volumes volumes: *superset-volumes
environment: environment:
# Test-specific database configuration
DATABASE_HOST: db-light DATABASE_HOST: db-light
DATABASE_DB: test DATABASE_DB: test
POSTGRES_DB: test POSTGRES_DB: test
# Point to test database
SUPERSET__SQLALCHEMY_DATABASE_URI: postgresql+psycopg2://superset:superset@db-light:5432/test SUPERSET__SQLALCHEMY_DATABASE_URI: postgresql+psycopg2://superset:superset@db-light:5432/test
# Use the light test config that doesn't require Redis SUPERSET__SQLALCHEMY_EXAMPLES_URI: "duckdb:////app/data/examples.duckdb"
SUPERSET_CONFIG: superset_test_config_light SUPERSET_CONFIG: superset_test_config_light
# Python path includes test directory
PYTHONPATH: /app/pythonpath:/app/docker/pythonpath_dev:/app PYTHONPATH: /app/pythonpath:/app/docker/pythonpath_dev:/app
volumes: volumes:

View File

@@ -42,6 +42,7 @@ x-common-build: &common-build
INCLUDE_CHROMIUM: ${INCLUDE_CHROMIUM:-false} INCLUDE_CHROMIUM: ${INCLUDE_CHROMIUM:-false}
INCLUDE_FIREFOX: ${INCLUDE_FIREFOX:-false} INCLUDE_FIREFOX: ${INCLUDE_FIREFOX:-false}
BUILD_TRANSLATIONS: ${BUILD_TRANSLATIONS:-false} BUILD_TRANSLATIONS: ${BUILD_TRANSLATIONS:-false}
LOAD_EXAMPLES_DUCKDB: ${LOAD_EXAMPLES_DUCKDB:-true}
services: services:
nginx: nginx:
@@ -107,6 +108,8 @@ services:
superset-init: superset-init:
condition: service_completed_successfully condition: service_completed_successfully
volumes: *superset-volumes volumes: *superset-volumes
environment:
SUPERSET__SQLALCHEMY_EXAMPLES_URI: "duckdb:////app/data/examples.duckdb"
superset-websocket: superset-websocket:
container_name: superset_websocket container_name: superset_websocket
@@ -158,6 +161,8 @@ services:
condition: service_started condition: service_started
user: *superset-user user: *superset-user
volumes: *superset-volumes volumes: *superset-volumes
environment:
SUPERSET__SQLALCHEMY_EXAMPLES_URI: "duckdb:////app/data/examples.duckdb"
healthcheck: healthcheck:
disable: true disable: true

View File

@@ -69,6 +69,8 @@ echo_step "3" "Complete" "Setting up roles and perms"
if [ "$SUPERSET_LOAD_EXAMPLES" = "yes" ]; then if [ "$SUPERSET_LOAD_EXAMPLES" = "yes" ]; then
# Load some data to play with # Load some data to play with
echo_step "4" "Starting" "Loading examples" echo_step "4" "Starting" "Loading examples"
# If Cypress run which consumes superset_test_config load required data for tests # If Cypress run which consumes superset_test_config load required data for tests
if [ "$CYPRESS_CONFIG" == "true" ]; then if [ "$CYPRESS_CONFIG" == "true" ]; then
superset load_examples --load-test-data superset load_examples --load-test-data

View File

@@ -49,12 +49,18 @@ SQLALCHEMY_DATABASE_URI = (
f"{DATABASE_HOST}:{DATABASE_PORT}/{DATABASE_DB}" f"{DATABASE_HOST}:{DATABASE_PORT}/{DATABASE_DB}"
) )
SQLALCHEMY_EXAMPLES_URI = ( # Use environment variable if set, otherwise construct from components
f"{DATABASE_DIALECT}://" # This MUST take precedence over any other configuration
f"{EXAMPLES_USER}:{EXAMPLES_PASSWORD}@" SQLALCHEMY_EXAMPLES_URI = os.getenv(
f"{EXAMPLES_HOST}:{EXAMPLES_PORT}/{EXAMPLES_DB}" "SUPERSET__SQLALCHEMY_EXAMPLES_URI",
(
f"{DATABASE_DIALECT}://"
f"{EXAMPLES_USER}:{EXAMPLES_PASSWORD}@"
f"{EXAMPLES_HOST}:{EXAMPLES_PORT}/{EXAMPLES_DB}"
),
) )
REDIS_HOST = os.getenv("REDIS_HOST", "redis") REDIS_HOST = os.getenv("REDIS_HOST", "redis")
REDIS_PORT = os.getenv("REDIS_PORT", "6379") REDIS_PORT = os.getenv("REDIS_PORT", "6379")
REDIS_CELERY_DB = os.getenv("REDIS_CELERY_DB", "0") REDIS_CELERY_DB = os.getenv("REDIS_CELERY_DB", "0")

View File

@@ -130,7 +130,7 @@ denodo = ["denodo-sqlalchemy~=1.0.6"]
dremio = ["sqlalchemy-dremio>=1.2.1, <4"] dremio = ["sqlalchemy-dremio>=1.2.1, <4"]
drill = ["sqlalchemy-drill>=1.1.4, <2"] drill = ["sqlalchemy-drill>=1.1.4, <2"]
druid = ["pydruid>=0.6.5,<0.7"] druid = ["pydruid>=0.6.5,<0.7"]
duckdb = ["duckdb-engine>=0.12.1, <0.13"] duckdb = ["duckdb-engine>=0.17.0"]
dynamodb = ["pydynamodb>=0.4.2"] dynamodb = ["pydynamodb>=0.4.2"]
solr = ["sqlalchemy-solr >= 0.2.0"] solr = ["sqlalchemy-solr >= 0.2.0"]
elasticsearch = ["elasticsearch-dbapi>=0.2.9, <0.3.0"] elasticsearch = ["elasticsearch-dbapi>=0.2.9, <0.3.0"]

View File

@@ -239,6 +239,7 @@ numpy==1.26.4
# bottleneck # bottleneck
# numexpr # numexpr
# pandas # pandas
# pyarrow
odfpy==1.4.1 odfpy==1.4.1
# via pandas # via pandas
openapi-schema-validator==0.6.3 openapi-schema-validator==0.6.3

View File

@@ -16,5 +16,5 @@
# specific language governing permissions and limitations # specific language governing permissions and limitations
# under the License. # under the License.
# #
-e .[development,bigquery,druid,gevent,gsheets,mysql,postgres,presto,prophet,trino,thumbnails] -e .[development,bigquery,druid,duckdb,gevent,gsheets,mysql,postgres,presto,prophet,trino,thumbnails]
-e ./superset-cli[test] -e ./superset-cli[test]

View File

@@ -177,6 +177,10 @@ dnspython==2.7.0
# email-validator # email-validator
docker==7.0.0 docker==7.0.0
# via apache-superset # via apache-superset
duckdb==1.3.2
# via duckdb-engine
duckdb-engine==0.17.0
# via apache-superset
email-validator==2.2.0 email-validator==2.2.0
# via # via
# -c requirements/base-constraint.txt # -c requirements/base-constraint.txt
@@ -480,6 +484,7 @@ numpy==1.26.4
# pandas # pandas
# pandas-gbq # pandas-gbq
# prophet # prophet
# pyarrow
oauthlib==3.2.2 oauthlib==3.2.2
# via requests-oauthlib # via requests-oauthlib
odfpy==1.4.1 odfpy==1.4.1
@@ -513,6 +518,7 @@ packaging==25.0
# db-dtypes # db-dtypes
# deprecation # deprecation
# docker # docker
# duckdb-engine
# google-cloud-bigquery # google-cloud-bigquery
# gunicorn # gunicorn
# limits # limits
@@ -810,6 +816,7 @@ sqlalchemy==1.4.54
# -c requirements/base-constraint.txt # -c requirements/base-constraint.txt
# alembic # alembic
# apache-superset # apache-superset
# duckdb-engine
# flask-appbuilder # flask-appbuilder
# flask-sqlalchemy # flask-sqlalchemy
# marshmallow-sqlalchemy # marshmallow-sqlalchemy

View File

@@ -30,8 +30,8 @@ from superset.extensions import (
) )
from superset.security import SupersetSecurityManager # noqa: F401 from superset.security import SupersetSecurityManager # noqa: F401
# All of the fields located here should be considered legacy. The correct way # All of the fields located here should be considered legacy. The correct way to
# to declare "global" dependencies is to define it in extensions.py, # declare "global" dependencies is to define it in extensions.py,
# then initialize it in app.create_app(). These fields will be removed # then initialize it in app.create_app(). These fields will be removed
# in subsequent PRs as things are migrated towards the factory pattern # in subsequent PRs as things are migrated towards the factory pattern
cache = cache_manager.cache cache = cache_manager.cache

View File

@@ -660,6 +660,7 @@ DEFAULT_FEATURE_FLAGS.update(
} }
) )
# This function can be overridden to customize the name of the user agent # This function can be overridden to customize the name of the user agent
# triggering the query. # triggering the query.
USER_AGENT_FUNC: Callable[[Database, QuerySource | None], str] | None = None USER_AGENT_FUNC: Callable[[Database, QuerySource | None], str] | None = None
@@ -2209,3 +2210,15 @@ elif importlib.util.find_spec("superset_config"):
except Exception: except Exception:
logger.exception("Found but failed to import local superset_config") logger.exception("Found but failed to import local superset_config")
raise raise
# Final environment variable processing - must be at the very end
# to override any config file assignments
ENV_VAR_KEYS = {
"SUPERSET__SQLALCHEMY_DATABASE_URI",
"SUPERSET__SQLALCHEMY_EXAMPLES_URI",
}
for env_var in ENV_VAR_KEYS:
if env_var in os.environ:
config_var = env_var.replace("SUPERSET__", "")
globals()[config_var] = os.environ[env_var]

View File

@@ -33,7 +33,7 @@ from sqlalchemy.engine.url import URL
from superset.constants import TimeGrain from superset.constants import TimeGrain
from superset.databases.utils import make_url_safe from superset.databases.utils import make_url_safe
from superset.db_engine_specs.base import BaseEngineSpec from superset.db_engine_specs.base import BaseEngineSpec, LimitMethod
from superset.errors import ErrorLevel, SupersetError, SupersetErrorType from superset.errors import ErrorLevel, SupersetError, SupersetErrorType
from superset.utils.core import GenericDataType, get_user_agent, QuerySource from superset.utils.core import GenericDataType, get_user_agent, QuerySource
@@ -261,6 +261,39 @@ class DuckDBEngineSpec(DuckDBParametersMixin, BaseEngineSpec):
return f"""'{dttm.isoformat(sep=" ", timespec="microseconds")}'""" return f"""'{dttm.isoformat(sep=" ", timespec="microseconds")}'"""
return None return None
@classmethod
def fetch_data(cls, cursor: Any, limit: int | None = None) -> list[tuple[Any, ...]]:
"""
Override fetch_data to work around duckdb-engine cursor.description bug.
The duckdb-engine SQLAlchemy driver has a bug where cursor.description
becomes None after calling fetchall(), even though the native DuckDB cursor
preserves this information correctly.
See: https://github.com/Mause/duckdb_engine/issues/1322
This method captures the cursor description before fetchall() and restores
it afterward to prevent downstream processing failures.
"""
# Capture description BEFORE fetchall() invalidates it
description = cursor.description
# Execute fetchall() (which will clear cursor.description in duckdb-engine)
if cls.arraysize:
cursor.arraysize = cls.arraysize
try:
if cls.limit_method == LimitMethod.FETCH_MANY and limit:
data = cursor.fetchmany(limit)
else:
data = cursor.fetchall()
except Exception as ex:
raise cls.get_dbapi_mapped_exception(ex) from ex
# Restore the captured description for downstream processing
cursor.description = description
return data
@classmethod @classmethod
def get_table_names( def get_table_names(
cls, database: Database, inspector: Inspector, schema: str | None cls, database: Database, inspector: Inspector, schema: str | None

View File

@@ -58,9 +58,14 @@ CSV_FILE_WITH_NULLS = [
def _setup_csv_upload(allowed_schemas: list[str] | None = None): def _setup_csv_upload(allowed_schemas: list[str] | None = None):
upload_db = get_or_create_db( # Use main database URI for schema-related tests (PostgreSQL-specific)
CSV_UPLOAD_DATABASE, app.config["SQLALCHEMY_EXAMPLES_URI"] # Use examples URI for general upload tests
) if allowed_schemas:
db_uri = app.config["SQLALCHEMY_DATABASE_URI"]
else:
db_uri = app.config["SQLALCHEMY_EXAMPLES_URI"]
upload_db = get_or_create_db(CSV_UPLOAD_DATABASE, db_uri)
upload_db.allow_file_upload = True upload_db.allow_file_upload = True
extra = upload_db.get_extra() extra = upload_db.get_extra()
allowed_schemas = allowed_schemas or [] allowed_schemas = allowed_schemas or []

View File

@@ -37,8 +37,9 @@ class TestDatasourceValidateExpressionApi(SupersetTestCase):
# Mock successful validation # Mock successful validation
mock_validate.return_value = {"valid": True, "errors": []} mock_validate.return_value = {"valid": True, "errors": []}
# Use a test datasource ID # Use the birth_names dataset for testing
datasource_id = 1 datasource = self.get_birth_names_dataset()
datasource_id = datasource.id
rv = self.client.post( rv = self.client.post(
f"/api/v1/datasource/table/{datasource_id}/validate_expression/", f"/api/v1/datasource/table/{datasource_id}/validate_expression/",
@@ -61,7 +62,9 @@ class TestDatasourceValidateExpressionApi(SupersetTestCase):
# Mock successful validation # Mock successful validation
mock_validate.return_value = {"valid": True, "errors": []} mock_validate.return_value = {"valid": True, "errors": []}
datasource_id = 1 # Assuming we have a datasource with ID 1 # Use the birth_names dataset for testing
datasource = self.get_birth_names_dataset()
datasource_id = datasource.id
rv = self.client.post( rv = self.client.post(
f"/api/v1/datasource/table/{datasource_id}/validate_expression/", f"/api/v1/datasource/table/{datasource_id}/validate_expression/",
@@ -84,7 +87,9 @@ class TestDatasourceValidateExpressionApi(SupersetTestCase):
# Mock successful validation # Mock successful validation
mock_validate.return_value = {"valid": True, "errors": []} mock_validate.return_value = {"valid": True, "errors": []}
datasource_id = 1 # Use the birth_names dataset for testing
datasource = self.get_birth_names_dataset()
datasource_id = datasource.id
rv = self.client.post( rv = self.client.post(
f"/api/v1/datasource/table/{datasource_id}/validate_expression/", f"/api/v1/datasource/table/{datasource_id}/validate_expression/",
@@ -107,7 +112,9 @@ class TestDatasourceValidateExpressionApi(SupersetTestCase):
# Mock successful validation # Mock successful validation
mock_validate.return_value = {"valid": True, "errors": []} mock_validate.return_value = {"valid": True, "errors": []}
datasource_id = 1 # Use the birth_names dataset for testing
datasource = self.get_birth_names_dataset()
datasource_id = datasource.id
rv = self.client.post( rv = self.client.post(
f"/api/v1/datasource/table/{datasource_id}/validate_expression/", f"/api/v1/datasource/table/{datasource_id}/validate_expression/",
@@ -126,7 +133,9 @@ class TestDatasourceValidateExpressionApi(SupersetTestCase):
"""Test validation of invalid SQL expression""" """Test validation of invalid SQL expression"""
self.login("admin") self.login("admin")
datasource_id = 1 # Use the birth_names dataset for testing
datasource = self.get_birth_names_dataset()
datasource_id = datasource.id
with patch( with patch(
"superset.connectors.sqla.models.SqlaTable.validate_expression" "superset.connectors.sqla.models.SqlaTable.validate_expression"
@@ -154,7 +163,9 @@ class TestDatasourceValidateExpressionApi(SupersetTestCase):
"""Test that HAVING clause fails for non-aggregated columns""" """Test that HAVING clause fails for non-aggregated columns"""
self.login("admin") self.login("admin")
datasource_id = 1 # Use the birth_names dataset for testing
datasource = self.get_birth_names_dataset()
datasource_id = datasource.id
with patch( with patch(
"superset.connectors.sqla.models.SqlaTable.validate_expression" "superset.connectors.sqla.models.SqlaTable.validate_expression"
@@ -189,7 +200,9 @@ class TestDatasourceValidateExpressionApi(SupersetTestCase):
"""Test validation of empty expression""" """Test validation of empty expression"""
self.login("admin") self.login("admin")
datasource_id = 1 # Use the birth_names dataset for testing
datasource = self.get_birth_names_dataset()
datasource_id = datasource.id
rv = self.client.post( rv = self.client.post(
f"/api/v1/datasource/table/{datasource_id}/validate_expression/", f"/api/v1/datasource/table/{datasource_id}/validate_expression/",
@@ -205,7 +218,9 @@ class TestDatasourceValidateExpressionApi(SupersetTestCase):
"""Test validation with missing required parameters""" """Test validation with missing required parameters"""
self.login("admin") self.login("admin")
datasource_id = 1 # Use the birth_names dataset for testing
datasource = self.get_birth_names_dataset()
datasource_id = datasource.id
# Missing expression_type - defaults to "where" # Missing expression_type - defaults to "where"
rv = self.client.post( rv = self.client.post(
@@ -242,7 +257,9 @@ class TestDatasourceValidateExpressionApi(SupersetTestCase):
# Create a user without admin privileges # Create a user without admin privileges
self.login("gamma") self.login("gamma")
datasource_id = 1 # Use the birth_names dataset for testing
datasource = self.get_birth_names_dataset()
datasource_id = datasource.id
rv = self.client.post( rv = self.client.post(
f"/api/v1/datasource/table/{datasource_id}/validate_expression/", f"/api/v1/datasource/table/{datasource_id}/validate_expression/",