diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml index 91417cc86a5..a4106c13b61 100644 --- a/.github/workflows/docker.yml +++ b/.github/workflows/docker.yml @@ -102,7 +102,7 @@ jobs: docker history $IMAGE_TAG - name: docker-compose sanity check - if: (steps.check.outputs.python || steps.check.outputs.frontend || steps.check.outputs.docker) && (matrix.build_preset == 'dev' || matrix.build_preset == 'lean') + if: (steps.check.outputs.python || steps.check.outputs.frontend || steps.check.outputs.docker) && matrix.build_preset == 'dev' shell: bash run: | export SUPERSET_BUILD_TARGET=${{ matrix.build_preset }} diff --git a/Dockerfile b/Dockerfile index 6dee6b31058..2fde626695d 100644 --- a/Dockerfile +++ b/Dockerfile @@ -145,6 +145,9 @@ RUN if [ "$BUILD_TRANSLATIONS" = "true" ]; then \ ###################################################################### FROM python-base AS python-common +# Build arg to pre-populate examples DuckDB file +ARG LOAD_EXAMPLES_DUCKDB="false" + ENV SUPERSET_HOME="/app/superset_home" \ HOME="/app/superset_home" \ SUPERSET_ENV="production" \ @@ -196,6 +199,18 @@ RUN /app/docker/apt-install.sh \ libecpg-dev \ libldap2-dev +# Pre-load examples DuckDB file if requested +RUN if [ "$LOAD_EXAMPLES_DUCKDB" = "true" ]; then \ + mkdir -p /app/data && \ + echo "Downloading pre-built examples.duckdb..." && \ + curl -L -o /app/data/examples.duckdb \ + "https://raw.githubusercontent.com/apache-superset/examples-data/master/examples.duckdb" && \ + chown -R superset:superset /app/data; \ + else \ + mkdir -p /app/data && \ + chown -R superset:superset /app/data; \ + fi + # Copy compiled things from previous stages COPY --from=superset-node /app/superset/static/assets superset/static/assets @@ -267,6 +282,15 @@ USER superset ###################################################################### FROM lean AS ci USER root -RUN uv pip install .[postgres] +RUN uv pip install .[postgres,duckdb] +USER superset +CMD ["/app/docker/entrypoints/docker-ci.sh"] + +###################################################################### +# Showtime image - lean + DuckDB for examples database +###################################################################### +FROM lean AS showtime +USER root +RUN uv pip install .[duckdb] USER superset CMD ["/app/docker/entrypoints/docker-ci.sh"] diff --git a/docker-compose-light.yml b/docker-compose-light.yml index 70c4f6225d6..ad10a93ef7b 100644 --- a/docker-compose-light.yml +++ b/docker-compose-light.yml @@ -77,6 +77,7 @@ x-common-build: &common-build INCLUDE_CHROMIUM: ${INCLUDE_CHROMIUM:-false} INCLUDE_FIREFOX: ${INCLUDE_FIREFOX:-false} BUILD_TRANSLATIONS: ${BUILD_TRANSLATIONS:-false} + LOAD_EXAMPLES_DUCKDB: ${LOAD_EXAMPLES_DUCKDB:-true} services: db-light: @@ -91,9 +92,7 @@ services: - db_home_light:/var/lib/postgresql/data - ./docker/docker-entrypoint-initdb.d:/docker-entrypoint-initdb.d environment: - # Override database name to avoid conflicts POSTGRES_DB: superset_light - # Increase max connections for test runs command: postgres -c max_connections=200 superset-light: @@ -106,7 +105,6 @@ services: <<: *common-build command: ["/app/docker/docker-bootstrap.sh", "app"] restart: unless-stopped - # No host port mapping - accessed via webpack dev server proxy extra_hosts: - "host.docker.internal:host-gateway" user: *superset-user @@ -115,15 +113,10 @@ services: condition: service_completed_successfully volumes: *superset-volumes environment: - # Override DB connection for light service DATABASE_HOST: db-light DATABASE_DB: superset_light POSTGRES_DB: superset_light - EXAMPLES_HOST: db-light - EXAMPLES_DB: superset_light - EXAMPLES_USER: superset - EXAMPLES_PASSWORD: superset - # Use light-specific config that disables Redis + SUPERSET__SQLALCHEMY_EXAMPLES_URI: "duckdb:////app/data/examples.duckdb" SUPERSET_CONFIG_PATH: /app/docker/pythonpath_dev/superset_config_docker_light.py superset-init-light: @@ -135,21 +128,16 @@ services: required: true - path: docker/.env-local # optional override required: false + user: *superset-user depends_on: db-light: condition: service_started - user: *superset-user volumes: *superset-volumes environment: - # Override DB connection for light service DATABASE_HOST: db-light DATABASE_DB: superset_light POSTGRES_DB: superset_light - EXAMPLES_HOST: db-light - EXAMPLES_DB: superset_light - EXAMPLES_USER: superset - EXAMPLES_PASSWORD: superset - # Use light-specific config that disables Redis + SUPERSET__SQLALCHEMY_EXAMPLES_URI: "duckdb:////app/data/examples.duckdb" SUPERSET_CONFIG_PATH: /app/docker/pythonpath_dev/superset_config_docker_light.py healthcheck: disable: true @@ -199,15 +187,12 @@ services: user: *superset-user volumes: *superset-volumes environment: - # Test-specific database configuration DATABASE_HOST: db-light DATABASE_DB: test POSTGRES_DB: test - # Point to test database SUPERSET__SQLALCHEMY_DATABASE_URI: postgresql+psycopg2://superset:superset@db-light:5432/test - # Use the light test config that doesn't require Redis + SUPERSET__SQLALCHEMY_EXAMPLES_URI: "duckdb:////app/data/examples.duckdb" SUPERSET_CONFIG: superset_test_config_light - # Python path includes test directory PYTHONPATH: /app/pythonpath:/app/docker/pythonpath_dev:/app volumes: diff --git a/docker-compose.yml b/docker-compose.yml index 08b0c40701c..8c89fad9185 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -42,6 +42,7 @@ x-common-build: &common-build INCLUDE_CHROMIUM: ${INCLUDE_CHROMIUM:-false} INCLUDE_FIREFOX: ${INCLUDE_FIREFOX:-false} BUILD_TRANSLATIONS: ${BUILD_TRANSLATIONS:-false} + LOAD_EXAMPLES_DUCKDB: ${LOAD_EXAMPLES_DUCKDB:-true} services: nginx: @@ -107,6 +108,8 @@ services: superset-init: condition: service_completed_successfully volumes: *superset-volumes + environment: + SUPERSET__SQLALCHEMY_EXAMPLES_URI: "duckdb:////app/data/examples.duckdb" superset-websocket: container_name: superset_websocket @@ -158,6 +161,8 @@ services: condition: service_started user: *superset-user volumes: *superset-volumes + environment: + SUPERSET__SQLALCHEMY_EXAMPLES_URI: "duckdb:////app/data/examples.duckdb" healthcheck: disable: true diff --git a/docker/docker-init.sh b/docker/docker-init.sh index f9bd09ed14d..e4b25b5b187 100755 --- a/docker/docker-init.sh +++ b/docker/docker-init.sh @@ -69,6 +69,8 @@ echo_step "3" "Complete" "Setting up roles and perms" if [ "$SUPERSET_LOAD_EXAMPLES" = "yes" ]; then # Load some data to play with echo_step "4" "Starting" "Loading examples" + + # If Cypress run which consumes superset_test_config – load required data for tests if [ "$CYPRESS_CONFIG" == "true" ]; then superset load_examples --load-test-data diff --git a/docker/pythonpath_dev/superset_config.py b/docker/pythonpath_dev/superset_config.py index 2de8f037947..6d80d254a6a 100644 --- a/docker/pythonpath_dev/superset_config.py +++ b/docker/pythonpath_dev/superset_config.py @@ -49,12 +49,18 @@ SQLALCHEMY_DATABASE_URI = ( f"{DATABASE_HOST}:{DATABASE_PORT}/{DATABASE_DB}" ) -SQLALCHEMY_EXAMPLES_URI = ( - f"{DATABASE_DIALECT}://" - f"{EXAMPLES_USER}:{EXAMPLES_PASSWORD}@" - f"{EXAMPLES_HOST}:{EXAMPLES_PORT}/{EXAMPLES_DB}" +# Use environment variable if set, otherwise construct from components +# This MUST take precedence over any other configuration +SQLALCHEMY_EXAMPLES_URI = os.getenv( + "SUPERSET__SQLALCHEMY_EXAMPLES_URI", + ( + f"{DATABASE_DIALECT}://" + f"{EXAMPLES_USER}:{EXAMPLES_PASSWORD}@" + f"{EXAMPLES_HOST}:{EXAMPLES_PORT}/{EXAMPLES_DB}" + ), ) + REDIS_HOST = os.getenv("REDIS_HOST", "redis") REDIS_PORT = os.getenv("REDIS_PORT", "6379") REDIS_CELERY_DB = os.getenv("REDIS_CELERY_DB", "0") diff --git a/pyproject.toml b/pyproject.toml index 952e605d396..c2b605bf5e6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -130,7 +130,7 @@ denodo = ["denodo-sqlalchemy~=1.0.6"] dremio = ["sqlalchemy-dremio>=1.2.1, <4"] drill = ["sqlalchemy-drill>=1.1.4, <2"] druid = ["pydruid>=0.6.5,<0.7"] -duckdb = ["duckdb-engine>=0.12.1, <0.13"] +duckdb = ["duckdb-engine>=0.17.0"] dynamodb = ["pydynamodb>=0.4.2"] solr = ["sqlalchemy-solr >= 0.2.0"] elasticsearch = ["elasticsearch-dbapi>=0.2.9, <0.3.0"] diff --git a/requirements/base.txt b/requirements/base.txt index 7584a9afbf7..9d150f7cfea 100644 --- a/requirements/base.txt +++ b/requirements/base.txt @@ -239,6 +239,7 @@ numpy==1.26.4 # bottleneck # numexpr # pandas + # pyarrow odfpy==1.4.1 # via pandas openapi-schema-validator==0.6.3 diff --git a/requirements/development.in b/requirements/development.in index 53a48f2f2dc..a53e266c570 100644 --- a/requirements/development.in +++ b/requirements/development.in @@ -16,5 +16,5 @@ # specific language governing permissions and limitations # under the License. # --e .[development,bigquery,druid,gevent,gsheets,mysql,postgres,presto,prophet,trino,thumbnails] +-e .[development,bigquery,druid,duckdb,gevent,gsheets,mysql,postgres,presto,prophet,trino,thumbnails] -e ./superset-cli[test] diff --git a/requirements/development.txt b/requirements/development.txt index 1c1f8f9c046..50d9e828834 100644 --- a/requirements/development.txt +++ b/requirements/development.txt @@ -177,6 +177,10 @@ dnspython==2.7.0 # email-validator docker==7.0.0 # via apache-superset +duckdb==1.3.2 + # via duckdb-engine +duckdb-engine==0.17.0 + # via apache-superset email-validator==2.2.0 # via # -c requirements/base-constraint.txt @@ -480,6 +484,7 @@ numpy==1.26.4 # pandas # pandas-gbq # prophet + # pyarrow oauthlib==3.2.2 # via requests-oauthlib odfpy==1.4.1 @@ -513,6 +518,7 @@ packaging==25.0 # db-dtypes # deprecation # docker + # duckdb-engine # google-cloud-bigquery # gunicorn # limits @@ -810,6 +816,7 @@ sqlalchemy==1.4.54 # -c requirements/base-constraint.txt # alembic # apache-superset + # duckdb-engine # flask-appbuilder # flask-sqlalchemy # marshmallow-sqlalchemy diff --git a/superset/__init__.py b/superset/__init__.py index 450b5f104ff..91fcbf5ad8f 100644 --- a/superset/__init__.py +++ b/superset/__init__.py @@ -30,8 +30,8 @@ from superset.extensions import ( ) from superset.security import SupersetSecurityManager # noqa: F401 -# All of the fields located here should be considered legacy. The correct way -# to declare "global" dependencies is to define it in extensions.py, +# All of the fields located here should be considered legacy. The correct way to +# declare "global" dependencies is to define it in extensions.py, # then initialize it in app.create_app(). These fields will be removed # in subsequent PRs as things are migrated towards the factory pattern cache = cache_manager.cache diff --git a/superset/config.py b/superset/config.py index d80483762e6..9e1e9f724d2 100644 --- a/superset/config.py +++ b/superset/config.py @@ -660,6 +660,7 @@ DEFAULT_FEATURE_FLAGS.update( } ) + # This function can be overridden to customize the name of the user agent # triggering the query. USER_AGENT_FUNC: Callable[[Database, QuerySource | None], str] | None = None @@ -2209,3 +2210,15 @@ elif importlib.util.find_spec("superset_config"): except Exception: logger.exception("Found but failed to import local superset_config") raise + +# Final environment variable processing - must be at the very end +# to override any config file assignments +ENV_VAR_KEYS = { + "SUPERSET__SQLALCHEMY_DATABASE_URI", + "SUPERSET__SQLALCHEMY_EXAMPLES_URI", +} + +for env_var in ENV_VAR_KEYS: + if env_var in os.environ: + config_var = env_var.replace("SUPERSET__", "") + globals()[config_var] = os.environ[env_var] diff --git a/superset/db_engine_specs/duckdb.py b/superset/db_engine_specs/duckdb.py index b213f56013d..9bf98426f70 100644 --- a/superset/db_engine_specs/duckdb.py +++ b/superset/db_engine_specs/duckdb.py @@ -33,7 +33,7 @@ from sqlalchemy.engine.url import URL from superset.constants import TimeGrain from superset.databases.utils import make_url_safe -from superset.db_engine_specs.base import BaseEngineSpec +from superset.db_engine_specs.base import BaseEngineSpec, LimitMethod from superset.errors import ErrorLevel, SupersetError, SupersetErrorType from superset.utils.core import GenericDataType, get_user_agent, QuerySource @@ -261,6 +261,39 @@ class DuckDBEngineSpec(DuckDBParametersMixin, BaseEngineSpec): return f"""'{dttm.isoformat(sep=" ", timespec="microseconds")}'""" return None + @classmethod + def fetch_data(cls, cursor: Any, limit: int | None = None) -> list[tuple[Any, ...]]: + """ + Override fetch_data to work around duckdb-engine cursor.description bug. + + The duckdb-engine SQLAlchemy driver has a bug where cursor.description + becomes None after calling fetchall(), even though the native DuckDB cursor + preserves this information correctly. + + See: https://github.com/Mause/duckdb_engine/issues/1322 + + This method captures the cursor description before fetchall() and restores + it afterward to prevent downstream processing failures. + """ + # Capture description BEFORE fetchall() invalidates it + description = cursor.description + + # Execute fetchall() (which will clear cursor.description in duckdb-engine) + if cls.arraysize: + cursor.arraysize = cls.arraysize + try: + if cls.limit_method == LimitMethod.FETCH_MANY and limit: + data = cursor.fetchmany(limit) + else: + data = cursor.fetchall() + except Exception as ex: + raise cls.get_dbapi_mapped_exception(ex) from ex + + # Restore the captured description for downstream processing + cursor.description = description + + return data + @classmethod def get_table_names( cls, database: Database, inspector: Inspector, schema: str | None diff --git a/tests/integration_tests/databases/commands/upload_test.py b/tests/integration_tests/databases/commands/upload_test.py index 295c6b21bf9..95e21960605 100644 --- a/tests/integration_tests/databases/commands/upload_test.py +++ b/tests/integration_tests/databases/commands/upload_test.py @@ -58,9 +58,14 @@ CSV_FILE_WITH_NULLS = [ def _setup_csv_upload(allowed_schemas: list[str] | None = None): - upload_db = get_or_create_db( - CSV_UPLOAD_DATABASE, app.config["SQLALCHEMY_EXAMPLES_URI"] - ) + # Use main database URI for schema-related tests (PostgreSQL-specific) + # Use examples URI for general upload tests + if allowed_schemas: + db_uri = app.config["SQLALCHEMY_DATABASE_URI"] + else: + db_uri = app.config["SQLALCHEMY_EXAMPLES_URI"] + + upload_db = get_or_create_db(CSV_UPLOAD_DATABASE, db_uri) upload_db.allow_file_upload = True extra = upload_db.get_extra() allowed_schemas = allowed_schemas or [] diff --git a/tests/integration_tests/datasource/test_validate_expression_api.py b/tests/integration_tests/datasource/test_validate_expression_api.py index 4d7502bad03..0f140ab11be 100644 --- a/tests/integration_tests/datasource/test_validate_expression_api.py +++ b/tests/integration_tests/datasource/test_validate_expression_api.py @@ -37,8 +37,9 @@ class TestDatasourceValidateExpressionApi(SupersetTestCase): # Mock successful validation mock_validate.return_value = {"valid": True, "errors": []} - # Use a test datasource ID - datasource_id = 1 + # Use the birth_names dataset for testing + datasource = self.get_birth_names_dataset() + datasource_id = datasource.id rv = self.client.post( f"/api/v1/datasource/table/{datasource_id}/validate_expression/", @@ -61,7 +62,9 @@ class TestDatasourceValidateExpressionApi(SupersetTestCase): # Mock successful validation mock_validate.return_value = {"valid": True, "errors": []} - datasource_id = 1 # Assuming we have a datasource with ID 1 + # Use the birth_names dataset for testing + datasource = self.get_birth_names_dataset() + datasource_id = datasource.id rv = self.client.post( f"/api/v1/datasource/table/{datasource_id}/validate_expression/", @@ -84,7 +87,9 @@ class TestDatasourceValidateExpressionApi(SupersetTestCase): # Mock successful validation mock_validate.return_value = {"valid": True, "errors": []} - datasource_id = 1 + # Use the birth_names dataset for testing + datasource = self.get_birth_names_dataset() + datasource_id = datasource.id rv = self.client.post( f"/api/v1/datasource/table/{datasource_id}/validate_expression/", @@ -107,7 +112,9 @@ class TestDatasourceValidateExpressionApi(SupersetTestCase): # Mock successful validation mock_validate.return_value = {"valid": True, "errors": []} - datasource_id = 1 + # Use the birth_names dataset for testing + datasource = self.get_birth_names_dataset() + datasource_id = datasource.id rv = self.client.post( f"/api/v1/datasource/table/{datasource_id}/validate_expression/", @@ -126,7 +133,9 @@ class TestDatasourceValidateExpressionApi(SupersetTestCase): """Test validation of invalid SQL expression""" self.login("admin") - datasource_id = 1 + # Use the birth_names dataset for testing + datasource = self.get_birth_names_dataset() + datasource_id = datasource.id with patch( "superset.connectors.sqla.models.SqlaTable.validate_expression" @@ -154,7 +163,9 @@ class TestDatasourceValidateExpressionApi(SupersetTestCase): """Test that HAVING clause fails for non-aggregated columns""" self.login("admin") - datasource_id = 1 + # Use the birth_names dataset for testing + datasource = self.get_birth_names_dataset() + datasource_id = datasource.id with patch( "superset.connectors.sqla.models.SqlaTable.validate_expression" @@ -189,7 +200,9 @@ class TestDatasourceValidateExpressionApi(SupersetTestCase): """Test validation of empty expression""" self.login("admin") - datasource_id = 1 + # Use the birth_names dataset for testing + datasource = self.get_birth_names_dataset() + datasource_id = datasource.id rv = self.client.post( f"/api/v1/datasource/table/{datasource_id}/validate_expression/", @@ -205,7 +218,9 @@ class TestDatasourceValidateExpressionApi(SupersetTestCase): """Test validation with missing required parameters""" self.login("admin") - datasource_id = 1 + # Use the birth_names dataset for testing + datasource = self.get_birth_names_dataset() + datasource_id = datasource.id # Missing expression_type - defaults to "where" rv = self.client.post( @@ -242,7 +257,9 @@ class TestDatasourceValidateExpressionApi(SupersetTestCase): # Create a user without admin privileges self.login("gamma") - datasource_id = 1 + # Use the birth_names dataset for testing + datasource = self.get_birth_names_dataset() + datasource_id = datasource.id rv = self.client.post( f"/api/v1/datasource/table/{datasource_id}/validate_expression/",