From a43f3a421b38e9566c19deeca98f7479d28653ed Mon Sep 17 00:00:00 2001 From: Claude Code Date: Mon, 18 May 2026 23:08:14 -0500 Subject: [PATCH] ci: run E2E backend under gunicorn instead of flask dev server MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Both `cypress-run-all` and `playwright-run` started the Superset backend with `flask run --no-debugger -p $port`. The Flask development server is single-threaded and has no crash-recovery, so heavy tests — most notably `playwright/tests/dashboard/export.spec.ts:61` (Export YAML) and `dashboard-list.spec.ts:266` (Import zip) — can knock the backend offline for the rest of the run. Subsequent tests then cascade-fail with `ECONNREFUSED`, `socket hang up`, `Missing CSRF token`, and `page.goto: net::ERR_ABORTED; maybe frame was detached`. Across the last 50 master runs of the E2E workflow, 6 failed (12%), every single one with this signature. Switch both runners to gunicorn with the same shape used in `docker/entrypoints/run-server.sh`: - `--workers 4 --worker-class gthread --threads 20` — concurrency that matches what the real product runs. - `--timeout 120` — kill stuck workers instead of letting them hang the entire suite. - `--max-requests 500 --max-requests-jitter 50` — recycle workers periodically so memory accumulation from long suites doesn't OOM the process. - `--access-logfile - --error-logfile -` — keep the same per-run log capture pattern. Only frontend (JS) coverage is captured in E2E (verified — bashlib.sh only instruments the JS assets), so multi-worker gunicorn doesn't break the existing coverage path. --- .github/workflows/bashlib.sh | 47 ++++++++++++++++++++++++++++-------- 1 file changed, 37 insertions(+), 10 deletions(-) diff --git a/.github/workflows/bashlib.sh b/.github/workflows/bashlib.sh index 76f44d28f1b..5874ba68642 100644 --- a/.github/workflows/bashlib.sh +++ b/.github/workflows/bashlib.sh @@ -175,9 +175,12 @@ cypress-run-all() { local APP_ROOT=$2 cd "$GITHUB_WORKSPACE/superset-frontend/cypress-base" - # Start Flask and run it in background - # --no-debugger means disable the interactive debugger on the 500 page - # so errors can print to stderr. + # Start the Superset backend via gunicorn (not `flask run`). The Flask + # development server is single-threaded and has no crash-recovery, so + # heavy tests (dashboard import/export, SQL Lab) can knock it offline + # for the rest of the run — surfacing as `ECONNREFUSED` / `socket hang up` + # / `Missing CSRF token` cascades. Gunicorn gives us multiple workers, + # a request timeout, and worker-recycling under load. local flasklog="${HOME}/flask.log" local port=8081 CYPRESS_BASE_URL="http://localhost:${port}" @@ -187,7 +190,18 @@ cypress-run-all() { fi export CYPRESS_BASE_URL - nohup flask run --no-debugger -p $port >"$flasklog" 2>&1 "$flasklog" 2>&1 "$flasklog" 2>&1 "$flasklog" 2>&1 /dev/null 2>&1; then - say "Flask server is ready" + say "gunicorn server is ready" break fi sleep 1 @@ -254,8 +281,8 @@ playwright-run() { done if [ $timeout -eq 0 ]; then - echo "::error::Flask server failed to start within 60 seconds" - echo "::group::Flask startup log" + echo "::error::gunicorn server failed to start within 60 seconds" + echo "::group::Server startup log" cat "$flasklog" echo "::endgroup::" return 1