fix(bigquery): address review comments on memory-limited fetch

- Use has_app_context()/has_request_context() guards so fetch_data is safe to call outside a Flask request (fixes RuntimeError on g writes and current_app access in non-request paths) - Replace sys.getsizeof(str(batch)) with per-row getsizeof sum for more accurate memory estimation without the str() allocation - Fix false-positive truncation: fetch remaining+1 rows and check len > remaining to confirm more data exists beyond the cap - Reset g.bq_memory_limited/g.bq_memory_limited_row_count after reading in get_df_payload to prevent flag leaking across multiple queries in the same request - Wrap warning string in _() for i18n - Add warning field to ChartDataResponseResult Marshmallow schema - Pass noDuplicate: true to addWarningToast to suppress duplicate toasts when a multi-query chart has multiple truncated responses Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-21 15:55:10 +00:00 · 2026-04-07 13:16:58 -07:00
parent ffe7addcd2
commit 1894b77b44
4 changed files with 38 additions and 21 deletions
--- a/superset-frontend/src/components/Chart/chartAction.ts
+++ b/superset-frontend/src/components/Chart/chartAction.ts
@@ -810,7 +810,7 @@ export function exploreJSON(
        );
        (queriesResponse as QueryData[]).forEach(response => {
          if (response.warning) {
-            dispatch(addWarningToast(response.warning));
+            dispatch(addWarningToast(response.warning, { noDuplicate: true }));
          }
        });
        return dispatch(
--- a/superset/charts/schemas.py
+++ b/superset/charts/schemas.py
@@ -1561,6 +1561,10 @@ class ChartDataResponseResult(Schema):
        required=False,
        allow_none=True,
    )
+    warning = fields.String(
+        metadata={"description": "Warning message when results were truncated"},
+        allow_none=True,
+    )


 class ChartDataResponseSchema(Schema):
--- a/superset/common/query_context_processor.py
+++ b/superset/common/query_context_processor.py
@@ -193,11 +193,17 @@ class QueryContextProcessor:
        warning: str | None = None
        if getattr(g, "bq_memory_limited", False):
            row_count = getattr(g, "bq_memory_limited_row_count", len(cache.df))
+            # Reset flags immediately so subsequent queries in the same request
+            # don't inherit this warning
+            g.bq_memory_limited = False
+            g.bq_memory_limited_row_count = 0
            chart_id = (self._query_context.form_data or {}).get("slice_id", "")
            prefix = f"Chart {chart_id}: " if chart_id else ""
-            warning = (
-                f"{prefix}Results truncated to {row_count:,} rows"
-                " due to memory constraints."
+            warning = _(
+                "%(prefix)sResults truncated to %(row_count)s rows"
+                " due to memory constraints.",
+                prefix=prefix,
+                row_count=f"{row_count:,}",
            )

        return {
--- a/superset/db_engine_specs/bigquery.py
+++ b/superset/db_engine_specs/bigquery.py
@@ -28,7 +28,7 @@ from typing import Any, TYPE_CHECKING, TypedDict
 import pandas as pd
 from apispec import APISpec
 from apispec.ext.marshmallow import MarshmallowPlugin
-from flask import current_app, g
+from flask import current_app, g, has_app_context, has_request_context
 from flask_babel import gettext as __
 from marshmallow import fields, Schema
 from marshmallow.exceptions import ValidationError
@@ -305,7 +305,7 @@ class BigQueryEngineSpec(BaseEngineSpec):  # pylint: disable=too-many-public-met
        return None

    @classmethod
-    def fetch_data(cls, cursor: Any, limit: int | None = None) -> list[tuple[Any, ...]]:
+    def fetch_data(cls, cursor: Any, limit: int | None = None) -> list[tuple[Any, ...]]:  # noqa: C901
        """
        Progressive fetch for BigQuery to prevent browser memory overload.

@@ -314,7 +314,7 @@ class BigQueryEngineSpec(BaseEngineSpec):  # pylint: disable=too-many-public-met
        Falls back to the parent implementation on any error.
        """
        max_mb: int = (
-            current_app.config.get("BQ_FETCH_MAX_MB", 200) if current_app else 200
+            current_app.config.get("BQ_FETCH_MAX_MB", 200) if has_app_context() else 200
        )
        max_bytes = max_mb * 1024 * 1024

@@ -323,16 +323,17 @@ class BigQueryEngineSpec(BaseEngineSpec):  # pylint: disable=too-many-public-met
            first_batch: list[Any] = cursor.fetchmany(initial_batch_size)

            if not first_batch:
-                g.bq_memory_limited = False
-                g.bq_memory_limited_row_count = 0
+                if has_request_context():
+                    g.bq_memory_limited = False
+                    g.bq_memory_limited_row_count = 0
                return []

            # Support BigQuery Row objects (PR #4071)
            if type(first_batch[0]).__name__ == "Row":
                first_batch = [r.values() for r in first_batch]

-            # Estimate how many rows fit in the memory budget
-            first_batch_bytes = sys.getsizeof(str(first_batch))
+            # Estimate how many rows fit in the memory budget using per-row sizes
+            first_batch_bytes = sum(sys.getsizeof(row) for row in first_batch)
            rows_fetched = len(first_batch)
            avg_bytes_per_row = first_batch_bytes / rows_fetched
            total_rows_for_target = int(max_bytes / avg_bytes_per_row)
@@ -347,21 +348,26 @@ class BigQueryEngineSpec(BaseEngineSpec):  # pylint: disable=too-many-public-met
                memory_limited = (
                    remaining_rows <= 0 and rows_fetched == initial_batch_size
                )
-                g.bq_memory_limited = memory_limited
-                g.bq_memory_limited_row_count = len(first_batch)
+                if has_request_context():
+                    g.bq_memory_limited = memory_limited
+                    g.bq_memory_limited_row_count = len(first_batch)
                return first_batch

-            # Fetch the rest up to the budget
-            second_batch: list[Any] = cursor.fetchmany(remaining_rows) or []
+            # Fetch one extra row to confirm truncation without false positives
+            second_batch: list[Any] = cursor.fetchmany(remaining_rows + 1) or []
            if second_batch and type(second_batch[0]).__name__ == "Row":
                second_batch = [r.values() for r in second_batch]

+            # Truncation is confirmed only when more rows exist beyond the budget
+            memory_limited = len(second_batch) > remaining_rows
+            if memory_limited:
+                second_batch = second_batch[:remaining_rows]
+
            data = first_batch + second_batch

-            # If we received exactly what we asked for, more rows may exist
-            memory_limited = len(second_batch) == remaining_rows
-            g.bq_memory_limited = memory_limited
-            g.bq_memory_limited_row_count = len(data)
+            if has_request_context():
+                g.bq_memory_limited = memory_limited
+                g.bq_memory_limited_row_count = len(data)
            return data

        except Exception:  # pylint: disable=broad-except
@@ -369,8 +375,9 @@ class BigQueryEngineSpec(BaseEngineSpec):  # pylint: disable=too-many-public-met
            data = super().fetch_data(cursor, limit)
            if data and type(data[0]).__name__ == "Row":
                data = [r.values() for r in data]  # type: ignore
-            g.bq_memory_limited = False
-            g.bq_memory_limited_row_count = len(data) if data else 0
+            if has_request_context():
+                g.bq_memory_limited = False
+                g.bq_memory_limited_row_count = len(data) if data else 0
            return data

    @staticmethod