fix(bigquery): limit result set size to prevent browser memory crashes

Implement memory-aware progressive fetching in BigQuery's fetch_data
method. Large result sets (950+ MB) previously crashed Chrome by loading
everything into memory at once. The fix samples an initial batch to
estimate row size, then fetches only as many rows as fit within the
BQ_FETCH_MAX_MB config limit (default 200 MB). A warning toast is shown
to users when results are truncated.

This is always-on with no feature flag -- operators control the budget
via the BQ_FETCH_MAX_MB config constant.

Originally by @ethan-l-geotab in apache#36387.

Co-authored-by: ethan-l-geotab <ethanliong@geotab.com>
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Claude Code
2026-03-11 13:20:26 -07:00
parent ffe60bd960
commit 1773531807
7 changed files with 346 additions and 9 deletions

View File

@@ -1379,3 +1379,125 @@ def test_get_df_payload_invalidates_cache_missing_applied_filter_columns():
assert mock_cache.is_loaded is False, (
"Cache should be inv when no applied_filter_columns and query has filters"
)
def test_get_df_payload_bq_memory_limited_warning():
"""
Test that get_df_payload includes a warning when BigQuery results are
truncated due to the memory limit (g.bq_memory_limited is set).
"""
from superset.common.query_object import QueryObject
mock_query_context = MagicMock()
mock_query_context.force = False
mock_query_context.form_data = {"slice_id": 42}
mock_datasource = MagicMock()
mock_datasource.column_names = ["col1"]
mock_datasource.uid = "test_ds"
mock_datasource.cache_timeout = None
mock_datasource.changed_on = None
mock_datasource.get_extra_cache_keys.return_value = []
mock_datasource.data = MagicMock()
mock_datasource.data.get.return_value = {}
processor = QueryContextProcessor(mock_query_context)
processor._qc_datasource = mock_datasource
query_obj = QueryObject(
datasource=mock_datasource,
columns=["col1"],
)
with patch(
"superset.common.query_context_processor.QueryCacheManager"
) as mock_cache_manager:
mock_cache = MagicMock()
mock_cache.is_loaded = True
mock_cache.df = pd.DataFrame({"col1": [1, 2, 3]})
mock_cache.query = "SELECT col1 FROM table"
mock_cache.error_message = None
mock_cache.status = "success"
mock_cache.applied_filter_columns = ["col1"]
mock_cache.applied_template_filters = []
mock_cache.rejected_filter_columns = []
mock_cache.annotation_data = {}
mock_cache.is_cached = True
mock_cache.sql_rowcount = 3
mock_cache.cache_dttm = "2024-01-01T00:00:00"
mock_cache.queried_dttm = "2024-01-01T00:00:00"
mock_cache_manager.get.return_value = mock_cache
with patch.object(query_obj, "validate", return_value=None):
with patch.object(processor, "query_cache_key", return_value="key"):
with patch.object(processor, "get_cache_timeout", return_value=3600):
# Simulate BigQuery memory-limited flag being set on Flask g
with patch("superset.common.query_context_processor.g") as mock_g:
mock_g.bq_memory_limited = True
mock_g.bq_memory_limited_row_count = 5000
result = processor.get_df_payload(query_obj, force_cached=False)
assert result["warning"] is not None
assert "Chart 42" in result["warning"]
assert "5,000 rows" in result["warning"]
assert "memory constraints" in result["warning"]
def test_get_df_payload_no_warning_when_not_memory_limited():
"""
Test that get_df_payload does not include a warning when BigQuery
results were not truncated.
"""
from superset.common.query_object import QueryObject
mock_query_context = MagicMock()
mock_query_context.force = False
mock_query_context.form_data = {}
mock_datasource = MagicMock()
mock_datasource.column_names = ["col1"]
mock_datasource.uid = "test_ds"
mock_datasource.cache_timeout = None
mock_datasource.changed_on = None
mock_datasource.get_extra_cache_keys.return_value = []
mock_datasource.data = MagicMock()
mock_datasource.data.get.return_value = {}
processor = QueryContextProcessor(mock_query_context)
processor._qc_datasource = mock_datasource
query_obj = QueryObject(
datasource=mock_datasource,
columns=["col1"],
)
with patch(
"superset.common.query_context_processor.QueryCacheManager"
) as mock_cache_manager:
mock_cache = MagicMock()
mock_cache.is_loaded = True
mock_cache.df = pd.DataFrame({"col1": [1, 2]})
mock_cache.query = "SELECT col1 FROM table"
mock_cache.error_message = None
mock_cache.status = "success"
mock_cache.applied_filter_columns = ["col1"]
mock_cache.applied_template_filters = []
mock_cache.rejected_filter_columns = []
mock_cache.annotation_data = {}
mock_cache.is_cached = True
mock_cache.sql_rowcount = 2
mock_cache.cache_dttm = "2024-01-01T00:00:00"
mock_cache.queried_dttm = "2024-01-01T00:00:00"
mock_cache_manager.get.return_value = mock_cache
with patch.object(query_obj, "validate", return_value=None):
with patch.object(processor, "query_cache_key", return_value="key"):
with patch.object(processor, "get_cache_timeout", return_value=3600):
# g.bq_memory_limited is not set (default)
with patch("superset.common.query_context_processor.g") as mock_g:
mock_g.bq_memory_limited = False
result = processor.get_df_payload(query_obj, force_cached=False)
assert result["warning"] is None