test(examples): add tests for UUID threading and security bypass (#37557)

Co-authored-by: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Joe Li
2026-02-12 14:12:12 -08:00
committed by GitHub
parent 0d5ddb3674
commit 6328e51620
6 changed files with 824 additions and 1 deletions

View File

@@ -0,0 +1,16 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.

View File

@@ -0,0 +1,204 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
"""Tests for data_loading.py UUID extraction functionality."""
from pathlib import Path
from tempfile import TemporaryDirectory
from unittest.mock import patch
import yaml
def test_get_dataset_config_from_yaml_extracts_uuid():
"""Test that UUID is extracted from dataset.yaml."""
from superset.examples.data_loading import get_dataset_config_from_yaml
with TemporaryDirectory() as tmpdir:
example_dir = Path(tmpdir)
dataset_yaml = example_dir / "dataset.yaml"
dataset_yaml.write_text(
yaml.dump(
{
"table_name": "test_table",
"uuid": "12345678-1234-1234-1234-123456789012",
"schema": "public",
}
)
)
config = get_dataset_config_from_yaml(example_dir)
assert config["uuid"] == "12345678-1234-1234-1234-123456789012"
assert config["table_name"] == "test_table"
assert config["schema"] == "public"
def test_get_dataset_config_from_yaml_without_uuid():
"""Test that missing UUID returns None."""
from superset.examples.data_loading import get_dataset_config_from_yaml
with TemporaryDirectory() as tmpdir:
example_dir = Path(tmpdir)
dataset_yaml = example_dir / "dataset.yaml"
dataset_yaml.write_text(
yaml.dump(
{
"table_name": "test_table",
"schema": "public",
}
)
)
config = get_dataset_config_from_yaml(example_dir)
assert config["uuid"] is None
assert config["table_name"] == "test_table"
def test_get_dataset_config_from_yaml_no_file():
"""Test behavior when dataset.yaml doesn't exist."""
from superset.examples.data_loading import get_dataset_config_from_yaml
with TemporaryDirectory() as tmpdir:
example_dir = Path(tmpdir)
config = get_dataset_config_from_yaml(example_dir)
assert config["uuid"] is None
assert config["table_name"] is None
assert config["schema"] is None
def test_get_dataset_config_from_yaml_treats_main_schema_as_none():
"""Test that SQLite's 'main' schema is treated as None."""
from superset.examples.data_loading import get_dataset_config_from_yaml
with TemporaryDirectory() as tmpdir:
example_dir = Path(tmpdir)
dataset_yaml = example_dir / "dataset.yaml"
dataset_yaml.write_text(
yaml.dump(
{
"table_name": "test_table",
"schema": "main", # SQLite default schema
}
)
)
config = get_dataset_config_from_yaml(example_dir)
assert config["schema"] is None
def test_get_multi_dataset_config_extracts_uuid():
"""Test that UUID is extracted from datasets/{name}.yaml."""
from superset.examples.data_loading import _get_multi_dataset_config
with TemporaryDirectory() as tmpdir:
example_dir = Path(tmpdir)
datasets_dir = example_dir / "datasets"
datasets_dir.mkdir()
dataset_yaml = datasets_dir / "test_dataset.yaml"
dataset_yaml.write_text(
yaml.dump(
{
"table_name": "custom_table_name",
"uuid": "aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeeeee",
"schema": "public",
}
)
)
data_file = example_dir / "data" / "test_dataset.parquet"
config = _get_multi_dataset_config(example_dir, "test_dataset", data_file)
assert config["uuid"] == "aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeeeee"
assert config["table_name"] == "custom_table_name"
def test_get_multi_dataset_config_without_yaml():
"""Test behavior when datasets/{name}.yaml doesn't exist."""
from superset.examples.data_loading import _get_multi_dataset_config
with TemporaryDirectory() as tmpdir:
example_dir = Path(tmpdir)
data_file = example_dir / "data" / "test_dataset.parquet"
config = _get_multi_dataset_config(example_dir, "test_dataset", data_file)
assert config.get("uuid") is None
assert config["table_name"] == "test_dataset"
def test_get_multi_dataset_config_treats_main_schema_as_none():
"""Test that SQLite's 'main' schema is treated as None in multi-dataset config."""
from superset.examples.data_loading import _get_multi_dataset_config
with TemporaryDirectory() as tmpdir:
example_dir = Path(tmpdir)
datasets_dir = example_dir / "datasets"
datasets_dir.mkdir()
dataset_yaml = datasets_dir / "test_dataset.yaml"
dataset_yaml.write_text(
yaml.dump(
{
"table_name": "test_table",
"schema": "main",
}
)
)
data_file = example_dir / "data" / "test_dataset.parquet"
config = _get_multi_dataset_config(example_dir, "test_dataset", data_file)
assert config["schema"] is None
def test_discover_datasets_passes_uuid_to_loader():
"""Test that discover_datasets passes UUID from YAML to create_generic_loader."""
from superset.examples.data_loading import discover_datasets
with TemporaryDirectory() as tmpdir:
examples_dir = Path(tmpdir)
# Create a simple example with data.parquet and dataset.yaml
example_dir = examples_dir / "test_example"
example_dir.mkdir()
(example_dir / "data.parquet").touch()
(example_dir / "dataset.yaml").write_text(
yaml.dump(
{
"table_name": "test_table",
"uuid": "12345678-1234-1234-1234-123456789012",
}
)
)
with patch(
"superset.examples.data_loading.get_examples_directory",
return_value=examples_dir,
):
with patch(
"superset.examples.data_loading.create_generic_loader"
) as mock_create:
mock_create.return_value = lambda: None
discover_datasets()
mock_create.assert_called_once()
call_kwargs = mock_create.call_args[1]
assert call_kwargs["uuid"] == "12345678-1234-1234-1234-123456789012"

View File

@@ -0,0 +1,233 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
"""Tests for generic_loader.py UUID threading functionality."""
from unittest.mock import MagicMock, patch
@patch("superset.examples.generic_loader.get_example_database")
@patch("superset.examples.generic_loader.db")
def test_load_parquet_table_sets_uuid_on_new_table(mock_db, mock_get_db):
"""Test that load_parquet_table sets UUID on newly created SqlaTable."""
from superset.examples.generic_loader import load_parquet_table
mock_database = MagicMock()
mock_database.id = 1
mock_database.has_table.return_value = True
mock_get_db.return_value = mock_database
mock_engine = MagicMock()
mock_inspector = MagicMock()
mock_inspector.default_schema_name = "public"
mock_database.get_sqla_engine.return_value.__enter__ = MagicMock(
return_value=mock_engine
)
mock_database.get_sqla_engine.return_value.__exit__ = MagicMock(return_value=False)
# Simulate table not found in metadata
mock_db.session.query.return_value.filter_by.return_value.first.return_value = None
test_uuid = "12345678-1234-1234-1234-123456789012"
with patch("superset.examples.generic_loader.inspect") as mock_inspect:
mock_inspect.return_value = mock_inspector
tbl = load_parquet_table(
parquet_file="test_data",
table_name="test_table",
database=mock_database,
only_metadata=True,
uuid=test_uuid,
)
assert tbl.uuid == test_uuid
@patch("superset.examples.generic_loader.get_example_database")
@patch("superset.examples.generic_loader.db")
def test_load_parquet_table_early_return_does_not_modify_existing_uuid(
mock_db, mock_get_db
):
"""Test early return path when table exists - UUID is not modified.
When the physical table exists and force=False, the function returns early
without going through the full load path. The existing table's UUID is
preserved as-is (not modified even if different from the provided uuid).
"""
from superset.examples.generic_loader import load_parquet_table
mock_database = MagicMock()
mock_database.id = 1
mock_database.has_table.return_value = True # Triggers early return
mock_get_db.return_value = mock_database
mock_engine = MagicMock()
mock_inspector = MagicMock()
mock_inspector.default_schema_name = "public"
mock_database.get_sqla_engine.return_value.__enter__ = MagicMock(
return_value=mock_engine
)
mock_database.get_sqla_engine.return_value.__exit__ = MagicMock(return_value=False)
# Simulate existing table without UUID
existing_table = MagicMock()
existing_table.uuid = None
mock_db.session.query.return_value.filter_by.return_value.first.return_value = (
existing_table
)
test_uuid = "12345678-1234-1234-1234-123456789012"
with patch("superset.examples.generic_loader.inspect") as mock_inspect:
mock_inspect.return_value = mock_inspector
tbl = load_parquet_table(
parquet_file="test_data",
table_name="test_table",
database=mock_database,
only_metadata=True,
uuid=test_uuid,
)
# Early return path returns existing table as-is
assert tbl is existing_table
# UUID was not modified (still None)
assert tbl.uuid is None
@patch("superset.examples.generic_loader.get_example_database")
@patch("superset.examples.generic_loader.db")
def test_load_parquet_table_preserves_existing_uuid(mock_db, mock_get_db):
"""Test that load_parquet_table does not overwrite existing UUID."""
from superset.examples.generic_loader import load_parquet_table
mock_database = MagicMock()
mock_database.id = 1
mock_database.has_table.return_value = True
mock_get_db.return_value = mock_database
mock_engine = MagicMock()
mock_inspector = MagicMock()
mock_inspector.default_schema_name = "public"
mock_database.get_sqla_engine.return_value.__enter__ = MagicMock(
return_value=mock_engine
)
mock_database.get_sqla_engine.return_value.__exit__ = MagicMock(return_value=False)
# Simulate existing table with different UUID
existing_uuid = "aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa"
existing_table = MagicMock()
existing_table.uuid = existing_uuid
mock_db.session.query.return_value.filter_by.return_value.first.return_value = (
existing_table
)
new_uuid = "bbbbbbbb-bbbb-bbbb-bbbb-bbbbbbbbbbbb"
with patch("superset.examples.generic_loader.inspect") as mock_inspect:
mock_inspect.return_value = mock_inspector
tbl = load_parquet_table(
parquet_file="test_data",
table_name="test_table",
database=mock_database,
only_metadata=True,
uuid=new_uuid,
)
# Should preserve original UUID
assert tbl.uuid == existing_uuid
@patch("superset.examples.generic_loader.get_example_database")
@patch("superset.examples.generic_loader.db")
def test_load_parquet_table_works_without_uuid(mock_db, mock_get_db):
"""Test that load_parquet_table works correctly when no UUID is provided."""
from superset.examples.generic_loader import load_parquet_table
mock_database = MagicMock()
mock_database.id = 1
mock_database.has_table.return_value = True
mock_get_db.return_value = mock_database
mock_engine = MagicMock()
mock_inspector = MagicMock()
mock_inspector.default_schema_name = "public"
mock_database.get_sqla_engine.return_value.__enter__ = MagicMock(
return_value=mock_engine
)
mock_database.get_sqla_engine.return_value.__exit__ = MagicMock(return_value=False)
# Simulate table not found
mock_db.session.query.return_value.filter_by.return_value.first.return_value = None
with patch("superset.examples.generic_loader.inspect") as mock_inspect:
mock_inspect.return_value = mock_inspector
tbl = load_parquet_table(
parquet_file="test_data",
table_name="test_table",
database=mock_database,
only_metadata=True,
# No uuid parameter
)
# UUID should remain None
assert tbl.uuid is None
def test_create_generic_loader_passes_uuid():
"""Test that create_generic_loader passes UUID to load_parquet_table."""
from superset.examples.generic_loader import create_generic_loader
test_uuid = "12345678-1234-1234-1234-123456789012"
loader = create_generic_loader(
parquet_file="test_data",
table_name="test_table",
uuid=test_uuid,
)
# Verify loader was created with UUID in closure
with patch("superset.examples.generic_loader.load_parquet_table") as mock_load:
mock_load.return_value = MagicMock()
loader(only_metadata=True)
# Verify UUID was passed through
mock_load.assert_called_once()
call_kwargs = mock_load.call_args[1]
assert call_kwargs["uuid"] == test_uuid
def test_create_generic_loader_without_uuid():
"""Test that create_generic_loader works without UUID (backward compat)."""
from superset.examples.generic_loader import create_generic_loader
loader = create_generic_loader(
parquet_file="test_data",
table_name="test_table",
# No uuid
)
with patch("superset.examples.generic_loader.load_parquet_table") as mock_load:
mock_load.return_value = MagicMock()
loader(only_metadata=True)
mock_load.assert_called_once()
call_kwargs = mock_load.call_args[1]
assert call_kwargs["uuid"] is None

View File

@@ -0,0 +1,206 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
"""Tests for examples/utils.py - YAML config loading and content assembly."""
from pathlib import Path
from tempfile import TemporaryDirectory
from unittest.mock import MagicMock, patch
import yaml
def _create_example_tree(base_dir: Path) -> Path:
"""Create a minimal example directory tree under base_dir/superset/examples/.
Returns the 'superset' directory (what files("superset") would return).
"""
superset_dir = base_dir / "superset"
examples_dir = superset_dir / "examples"
# _shared configs
shared_dir = examples_dir / "_shared"
shared_dir.mkdir(parents=True)
(shared_dir / "database.yaml").write_text(
"database_name: examples\n"
"sqlalchemy_uri: __SQLALCHEMY_EXAMPLES_URI__\n"
"uuid: a2dc77af-e654-49bb-b321-40f6b559a1ee\n"
"version: '1.0.0'\n"
)
(shared_dir / "metadata.yaml").write_text(
"version: '1.0.0'\ntimestamp: '2020-12-11T22:52:56.534241+00:00'\n"
)
# An example with dataset, dashboard, and chart
example_dir = examples_dir / "test_example"
example_dir.mkdir()
(example_dir / "dataset.yaml").write_text(
yaml.dump(
{
"table_name": "test_table",
"schema": "main",
"uuid": "14f48794-ebfa-4f60-a26a-582c49132f1b",
"database_uuid": "a2dc77af-e654-49bb-b321-40f6b559a1ee",
"version": "1.0.0",
}
)
)
(example_dir / "dashboard.yaml").write_text(
yaml.dump(
{
"dashboard_title": "Test Dashboard",
"uuid": "dddddddd-dddd-dddd-dddd-dddddddddddd",
"version": "1.0.0",
}
)
)
charts_dir = example_dir / "charts"
charts_dir.mkdir()
(charts_dir / "test_chart.yaml").write_text(
yaml.dump(
{
"slice_name": "Test Chart",
"uuid": "cccccccc-cccc-cccc-cccc-cccccccccccc",
"dataset_uuid": "14f48794-ebfa-4f60-a26a-582c49132f1b",
"version": "1.0.0",
}
)
)
return superset_dir
def test_load_contents_builds_correct_import_structure():
"""load_contents() must produce the key structure ImportExamplesCommand expects.
This tests the orchestration entry point: YAML files are discovered from
the examples directory, the shared database config has its URI placeholder
replaced, and the result has the correct key prefixes (databases/, datasets/,
metadata.yaml).
"""
from superset.examples.utils import load_contents
with TemporaryDirectory() as tmpdir:
superset_dir = _create_example_tree(Path(tmpdir))
test_examples_uri = "sqlite:///path/to/examples.db"
mock_app = MagicMock()
mock_app.config = {"SQLALCHEMY_EXAMPLES_URI": test_examples_uri}
with patch("superset.examples.utils.files", return_value=superset_dir):
with patch("flask.current_app", mock_app):
contents = load_contents()
# Verify database config is present with placeholder replaced
assert "databases/examples.yaml" in contents
db_content = contents["databases/examples.yaml"]
assert "__SQLALCHEMY_EXAMPLES_URI__" not in db_content
assert test_examples_uri in db_content
# Verify metadata is present
assert "metadata.yaml" in contents
# Verify dataset is discovered with correct key prefix
assert "datasets/examples/test_example.yaml" in contents
# Verify dashboard is discovered with correct key prefix
assert "dashboards/test_example.yaml" in contents
# Verify chart is discovered with correct key prefix
assert "charts/test_example/test_chart.yaml" in contents
# Verify schema normalization happened (main -> null)
dataset_content = contents["datasets/examples/test_example.yaml"]
assert "schema: main" not in dataset_content
assert "schema: null" in dataset_content
def test_load_contents_replaces_sqlalchemy_examples_uri_placeholder():
"""The __SQLALCHEMY_EXAMPLES_URI__ placeholder must be replaced with the real URI.
If this placeholder is not replaced, the database import will fail with an
invalid connection string, preventing all examples from loading.
"""
from superset.examples.utils import _load_shared_configs
with TemporaryDirectory() as tmpdir:
superset_dir = _create_example_tree(Path(tmpdir))
examples_root = Path("examples")
test_uri = "postgresql://user:pass@host/db"
mock_app = MagicMock()
mock_app.config = {"SQLALCHEMY_EXAMPLES_URI": test_uri}
with patch("superset.examples.utils.files", return_value=superset_dir):
with patch("flask.current_app", mock_app):
contents = _load_shared_configs(examples_root)
assert "databases/examples.yaml" in contents
assert test_uri in contents["databases/examples.yaml"]
assert "__SQLALCHEMY_EXAMPLES_URI__" not in contents["databases/examples.yaml"]
@patch("superset.examples.utils.ImportExamplesCommand")
@patch("superset.examples.utils.load_contents")
def test_load_examples_from_configs_wires_command_correctly(
mock_load_contents,
mock_command_cls,
):
"""load_examples_from_configs() must construct ImportExamplesCommand
with overwrite=True and thread force_data through.
A wiring regression here would silently skip overwriting existing
examples or ignore the force_data flag.
"""
from superset.examples.utils import load_examples_from_configs
mock_load_contents.return_value = {"databases/examples.yaml": "content"}
mock_command = MagicMock()
mock_command_cls.return_value = mock_command
load_examples_from_configs(force_data=True)
mock_load_contents.assert_called_once_with(False)
mock_command_cls.assert_called_once_with(
{"databases/examples.yaml": "content"},
overwrite=True,
force_data=True,
)
mock_command.run.assert_called_once()
@patch("superset.examples.utils.ImportExamplesCommand")
@patch("superset.examples.utils.load_contents")
def test_load_examples_from_configs_defaults(
mock_load_contents,
mock_command_cls,
):
"""Default call should pass force_data=False and load_test_data=False."""
from superset.examples.utils import load_examples_from_configs
mock_load_contents.return_value = {}
mock_command = MagicMock()
mock_command_cls.return_value = mock_command
load_examples_from_configs()
mock_load_contents.assert_called_once_with(False)
mock_command_cls.assert_called_once_with(
{},
overwrite=True,
force_data=False,
)
mock_command.run.assert_called_once()