From 0d50fd676be3277a5e783bc50794f482ec773c9f Mon Sep 17 00:00:00 2001 From: Richard Fogaca Nienkotter <63572350+richardfogaca@users.noreply.github.com> Date: Thu, 23 Apr 2026 17:35:08 -0300 Subject: [PATCH] fix(mcp): hide user directory metadata from responses (#39576) Co-authored-by: Elizabeth Thompson Co-authored-by: Claude Sonnet 4.6 --- superset/mcp_service/app.py | 28 ++-- superset/mcp_service/chart/schemas.py | 30 +--- .../mcp_service/chart/tool/generate_chart.py | 11 +- .../mcp_service/chart/tool/get_chart_info.py | 3 +- .../mcp_service/common/schema_discovery.py | 25 ++- superset/mcp_service/dashboard/schemas.py | 58 +------ .../tool/add_chart_to_existing_dashboard.py | 17 +- .../dashboard/tool/generate_dashboard.py | 15 +- .../dashboard/tool/get_dashboard_info.py | 7 +- superset/mcp_service/database/schemas.py | 21 +-- superset/mcp_service/dataset/schemas.py | 24 +-- superset/mcp_service/mcp_core.py | 77 ++++++--- superset/mcp_service/privacy.py | 52 +++++++ superset/mcp_service/system/schemas.py | 4 +- .../mcp_service/utils/permissions_utils.py | 17 +- superset/mcp_service/utils/token_utils.py | 4 +- .../chart/tool/test_generate_chart.py | 2 +- .../dashboard/tool/test_dashboard_tools.py | 147 ++++++++++++++++++ .../database/tool/test_database_tools.py | 50 ++++++ .../system/tool/test_get_current_user.py | 42 ++--- .../system/tool/test_get_schema.py | 34 ++++ .../mcp_service/system/tool/test_mcp_core.py | 57 +++++++ .../unit_tests/mcp_service/test_mcp_config.py | 13 ++ tests/unit_tests/mcp_service/test_privacy.py | 84 ++++++++++ .../mcp_service/test_schema_discovery.py | 43 +++++ .../utils/test_permissions_utils.py | 73 +++++++++ .../mcp_service/utils/test_token_utils.py | 2 + 27 files changed, 698 insertions(+), 242 deletions(-) create mode 100644 superset/mcp_service/privacy.py create mode 100644 tests/unit_tests/mcp_service/test_privacy.py create mode 100644 tests/unit_tests/mcp_service/test_schema_discovery.py create mode 100644 tests/unit_tests/mcp_service/utils/test_permissions_utils.py diff --git a/superset/mcp_service/app.py b/superset/mcp_service/app.py index 74ef1c91954..e9e24277362 100644 --- a/superset/mcp_service/app.py +++ b/superset/mcp_service/app.py @@ -142,15 +142,6 @@ To create a chart: "config": {{...}}, "save_chart": true }}) -> save permanently -To find your own charts/dashboards/databases: -1. get_instance_info -> get current_user.id -2. list_charts(request={{"filters": [{{"col": "created_by_fk", - "opr": "eq", "value": current_user.id}}]}}) -3. Or: list_dashboards(request={{"filters": [{{"col": "created_by_fk", - "opr": "eq", "value": current_user.id}}]}}) -4. Or: list_databases(request={{"filters": [{{"col": "created_by_fk", - "opr": "eq", "value": current_user.id}}]}}) - To explore data with SQL: 1. list_datasets(request={{}}) -> find a dataset and note its database_id 2. execute_sql(request={{"database_id": , "sql": "SELECT ..."}}) @@ -211,13 +202,6 @@ Query Examples: list_charts(request={{"filters": [{{"col": "viz_type", "opr": "sw", "value": "echarts_timeseries"}}]}}) - Search by name: list_charts(request={{"search": "sales"}}) -- My charts (use current_user.id from get_instance_info): - list_charts(request={{"filters": [{{"col": "created_by_fk", "opr": "eq", "value": }}]}}) -- My dashboards: - list_dashboards(request={{"filters": [{{"col": "created_by_fk", "opr": "eq", "value": }}]}}) -- My databases: - list_databases(request={{"filters": [{{"col": "created_by_fk", "opr": "eq", "value": }}]}}) - To modify an existing chart (add filters, change metrics, etc.): 1. get_chart_info(request={{"identifier": }}) -> examine current configuration @@ -274,6 +258,18 @@ Permission Awareness: - get_instance_info returns current_user.roles (e.g., ["Admin"], ["Alpha"], ["Viewer"]). - ALWAYS check the user's roles BEFORE suggesting write operations (creating datasets, charts, dashboards, or running SQL). +- Do NOT disclose dashboard access lists, dashboard owners, chart owners, dataset + owners, workspace admins, or other users' names, usernames, email addresses, + contact details, roles, admin status, ownership, or access-list information. +- Do NOT infer access-list answers from dashboard metadata such as published status, + role restrictions, empty owner lists, or schema fields. +- Do NOT use execute_sql to query user, role, owner, or access-list tables for this + information. +- You may reference the current user's own identity details when appropriate, such + as confirming their own username. +- If a user asks who can view/edit/access content, who owns content, who is an + admin, who to contact for access, or what role another user has, say that you + cannot provide that information and direct them to their workspace admin. - Common roles and their typical capabilities: - Admin: Full access to all features - Alpha: Can create and modify charts, dashboards, datasets, and run SQL diff --git a/superset/mcp_service/chart/schemas.py b/superset/mcp_service/chart/schemas.py index e309d528d76..001a149d19d 100644 --- a/superset/mcp_service/chart/schemas.py +++ b/superset/mcp_service/chart/schemas.py @@ -51,11 +51,10 @@ from superset.mcp_service.common.cache_schemas import ( ) from superset.mcp_service.common.error_schemas import ChartGenerationError from superset.mcp_service.constants import DEFAULT_PAGE_SIZE, MAX_PAGE_SIZE +from superset.mcp_service.privacy import filter_user_directory_fields from superset.mcp_service.system.schemas import ( PaginationInfo, - serialize_user_object, TagInfo, - UserInfo, ) from superset.mcp_service.utils.sanitization import ( sanitize_filter_value, @@ -102,17 +101,12 @@ class ChartInfo(BaseModel): url: str | None = Field(None, description="Chart explore page URL") description: str | None = Field(None, description="Chart description") cache_timeout: int | None = Field(None, description="Cache timeout") - changed_by: str | None = Field(None, description="Last modifier (username)") - changed_by_name: str | None = Field( - None, description="Last modifier (display name)" - ) changed_on: str | datetime | None = Field( None, description="Last modification timestamp" ) changed_on_humanized: str | None = Field( None, description="Humanized modification time" ) - created_by: str | None = Field(None, description="Chart creator (username)") created_on: str | datetime | None = Field(None, description="Creation timestamp") created_on_humanized: str | None = Field( None, description="Humanized creation time" @@ -125,7 +119,6 @@ class ChartInfo(BaseModel): ) uuid: str | None = Field(None, description="Chart UUID") tags: List[TagInfo] = Field(default_factory=list, description="Chart tags") - owners: List[UserInfo] = Field(default_factory=list, description="Chart owners") # Filters extracted from form_data for easy inspection filters: ChartFiltersInfo | None = Field( @@ -164,7 +157,7 @@ class ChartInfo(BaseModel): model_config = ConfigDict(from_attributes=True, ser_json_timedelta="iso8601") - @model_serializer(mode="wrap", when_used="json") + @model_serializer(mode="wrap") def _filter_fields_by_context(self, serializer: Any, info: Any) -> Dict[str, Any]: """Filter fields based on serialization context. @@ -172,7 +165,7 @@ class ChartInfo(BaseModel): Otherwise, include all fields (default behavior). """ # Get full serialization - data = serializer(self) + data = filter_user_directory_fields(serializer(self)) # Check if we have a context with select_columns if info.context and isinstance(info.context, dict): @@ -181,7 +174,6 @@ class ChartInfo(BaseModel): # Filter to only requested fields return {k: v for k, v in data.items() if k in select_columns} - # No filtering - return all fields return data @@ -420,13 +412,8 @@ def serialize_chart_object(chart: ChartLike | None) -> ChartInfo | None: cache_timeout=getattr(chart, "cache_timeout", None), form_data=chart_form_data, filters=filters_info, - changed_by=getattr(chart, "changed_by_name", None) - or (str(chart.changed_by) if getattr(chart, "changed_by", None) else None), - changed_by_name=getattr(chart, "changed_by_name", None), changed_on=getattr(chart, "changed_on", None), changed_on_humanized=_humanize_timestamp(getattr(chart, "changed_on", None)), - created_by=getattr(chart, "created_by_name", None) - or (str(chart.created_by) if getattr(chart, "created_by", None) else None), created_on=getattr(chart, "created_on", None), created_on_humanized=_humanize_timestamp(getattr(chart, "created_on", None)), uuid=str(getattr(chart, "uuid", "")) if getattr(chart, "uuid", None) else None, @@ -436,13 +423,6 @@ def serialize_chart_object(chart: ChartLike | None) -> ChartInfo | None: ] if getattr(chart, "tags", None) else [], - owners=[ - info - for owner in getattr(chart, "owners", []) - if (info := serialize_user_object(owner)) is not None - ] - if getattr(chart, "owners", None) - else [], ) @@ -458,12 +438,10 @@ class ChartFilter(ColumnOperator): "slice_name", "viz_type", "datasource_name", - "created_by_fk", ] = Field( ..., description="Column to filter on. Use get_schema(model_type='chart') for " - "available filter columns. Use created_by_fk with the user ID from " - "get_instance_info's current_user to find charts created by a specific user.", + "available filter columns.", ) opr: ColumnOperatorEnum = Field( ..., diff --git a/superset/mcp_service/chart/tool/generate_chart.py b/superset/mcp_service/chart/tool/generate_chart.py index 7ed0a5cc6b0..7c999b8a5c3 100644 --- a/superset/mcp_service/chart/tool/generate_chart.py +++ b/superset/mcp_service/chart/tool/generate_chart.py @@ -735,18 +735,17 @@ async def generate_chart( # noqa: C901 from superset.models.slice import Slice # Re-fetch with eager-loaded relationships to avoid detached - # instance errors when serialize_chart_object accesses .tags - # and .owners. The preceding commit may invalidate the session + # instance errors when serialize_chart_object accesses .tags. + # The preceding commit may invalidate the session # in multi-tenant environments; on failure, build a minimal # chart_data dict from scalar attributes that are already loaded - # — relationship fields (owners, tags) would trigger - # lazy-loading on the same dead session. + # — relationship fields like tags would trigger lazy-loading on + # the same dead session. try: chart = ( ChartDAO.find_by_id( chart.id, query_options=[ - joinedload(Slice.owners), joinedload(Slice.tags), ], ) @@ -781,7 +780,7 @@ async def generate_chart( # noqa: C901 # Safely serialize chart_info - handle both Pydantic models and dicts if chart_data is None and chart_info is not None: if hasattr(chart_info, "model_dump"): - chart_data = chart_info.model_dump() + chart_data = chart_info.model_dump(mode="json") elif isinstance(chart_info, dict): chart_data = chart_info else: diff --git a/superset/mcp_service/chart/tool/get_chart_info.py b/superset/mcp_service/chart/tool/get_chart_info.py index fb8720a1c1b..f508a62c45e 100644 --- a/superset/mcp_service/chart/tool/get_chart_info.py +++ b/superset/mcp_service/chart/tool/get_chart_info.py @@ -172,9 +172,8 @@ async def get_chart_info( # branch returned above). assert request.identifier is not None - # Eager load owners and tags to avoid N+1 queries during serialization + # Eager load tags to avoid N+1 queries during serialization. eager_options = [ - subqueryload(Slice.owners), subqueryload(Slice.tags), ] diff --git a/superset/mcp_service/common/schema_discovery.py b/superset/mcp_service/common/schema_discovery.py index 5ca5a1a7424..71d7dc22f81 100644 --- a/superset/mcp_service/common/schema_discovery.py +++ b/superset/mcp_service/common/schema_discovery.py @@ -30,6 +30,7 @@ from pydantic import BaseModel, Field from sqlalchemy.inspection import inspect from superset.mcp_service.constants import ModelType +from superset.mcp_service.privacy import USER_DIRECTORY_FIELDS class ColumnMetadata(BaseModel): @@ -127,8 +128,6 @@ _COLUMN_DESCRIPTIONS: dict[str, str] = { "uuid": "Unique UUID identifier", "created_on": "Timestamp when the resource was created", "changed_on": "Timestamp when the resource was last modified", - "created_by_fk": "User ID of the creator", - "changed_by_fk": "User ID of the last modifier", "description": "User-provided description text", "cache_timeout": "Cache timeout override in seconds", "perm": "Permission string for access control", @@ -146,7 +145,6 @@ _COLUMN_DESCRIPTIONS: dict[str, str] = { "params": "JSON string of chart parameters/configuration", "query_context": "JSON string of the query context for data fetching", "last_saved_at": "Timestamp of the last explicit save", - "last_saved_by_fk": "User ID who last saved this chart", # Dataset-specific "table_name": "Name of the database table or view", "schema": "Database schema name", @@ -221,6 +219,8 @@ def get_columns_from_model( # Add extra columns (computed fields, relationships, etc.) if extra_columns: for name, metadata in extra_columns.items(): + if exclude_columns and name in exclude_columns: + continue # Check if already added from model columns if not any(c.name == name for c in columns): columns.append(metadata) @@ -569,7 +569,12 @@ def get_chart_columns() -> list[ColumnMetadata]: """Get column metadata for Chart model dynamically.""" from superset.models.slice import Slice - return get_columns_from_model(Slice, CHART_DEFAULT_COLUMNS, CHART_EXTRA_COLUMNS) + return get_columns_from_model( + Slice, + CHART_DEFAULT_COLUMNS, + CHART_EXTRA_COLUMNS, + exclude_columns=set(USER_DIRECTORY_FIELDS), + ) def get_dataset_columns() -> list[ColumnMetadata]: @@ -577,7 +582,10 @@ def get_dataset_columns() -> list[ColumnMetadata]: from superset.connectors.sqla.models import SqlaTable return get_columns_from_model( - SqlaTable, DATASET_DEFAULT_COLUMNS, DATASET_EXTRA_COLUMNS + SqlaTable, + DATASET_DEFAULT_COLUMNS, + DATASET_EXTRA_COLUMNS, + exclude_columns=set(USER_DIRECTORY_FIELDS), ) @@ -586,7 +594,10 @@ def get_dashboard_columns() -> list[ColumnMetadata]: from superset.models.dashboard import Dashboard return get_columns_from_model( - Dashboard, DASHBOARD_DEFAULT_COLUMNS, DASHBOARD_EXTRA_COLUMNS + Dashboard, + DASHBOARD_DEFAULT_COLUMNS, + DASHBOARD_EXTRA_COLUMNS, + exclude_columns=set(USER_DIRECTORY_FIELDS), ) @@ -607,7 +618,7 @@ def get_database_columns() -> list[ColumnMetadata]: Database, DATABASE_DEFAULT_COLUMNS, DATABASE_EXTRA_COLUMNS, - exclude_columns=DATABASE_EXCLUDE_COLUMNS, + exclude_columns=DATABASE_EXCLUDE_COLUMNS | set(USER_DIRECTORY_FIELDS), ) diff --git a/superset/mcp_service/dashboard/schemas.py b/superset/mcp_service/dashboard/schemas.py index 6973245ad64..cecdfe21370 100644 --- a/superset/mcp_service/dashboard/schemas.py +++ b/superset/mcp_service/dashboard/schemas.py @@ -27,7 +27,7 @@ Example usage: id=1, dashboard_title="Sales Dashboard", published=True, - owners=[UserInfo(id=1, username="admin")], + tags=[TagInfo(id=1, name="sales")], charts=[DashboardChartSummary(id=1, slice_name="Sales Chart")] ) @@ -86,12 +86,11 @@ if TYPE_CHECKING: from superset.daos.base import ColumnOperator, ColumnOperatorEnum from superset.mcp_service.common.cache_schemas import MetadataCacheControl from superset.mcp_service.constants import DEFAULT_PAGE_SIZE, MAX_PAGE_SIZE +from superset.mcp_service.privacy import filter_user_directory_fields from superset.mcp_service.system.schemas import ( PaginationInfo, RoleInfo, - serialize_user_object, TagInfo, - UserInfo, ) from superset.mcp_service.utils.sanitization import ( _remove_dangerous_unicode, @@ -117,10 +116,6 @@ class DashboardError(BaseModel): return cls(error=error, error_type=error_type, timestamp=datetime.now()) -# serialize_user_object is imported from system.schemas and re-exported here -# for backward compatibility with dashboard tool modules. - - def serialize_tag_object(tag: Any) -> TagInfo | None: """Serialize a tag object to TagInfo""" if not tag: @@ -159,17 +154,13 @@ class DashboardFilter(ColumnOperator): col: Literal[ "dashboard_title", "published", - "created_by_fk", - "owner", "favorite", ] = Field( ..., description=( "Column to filter on. Use " "get_schema(model_type='dashboard') for available " - "filter columns. Use created_by_fk with the user " - "ID from get_instance_info's current_user to find " - "dashboards created by a specific user." + "filter columns." ), ) opr: ColumnOperatorEnum = Field( @@ -349,16 +340,12 @@ class DashboardInfo(BaseModel): external_url: str | None = None created_on: str | datetime | None = None changed_on: str | datetime | None = None - created_by: str | None = None - changed_by: str | None = None uuid: str | None = None url: str | None = None created_on_humanized: str | None = None changed_on_humanized: str | None = None chart_count: int = 0 - owners: List[UserInfo] = Field(default_factory=list) tags: List[TagInfo] = Field(default_factory=list) - roles: List[RoleInfo] = Field(default_factory=list) charts: List[DashboardChartSummary] = Field(default_factory=list) # Structured filter information extracted from json_metadata @@ -413,7 +400,7 @@ class DashboardInfo(BaseModel): model_config = ConfigDict(from_attributes=True, ser_json_timedelta="iso8601") - @model_serializer(mode="wrap", when_used="json") + @model_serializer(mode="wrap") def _filter_fields_by_context(self, serializer: Any, info: Any) -> Dict[str, Any]: """Filter fields based on serialization context. @@ -421,7 +408,7 @@ class DashboardInfo(BaseModel): Otherwise, include all fields (default behavior). """ # Get full serialization - data = serializer(self) + data = filter_user_directory_fields(serializer(self)) # Check if we have a context with select_columns if info.context and isinstance(info.context, dict): @@ -430,7 +417,6 @@ class DashboardInfo(BaseModel): # Filter to only requested fields return {k: v for k, v in data.items() if k in select_columns} - # No filtering - return all fields return data @@ -678,12 +664,6 @@ def dashboard_serializer(dashboard: "Dashboard") -> DashboardInfo: external_url=dashboard.external_url, created_on=dashboard.created_on, changed_on=dashboard.changed_on, - created_by=getattr(dashboard.created_by, "username", None) - if dashboard.created_by - else None, - changed_by=getattr(dashboard.changed_by, "username", None) - if dashboard.changed_by - else None, uuid=str(dashboard.uuid) if dashboard.uuid else None, url=absolute_url, created_on_humanized=dashboard.created_on_humanized, @@ -699,24 +679,11 @@ def dashboard_serializer(dashboard: "Dashboard") -> DashboardInfo: getattr(dashboard, "json_metadata", None), getattr(dashboard, "position_json", None), ), - owners=[ - info - for owner in dashboard.owners - if (info := serialize_user_object(owner)) is not None - ] - if dashboard.owners - else [], tags=[ TagInfo.model_validate(tag, from_attributes=True) for tag in dashboard.tags ] if dashboard.tags else [], - roles=[ - RoleInfo.model_validate(role, from_attributes=True) - for role in dashboard.roles - ] - if dashboard.roles - else [], charts=[ summary for chart in dashboard.slices @@ -757,12 +724,10 @@ def serialize_dashboard_object(dashboard: Any) -> DashboardInfo: slug=slug or "", url=dashboard_url, published=getattr(dashboard, "published", None), - changed_by=getattr(dashboard, "changed_by_name", None), changed_on=getattr(dashboard, "changed_on", None), changed_on_humanized=_humanize_timestamp( getattr(dashboard, "changed_on", None) ), - created_by=getattr(dashboard, "created_by_name", None), created_on=getattr(dashboard, "created_on", None), created_on_humanized=_humanize_timestamp( getattr(dashboard, "created_on", None) @@ -780,25 +745,12 @@ def serialize_dashboard_object(dashboard: Any) -> DashboardInfo: if getattr(dashboard, "uuid", None) else None, chart_count=len(getattr(dashboard, "slices", [])), - owners=[ - info - for owner in getattr(dashboard, "owners", []) - if (info := serialize_user_object(owner)) is not None - ] - if getattr(dashboard, "owners", None) - else [], tags=[ TagInfo.model_validate(tag, from_attributes=True) for tag in getattr(dashboard, "tags", []) ] if getattr(dashboard, "tags", None) else [], - roles=[ - RoleInfo.model_validate(role, from_attributes=True) - for role in getattr(dashboard, "roles", []) - ] - if getattr(dashboard, "roles", None) - else [], charts=[ summary for chart in getattr(dashboard, "slices", []) diff --git a/superset/mcp_service/dashboard/tool/add_chart_to_existing_dashboard.py b/superset/mcp_service/dashboard/tool/add_chart_to_existing_dashboard.py index da511c29eb3..a323c42d918 100644 --- a/superset/mcp_service/dashboard/tool/add_chart_to_existing_dashboard.py +++ b/superset/mcp_service/dashboard/tool/add_chart_to_existing_dashboard.py @@ -470,10 +470,10 @@ def add_chart_to_existing_dashboard( # Re-fetch the dashboard with eager-loaded relationships to avoid # "Instance is not bound to a Session" errors when serializing - # chart .tags and .owners. The preceding command.run() commit may + # chart tags. The preceding command.run() commit may # invalidate the session in multi-tenant environments; on failure, # return a minimal response using only scalar attributes that are - # already loaded — relationship fields (owners, tags, slices) would + # already loaded — relationship fields (tags, slices) would # trigger lazy-loading on the same dead session. from sqlalchemy.orm import subqueryload @@ -486,9 +486,7 @@ def add_chart_to_existing_dashboard( DashboardDAO.find_by_id( updated_dashboard.id, query_options=[ - subqueryload(Dashboard.slices).subqueryload(Slice.owners), subqueryload(Dashboard.slices).subqueryload(Slice.tags), - subqueryload(Dashboard.owners), subqueryload(Dashboard.tags), ], ) @@ -531,7 +529,6 @@ def add_chart_to_existing_dashboard( # Convert to response format from superset.mcp_service.dashboard.schemas import ( serialize_tag_object, - serialize_user_object, ) dashboard_info = DashboardInfo( @@ -542,16 +539,12 @@ def add_chart_to_existing_dashboard( published=updated_dashboard.published, created_on=updated_dashboard.created_on, changed_on=updated_dashboard.changed_on, - created_by=updated_dashboard.created_by_name or None, - changed_by=updated_dashboard.changed_by_name or None, + created_by=None, + changed_by=None, uuid=str(updated_dashboard.uuid) if updated_dashboard.uuid else None, url=f"{get_superset_base_url()}/superset/dashboard/{updated_dashboard.id}/", chart_count=len(updated_dashboard.slices), - owners=[ - serialize_user_object(owner) - for owner in getattr(updated_dashboard, "owners", []) - if serialize_user_object(owner) is not None - ], + owners=[], tags=[ serialize_tag_object(tag) for tag in getattr(updated_dashboard, "tags", []) diff --git a/superset/mcp_service/dashboard/tool/generate_dashboard.py b/superset/mcp_service/dashboard/tool/generate_dashboard.py index aa1bb900a31..0086f9514e7 100644 --- a/superset/mcp_service/dashboard/tool/generate_dashboard.py +++ b/superset/mcp_service/dashboard/tool/generate_dashboard.py @@ -350,7 +350,7 @@ def generate_dashboard( # noqa: C901 # environments, causing "Can't reconnect until invalid transaction # is rolled back". Wrap the DAO re-fetch in try/except; on failure, # return a minimal response using only scalar attributes that are - # already loaded — relationship fields (owners, tags, slices) would + # already loaded — relationship fields (tags, slices) would # trigger lazy-loading on the same dead session. from superset.daos.dashboard import DashboardDAO @@ -359,9 +359,7 @@ def generate_dashboard( # noqa: C901 DashboardDAO.find_by_id( dashboard.id, query_options=[ - subqueryload(Dashboard.slices).subqueryload(Slice.owners), subqueryload(Dashboard.slices).subqueryload(Slice.tags), - subqueryload(Dashboard.owners), subqueryload(Dashboard.tags), ], ) @@ -399,7 +397,6 @@ def generate_dashboard( # noqa: C901 from superset.mcp_service.dashboard.schemas import ( serialize_chart_summary, serialize_tag_object, - serialize_user_object, ) dashboard_info = DashboardInfo( @@ -410,16 +407,12 @@ def generate_dashboard( # noqa: C901 published=dashboard.published, created_on=dashboard.created_on, changed_on=dashboard.changed_on, - created_by=dashboard.created_by_name or None, - changed_by=dashboard.changed_by_name or None, + created_by=None, + changed_by=None, uuid=str(dashboard.uuid) if dashboard.uuid else None, url=f"{get_superset_base_url()}/superset/dashboard/{dashboard.id}/", chart_count=len(request.chart_ids), - owners=[ - serialize_user_object(owner) - for owner in getattr(dashboard, "owners", []) - if serialize_user_object(owner) is not None - ], + owners=[], tags=[ serialize_tag_object(tag) for tag in getattr(dashboard, "tags", []) diff --git a/superset/mcp_service/dashboard/tool/get_dashboard_info.py b/superset/mcp_service/dashboard/tool/get_dashboard_info.py index 46f5ae4eedd..71e60279003 100644 --- a/superset/mcp_service/dashboard/tool/get_dashboard_info.py +++ b/superset/mcp_service/dashboard/tool/get_dashboard_info.py @@ -108,15 +108,10 @@ async def get_dashboard_info( from superset.models.dashboard import Dashboard from superset.models.slice import Slice - # Eager load slices (charts), owners, tags, and roles to avoid N+1 - # queries. Also eager load owners/tags on each slice since the - # dashboard serializer calls serialize_chart_object for every chart. + # Eager load slices and tags to avoid N+1 queries during serialization. eager_options = [ - subqueryload(Dashboard.slices).subqueryload(Slice.owners), subqueryload(Dashboard.slices).subqueryload(Slice.tags), - subqueryload(Dashboard.owners), subqueryload(Dashboard.tags), - subqueryload(Dashboard.roles), ] with event_logger.log_context(action="mcp.get_dashboard_info.lookup"): diff --git a/superset/mcp_service/database/schemas.py b/superset/mcp_service/database/schemas.py index d93ea630b30..0a425b95e1d 100644 --- a/superset/mcp_service/database/schemas.py +++ b/superset/mcp_service/database/schemas.py @@ -38,6 +38,7 @@ from pydantic import ( from superset.daos.base import ColumnOperator, ColumnOperatorEnum from superset.mcp_service.common.cache_schemas import MetadataCacheControl from superset.mcp_service.constants import DEFAULT_PAGE_SIZE, MAX_PAGE_SIZE +from superset.mcp_service.privacy import filter_user_directory_fields from superset.mcp_service.system.schemas import PaginationInfo from superset.mcp_service.utils.schema_utils import ( parse_json_or_list, @@ -58,14 +59,10 @@ class DatabaseFilter(ColumnOperator): "database_name", "expose_in_sqllab", "allow_file_upload", - "created_by_fk", - "changed_by_fk", ] = Field( ..., description="Column to filter on. Use get_schema(model_type='database') for " - "available filter columns. Use created_by_fk with the user " - "ID from get_instance_info's current_user to find " - "databases created by a specific user.", + "available filter columns.", ) opr: ColumnOperatorEnum = Field( ..., @@ -119,14 +116,12 @@ class DatabaseInfo(BaseModel): None, description="URL of the external management system" ) extra: Dict[str, Any | None] | None = Field(None, description="Extra configuration") - changed_by: str | None = Field(None, description="Last modifier (username)") changed_on: str | datetime | None = Field( None, description="Last modification timestamp" ) changed_on_humanized: str | None = Field( None, description="Humanized modification time" ) - created_by: str | None = Field(None, description="Database creator (username)") created_on: str | datetime | None = Field(None, description="Creation timestamp") created_on_humanized: str | None = Field( None, description="Humanized creation time" @@ -137,14 +132,14 @@ class DatabaseInfo(BaseModel): populate_by_name=True, ) - @model_serializer(mode="wrap", when_used="json") + @model_serializer(mode="wrap") def _filter_fields_by_context(self, serializer: Any, info: Any) -> Dict[str, Any]: """Filter fields based on serialization context. If context contains 'select_columns', only include those fields. Otherwise, include all fields (default behavior). """ - data = serializer(self) + data = filter_user_directory_fields(serializer(self)) if info.context and isinstance(info.context, dict): select_columns = info.context.get("select_columns") @@ -349,16 +344,8 @@ def serialize_database_object(database: Any) -> DatabaseInfo | None: is_managed_externally=getattr(database, "is_managed_externally", None), external_url=getattr(database, "external_url", None), extra=_parse_json_field(database, "extra"), - changed_by=getattr(database, "changed_by_name", None) - or ( - str(database.changed_by) if getattr(database, "changed_by", None) else None - ), changed_on=getattr(database, "changed_on", None), changed_on_humanized=_humanize_timestamp(getattr(database, "changed_on", None)), - created_by=getattr(database, "created_by_name", None) - or ( - str(database.created_by) if getattr(database, "created_by", None) else None - ), created_on=getattr(database, "created_on", None), created_on_humanized=_humanize_timestamp(getattr(database, "created_on", None)), ) diff --git a/superset/mcp_service/dataset/schemas.py b/superset/mcp_service/dataset/schemas.py index ff25404427e..3a8216c1d4a 100644 --- a/superset/mcp_service/dataset/schemas.py +++ b/superset/mcp_service/dataset/schemas.py @@ -38,11 +38,10 @@ from pydantic import ( from superset.daos.base import ColumnOperator, ColumnOperatorEnum from superset.mcp_service.common.cache_schemas import MetadataCacheControl from superset.mcp_service.constants import DEFAULT_PAGE_SIZE, MAX_PAGE_SIZE +from superset.mcp_service.privacy import filter_user_directory_fields from superset.mcp_service.system.schemas import ( PaginationInfo, - serialize_user_object, TagInfo, - UserInfo, ) from superset.utils import json @@ -59,7 +58,6 @@ class DatasetFilter(ColumnOperator): "table_name", "schema", "database_name", - "owner", ] = Field( ..., description="Column to filter on. Use get_schema(model_type='dataset') for " @@ -109,22 +107,17 @@ class DatasetInfo(BaseModel): certification_details: str | None = Field( None, description="Certification details or reason" ) - changed_by: str | None = Field(None, description="Last modifier (username)") changed_on: str | datetime | None = Field( None, description="Last modification timestamp" ) changed_on_humanized: str | None = Field( None, description="Humanized modification time" ) - created_by: str | None = Field(None, description="Dataset creator (username)") created_on: str | datetime | None = Field(None, description="Creation timestamp") created_on_humanized: str | None = Field( None, description="Humanized creation time" ) tags: List[TagInfo] = Field(default_factory=list, description="Dataset tags") - owners: List[UserInfo] = Field( - default_factory=list, description="DatasetInfo owners" - ) is_virtual: bool | None = Field( None, description="Whether the dataset is virtual (uses SQL)" ) @@ -158,7 +151,7 @@ class DatasetInfo(BaseModel): populate_by_name=True, # Allow both 'schema' (alias) and 'schema_name' (field) ) - @model_serializer(mode="wrap", when_used="json") + @model_serializer(mode="wrap") def _filter_fields_by_context(self, serializer: Any, info: Any) -> Dict[str, Any]: """Filter fields based on serialization context. @@ -166,7 +159,7 @@ class DatasetInfo(BaseModel): Otherwise, include all fields (default behavior). """ # Get full serialization - data = serializer(self) + data = filter_user_directory_fields(serializer(self)) # Normalize alias: Pydantic serializes as 'schema_name' (field name) # but the DAO column and API convention is 'schema' @@ -448,12 +441,8 @@ def serialize_dataset_object(dataset: Any) -> DatasetInfo | None: description=getattr(dataset, "description", None), certified_by=getattr(dataset, "certified_by", None), certification_details=getattr(dataset, "certification_details", None), - changed_by=getattr(dataset, "changed_by_name", None) - or (str(dataset.changed_by) if getattr(dataset, "changed_by", None) else None), changed_on=getattr(dataset, "changed_on", None), changed_on_humanized=_humanize_timestamp(getattr(dataset, "changed_on", None)), - created_by=getattr(dataset, "created_by_name", None) - or (str(dataset.created_by) if getattr(dataset, "created_by", None) else None), created_on=getattr(dataset, "created_on", None), created_on_humanized=_humanize_timestamp(getattr(dataset, "created_on", None)), tags=[ @@ -462,13 +451,6 @@ def serialize_dataset_object(dataset: Any) -> DatasetInfo | None: ] if getattr(dataset, "tags", None) else [], - owners=[ - info - for owner in getattr(dataset, "owners", []) - if (info := serialize_user_object(owner)) is not None - ] - if getattr(dataset, "owners", None) - else [], is_virtual=getattr(dataset, "is_virtual", None), database_id=getattr(dataset, "database_id", None), uuid=str(getattr(dataset, "uuid", "")) diff --git a/superset/mcp_service/mcp_core.py b/superset/mcp_service/mcp_core.py index 4b4aba29928..7a5a6dabda0 100644 --- a/superset/mcp_service/mcp_core.py +++ b/superset/mcp_service/mcp_core.py @@ -26,6 +26,10 @@ from pydantic import BaseModel from superset.daos.base import BaseDAO from superset.mcp_service.constants import ModelType +from superset.mcp_service.privacy import ( + filter_user_directory_columns, + USER_DIRECTORY_FIELDS, +) from superset.mcp_service.utils import _is_uuid # Type variables for generic model tools @@ -116,12 +120,16 @@ class ModelListCore(BaseCore, Generic[L]): self.output_schema = output_schema self.item_serializer = item_serializer self.filter_type = filter_type - self.default_columns = list(default_columns) # Copy to prevent mutation - self.search_columns = list(search_columns) # Copy to prevent mutation + self.default_columns = filter_user_directory_columns(default_columns) + self.search_columns = filter_user_directory_columns(search_columns) self.list_field_name = list_field_name self.output_list_schema = output_list_schema - self._all_columns = list(all_columns) if all_columns else list(default_columns) - self._sortable_columns = list(sortable_columns) if sortable_columns else [] + self._all_columns = filter_user_directory_columns( + all_columns if all_columns else default_columns + ) + self._sortable_columns = filter_user_directory_columns( + sortable_columns if sortable_columns else [] + ) @property def all_columns(self) -> List[str]: @@ -133,6 +141,39 @@ class ModelListCore(BaseCore, Generic[L]): """Return a copy of sortable_columns to prevent external mutation.""" return list(self._sortable_columns) + def _get_columns_to_load( + self, select_columns: Any | None + ) -> tuple[List[str], List[str]]: + """Return requested and loaded columns after privacy filtering.""" + if not select_columns: + return self.default_columns, list(self.default_columns) + + from superset.mcp_service.utils.schema_utils import parse_json_or_list + + parsed_columns = parse_json_or_list(select_columns, param_name="select_columns") + columns_to_load = filter_user_directory_columns(parsed_columns) + if not columns_to_load: + raise ValueError("select_columns contains no valid columns") + + return columns_to_load, list(columns_to_load) + + def _validate_order_column(self, order_column: str | None) -> None: + """Reject privacy-filtered or unknown sort columns. + + Validation is skipped when no sortable_columns were declared, to preserve + backward-compatible passthrough behaviour for tools that rely on DAO-level + sort handling. + """ + if ( + order_column + and self._sortable_columns + and order_column not in self._sortable_columns + ): + raise ValueError( + f"Invalid order_column '{order_column}'. " + f"Allowed columns: {', '.join(self._sortable_columns)}" + ) + def run_tool( self, filters: Any | None = None, @@ -150,22 +191,13 @@ class ModelListCore(BaseCore, Generic[L]): # Parse filters using generic utility (accepts JSON string or object) from superset.mcp_service.utils.schema_utils import ( - parse_json_or_list, parse_json_or_passthrough, ) filters = parse_json_or_passthrough(filters, param_name="filters") # Parse select_columns using generic utility (accepts JSON, list, or CSV) - if select_columns: - select_columns = parse_json_or_list( - select_columns, param_name="select_columns" - ) - columns_to_load = list(select_columns) - columns_requested = select_columns - else: - columns_to_load = list(self.default_columns) - columns_requested = self.default_columns + columns_requested, columns_to_load = self._get_columns_to_load(select_columns) # Ensure computed columns have their dependencies loaded. # Humanized timestamps are derived from their raw counterparts — @@ -178,6 +210,8 @@ class ModelListCore(BaseCore, Generic[L]): if computed in columns_to_load and dependency not in columns_to_load: columns_to_load.append(dependency) + self._validate_order_column(order_column) + # Query the DAO items: List[Any] items, total_count = self.dao_class.list( @@ -567,13 +601,18 @@ class ModelGetSchemaCore(BaseCore, Generic[S]): self.model_type = model_type self.dao_class = dao_class self.output_schema = output_schema - self.select_columns = select_columns - self.sortable_columns = sortable_columns - self.default_columns = default_columns - self.search_columns = search_columns + self.select_columns = [ + column + for column in select_columns + if getattr(column, "name", None) not in USER_DIRECTORY_FIELDS + ] + self.sortable_columns = filter_user_directory_columns(sortable_columns) + self.default_columns = filter_user_directory_columns(default_columns) + self.search_columns = filter_user_directory_columns(search_columns) self.default_sort = default_sort self.default_sort_direction = default_sort_direction - self.exclude_filter_columns = exclude_filter_columns or set() + self.exclude_filter_columns = set(exclude_filter_columns or set()) + self.exclude_filter_columns.update(USER_DIRECTORY_FIELDS) def _get_filter_columns(self) -> Dict[str, List[str]]: """Get filterable columns and operators from the DAO.""" diff --git a/superset/mcp_service/privacy.py b/superset/mcp_service/privacy.py new file mode 100644 index 00000000000..dd26b9ee92e --- /dev/null +++ b/superset/mcp_service/privacy.py @@ -0,0 +1,52 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +"""Privacy helpers for MCP user-directory and access-list metadata.""" + +from __future__ import annotations + +from collections.abc import Iterable +from typing import Any + +USER_DIRECTORY_FIELDS = frozenset( + { + "changed_by", + "changed_by_fk", + "changed_by_name", + "created_by", + "created_by_fk", + "created_by_name", + "last_saved_by", + "last_saved_by_fk", + "last_saved_by_name", + "owner", + "owners", + "roles", + } +) + + +def filter_user_directory_fields(data: dict[str, Any]) -> dict[str, Any]: + """Remove fields that expose users, roles, owners, or access metadata.""" + return { + key: value for key, value in data.items() if key not in USER_DIRECTORY_FIELDS + } + + +def filter_user_directory_columns(columns: Iterable[str]) -> list[str]: + """Remove user-directory columns while preserving order.""" + return [column for column in columns if column not in USER_DIRECTORY_FIELDS] diff --git a/superset/mcp_service/system/schemas.py b/superset/mcp_service/system/schemas.py index ae0fab91e62..582ae84aef9 100644 --- a/superset/mcp_service/system/schemas.py +++ b/superset/mcp_service/system/schemas.py @@ -118,9 +118,7 @@ class InstanceInfo(BaseModel): popular_content: PopularContent current_user: UserInfo | None = Field( None, - description=( - "Use current_user.id with created_by_fk filter to find your own assets." - ), + description="Information about the authenticated user.", ) feature_availability: FeatureAvailability timestamp: datetime diff --git a/superset/mcp_service/utils/permissions_utils.py b/superset/mcp_service/utils/permissions_utils.py index 62d9c3d0840..c4b24afc50d 100644 --- a/superset/mcp_service/utils/permissions_utils.py +++ b/superset/mcp_service/utils/permissions_utils.py @@ -26,6 +26,8 @@ from typing import Any, List, Optional, Set from flask_appbuilder.security.sqla.models import User from pydantic import BaseModel +from superset.mcp_service.privacy import USER_DIRECTORY_FIELDS + logger = logging.getLogger(__name__) # Define sensitive fields by object type @@ -34,24 +36,17 @@ SENSITIVE_FIELDS = { "sql", # Raw SQL queries may contain sensitive logic "extra", # May contain connection strings or credentials "database_id", # Internal database references - "changed_by_fk", # Internal user references - "created_by_fk", # Internal user references }, "chart": { "query_context", # May contain sensitive filters or parameters "cache_key", # Internal cache references - "changed_by_fk", # Internal user references - "created_by_fk", # Internal user references }, "dashboard": { "css", # May contain sensitive styling info - "changed_by_fk", # Internal user references - "created_by_fk", # Internal user references }, "common": { "uuid", # Internal identifiers (keep for some use cases) - "changed_by_fk", # Internal user references - "created_by_fk", # Internal user references + *USER_DIRECTORY_FIELDS, }, } @@ -139,7 +134,7 @@ def get_allowed_fields( user = get_current_user() # Get sensitive fields for this object type - sensitive_fields = SENSITIVE_FIELDS.get(object_type, set()) + sensitive_fields = set(SENSITIVE_FIELDS.get(object_type, set())) sensitive_fields.update(SENSITIVE_FIELDS.get("common", set())) # If no user, only allow non-sensitive fields @@ -155,6 +150,8 @@ def get_allowed_fields( if requested_fields: for field in requested_fields: + if field in USER_DIRECTORY_FIELDS: + continue if field not in sensitive_fields: # Non-sensitive field, always allowed allowed_fields.add(field) @@ -229,6 +226,8 @@ def filter_sensitive_data( # Filter the dictionary filtered_data = {} for key, value in data.items(): + if key in USER_DIRECTORY_FIELDS: + continue if key in allowed_fields: filtered_data[key] = value else: diff --git a/superset/mcp_service/utils/token_utils.py b/superset/mcp_service/utils/token_utils.py index 423bbc6a224..00e6664e729 100644 --- a/superset/mcp_service/utils/token_utils.py +++ b/superset/mcp_service/utils/token_utils.py @@ -255,8 +255,8 @@ def generate_size_reduction_suggestions( current_filters = query_params.get("filters") if not current_filters and "list_" in tool_name: suggestions.append( - "Add filters to narrow down results (e.g., filter by owner, " - "date range, or specific attributes)" + "Add filters to narrow down results (e.g., date range, search term, " + "or specific non-user attributes)" ) # Suggestion 4: Tool-specific suggestions diff --git a/tests/unit_tests/mcp_service/chart/tool/test_generate_chart.py b/tests/unit_tests/mcp_service/chart/tool/test_generate_chart.py index 2a7f1854dfb..7da62a81e82 100644 --- a/tests/unit_tests/mcp_service/chart/tool/test_generate_chart.py +++ b/tests/unit_tests/mcp_service/chart/tool/test_generate_chart.py @@ -395,7 +395,7 @@ class TestChartSerializationEagerLoading: assert result.id == 42 assert result.slice_name == "Test Chart" assert result.tags == [] - assert result.owners == [] + assert "owners" not in result.model_dump() def test_serialize_chart_object_with_certification_fields(self): """serialize_chart_object correctly serializes non-None certification values.""" diff --git a/tests/unit_tests/mcp_service/dashboard/tool/test_dashboard_tools.py b/tests/unit_tests/mcp_service/dashboard/tool/test_dashboard_tools.py index e2af01c4e06..d47c7651bd2 100644 --- a/tests/unit_tests/mcp_service/dashboard/tool/test_dashboard_tools.py +++ b/tests/unit_tests/mcp_service/dashboard/tool/test_dashboard_tools.py @@ -363,6 +363,153 @@ async def test_get_dashboard_info_access_denied(mock_info, mcp_server): assert result.data["error_type"] == "not_found" +@patch("superset.daos.dashboard.DashboardDAO.find_by_id") +@pytest.mark.asyncio +async def test_get_dashboard_info_does_not_expose_access_list_or_roles( + mock_info, mcp_server +): + creator = Mock() + creator.username = "workspace-admin" + + owner_role = Mock() + owner_role.name = "Primary Contributor" + owner = Mock() + owner.id = 2 + owner.username = "daniel" + owner.first_name = "Daniel" + owner.last_name = "Watania" + owner.email = "daniel.watania@preset.io" + owner.active = True + owner.roles = [owner_role] + + dashboard_role = Mock() + dashboard_role.id = 3 + dashboard_role.name = "PresetAlpha" + dashboard_role.permissions = [] + + chart = Mock() + chart.id = 10 + chart.slice_name = "Chart with owner" + chart.viz_type = "table" + chart.datasource_name = "examples" + chart.datasource_type = "table" + chart.description = None + chart.cache_timeout = None + chart.changed_by_name = None + chart.changed_by = None + chart.changed_on = None + chart.created_by_name = None + chart.created_by = None + chart.created_on = None + chart.uuid = None + chart.tags = [] + chart.owners = [owner] + + dashboard = Mock() + dashboard.id = 1 + dashboard.dashboard_title = "Customer Success Home Dashboard" + dashboard.slug = "customer-success-home" + dashboard.description = None + dashboard.css = None + dashboard.certified_by = None + dashboard.certification_details = None + dashboard.json_metadata = None + dashboard.position_json = None + dashboard.published = True + dashboard.is_managed_externally = False + dashboard.external_url = None + dashboard.created_on = None + dashboard.changed_on = None + dashboard.created_by = creator + dashboard.changed_by = creator + dashboard.uuid = None + dashboard.url = "/dashboard/1" + dashboard.created_on_humanized = None + dashboard.changed_on_humanized = None + dashboard.slices = [chart] + dashboard.owners = [owner] + dashboard.tags = [] + dashboard.roles = [dashboard_role] + + mock_info.return_value = dashboard + + async with Client(mcp_server) as client: + result = await client.call_tool( + "get_dashboard_info", {"request": {"identifier": 1}} + ) + + assert result.data["dashboard_title"] == "Customer Success Home Dashboard" + assert "created_by" not in result.data + assert "changed_by" not in result.data + assert "owners" not in result.data + assert "roles" not in result.data + assert "created_by" not in result.data["charts"][0] + assert "changed_by" not in result.data["charts"][0] + assert "owners" not in result.data["charts"][0] + + +@patch("superset.daos.dashboard.DashboardDAO.list") +@pytest.mark.asyncio +async def test_list_dashboards_omits_requested_user_directory_fields( + mock_list, mcp_server +): + dashboard = Mock() + dashboard.id = 1 + dashboard.dashboard_title = "Customer Success Home Dashboard" + dashboard.slug = "customer-success-home" + dashboard.url = "/dashboard/1" + dashboard.published = True + dashboard.changed_by_name = "workspace-admin" + dashboard.changed_on = None + dashboard.changed_on_humanized = None + dashboard.created_by_name = "workspace-admin" + dashboard.created_on = None + dashboard.created_on_humanized = None + dashboard.tags = [] + dashboard.owners = [Mock()] + dashboard.slices = [] + dashboard.description = None + dashboard.css = None + dashboard.certified_by = None + dashboard.certification_details = None + dashboard.json_metadata = None + dashboard.position_json = None + dashboard.is_managed_externally = False + dashboard.external_url = None + dashboard.uuid = "test-dashboard-uuid-1" + dashboard.roles = [Mock()] + dashboard._mapping = {} + mock_list.return_value = ([dashboard], 1) + + async with Client(mcp_server) as client: + request = ListDashboardsRequest( + page=1, + page_size=10, + select_columns=[ + "id", + "dashboard_title", + "owners", + "roles", + "created_by", + "changed_by", + ], + ) + result = await client.call_tool( + "list_dashboards", {"request": request.model_dump()} + ) + + data = json.loads(result.content[0].text) + dashboard_data = data["dashboards"][0] + assert dashboard_data == { + "id": 1, + "dashboard_title": "Customer Success Home Dashboard", + } + for field in ("owners", "roles", "created_by", "changed_by"): + assert field not in data["columns_requested"] + assert field not in data["columns_loaded"] + assert field not in data["columns_available"] + + # TODO (Phase 3+): Add tests for get_dashboard_available_filters tool diff --git a/tests/unit_tests/mcp_service/database/tool/test_database_tools.py b/tests/unit_tests/mcp_service/database/tool/test_database_tools.py index 78ae81c9962..b98413fc269 100644 --- a/tests/unit_tests/mcp_service/database/tool/test_database_tools.py +++ b/tests/unit_tests/mcp_service/database/tool/test_database_tools.py @@ -165,6 +165,54 @@ async def test_list_databases_with_filters(mock_list, mcp_server): assert len(data["databases"]) == 1 +@patch("superset.daos.database.DatabaseDAO.list") +@pytest.mark.asyncio +async def test_list_databases_does_not_expose_user_directory_fields( + mock_list, mcp_server +) -> None: + """Test database listing does not expose creator/modifier fields.""" + database = create_mock_database() + database._mapping = { + "id": database.id, + "database_name": database.database_name, + "created_by": database.created_by_name, + "created_by_fk": 1, + "changed_by": database.changed_by_name, + "changed_by_fk": 1, + } + mock_list.return_value = ([database], 1) + + async with Client(mcp_server) as client: + request = ListDatabasesRequest( + page=1, + page_size=10, + select_columns=[ + "id", + "database_name", + "created_by", + "created_by_fk", + "changed_by", + "changed_by_fk", + ], + ) + result = await client.call_tool( + "list_databases", {"request": request.model_dump()} + ) + + data = json.loads(result.content[0].text) + assert data["columns_requested"] == ["id", "database_name"] + assert data["columns_loaded"] == ["id", "database_name"] + assert data["databases"] == [{"id": 1, "database_name": "examples"}] + + +def test_database_filter_rejects_user_directory_fields() -> None: + """Test user directory fields cannot be used for database filters.""" + with pytest.raises(ValueError, match="created_by_fk"): + ListDatabasesRequest( + filters=[{"col": "created_by_fk", "opr": "eq", "value": 1}], + ) + + @patch("superset.daos.database.DatabaseDAO.list") @pytest.mark.asyncio async def test_list_databases_api_error(mock_list, mcp_server): @@ -192,6 +240,8 @@ async def test_get_database_info_basic(mock_find, mcp_server): assert data["id"] == 1 assert data["database_name"] == "examples" assert data["backend"] == "postgresql" + assert "created_by" not in data + assert "changed_by" not in data @patch("superset.daos.database.DatabaseDAO.find_by_id") diff --git a/tests/unit_tests/mcp_service/system/tool/test_get_current_user.py b/tests/unit_tests/mcp_service/system/tool/test_get_current_user.py index 01ac357f3eb..4b57fa13f69 100644 --- a/tests/unit_tests/mcp_service/system/tool/test_get_current_user.py +++ b/tests/unit_tests/mcp_service/system/tool/test_get_current_user.py @@ -15,7 +15,7 @@ # specific language governing permissions and limitations # under the License. -"""Tests for current_user in get_instance_info and created_by_fk filtering.""" +"""Tests for current_user in get_instance_info and user-directory filtering.""" from unittest.mock import Mock, patch @@ -304,24 +304,14 @@ class TestGetInstanceInfoCurrentUserViaMCP: # --------------------------------------------------------------------------- -# Filter schema tests: created_by_fk +# Filter schema tests: user-directory fields # --------------------------------------------------------------------------- -def test_chart_filter_accepts_created_by_fk(): - """Test that ChartFilter accepts created_by_fk as a valid column.""" - f = ChartFilter(col="created_by_fk", opr="eq", value=42) - assert f.col == "created_by_fk" - assert f.opr == "eq" - assert f.value == 42 - - -def test_chart_filter_created_by_fk_with_ne_operator(): - """Test created_by_fk with 'ne' (not equal) operator.""" - f = ChartFilter(col="created_by_fk", opr="ne", value=1) - assert f.col == "created_by_fk" - assert f.opr == "ne" - assert f.value == 1 +def test_chart_filter_rejects_created_by_fk() -> None: + """Test that ChartFilter rejects user-directory columns.""" + with pytest.raises(ValidationError): + ChartFilter(col="created_by_fk", opr="eq", value=42) def test_chart_filter_rejects_invalid_column(): @@ -330,20 +320,10 @@ def test_chart_filter_rejects_invalid_column(): ChartFilter(col="nonexistent_column", opr="eq", value=42) -def test_dashboard_filter_accepts_created_by_fk(): - """Test that DashboardFilter accepts created_by_fk as a valid column.""" - f = DashboardFilter(col="created_by_fk", opr="eq", value=42) - assert f.col == "created_by_fk" - assert f.opr == "eq" - assert f.value == 42 - - -def test_dashboard_filter_created_by_fk_with_ne_operator(): - """Test created_by_fk with 'ne' (not equal) operator on dashboards.""" - f = DashboardFilter(col="created_by_fk", opr="ne", value=1) - assert f.col == "created_by_fk" - assert f.opr == "ne" - assert f.value == 1 +def test_dashboard_filter_rejects_created_by_fk(): + """Test that DashboardFilter rejects user-directory columns.""" + with pytest.raises(ValidationError): + DashboardFilter(col="created_by_fk", opr="eq", value=42) def test_dashboard_filter_rejects_invalid_column(): @@ -366,6 +346,6 @@ def test_chart_filter_existing_columns_still_work(): def test_dashboard_filter_existing_columns_still_work(): """Test that pre-existing dashboard filter columns are not broken.""" - for col in ("dashboard_title", "published", "created_by_fk"): + for col in ("dashboard_title", "published", "favorite"): f = DashboardFilter(col=col, opr="eq", value="test") assert f.col == col diff --git a/tests/unit_tests/mcp_service/system/tool/test_get_schema.py b/tests/unit_tests/mcp_service/system/tool/test_get_schema.py index 20abfdd7999..bd139da23dc 100644 --- a/tests/unit_tests/mcp_service/system/tool/test_get_schema.py +++ b/tests/unit_tests/mcp_service/system/tool/test_get_schema.py @@ -305,6 +305,40 @@ class TestGetSchemaToolViaClient: assert uuid_col is not None assert uuid_col["is_default"] is False + @patch("superset.daos.dashboard.DashboardDAO.get_filterable_columns_and_operators") + @pytest.mark.asyncio + async def test_get_schema_omits_user_directory_columns( + self, mock_filters, mcp_server + ): + """Test that schema discovery does not advertise user/access fields.""" + mock_filters.return_value = { + "dashboard_title": ["eq", "ilike"], + "owner": ["rel_m_m"], + "published": ["eq"], + } + + async with Client(mcp_server) as client: + result = await client.call_tool( + "get_schema", {"request": {"model_type": "dashboard"}} + ) + + data = json.loads(result.content[0].text) + info = data["schema_info"] + select_column_names = {column["name"] for column in info["select_columns"]} + + for field in ( + "owners", + "roles", + "created_by", + "created_by_fk", + "changed_by", + "changed_by_fk", + "owner", + ): + assert field not in select_column_names + assert field not in info["filter_columns"] + assert field not in info["sortable_columns"] + class TestGetSchemaEdgeCases: """Test edge cases for get_schema tool.""" diff --git a/tests/unit_tests/mcp_service/system/tool/test_mcp_core.py b/tests/unit_tests/mcp_service/system/tool/test_mcp_core.py index 945b5fea084..54c4906eaea 100644 --- a/tests/unit_tests/mcp_service/system/tool/test_mcp_core.py +++ b/tests/unit_tests/mcp_service/system/tool/test_mcp_core.py @@ -27,6 +27,7 @@ from superset.mcp_service.mcp_core import ( ModelGetSchemaCore, ModelListCore, ) +from superset.mcp_service.privacy import USER_DIRECTORY_FIELDS # Dummy Pydantic output schema @@ -125,6 +126,62 @@ def test_model_list_tool_with_filters_and_columns(): assert "id" in result.columns_loaded +def test_model_list_tool_rejects_only_user_directory_select_columns(): + tool = ModelListCore( + dao_class=DummyDAO, + output_schema=DummyOutputSchema, + item_serializer=dummy_serializer, + filter_type=None, + default_columns=["id", "name"], + search_columns=["name"], + list_field_name="items", + output_list_schema=DummyListSchema, + ) + + with pytest.raises(ValueError, match="contains no valid columns"): + tool.run_tool(select_columns=["created_by", "owners"]) + + +def test_model_list_tool_rejects_private_order_column(): + tool = ModelListCore( + dao_class=DummyDAO, + output_schema=DummyOutputSchema, + item_serializer=dummy_serializer, + filter_type=None, + default_columns=["id", "name"], + search_columns=["name"], + list_field_name="items", + output_list_schema=DummyListSchema, + sortable_columns=["id", "name", "created_by_fk"], + ) + + with pytest.raises(ValueError, match="order_column"): + tool.run_tool(order_column="created_by_fk") + + +def test_model_list_tool_allows_order_column_when_sortable_columns_not_declared(): + """When sortable_columns is not provided, order_column is passed through to the DAO + without validation (backward-compatible behaviour).""" + tool = ModelListCore( + dao_class=DummyDAO, + output_schema=DummyOutputSchema, + item_serializer=dummy_serializer, + filter_type=None, + default_columns=["id", "name"], + search_columns=["name"], + list_field_name="items", + output_list_schema=DummyListSchema, + # sortable_columns intentionally omitted + ) + # Should not raise even though "name" is not in the (empty) sortable list + tool.run_tool(order_column="name") + + +def test_user_directory_fields_include_last_saved_relationships(): + assert "last_saved_by" in USER_DIRECTORY_FIELDS + assert "last_saved_by_name" in USER_DIRECTORY_FIELDS + + def test_model_list_tool_empty_result(): class EmptyDAO: @classmethod diff --git a/tests/unit_tests/mcp_service/test_mcp_config.py b/tests/unit_tests/mcp_service/test_mcp_config.py index 73f8cf45cbf..f308c60ae04 100644 --- a/tests/unit_tests/mcp_service/test_mcp_config.py +++ b/tests/unit_tests/mcp_service/test_mcp_config.py @@ -64,6 +64,19 @@ def test_get_default_instructions_mentions_feature_availability(): assert "accessible menus" in instructions +def test_get_default_instructions_forbid_disclosing_other_user_access_or_roles() -> ( + None +): + """Test that instructions route access-list questions to workspace admins.""" + instructions = get_default_instructions() + + assert "Do NOT disclose dashboard access lists" in instructions + assert "other users' names, usernames, email addresses" in instructions + assert "current user's own identity details" in instructions + assert "Do NOT use execute_sql to query user, role, owner" in instructions + assert "direct them to their workspace admin" in instructions + + def test_init_fastmcp_server_with_default_app_name(): """Test that default APP_NAME produces Superset branding.""" # Mock Flask app config with default APP_NAME diff --git a/tests/unit_tests/mcp_service/test_privacy.py b/tests/unit_tests/mcp_service/test_privacy.py new file mode 100644 index 00000000000..df29c71e61b --- /dev/null +++ b/tests/unit_tests/mcp_service/test_privacy.py @@ -0,0 +1,84 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +"""Tests for MCP user-directory privacy filtering.""" + +import pytest + +from superset.mcp_service.chart.schemas import ChartInfo +from superset.mcp_service.dashboard.schemas import DashboardInfo +from superset.mcp_service.database.schemas import DatabaseInfo +from superset.mcp_service.dataset.schemas import DatasetInfo + + +@pytest.mark.parametrize( + "model", + [ + ChartInfo( + id=1, + slice_name="Revenue", + created_by="creator", + changed_by="modifier", + owners=[], + ), + DashboardInfo( + id=1, + dashboard_title="Executive Dashboard", + created_by="creator", + changed_by="modifier", + owners=[], + roles=[], + ), + DatasetInfo( + id=1, + table_name="sales", + created_by="creator", + changed_by="modifier", + owners=[], + ), + DatabaseInfo( + id=1, + database_name="warehouse", + created_by="creator", + changed_by="modifier", + ), + ], +) +def test_user_directory_fields_removed_from_python_and_json_dumps(model): + """Privacy fields are stripped regardless of Pydantic serialization mode.""" + for mode in (None, "json"): + data = model.model_dump() if mode is None else model.model_dump(mode=mode) + + for field in ("created_by", "changed_by", "owners", "roles"): + assert field not in data + + +@pytest.mark.parametrize( + ("schema_cls", "omitted_fields"), + [ + (ChartInfo, {"created_by", "changed_by", "changed_by_name", "owners"}), + (DashboardInfo, {"created_by", "changed_by", "owners", "roles"}), + (DatasetInfo, {"created_by", "changed_by", "owners"}), + (DatabaseInfo, {"created_by", "changed_by"}), + ], +) +def test_user_directory_fields_removed_from_json_schema(schema_cls, omitted_fields): + """Privacy-only response fields should not appear in the published schema.""" + properties = schema_cls.model_json_schema().get("properties", {}) + + for field in omitted_fields: + assert field not in properties diff --git a/tests/unit_tests/mcp_service/test_schema_discovery.py b/tests/unit_tests/mcp_service/test_schema_discovery.py new file mode 100644 index 00000000000..13abe35b467 --- /dev/null +++ b/tests/unit_tests/mcp_service/test_schema_discovery.py @@ -0,0 +1,43 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +"""Tests for MCP schema discovery helpers.""" + +from superset.mcp_service.common.schema_discovery import ( + CHART_EXTRA_COLUMNS, + ColumnMetadata, + get_columns_from_model, +) +from superset.models.slice import Slice + + +def test_get_columns_from_model_excludes_matching_extra_columns(): + columns = get_columns_from_model( + Slice, + default_columns=["id"], + extra_columns={ + "owners": ColumnMetadata(**CHART_EXTRA_COLUMNS["owners"].model_dump()), + "url": ColumnMetadata(**CHART_EXTRA_COLUMNS["url"].model_dump()), + }, + exclude_columns={"owners"}, + ) + + column_names = {column.name for column in columns} + + assert "id" in column_names + assert "url" in column_names + assert "owners" not in column_names diff --git a/tests/unit_tests/mcp_service/utils/test_permissions_utils.py b/tests/unit_tests/mcp_service/utils/test_permissions_utils.py new file mode 100644 index 00000000000..160fccb6344 --- /dev/null +++ b/tests/unit_tests/mcp_service/utils/test_permissions_utils.py @@ -0,0 +1,73 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +"""Tests for MCP field-level permission helpers.""" + +from unittest.mock import Mock + +from superset.mcp_service.utils.permissions_utils import ( + apply_field_permissions_to_columns, + filter_sensitive_data, + get_allowed_fields, +) + + +def test_get_allowed_fields_always_denies_user_directory_fields(): + user = Mock() + user.roles = [] + + allowed_fields = get_allowed_fields( + "dashboard", + user=user, + requested_fields=["id", "dashboard_title", "owners", "roles", "created_by"], + ) + + assert allowed_fields == {"id", "dashboard_title"} + + +def test_filter_sensitive_data_strips_user_directory_fields_even_if_allowed(): + data = { + "id": 1, + "dashboard_title": "Executive Dashboard", + "owners": [{"username": "admin"}], + "roles": [{"name": "Admin"}], + "created_by": "admin", + } + + filtered = filter_sensitive_data( + data, + "dashboard", + allowed_fields=set(data), + ) + + assert filtered == { + "id": 1, + "dashboard_title": "Executive Dashboard", + } + + +def test_apply_field_permissions_to_columns_omits_user_directory_fields(): + user = Mock() + user.roles = [] + + columns = apply_field_permissions_to_columns( + ["id", "slice_name", "owners", "changed_by_fk"], + "chart", + user=user, + ) + + assert columns == ["id", "slice_name"] diff --git a/tests/unit_tests/mcp_service/utils/test_token_utils.py b/tests/unit_tests/mcp_service/utils/test_token_utils.py index 2e540902c93..9a49264bd93 100644 --- a/tests/unit_tests/mcp_service/utils/test_token_utils.py +++ b/tests/unit_tests/mcp_service/utils/test_token_utils.py @@ -220,6 +220,8 @@ class TestGenerateSizeReductionSuggestions: token_limit=25000, ) assert any("filter" in s.lower() for s in suggestions) + assert not any("owner" in s.lower() for s in suggestions) + assert any("non-user attributes" in s for s in suggestions) def test_tool_specific_suggestions_execute_sql(self) -> None: """Should provide SQL-specific suggestions for execute_sql."""