From e17cf3c8086f7393e4dca820c44afb8a2d56b2f4 Mon Sep 17 00:00:00 2001 From: Amin Ghadersohi Date: Thu, 9 Apr 2026 17:25:46 -0400 Subject: [PATCH] fix(mcp): wire up compact schema serialization for search_tools results (#39229) --- superset/mcp_service/mcp_config.py | 14 +- superset/mcp_service/server.py | 120 ++++++- .../mcp_service/test_tool_search_transform.py | 340 ++++++++++++++++++ 3 files changed, 472 insertions(+), 2 deletions(-) diff --git a/superset/mcp_service/mcp_config.py b/superset/mcp_service/mcp_config.py index 7142e69bc48..caa1d1fc2d0 100644 --- a/superset/mcp_service/mcp_config.py +++ b/superset/mcp_service/mcp_config.py @@ -248,9 +248,19 @@ MCP_RESPONSE_SIZE_CONFIG: Dict[str, Any] = { # - "bm25": Natural language search using BM25 ranking (recommended) # - "regex": Pattern-based search using regular expressions # +# Schema Compaction: +# ------------------ +# When compact_schemas=True, search results strip $defs sections and replace +# $ref pointers with {"type": "object"}, and truncate tool descriptions. +# This reduces per-search token cost by ~40-60%. Full schemas remain +# available when the tool is actually invoked via call_tool. +# # Rollback: # --------- -# Set enabled=False in superset_config.py for instant rollback. +# - Set enabled=False to disable tool search entirely (full catalog exposed). +# - Set compact_schemas=False to disable schema compaction only (full $defs +# and descriptions in search results, tool search still active). +# - Set max_description_length=0 to disable description truncation only. # ============================================================================= MCP_TOOL_SEARCH_CONFIG: Dict[str, Any] = { "enabled": True, # Enabled by default — reduces initial context by ~70% @@ -262,6 +272,8 @@ MCP_TOOL_SEARCH_CONFIG: Dict[str, Any] = { ], "search_tool_name": "search_tools", # Name of the search tool "call_tool_name": "call_tool", # Name of the call proxy tool + "compact_schemas": True, # Strip $defs and simplify $ref in search results + "max_description_length": 300, # Truncate tool descriptions (0 = no truncation) } diff --git a/superset/mcp_service/server.py b/superset/mcp_service/server.py index 03f089ddff0..8f0d6feccaf 100644 --- a/superset/mcp_service/server.py +++ b/superset/mcp_service/server.py @@ -175,6 +175,84 @@ def _strip_titles(obj: Any, in_properties_map: bool = False) -> Any: return obj +def _simplify_optional_union(result: dict[str, Any]) -> dict[str, Any]: + """Collapse ``anyOf``/``oneOf`` with exactly one non-null variant. + + Pydantic encodes ``Optional[X]`` as ``{"anyOf": [, {"type": "null"}]}``. + This replaces the union with the non-null variant while preserving any + ``description`` or ``default`` from the parent node. + """ + for union_key in ("anyOf", "oneOf"): + variants = result.get(union_key) + if not isinstance(variants, list) or len(variants) != 2: + continue + non_null = [v for v in variants if v.get("type") != "null"] + if len(non_null) != 1: + continue + simplified = dict(non_null[0]) + for keep in ("description", "default"): + if keep in result and keep not in simplified: + simplified[keep] = result[keep] + result.pop(union_key) + result.pop("description", None) + result.pop("default", None) + result.update(simplified) + return result + + +def _compact_schema(obj: Any) -> Any: + """Collapse ``$defs`` and ``$ref`` pointers in a JSON Schema. + + Search results only need enough schema detail for the LLM to identify + which tool to call and construct a basic invocation. Full schemas + (with all nested model definitions) are still available when the tool + is actually invoked via ``call_tool``. + + Transformations applied: + + * ``$defs`` sections are removed entirely. + * ``{"$ref": "..."}`` is replaced with ``{"type": "object"}``. + * ``anyOf``/``oneOf`` lists containing only a ``$ref`` and + ``{"type": "null"}`` (Pydantic's Optional encoding) are collapsed + to the simplified non-null variant. + """ + if isinstance(obj, list): + return [_compact_schema(item) for item in obj] + if not isinstance(obj, dict): + return obj + + # Direct $ref → generic object type + if "$ref" in obj: + replacement: dict[str, Any] = {"type": "object"} + if desc := obj.get("description"): + replacement["description"] = desc + return replacement + + result: dict[str, Any] = {} + for key, value in obj.items(): + if key == "$defs": + continue + result[key] = _compact_schema(value) + + return _simplify_optional_union(result) + + +def _truncate_description(text: str, max_length: int) -> str: + """Truncate a tool description for search results. + + Cuts at the last sentence boundary before *max_length*, or at + *max_length* with an ellipsis if no sentence boundary is found. + """ + if not text or len(text) <= max_length: + return text + # Try to cut at the last sentence boundary + truncated = text[:max_length] + last_period = truncated.rfind(". ") + if last_period > max_length // 2: + return truncated[: last_period + 1] + return truncated.rstrip() + "..." + + def _serialize_tools_without_output_schema( tools: Sequence[Any], ) -> list[dict[str, Any]]: @@ -194,6 +272,46 @@ def _serialize_tools_without_output_schema( return results +def _create_search_result_serializer( + config: dict[str, Any], +) -> Any: + """Build a search-result serializer from the tool-search config. + + When ``compact_schemas`` is enabled (default), the serializer applies + additional compaction on top of the base serialization: + + * ``$defs`` sections and ``$ref`` pointers are collapsed + (see :func:`_compact_schema`). + * Tool descriptions are truncated to ``max_description_length`` chars. + + This reduces per-search-call token cost by ~40-60 % while keeping + enough detail for the LLM to identify the right tool and construct + a basic invocation. + """ + compact = config.get("compact_schemas", True) + # Description truncation defaults to 300 when compact_schemas is on, + # but is disabled when compact_schemas is off (unless explicitly set). + max_desc_default = 300 if compact else 0 + max_desc = config.get("max_description_length", max_desc_default) + + if not compact and not max_desc: + return _serialize_tools_without_output_schema + + def _serializer(tools: Sequence[Any]) -> list[dict[str, Any]]: + results = _serialize_tools_without_output_schema(tools) + for data in results: + if compact: + if input_schema := data.get("inputSchema"): + data["inputSchema"] = _compact_schema(input_schema) + if max_desc and "description" in data: + data["description"] = _truncate_description( + data["description"], max_desc + ) + return results + + return _serializer + + def _fix_call_tool_arguments(tool: Any) -> Any: """Fix anyOf schema in call_tool ``arguments`` for MCP bridge compatibility. @@ -270,7 +388,7 @@ def _apply_tool_search_transform(mcp_instance: Any, config: dict[str, Any]) -> N "always_visible": config.get("always_visible", []), "search_tool_name": config.get("search_tool_name", "search_tools"), "call_tool_name": config.get("call_tool_name", "call_tool"), - "search_result_serializer": _serialize_tools_without_output_schema, + "search_result_serializer": _create_search_result_serializer(config), } def _make_normalizing_call_tool(transform: Any) -> Tool: diff --git a/tests/unit_tests/mcp_service/test_tool_search_transform.py b/tests/unit_tests/mcp_service/test_tool_search_transform.py index 6000e1c721d..dff91c76f81 100644 --- a/tests/unit_tests/mcp_service/test_tool_search_transform.py +++ b/tests/unit_tests/mcp_service/test_tool_search_transform.py @@ -25,9 +25,12 @@ from fastmcp.server.transforms.search import BM25SearchTransform, RegexSearchTra from superset.mcp_service.mcp_config import MCP_TOOL_SEARCH_CONFIG from superset.mcp_service.server import ( _apply_tool_search_transform, + _compact_schema, + _create_search_result_serializer, _fix_call_tool_arguments, _normalize_call_tool_arguments, _serialize_tools_without_output_schema, + _truncate_description, ) from superset.utils import json @@ -300,3 +303,340 @@ def test_normalize_ignores_keys_not_in_schema(): result = _normalize_call_tool_arguments(arguments, schema) assert isinstance(result["unknown_key"], dict) + + +# -- _compact_schema tests -- + + +def test_compact_schema_removes_defs(): + """$defs section is stripped from the schema.""" + schema = { + "type": "object", + "properties": { + "filters": {"items": {"$ref": "#/$defs/MyFilter"}, "type": "array"}, + }, + "$defs": { + "MyFilter": { + "type": "object", + "properties": {"col": {"type": "string"}}, + } + }, + } + + result = _compact_schema(schema) + + assert "$defs" not in result + assert result["properties"]["filters"]["items"] == {"type": "object"} + + +def test_compact_schema_replaces_ref_with_object(): + """Direct $ref is replaced with {"type": "object"}.""" + schema = {"$ref": "#/$defs/SomeModel"} + + result = _compact_schema(schema) + + assert result == {"type": "object"} + + +def test_compact_schema_preserves_ref_description(): + """$ref replacement preserves sibling description if present.""" + schema = {"$ref": "#/$defs/SomeModel", "description": "A model"} + + result = _compact_schema(schema) + + assert result == {"type": "object", "description": "A model"} + + +def test_compact_schema_simplifies_optional_ref(): + """anyOf with $ref and null is collapsed to the non-null variant.""" + schema = { + "anyOf": [ + {"$ref": "#/$defs/SomeModel"}, + {"type": "null"}, + ], + "description": "Optional model", + } + + result = _compact_schema(schema) + + assert "anyOf" not in result + assert result["type"] == "object" + assert result["description"] == "Optional model" + + +def test_compact_schema_simplifies_optional_primitive(): + """anyOf with primitive type and null is collapsed.""" + schema = { + "anyOf": [ + {"type": "integer"}, + {"type": "null"}, + ], + "default": None, + } + + result = _compact_schema(schema) + + assert "anyOf" not in result + assert result["type"] == "integer" + assert result["default"] is None + + +def test_compact_schema_preserves_multi_variant_anyof(): + """anyOf with >2 variants or no null is left unchanged.""" + schema = { + "anyOf": [ + {"type": "integer"}, + {"type": "string"}, + ] + } + + result = _compact_schema(schema) + + assert "anyOf" in result + assert len(result["anyOf"]) == 2 + + +def test_compact_schema_nested_in_items(): + """$ref nested inside items/properties is also replaced.""" + schema = { + "type": "object", + "properties": { + "filters": { + "type": "array", + "items": {"$ref": "#/$defs/Filter"}, + }, + "name": {"type": "string"}, + }, + } + + result = _compact_schema(schema) + + assert result["properties"]["filters"]["items"] == {"type": "object"} + assert result["properties"]["name"] == {"type": "string"} + + +def test_compact_schema_passthrough_simple(): + """Simple schema without $defs/$ref passes through unchanged.""" + schema = { + "type": "object", + "properties": { + "page": {"type": "integer", "default": 1}, + "search": {"type": "string"}, + }, + } + + result = _compact_schema(schema) + + assert result == schema + + +def test_compact_schema_handles_non_dict(): + """Non-dict inputs pass through unchanged.""" + assert _compact_schema("hello") == "hello" + assert _compact_schema(42) == 42 + assert _compact_schema(None) is None + assert _compact_schema([1, 2]) == [1, 2] + + +# -- _truncate_description tests -- + + +def test_truncate_description_short_text(): + """Short text is returned as-is.""" + assert _truncate_description("Hello world", 300) == "Hello world" + + +def test_truncate_description_cuts_at_sentence(): + """Long text is cut at the last sentence boundary.""" + text = "First sentence. Second sentence. Third sentence that is quite long." + result = _truncate_description(text, 40) + assert result == "First sentence. Second sentence." + + +def test_truncate_description_ellipsis_fallback(): + """When no sentence boundary, truncates with ellipsis.""" + text = "A very long single sentence without periods that goes on and on" + result = _truncate_description(text, 30) + assert result.endswith("...") + assert len(result) <= 33 # 30 + "..." + + +def test_truncate_description_empty(): + """Empty string returns empty.""" + assert _truncate_description("", 300) == "" + + +def test_truncate_description_zero_max(): + """Zero max_length produces ellipsis; the serializer skips calling this.""" + text = "Some text" + # _truncate_description(text, 0) truncates to 0 chars and appends "...". + # The caller (_create_search_result_serializer) skips calling it when + # max_desc=0 so this edge case only matters for direct callers. + result = _truncate_description(text, 0) + assert result == "..." + + +# -- _create_search_result_serializer tests -- + + +def _make_mock_tool(name, description, input_schema): + """Helper to create a mock tool for serializer tests.""" + mock_tool = MagicMock() + mock_mcp_tool = MagicMock() + mock_mcp_tool.model_dump.return_value = { + "name": name, + "description": description, + "inputSchema": input_schema, + } + mock_tool.to_mcp_tool.return_value = mock_mcp_tool + return mock_tool + + +def test_create_serializer_compacts_schemas(): + """Compact serializer strips $defs and replaces $ref.""" + tool = _make_mock_tool( + "list_charts", + "List charts with filtering.", + { + "type": "object", + "properties": { + "filters": { + "type": "array", + "items": {"$ref": "#/$defs/ChartFilter"}, + } + }, + "$defs": { + "ChartFilter": { + "type": "object", + "properties": {"col": {"type": "string"}}, + } + }, + }, + ) + + serializer = _create_search_result_serializer({"compact_schemas": True}) + result = serializer([tool]) + + assert len(result) == 1 + schema = result[0]["inputSchema"] + assert "$defs" not in schema + assert schema["properties"]["filters"]["items"] == {"type": "object"} + + +def test_create_serializer_truncates_descriptions(): + """Compact serializer truncates long tool descriptions.""" + long_desc = "Short intro. " + "x" * 500 + tool = _make_mock_tool( + "generate_chart", + long_desc, + {"type": "object"}, + ) + + serializer = _create_search_result_serializer({"max_description_length": 50}) + result = serializer([tool]) + + assert len(result[0]["description"]) <= 53 # 50 + potential "..." + + +def test_create_serializer_disabled(): + """When compact_schemas=False and max_description_length=0, no compaction.""" + tool = _make_mock_tool( + "test_tool", + "A long description " * 20, + { + "type": "object", + "$defs": {"Model": {"type": "object"}}, + }, + ) + + serializer = _create_search_result_serializer( + {"compact_schemas": False, "max_description_length": 0} + ) + result = serializer([tool]) + + # $defs should still be present (compaction disabled) + assert "$defs" in result[0]["inputSchema"] + # Description should not be truncated + assert result[0]["description"] == "A long description " * 20 + + +def test_create_serializer_compact_false_disables_truncation(): + """compact_schemas=False also disables description truncation by default.""" + long_desc = "A very long description. " * 30 + tool = _make_mock_tool( + "test_tool", + long_desc, + {"type": "object", "$defs": {"Model": {"type": "object"}}}, + ) + + serializer = _create_search_result_serializer({"compact_schemas": False}) + result = serializer([tool]) + + # $defs should still be present (compaction disabled) + assert "$defs" in result[0]["inputSchema"] + # Description should NOT be truncated (max_desc defaults to 0 when compact=False) + assert result[0]["description"] == long_desc + + +def test_create_serializer_compact_false_explicit_truncation(): + """compact_schemas=False with explicit max_description_length still truncates.""" + long_desc = "First sentence. " + "x" * 500 + tool = _make_mock_tool( + "test_tool", + long_desc, + {"type": "object", "$defs": {"Model": {"type": "object"}}}, + ) + + serializer = _create_search_result_serializer( + {"compact_schemas": False, "max_description_length": 200} + ) + result = serializer([tool]) + + # $defs should still be present (compaction disabled) + assert "$defs" in result[0]["inputSchema"] + # Description SHOULD be truncated (explicitly requested) + assert len(result[0]["description"]) <= 203 + + +def test_create_serializer_uses_config_defaults(): + """Empty config uses defaults (compact=True, max_desc=300).""" + long_desc = "First sentence. " + "x" * 500 + tool = _make_mock_tool( + "test_tool", + long_desc, + { + "type": "object", + "$defs": {"Model": {"type": "object"}}, + "properties": {"x": {"$ref": "#/$defs/Model"}}, + }, + ) + + serializer = _create_search_result_serializer({}) + result = serializer([tool]) + + assert "$defs" not in result[0]["inputSchema"] + assert len(result[0]["description"]) <= 303 + + +def test_apply_transform_uses_compact_serializer(): + """_apply_tool_search_transform wires _create_search_result_serializer.""" + mock_mcp = MagicMock() + config = { + "strategy": "bm25", + "max_results": 5, + "always_visible": ["health_check"], + "search_tool_name": "search_tools", + "call_tool_name": "call_tool", + "compact_schemas": True, + "max_description_length": 200, + } + + _apply_tool_search_transform(mock_mcp, config) + + mock_mcp.add_transform.assert_called_once() + transform = mock_mcp.add_transform.call_args[0][0] + # The serializer should NOT be the plain _serialize_tools_without_output_schema + assert ( + transform._search_result_serializer + is not _serialize_tools_without_output_schema + )