fix(mcp): wire up compact schema serialization for search_tools results (#39229)

This commit is contained in:
Amin Ghadersohi
2026-04-09 17:25:46 -04:00
committed by GitHub
parent f49310b8ff
commit e17cf3c808
3 changed files with 472 additions and 2 deletions

View File

@@ -248,9 +248,19 @@ MCP_RESPONSE_SIZE_CONFIG: Dict[str, Any] = {
# - "bm25": Natural language search using BM25 ranking (recommended)
# - "regex": Pattern-based search using regular expressions
#
# Schema Compaction:
# ------------------
# When compact_schemas=True, search results strip $defs sections and replace
# $ref pointers with {"type": "object"}, and truncate tool descriptions.
# This reduces per-search token cost by ~40-60%. Full schemas remain
# available when the tool is actually invoked via call_tool.
#
# Rollback:
# ---------
# Set enabled=False in superset_config.py for instant rollback.
# - Set enabled=False to disable tool search entirely (full catalog exposed).
# - Set compact_schemas=False to disable schema compaction only (full $defs
# and descriptions in search results, tool search still active).
# - Set max_description_length=0 to disable description truncation only.
# =============================================================================
MCP_TOOL_SEARCH_CONFIG: Dict[str, Any] = {
"enabled": True, # Enabled by default — reduces initial context by ~70%
@@ -262,6 +272,8 @@ MCP_TOOL_SEARCH_CONFIG: Dict[str, Any] = {
],
"search_tool_name": "search_tools", # Name of the search tool
"call_tool_name": "call_tool", # Name of the call proxy tool
"compact_schemas": True, # Strip $defs and simplify $ref in search results
"max_description_length": 300, # Truncate tool descriptions (0 = no truncation)
}

View File

@@ -175,6 +175,84 @@ def _strip_titles(obj: Any, in_properties_map: bool = False) -> Any:
return obj
def _simplify_optional_union(result: dict[str, Any]) -> dict[str, Any]:
"""Collapse ``anyOf``/``oneOf`` with exactly one non-null variant.
Pydantic encodes ``Optional[X]`` as ``{"anyOf": [<X>, {"type": "null"}]}``.
This replaces the union with the non-null variant while preserving any
``description`` or ``default`` from the parent node.
"""
for union_key in ("anyOf", "oneOf"):
variants = result.get(union_key)
if not isinstance(variants, list) or len(variants) != 2:
continue
non_null = [v for v in variants if v.get("type") != "null"]
if len(non_null) != 1:
continue
simplified = dict(non_null[0])
for keep in ("description", "default"):
if keep in result and keep not in simplified:
simplified[keep] = result[keep]
result.pop(union_key)
result.pop("description", None)
result.pop("default", None)
result.update(simplified)
return result
def _compact_schema(obj: Any) -> Any:
"""Collapse ``$defs`` and ``$ref`` pointers in a JSON Schema.
Search results only need enough schema detail for the LLM to identify
which tool to call and construct a basic invocation. Full schemas
(with all nested model definitions) are still available when the tool
is actually invoked via ``call_tool``.
Transformations applied:
* ``$defs`` sections are removed entirely.
* ``{"$ref": "..."}`` is replaced with ``{"type": "object"}``.
* ``anyOf``/``oneOf`` lists containing only a ``$ref`` and
``{"type": "null"}`` (Pydantic's Optional encoding) are collapsed
to the simplified non-null variant.
"""
if isinstance(obj, list):
return [_compact_schema(item) for item in obj]
if not isinstance(obj, dict):
return obj
# Direct $ref → generic object type
if "$ref" in obj:
replacement: dict[str, Any] = {"type": "object"}
if desc := obj.get("description"):
replacement["description"] = desc
return replacement
result: dict[str, Any] = {}
for key, value in obj.items():
if key == "$defs":
continue
result[key] = _compact_schema(value)
return _simplify_optional_union(result)
def _truncate_description(text: str, max_length: int) -> str:
"""Truncate a tool description for search results.
Cuts at the last sentence boundary before *max_length*, or at
*max_length* with an ellipsis if no sentence boundary is found.
"""
if not text or len(text) <= max_length:
return text
# Try to cut at the last sentence boundary
truncated = text[:max_length]
last_period = truncated.rfind(". ")
if last_period > max_length // 2:
return truncated[: last_period + 1]
return truncated.rstrip() + "..."
def _serialize_tools_without_output_schema(
tools: Sequence[Any],
) -> list[dict[str, Any]]:
@@ -194,6 +272,46 @@ def _serialize_tools_without_output_schema(
return results
def _create_search_result_serializer(
config: dict[str, Any],
) -> Any:
"""Build a search-result serializer from the tool-search config.
When ``compact_schemas`` is enabled (default), the serializer applies
additional compaction on top of the base serialization:
* ``$defs`` sections and ``$ref`` pointers are collapsed
(see :func:`_compact_schema`).
* Tool descriptions are truncated to ``max_description_length`` chars.
This reduces per-search-call token cost by ~40-60 % while keeping
enough detail for the LLM to identify the right tool and construct
a basic invocation.
"""
compact = config.get("compact_schemas", True)
# Description truncation defaults to 300 when compact_schemas is on,
# but is disabled when compact_schemas is off (unless explicitly set).
max_desc_default = 300 if compact else 0
max_desc = config.get("max_description_length", max_desc_default)
if not compact and not max_desc:
return _serialize_tools_without_output_schema
def _serializer(tools: Sequence[Any]) -> list[dict[str, Any]]:
results = _serialize_tools_without_output_schema(tools)
for data in results:
if compact:
if input_schema := data.get("inputSchema"):
data["inputSchema"] = _compact_schema(input_schema)
if max_desc and "description" in data:
data["description"] = _truncate_description(
data["description"], max_desc
)
return results
return _serializer
def _fix_call_tool_arguments(tool: Any) -> Any:
"""Fix anyOf schema in call_tool ``arguments`` for MCP bridge compatibility.
@@ -270,7 +388,7 @@ def _apply_tool_search_transform(mcp_instance: Any, config: dict[str, Any]) -> N
"always_visible": config.get("always_visible", []),
"search_tool_name": config.get("search_tool_name", "search_tools"),
"call_tool_name": config.get("call_tool_name", "call_tool"),
"search_result_serializer": _serialize_tools_without_output_schema,
"search_result_serializer": _create_search_result_serializer(config),
}
def _make_normalizing_call_tool(transform: Any) -> Tool:

View File

@@ -25,9 +25,12 @@ from fastmcp.server.transforms.search import BM25SearchTransform, RegexSearchTra
from superset.mcp_service.mcp_config import MCP_TOOL_SEARCH_CONFIG
from superset.mcp_service.server import (
_apply_tool_search_transform,
_compact_schema,
_create_search_result_serializer,
_fix_call_tool_arguments,
_normalize_call_tool_arguments,
_serialize_tools_without_output_schema,
_truncate_description,
)
from superset.utils import json
@@ -300,3 +303,340 @@ def test_normalize_ignores_keys_not_in_schema():
result = _normalize_call_tool_arguments(arguments, schema)
assert isinstance(result["unknown_key"], dict)
# -- _compact_schema tests --
def test_compact_schema_removes_defs():
"""$defs section is stripped from the schema."""
schema = {
"type": "object",
"properties": {
"filters": {"items": {"$ref": "#/$defs/MyFilter"}, "type": "array"},
},
"$defs": {
"MyFilter": {
"type": "object",
"properties": {"col": {"type": "string"}},
}
},
}
result = _compact_schema(schema)
assert "$defs" not in result
assert result["properties"]["filters"]["items"] == {"type": "object"}
def test_compact_schema_replaces_ref_with_object():
"""Direct $ref is replaced with {"type": "object"}."""
schema = {"$ref": "#/$defs/SomeModel"}
result = _compact_schema(schema)
assert result == {"type": "object"}
def test_compact_schema_preserves_ref_description():
"""$ref replacement preserves sibling description if present."""
schema = {"$ref": "#/$defs/SomeModel", "description": "A model"}
result = _compact_schema(schema)
assert result == {"type": "object", "description": "A model"}
def test_compact_schema_simplifies_optional_ref():
"""anyOf with $ref and null is collapsed to the non-null variant."""
schema = {
"anyOf": [
{"$ref": "#/$defs/SomeModel"},
{"type": "null"},
],
"description": "Optional model",
}
result = _compact_schema(schema)
assert "anyOf" not in result
assert result["type"] == "object"
assert result["description"] == "Optional model"
def test_compact_schema_simplifies_optional_primitive():
"""anyOf with primitive type and null is collapsed."""
schema = {
"anyOf": [
{"type": "integer"},
{"type": "null"},
],
"default": None,
}
result = _compact_schema(schema)
assert "anyOf" not in result
assert result["type"] == "integer"
assert result["default"] is None
def test_compact_schema_preserves_multi_variant_anyof():
"""anyOf with >2 variants or no null is left unchanged."""
schema = {
"anyOf": [
{"type": "integer"},
{"type": "string"},
]
}
result = _compact_schema(schema)
assert "anyOf" in result
assert len(result["anyOf"]) == 2
def test_compact_schema_nested_in_items():
"""$ref nested inside items/properties is also replaced."""
schema = {
"type": "object",
"properties": {
"filters": {
"type": "array",
"items": {"$ref": "#/$defs/Filter"},
},
"name": {"type": "string"},
},
}
result = _compact_schema(schema)
assert result["properties"]["filters"]["items"] == {"type": "object"}
assert result["properties"]["name"] == {"type": "string"}
def test_compact_schema_passthrough_simple():
"""Simple schema without $defs/$ref passes through unchanged."""
schema = {
"type": "object",
"properties": {
"page": {"type": "integer", "default": 1},
"search": {"type": "string"},
},
}
result = _compact_schema(schema)
assert result == schema
def test_compact_schema_handles_non_dict():
"""Non-dict inputs pass through unchanged."""
assert _compact_schema("hello") == "hello"
assert _compact_schema(42) == 42
assert _compact_schema(None) is None
assert _compact_schema([1, 2]) == [1, 2]
# -- _truncate_description tests --
def test_truncate_description_short_text():
"""Short text is returned as-is."""
assert _truncate_description("Hello world", 300) == "Hello world"
def test_truncate_description_cuts_at_sentence():
"""Long text is cut at the last sentence boundary."""
text = "First sentence. Second sentence. Third sentence that is quite long."
result = _truncate_description(text, 40)
assert result == "First sentence. Second sentence."
def test_truncate_description_ellipsis_fallback():
"""When no sentence boundary, truncates with ellipsis."""
text = "A very long single sentence without periods that goes on and on"
result = _truncate_description(text, 30)
assert result.endswith("...")
assert len(result) <= 33 # 30 + "..."
def test_truncate_description_empty():
"""Empty string returns empty."""
assert _truncate_description("", 300) == ""
def test_truncate_description_zero_max():
"""Zero max_length produces ellipsis; the serializer skips calling this."""
text = "Some text"
# _truncate_description(text, 0) truncates to 0 chars and appends "...".
# The caller (_create_search_result_serializer) skips calling it when
# max_desc=0 so this edge case only matters for direct callers.
result = _truncate_description(text, 0)
assert result == "..."
# -- _create_search_result_serializer tests --
def _make_mock_tool(name, description, input_schema):
"""Helper to create a mock tool for serializer tests."""
mock_tool = MagicMock()
mock_mcp_tool = MagicMock()
mock_mcp_tool.model_dump.return_value = {
"name": name,
"description": description,
"inputSchema": input_schema,
}
mock_tool.to_mcp_tool.return_value = mock_mcp_tool
return mock_tool
def test_create_serializer_compacts_schemas():
"""Compact serializer strips $defs and replaces $ref."""
tool = _make_mock_tool(
"list_charts",
"List charts with filtering.",
{
"type": "object",
"properties": {
"filters": {
"type": "array",
"items": {"$ref": "#/$defs/ChartFilter"},
}
},
"$defs": {
"ChartFilter": {
"type": "object",
"properties": {"col": {"type": "string"}},
}
},
},
)
serializer = _create_search_result_serializer({"compact_schemas": True})
result = serializer([tool])
assert len(result) == 1
schema = result[0]["inputSchema"]
assert "$defs" not in schema
assert schema["properties"]["filters"]["items"] == {"type": "object"}
def test_create_serializer_truncates_descriptions():
"""Compact serializer truncates long tool descriptions."""
long_desc = "Short intro. " + "x" * 500
tool = _make_mock_tool(
"generate_chart",
long_desc,
{"type": "object"},
)
serializer = _create_search_result_serializer({"max_description_length": 50})
result = serializer([tool])
assert len(result[0]["description"]) <= 53 # 50 + potential "..."
def test_create_serializer_disabled():
"""When compact_schemas=False and max_description_length=0, no compaction."""
tool = _make_mock_tool(
"test_tool",
"A long description " * 20,
{
"type": "object",
"$defs": {"Model": {"type": "object"}},
},
)
serializer = _create_search_result_serializer(
{"compact_schemas": False, "max_description_length": 0}
)
result = serializer([tool])
# $defs should still be present (compaction disabled)
assert "$defs" in result[0]["inputSchema"]
# Description should not be truncated
assert result[0]["description"] == "A long description " * 20
def test_create_serializer_compact_false_disables_truncation():
"""compact_schemas=False also disables description truncation by default."""
long_desc = "A very long description. " * 30
tool = _make_mock_tool(
"test_tool",
long_desc,
{"type": "object", "$defs": {"Model": {"type": "object"}}},
)
serializer = _create_search_result_serializer({"compact_schemas": False})
result = serializer([tool])
# $defs should still be present (compaction disabled)
assert "$defs" in result[0]["inputSchema"]
# Description should NOT be truncated (max_desc defaults to 0 when compact=False)
assert result[0]["description"] == long_desc
def test_create_serializer_compact_false_explicit_truncation():
"""compact_schemas=False with explicit max_description_length still truncates."""
long_desc = "First sentence. " + "x" * 500
tool = _make_mock_tool(
"test_tool",
long_desc,
{"type": "object", "$defs": {"Model": {"type": "object"}}},
)
serializer = _create_search_result_serializer(
{"compact_schemas": False, "max_description_length": 200}
)
result = serializer([tool])
# $defs should still be present (compaction disabled)
assert "$defs" in result[0]["inputSchema"]
# Description SHOULD be truncated (explicitly requested)
assert len(result[0]["description"]) <= 203
def test_create_serializer_uses_config_defaults():
"""Empty config uses defaults (compact=True, max_desc=300)."""
long_desc = "First sentence. " + "x" * 500
tool = _make_mock_tool(
"test_tool",
long_desc,
{
"type": "object",
"$defs": {"Model": {"type": "object"}},
"properties": {"x": {"$ref": "#/$defs/Model"}},
},
)
serializer = _create_search_result_serializer({})
result = serializer([tool])
assert "$defs" not in result[0]["inputSchema"]
assert len(result[0]["description"]) <= 303
def test_apply_transform_uses_compact_serializer():
"""_apply_tool_search_transform wires _create_search_result_serializer."""
mock_mcp = MagicMock()
config = {
"strategy": "bm25",
"max_results": 5,
"always_visible": ["health_check"],
"search_tool_name": "search_tools",
"call_tool_name": "call_tool",
"compact_schemas": True,
"max_description_length": 200,
}
_apply_tool_search_transform(mock_mcp, config)
mock_mcp.add_transform.assert_called_once()
transform = mock_mcp.add_transform.call_args[0][0]
# The serializer should NOT be the plain _serialize_tools_without_output_schema
assert (
transform._search_result_serializer
is not _serialize_tools_without_output_schema
)