mirror of
https://github.com/apache/superset.git
synced 2026-06-06 08:09:14 +00:00
fix(mcp): wire up compact schema serialization for search_tools results (#39229)
This commit is contained in:
@@ -248,9 +248,19 @@ MCP_RESPONSE_SIZE_CONFIG: Dict[str, Any] = {
|
||||
# - "bm25": Natural language search using BM25 ranking (recommended)
|
||||
# - "regex": Pattern-based search using regular expressions
|
||||
#
|
||||
# Schema Compaction:
|
||||
# ------------------
|
||||
# When compact_schemas=True, search results strip $defs sections and replace
|
||||
# $ref pointers with {"type": "object"}, and truncate tool descriptions.
|
||||
# This reduces per-search token cost by ~40-60%. Full schemas remain
|
||||
# available when the tool is actually invoked via call_tool.
|
||||
#
|
||||
# Rollback:
|
||||
# ---------
|
||||
# Set enabled=False in superset_config.py for instant rollback.
|
||||
# - Set enabled=False to disable tool search entirely (full catalog exposed).
|
||||
# - Set compact_schemas=False to disable schema compaction only (full $defs
|
||||
# and descriptions in search results, tool search still active).
|
||||
# - Set max_description_length=0 to disable description truncation only.
|
||||
# =============================================================================
|
||||
MCP_TOOL_SEARCH_CONFIG: Dict[str, Any] = {
|
||||
"enabled": True, # Enabled by default — reduces initial context by ~70%
|
||||
@@ -262,6 +272,8 @@ MCP_TOOL_SEARCH_CONFIG: Dict[str, Any] = {
|
||||
],
|
||||
"search_tool_name": "search_tools", # Name of the search tool
|
||||
"call_tool_name": "call_tool", # Name of the call proxy tool
|
||||
"compact_schemas": True, # Strip $defs and simplify $ref in search results
|
||||
"max_description_length": 300, # Truncate tool descriptions (0 = no truncation)
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -175,6 +175,84 @@ def _strip_titles(obj: Any, in_properties_map: bool = False) -> Any:
|
||||
return obj
|
||||
|
||||
|
||||
def _simplify_optional_union(result: dict[str, Any]) -> dict[str, Any]:
|
||||
"""Collapse ``anyOf``/``oneOf`` with exactly one non-null variant.
|
||||
|
||||
Pydantic encodes ``Optional[X]`` as ``{"anyOf": [<X>, {"type": "null"}]}``.
|
||||
This replaces the union with the non-null variant while preserving any
|
||||
``description`` or ``default`` from the parent node.
|
||||
"""
|
||||
for union_key in ("anyOf", "oneOf"):
|
||||
variants = result.get(union_key)
|
||||
if not isinstance(variants, list) or len(variants) != 2:
|
||||
continue
|
||||
non_null = [v for v in variants if v.get("type") != "null"]
|
||||
if len(non_null) != 1:
|
||||
continue
|
||||
simplified = dict(non_null[0])
|
||||
for keep in ("description", "default"):
|
||||
if keep in result and keep not in simplified:
|
||||
simplified[keep] = result[keep]
|
||||
result.pop(union_key)
|
||||
result.pop("description", None)
|
||||
result.pop("default", None)
|
||||
result.update(simplified)
|
||||
return result
|
||||
|
||||
|
||||
def _compact_schema(obj: Any) -> Any:
|
||||
"""Collapse ``$defs`` and ``$ref`` pointers in a JSON Schema.
|
||||
|
||||
Search results only need enough schema detail for the LLM to identify
|
||||
which tool to call and construct a basic invocation. Full schemas
|
||||
(with all nested model definitions) are still available when the tool
|
||||
is actually invoked via ``call_tool``.
|
||||
|
||||
Transformations applied:
|
||||
|
||||
* ``$defs`` sections are removed entirely.
|
||||
* ``{"$ref": "..."}`` is replaced with ``{"type": "object"}``.
|
||||
* ``anyOf``/``oneOf`` lists containing only a ``$ref`` and
|
||||
``{"type": "null"}`` (Pydantic's Optional encoding) are collapsed
|
||||
to the simplified non-null variant.
|
||||
"""
|
||||
if isinstance(obj, list):
|
||||
return [_compact_schema(item) for item in obj]
|
||||
if not isinstance(obj, dict):
|
||||
return obj
|
||||
|
||||
# Direct $ref → generic object type
|
||||
if "$ref" in obj:
|
||||
replacement: dict[str, Any] = {"type": "object"}
|
||||
if desc := obj.get("description"):
|
||||
replacement["description"] = desc
|
||||
return replacement
|
||||
|
||||
result: dict[str, Any] = {}
|
||||
for key, value in obj.items():
|
||||
if key == "$defs":
|
||||
continue
|
||||
result[key] = _compact_schema(value)
|
||||
|
||||
return _simplify_optional_union(result)
|
||||
|
||||
|
||||
def _truncate_description(text: str, max_length: int) -> str:
|
||||
"""Truncate a tool description for search results.
|
||||
|
||||
Cuts at the last sentence boundary before *max_length*, or at
|
||||
*max_length* with an ellipsis if no sentence boundary is found.
|
||||
"""
|
||||
if not text or len(text) <= max_length:
|
||||
return text
|
||||
# Try to cut at the last sentence boundary
|
||||
truncated = text[:max_length]
|
||||
last_period = truncated.rfind(". ")
|
||||
if last_period > max_length // 2:
|
||||
return truncated[: last_period + 1]
|
||||
return truncated.rstrip() + "..."
|
||||
|
||||
|
||||
def _serialize_tools_without_output_schema(
|
||||
tools: Sequence[Any],
|
||||
) -> list[dict[str, Any]]:
|
||||
@@ -194,6 +272,46 @@ def _serialize_tools_without_output_schema(
|
||||
return results
|
||||
|
||||
|
||||
def _create_search_result_serializer(
|
||||
config: dict[str, Any],
|
||||
) -> Any:
|
||||
"""Build a search-result serializer from the tool-search config.
|
||||
|
||||
When ``compact_schemas`` is enabled (default), the serializer applies
|
||||
additional compaction on top of the base serialization:
|
||||
|
||||
* ``$defs`` sections and ``$ref`` pointers are collapsed
|
||||
(see :func:`_compact_schema`).
|
||||
* Tool descriptions are truncated to ``max_description_length`` chars.
|
||||
|
||||
This reduces per-search-call token cost by ~40-60 % while keeping
|
||||
enough detail for the LLM to identify the right tool and construct
|
||||
a basic invocation.
|
||||
"""
|
||||
compact = config.get("compact_schemas", True)
|
||||
# Description truncation defaults to 300 when compact_schemas is on,
|
||||
# but is disabled when compact_schemas is off (unless explicitly set).
|
||||
max_desc_default = 300 if compact else 0
|
||||
max_desc = config.get("max_description_length", max_desc_default)
|
||||
|
||||
if not compact and not max_desc:
|
||||
return _serialize_tools_without_output_schema
|
||||
|
||||
def _serializer(tools: Sequence[Any]) -> list[dict[str, Any]]:
|
||||
results = _serialize_tools_without_output_schema(tools)
|
||||
for data in results:
|
||||
if compact:
|
||||
if input_schema := data.get("inputSchema"):
|
||||
data["inputSchema"] = _compact_schema(input_schema)
|
||||
if max_desc and "description" in data:
|
||||
data["description"] = _truncate_description(
|
||||
data["description"], max_desc
|
||||
)
|
||||
return results
|
||||
|
||||
return _serializer
|
||||
|
||||
|
||||
def _fix_call_tool_arguments(tool: Any) -> Any:
|
||||
"""Fix anyOf schema in call_tool ``arguments`` for MCP bridge compatibility.
|
||||
|
||||
@@ -270,7 +388,7 @@ def _apply_tool_search_transform(mcp_instance: Any, config: dict[str, Any]) -> N
|
||||
"always_visible": config.get("always_visible", []),
|
||||
"search_tool_name": config.get("search_tool_name", "search_tools"),
|
||||
"call_tool_name": config.get("call_tool_name", "call_tool"),
|
||||
"search_result_serializer": _serialize_tools_without_output_schema,
|
||||
"search_result_serializer": _create_search_result_serializer(config),
|
||||
}
|
||||
|
||||
def _make_normalizing_call_tool(transform: Any) -> Tool:
|
||||
|
||||
@@ -25,9 +25,12 @@ from fastmcp.server.transforms.search import BM25SearchTransform, RegexSearchTra
|
||||
from superset.mcp_service.mcp_config import MCP_TOOL_SEARCH_CONFIG
|
||||
from superset.mcp_service.server import (
|
||||
_apply_tool_search_transform,
|
||||
_compact_schema,
|
||||
_create_search_result_serializer,
|
||||
_fix_call_tool_arguments,
|
||||
_normalize_call_tool_arguments,
|
||||
_serialize_tools_without_output_schema,
|
||||
_truncate_description,
|
||||
)
|
||||
from superset.utils import json
|
||||
|
||||
@@ -300,3 +303,340 @@ def test_normalize_ignores_keys_not_in_schema():
|
||||
result = _normalize_call_tool_arguments(arguments, schema)
|
||||
|
||||
assert isinstance(result["unknown_key"], dict)
|
||||
|
||||
|
||||
# -- _compact_schema tests --
|
||||
|
||||
|
||||
def test_compact_schema_removes_defs():
|
||||
"""$defs section is stripped from the schema."""
|
||||
schema = {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"filters": {"items": {"$ref": "#/$defs/MyFilter"}, "type": "array"},
|
||||
},
|
||||
"$defs": {
|
||||
"MyFilter": {
|
||||
"type": "object",
|
||||
"properties": {"col": {"type": "string"}},
|
||||
}
|
||||
},
|
||||
}
|
||||
|
||||
result = _compact_schema(schema)
|
||||
|
||||
assert "$defs" not in result
|
||||
assert result["properties"]["filters"]["items"] == {"type": "object"}
|
||||
|
||||
|
||||
def test_compact_schema_replaces_ref_with_object():
|
||||
"""Direct $ref is replaced with {"type": "object"}."""
|
||||
schema = {"$ref": "#/$defs/SomeModel"}
|
||||
|
||||
result = _compact_schema(schema)
|
||||
|
||||
assert result == {"type": "object"}
|
||||
|
||||
|
||||
def test_compact_schema_preserves_ref_description():
|
||||
"""$ref replacement preserves sibling description if present."""
|
||||
schema = {"$ref": "#/$defs/SomeModel", "description": "A model"}
|
||||
|
||||
result = _compact_schema(schema)
|
||||
|
||||
assert result == {"type": "object", "description": "A model"}
|
||||
|
||||
|
||||
def test_compact_schema_simplifies_optional_ref():
|
||||
"""anyOf with $ref and null is collapsed to the non-null variant."""
|
||||
schema = {
|
||||
"anyOf": [
|
||||
{"$ref": "#/$defs/SomeModel"},
|
||||
{"type": "null"},
|
||||
],
|
||||
"description": "Optional model",
|
||||
}
|
||||
|
||||
result = _compact_schema(schema)
|
||||
|
||||
assert "anyOf" not in result
|
||||
assert result["type"] == "object"
|
||||
assert result["description"] == "Optional model"
|
||||
|
||||
|
||||
def test_compact_schema_simplifies_optional_primitive():
|
||||
"""anyOf with primitive type and null is collapsed."""
|
||||
schema = {
|
||||
"anyOf": [
|
||||
{"type": "integer"},
|
||||
{"type": "null"},
|
||||
],
|
||||
"default": None,
|
||||
}
|
||||
|
||||
result = _compact_schema(schema)
|
||||
|
||||
assert "anyOf" not in result
|
||||
assert result["type"] == "integer"
|
||||
assert result["default"] is None
|
||||
|
||||
|
||||
def test_compact_schema_preserves_multi_variant_anyof():
|
||||
"""anyOf with >2 variants or no null is left unchanged."""
|
||||
schema = {
|
||||
"anyOf": [
|
||||
{"type": "integer"},
|
||||
{"type": "string"},
|
||||
]
|
||||
}
|
||||
|
||||
result = _compact_schema(schema)
|
||||
|
||||
assert "anyOf" in result
|
||||
assert len(result["anyOf"]) == 2
|
||||
|
||||
|
||||
def test_compact_schema_nested_in_items():
|
||||
"""$ref nested inside items/properties is also replaced."""
|
||||
schema = {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"filters": {
|
||||
"type": "array",
|
||||
"items": {"$ref": "#/$defs/Filter"},
|
||||
},
|
||||
"name": {"type": "string"},
|
||||
},
|
||||
}
|
||||
|
||||
result = _compact_schema(schema)
|
||||
|
||||
assert result["properties"]["filters"]["items"] == {"type": "object"}
|
||||
assert result["properties"]["name"] == {"type": "string"}
|
||||
|
||||
|
||||
def test_compact_schema_passthrough_simple():
|
||||
"""Simple schema without $defs/$ref passes through unchanged."""
|
||||
schema = {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"page": {"type": "integer", "default": 1},
|
||||
"search": {"type": "string"},
|
||||
},
|
||||
}
|
||||
|
||||
result = _compact_schema(schema)
|
||||
|
||||
assert result == schema
|
||||
|
||||
|
||||
def test_compact_schema_handles_non_dict():
|
||||
"""Non-dict inputs pass through unchanged."""
|
||||
assert _compact_schema("hello") == "hello"
|
||||
assert _compact_schema(42) == 42
|
||||
assert _compact_schema(None) is None
|
||||
assert _compact_schema([1, 2]) == [1, 2]
|
||||
|
||||
|
||||
# -- _truncate_description tests --
|
||||
|
||||
|
||||
def test_truncate_description_short_text():
|
||||
"""Short text is returned as-is."""
|
||||
assert _truncate_description("Hello world", 300) == "Hello world"
|
||||
|
||||
|
||||
def test_truncate_description_cuts_at_sentence():
|
||||
"""Long text is cut at the last sentence boundary."""
|
||||
text = "First sentence. Second sentence. Third sentence that is quite long."
|
||||
result = _truncate_description(text, 40)
|
||||
assert result == "First sentence. Second sentence."
|
||||
|
||||
|
||||
def test_truncate_description_ellipsis_fallback():
|
||||
"""When no sentence boundary, truncates with ellipsis."""
|
||||
text = "A very long single sentence without periods that goes on and on"
|
||||
result = _truncate_description(text, 30)
|
||||
assert result.endswith("...")
|
||||
assert len(result) <= 33 # 30 + "..."
|
||||
|
||||
|
||||
def test_truncate_description_empty():
|
||||
"""Empty string returns empty."""
|
||||
assert _truncate_description("", 300) == ""
|
||||
|
||||
|
||||
def test_truncate_description_zero_max():
|
||||
"""Zero max_length produces ellipsis; the serializer skips calling this."""
|
||||
text = "Some text"
|
||||
# _truncate_description(text, 0) truncates to 0 chars and appends "...".
|
||||
# The caller (_create_search_result_serializer) skips calling it when
|
||||
# max_desc=0 so this edge case only matters for direct callers.
|
||||
result = _truncate_description(text, 0)
|
||||
assert result == "..."
|
||||
|
||||
|
||||
# -- _create_search_result_serializer tests --
|
||||
|
||||
|
||||
def _make_mock_tool(name, description, input_schema):
|
||||
"""Helper to create a mock tool for serializer tests."""
|
||||
mock_tool = MagicMock()
|
||||
mock_mcp_tool = MagicMock()
|
||||
mock_mcp_tool.model_dump.return_value = {
|
||||
"name": name,
|
||||
"description": description,
|
||||
"inputSchema": input_schema,
|
||||
}
|
||||
mock_tool.to_mcp_tool.return_value = mock_mcp_tool
|
||||
return mock_tool
|
||||
|
||||
|
||||
def test_create_serializer_compacts_schemas():
|
||||
"""Compact serializer strips $defs and replaces $ref."""
|
||||
tool = _make_mock_tool(
|
||||
"list_charts",
|
||||
"List charts with filtering.",
|
||||
{
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"filters": {
|
||||
"type": "array",
|
||||
"items": {"$ref": "#/$defs/ChartFilter"},
|
||||
}
|
||||
},
|
||||
"$defs": {
|
||||
"ChartFilter": {
|
||||
"type": "object",
|
||||
"properties": {"col": {"type": "string"}},
|
||||
}
|
||||
},
|
||||
},
|
||||
)
|
||||
|
||||
serializer = _create_search_result_serializer({"compact_schemas": True})
|
||||
result = serializer([tool])
|
||||
|
||||
assert len(result) == 1
|
||||
schema = result[0]["inputSchema"]
|
||||
assert "$defs" not in schema
|
||||
assert schema["properties"]["filters"]["items"] == {"type": "object"}
|
||||
|
||||
|
||||
def test_create_serializer_truncates_descriptions():
|
||||
"""Compact serializer truncates long tool descriptions."""
|
||||
long_desc = "Short intro. " + "x" * 500
|
||||
tool = _make_mock_tool(
|
||||
"generate_chart",
|
||||
long_desc,
|
||||
{"type": "object"},
|
||||
)
|
||||
|
||||
serializer = _create_search_result_serializer({"max_description_length": 50})
|
||||
result = serializer([tool])
|
||||
|
||||
assert len(result[0]["description"]) <= 53 # 50 + potential "..."
|
||||
|
||||
|
||||
def test_create_serializer_disabled():
|
||||
"""When compact_schemas=False and max_description_length=0, no compaction."""
|
||||
tool = _make_mock_tool(
|
||||
"test_tool",
|
||||
"A long description " * 20,
|
||||
{
|
||||
"type": "object",
|
||||
"$defs": {"Model": {"type": "object"}},
|
||||
},
|
||||
)
|
||||
|
||||
serializer = _create_search_result_serializer(
|
||||
{"compact_schemas": False, "max_description_length": 0}
|
||||
)
|
||||
result = serializer([tool])
|
||||
|
||||
# $defs should still be present (compaction disabled)
|
||||
assert "$defs" in result[0]["inputSchema"]
|
||||
# Description should not be truncated
|
||||
assert result[0]["description"] == "A long description " * 20
|
||||
|
||||
|
||||
def test_create_serializer_compact_false_disables_truncation():
|
||||
"""compact_schemas=False also disables description truncation by default."""
|
||||
long_desc = "A very long description. " * 30
|
||||
tool = _make_mock_tool(
|
||||
"test_tool",
|
||||
long_desc,
|
||||
{"type": "object", "$defs": {"Model": {"type": "object"}}},
|
||||
)
|
||||
|
||||
serializer = _create_search_result_serializer({"compact_schemas": False})
|
||||
result = serializer([tool])
|
||||
|
||||
# $defs should still be present (compaction disabled)
|
||||
assert "$defs" in result[0]["inputSchema"]
|
||||
# Description should NOT be truncated (max_desc defaults to 0 when compact=False)
|
||||
assert result[0]["description"] == long_desc
|
||||
|
||||
|
||||
def test_create_serializer_compact_false_explicit_truncation():
|
||||
"""compact_schemas=False with explicit max_description_length still truncates."""
|
||||
long_desc = "First sentence. " + "x" * 500
|
||||
tool = _make_mock_tool(
|
||||
"test_tool",
|
||||
long_desc,
|
||||
{"type": "object", "$defs": {"Model": {"type": "object"}}},
|
||||
)
|
||||
|
||||
serializer = _create_search_result_serializer(
|
||||
{"compact_schemas": False, "max_description_length": 200}
|
||||
)
|
||||
result = serializer([tool])
|
||||
|
||||
# $defs should still be present (compaction disabled)
|
||||
assert "$defs" in result[0]["inputSchema"]
|
||||
# Description SHOULD be truncated (explicitly requested)
|
||||
assert len(result[0]["description"]) <= 203
|
||||
|
||||
|
||||
def test_create_serializer_uses_config_defaults():
|
||||
"""Empty config uses defaults (compact=True, max_desc=300)."""
|
||||
long_desc = "First sentence. " + "x" * 500
|
||||
tool = _make_mock_tool(
|
||||
"test_tool",
|
||||
long_desc,
|
||||
{
|
||||
"type": "object",
|
||||
"$defs": {"Model": {"type": "object"}},
|
||||
"properties": {"x": {"$ref": "#/$defs/Model"}},
|
||||
},
|
||||
)
|
||||
|
||||
serializer = _create_search_result_serializer({})
|
||||
result = serializer([tool])
|
||||
|
||||
assert "$defs" not in result[0]["inputSchema"]
|
||||
assert len(result[0]["description"]) <= 303
|
||||
|
||||
|
||||
def test_apply_transform_uses_compact_serializer():
|
||||
"""_apply_tool_search_transform wires _create_search_result_serializer."""
|
||||
mock_mcp = MagicMock()
|
||||
config = {
|
||||
"strategy": "bm25",
|
||||
"max_results": 5,
|
||||
"always_visible": ["health_check"],
|
||||
"search_tool_name": "search_tools",
|
||||
"call_tool_name": "call_tool",
|
||||
"compact_schemas": True,
|
||||
"max_description_length": 200,
|
||||
}
|
||||
|
||||
_apply_tool_search_transform(mock_mcp, config)
|
||||
|
||||
mock_mcp.add_transform.assert_called_once()
|
||||
transform = mock_mcp.add_transform.call_args[0][0]
|
||||
# The serializer should NOT be the plain _serialize_tools_without_output_schema
|
||||
assert (
|
||||
transform._search_result_serializer
|
||||
is not _serialize_tools_without_output_schema
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user