mirror of
https://github.com/apache/superset.git
synced 2026-04-11 12:26:05 +00:00
655 lines
22 KiB
Python
655 lines
22 KiB
Python
# Licensed to the Apache Software Foundation (ASF) under one
|
|
# or more contributor license agreements. See the NOTICE file
|
|
# distributed with this work for additional information
|
|
# regarding copyright ownership. The ASF licenses this file
|
|
# to you under the Apache License, Version 2.0 (the
|
|
# "License"); you may not use this file except in compliance
|
|
# with the License. You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing,
|
|
# software distributed under the License is distributed on an
|
|
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
# KIND, either express or implied. See the License for the
|
|
# specific language governing permissions and limitations
|
|
# under the License.
|
|
|
|
"""
|
|
MCP tool: get_chart_data
|
|
"""
|
|
|
|
import logging
|
|
from typing import Any, Dict, List, TYPE_CHECKING
|
|
|
|
from fastmcp import Context
|
|
from superset_core.mcp import tool
|
|
|
|
if TYPE_CHECKING:
|
|
from superset.models.slice import Slice
|
|
|
|
from superset.mcp_service.chart.schemas import (
|
|
ChartData,
|
|
ChartError,
|
|
DataColumn,
|
|
GetChartDataRequest,
|
|
PerformanceMetadata,
|
|
)
|
|
from superset.mcp_service.utils.cache_utils import get_cache_status_from_result
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
@tool(tags=["data"])
|
|
async def get_chart_data( # noqa: C901
|
|
request: GetChartDataRequest, ctx: Context
|
|
) -> ChartData | ChartError:
|
|
"""Get chart data by ID or UUID.
|
|
|
|
Returns the actual data behind a chart for LLM analysis without image rendering.
|
|
|
|
Supports:
|
|
- Numeric ID or UUID lookup
|
|
- Multiple formats: json, csv, excel
|
|
- Cache control: use_cache, force_refresh, cache_timeout
|
|
|
|
Returns underlying data in requested format with cache status.
|
|
"""
|
|
await ctx.info(
|
|
"Starting chart data retrieval: identifier=%s, format=%s, limit=%s"
|
|
% (
|
|
request.identifier,
|
|
request.format,
|
|
request.limit,
|
|
)
|
|
)
|
|
await ctx.debug(
|
|
"Cache settings: use_cache=%s, force_refresh=%s, cache_timeout=%s"
|
|
% (
|
|
request.use_cache,
|
|
request.force_refresh,
|
|
request.cache_timeout,
|
|
)
|
|
)
|
|
|
|
try:
|
|
await ctx.report_progress(1, 4, "Looking up chart")
|
|
from superset.daos.chart import ChartDAO
|
|
from superset.utils import json as utils_json
|
|
|
|
# Find the chart
|
|
chart = None
|
|
if isinstance(request.identifier, int) or (
|
|
isinstance(request.identifier, str) and request.identifier.isdigit()
|
|
):
|
|
chart_id = (
|
|
int(request.identifier)
|
|
if isinstance(request.identifier, str)
|
|
else request.identifier
|
|
)
|
|
await ctx.debug(
|
|
"Performing ID-based chart lookup: chart_id=%s" % (chart_id,)
|
|
)
|
|
chart = ChartDAO.find_by_id(chart_id)
|
|
else:
|
|
await ctx.debug(
|
|
"Performing UUID-based chart lookup: uuid=%s" % (request.identifier,)
|
|
)
|
|
# Try UUID lookup using DAO flexible method
|
|
chart = ChartDAO.find_by_id(request.identifier, id_column="uuid")
|
|
|
|
if not chart:
|
|
await ctx.error("Chart not found: identifier=%s" % (request.identifier,))
|
|
return ChartError(
|
|
error=f"No chart found with identifier: {request.identifier}",
|
|
error_type="NotFound",
|
|
)
|
|
|
|
await ctx.info(
|
|
"Chart found successfully: chart_id=%s, chart_name=%s, viz_type=%s"
|
|
% (
|
|
chart.id,
|
|
chart.slice_name,
|
|
chart.viz_type,
|
|
)
|
|
)
|
|
logger.info("Getting data for chart %s: %s", chart.id, chart.slice_name)
|
|
|
|
import time
|
|
|
|
start_time = time.time()
|
|
|
|
try:
|
|
await ctx.report_progress(2, 4, "Preparing data query")
|
|
# Get chart data using the existing API
|
|
from superset.commands.chart.data.get_data_command import ChartDataCommand
|
|
from superset.common.query_context_factory import QueryContextFactory
|
|
|
|
# Parse the form_data to get query context
|
|
form_data = utils_json.loads(chart.params) if chart.params else {}
|
|
await ctx.debug(
|
|
"Chart form data parsed: has_filters=%s, has_groupby=%s, has_metrics=%s"
|
|
% (
|
|
bool(form_data.get("filters")),
|
|
bool(form_data.get("groupby")),
|
|
bool(form_data.get("metrics")),
|
|
)
|
|
)
|
|
|
|
# Create a proper QueryContext using the factory with cache control
|
|
factory = QueryContextFactory()
|
|
query_context = factory.create(
|
|
datasource={"id": chart.datasource_id, "type": chart.datasource_type},
|
|
queries=[
|
|
{
|
|
"filters": form_data.get("filters", []),
|
|
"columns": form_data.get("groupby", []),
|
|
"metrics": form_data.get("metrics", []),
|
|
"row_limit": request.limit or 100,
|
|
"order_desc": True,
|
|
# Apply cache control from request
|
|
"cache_timeout": request.cache_timeout,
|
|
}
|
|
],
|
|
form_data=form_data,
|
|
# Use cache unless force_refresh is True
|
|
force=request.force_refresh,
|
|
)
|
|
|
|
await ctx.report_progress(3, 4, "Executing data query")
|
|
await ctx.debug(
|
|
"Query execution parameters: datasource_id=%s, datasource_type=%s, "
|
|
"row_limit=%s, force_refresh=%s"
|
|
% (
|
|
chart.datasource_id,
|
|
chart.datasource_type,
|
|
request.limit or 100,
|
|
request.force_refresh,
|
|
)
|
|
)
|
|
|
|
# Execute the query
|
|
command = ChartDataCommand(query_context)
|
|
result = command.run()
|
|
|
|
# Handle empty query results for certain chart types
|
|
if not result or ("queries" not in result) or len(result["queries"]) == 0:
|
|
await ctx.warning(
|
|
"Empty query results: chart_id=%s, chart_type=%s"
|
|
% (chart.id, chart.viz_type)
|
|
)
|
|
return ChartError(
|
|
error=f"No query results returned for chart {chart.id}. "
|
|
f"This may occur with chart types like big_number.",
|
|
error_type="EmptyQuery",
|
|
)
|
|
|
|
# Extract data from result (we've already validated it exists above)
|
|
query_result = result["queries"][0]
|
|
data = query_result.get("data", [])
|
|
raw_columns = query_result.get("colnames", [])
|
|
|
|
await ctx.debug(
|
|
"Query results received: row_count=%s, column_count=%s, "
|
|
"has_cache_key=%s"
|
|
% (
|
|
len(data),
|
|
len(raw_columns),
|
|
bool(query_result.get("cache_key")),
|
|
)
|
|
)
|
|
|
|
# Check if we have data to work with
|
|
if not data:
|
|
await ctx.warning("No data in query results: chart_id=%s" % (chart.id,))
|
|
return ChartError(
|
|
error=f"No data available for chart {chart.id}", error_type="NoData"
|
|
)
|
|
|
|
# Create rich column metadata
|
|
columns = []
|
|
for col_name in raw_columns:
|
|
# Sample some values for metadata
|
|
sample_values = [
|
|
row.get(col_name)
|
|
for row in data[:3]
|
|
if row.get(col_name) is not None
|
|
]
|
|
|
|
# Infer data type
|
|
data_type = "string"
|
|
if sample_values:
|
|
if all(isinstance(v, (int, float)) for v in sample_values):
|
|
data_type = "numeric"
|
|
elif all(isinstance(v, bool) for v in sample_values):
|
|
data_type = "boolean"
|
|
|
|
columns.append(
|
|
DataColumn(
|
|
name=col_name,
|
|
display_name=col_name.replace("_", " ").title(),
|
|
data_type=data_type,
|
|
sample_values=sample_values[:3],
|
|
null_count=sum(1 for row in data if row.get(col_name) is None),
|
|
unique_count=len({str(row.get(col_name)) for row in data}),
|
|
)
|
|
)
|
|
|
|
# Cache status information using utility function
|
|
cache_status = get_cache_status_from_result(
|
|
query_result, force_refresh=request.force_refresh
|
|
)
|
|
|
|
# Generate insights and recommendations
|
|
insights = []
|
|
if len(data) > 100:
|
|
insights.append(
|
|
"Large dataset - consider filtering for better performance"
|
|
)
|
|
if len(raw_columns) > 10:
|
|
insights.append("Many columns available - focus on key metrics")
|
|
|
|
# Add cache-specific insights
|
|
if cache_status.cache_hit:
|
|
if (
|
|
cache_status.cache_age_seconds
|
|
and cache_status.cache_age_seconds > 3600
|
|
):
|
|
hours_old = cache_status.cache_age_seconds // 3600
|
|
insights.append(
|
|
f"Data is from cache ({hours_old}h old) - "
|
|
"consider refreshing for latest data"
|
|
)
|
|
else:
|
|
insights.append("Data served from cache for fast response")
|
|
else:
|
|
insights.append("Fresh data retrieved from database")
|
|
|
|
recommended_visualizations = []
|
|
if any(
|
|
"time" in col.lower() or "date" in col.lower() for col in raw_columns
|
|
):
|
|
recommended_visualizations.extend(["line chart", "time series"])
|
|
if len(raw_columns) <= 3:
|
|
recommended_visualizations.extend(["bar chart", "scatter plot"])
|
|
|
|
# Performance metadata with cache awareness
|
|
execution_time = int((time.time() - start_time) * 1000)
|
|
performance_status = (
|
|
"cache_hit" if cache_status.cache_hit else "fresh_query"
|
|
)
|
|
optimization_suggestions = []
|
|
|
|
if not cache_status.cache_hit and execution_time > 5000:
|
|
optimization_suggestions.append(
|
|
"Consider using cache for this slow query"
|
|
)
|
|
elif (
|
|
cache_status.cache_hit
|
|
and cache_status.cache_age_seconds
|
|
and cache_status.cache_age_seconds > 86400
|
|
):
|
|
optimization_suggestions.append("Cache is old - consider refreshing")
|
|
|
|
performance = PerformanceMetadata(
|
|
query_duration_ms=execution_time,
|
|
cache_status=performance_status,
|
|
optimization_suggestions=optimization_suggestions,
|
|
)
|
|
|
|
# Generate comprehensive summary with cache info
|
|
cache_info = ""
|
|
if cache_status.cache_hit:
|
|
age_info = (
|
|
f" (cached {cache_status.cache_age_seconds // 60}m ago)"
|
|
if cache_status.cache_age_seconds
|
|
else " (cached)"
|
|
)
|
|
cache_info = age_info
|
|
|
|
summary_parts = [
|
|
f"Chart '{chart.slice_name}' ({chart.viz_type})",
|
|
f"Contains {len(data)} rows across {len(raw_columns)} columns"
|
|
f"{cache_info}",
|
|
]
|
|
|
|
if data and len(data) > 0:
|
|
summary_parts.append(
|
|
f"Sample data includes: {', '.join(raw_columns[:3])}"
|
|
)
|
|
|
|
summary = ". ".join(summary_parts)
|
|
|
|
# Handle different export formats
|
|
if request.format == "csv":
|
|
return _export_data_as_csv(
|
|
chart,
|
|
data[: request.limit] if request.limit else data,
|
|
raw_columns,
|
|
cache_status,
|
|
performance,
|
|
)
|
|
elif request.format == "excel":
|
|
return _export_data_as_excel(
|
|
chart,
|
|
data[: request.limit] if request.limit else data,
|
|
raw_columns,
|
|
cache_status,
|
|
performance,
|
|
)
|
|
|
|
await ctx.report_progress(4, 4, "Building response")
|
|
|
|
# Calculate data quality metrics
|
|
data_completeness = 1.0 - (
|
|
sum(col.null_count for col in columns)
|
|
/ max(len(data) * len(columns), 1)
|
|
)
|
|
|
|
await ctx.info(
|
|
"Chart data retrieval completed successfully: chart_id=%s, "
|
|
"rows_returned=%s, columns_returned=%s, execution_time_ms=%s, "
|
|
"cache_hit=%s, data_completeness=%s"
|
|
% (
|
|
chart.id,
|
|
len(data),
|
|
len(raw_columns),
|
|
execution_time,
|
|
cache_status.cache_hit,
|
|
round(data_completeness, 3),
|
|
)
|
|
)
|
|
|
|
# Default JSON format
|
|
return ChartData(
|
|
chart_id=chart.id,
|
|
chart_name=chart.slice_name or f"Chart {chart.id}",
|
|
chart_type=chart.viz_type or "unknown",
|
|
columns=columns,
|
|
data=data[: request.limit] if request.limit else data,
|
|
row_count=len(data),
|
|
total_rows=query_result.get("rowcount"),
|
|
summary=summary,
|
|
insights=insights,
|
|
data_quality={"completeness": data_completeness},
|
|
recommended_visualizations=recommended_visualizations,
|
|
data_freshness=None, # Add missing field
|
|
performance=performance,
|
|
cache_status=cache_status,
|
|
)
|
|
|
|
except Exception as data_error:
|
|
await ctx.error(
|
|
"Data retrieval failed: chart_id=%s, error=%s, error_type=%s"
|
|
% (
|
|
chart.id,
|
|
str(data_error),
|
|
type(data_error).__name__,
|
|
)
|
|
)
|
|
logger.error("Data retrieval error for chart %s: %s", chart.id, data_error)
|
|
return ChartError(
|
|
error=f"Error retrieving chart data: {str(data_error)}",
|
|
error_type="DataError",
|
|
)
|
|
|
|
except Exception as e:
|
|
await ctx.error(
|
|
"Chart data retrieval failed: identifier=%s, error=%s, error_type=%s"
|
|
% (
|
|
request.identifier,
|
|
str(e),
|
|
type(e).__name__,
|
|
)
|
|
)
|
|
logger.error("Error in get_chart_data: %s", e)
|
|
return ChartError(
|
|
error=f"Failed to get chart data: {str(e)}", error_type="InternalError"
|
|
)
|
|
|
|
|
|
def _export_data_as_csv(
|
|
chart: "Slice",
|
|
data: List[Dict[str, Any]],
|
|
columns: List[str],
|
|
cache_status: Any,
|
|
performance: Any,
|
|
) -> "ChartData":
|
|
"""Export chart data as CSV format."""
|
|
import csv
|
|
import io
|
|
|
|
# Create CSV content
|
|
output = io.StringIO()
|
|
|
|
if data and columns:
|
|
writer = csv.DictWriter(output, fieldnames=columns)
|
|
writer.writeheader()
|
|
|
|
# Write data rows
|
|
for row in data:
|
|
# Ensure all values are properly formatted for CSV
|
|
csv_row = {}
|
|
for col in columns:
|
|
value = row.get(col, "")
|
|
# Handle None values and convert to string
|
|
if value is None:
|
|
csv_row[col] = ""
|
|
elif isinstance(value, (list, dict)):
|
|
csv_row[col] = str(value)
|
|
else:
|
|
csv_row[col] = value
|
|
writer.writerow(csv_row)
|
|
|
|
csv_content = output.getvalue()
|
|
|
|
# Return as ChartData with CSV content in a special field
|
|
from superset.mcp_service.chart.schemas import ChartData
|
|
|
|
return ChartData(
|
|
chart_id=chart.id,
|
|
chart_name=chart.slice_name or f"Chart {chart.id}",
|
|
chart_type=chart.viz_type or "unknown",
|
|
columns=[], # Not needed for CSV export
|
|
data=[], # CSV content is in csv_data field
|
|
row_count=len(data),
|
|
total_rows=len(data),
|
|
summary=f"CSV export of chart '{chart.slice_name}' with {len(data)} rows",
|
|
insights=[f"Data exported as CSV format ({len(csv_content)} characters)"],
|
|
data_quality={},
|
|
recommended_visualizations=[],
|
|
data_freshness=None,
|
|
performance=performance,
|
|
cache_status=cache_status,
|
|
# Store CSV content in data field as string for the response
|
|
csv_data=csv_content,
|
|
format="csv",
|
|
)
|
|
|
|
|
|
def _export_data_as_excel(
|
|
chart: "Slice",
|
|
data: List[Dict[str, Any]],
|
|
columns: List[str],
|
|
cache_status: Any,
|
|
performance: Any,
|
|
) -> "ChartData | ChartError":
|
|
"""Export chart data as Excel format."""
|
|
try:
|
|
excel_b64 = _create_excel_with_openpyxl(chart, data, columns)
|
|
return _create_excel_chart_data(
|
|
chart, data, excel_b64, performance, cache_status
|
|
)
|
|
except ImportError:
|
|
return _try_xlsxwriter_fallback(chart, data, columns, cache_status, performance)
|
|
|
|
|
|
def _create_excel_with_openpyxl(
|
|
chart: "Slice", data: List[Dict[str, Any]], columns: List[str]
|
|
) -> str:
|
|
"""Create Excel file using openpyxl."""
|
|
import base64
|
|
import io
|
|
|
|
from openpyxl import Workbook
|
|
|
|
wb = Workbook()
|
|
ws = wb.active
|
|
ws.title = chart.slice_name[:31] if chart.slice_name else "Chart Data"
|
|
|
|
if data and columns:
|
|
_write_excel_headers(ws, columns)
|
|
_write_excel_data(ws, data, columns)
|
|
|
|
output = io.BytesIO()
|
|
wb.save(output)
|
|
output.seek(0)
|
|
return base64.b64encode(output.read()).decode()
|
|
|
|
|
|
def _write_excel_headers(ws: Any, columns: List[str]) -> None:
|
|
"""Write headers to Excel worksheet."""
|
|
for idx, col in enumerate(columns, 1):
|
|
ws.cell(row=1, column=idx, value=col)
|
|
|
|
|
|
def _write_excel_data(ws: Any, data: List[Dict[str, Any]], columns: List[str]) -> None:
|
|
"""Write data to Excel worksheet."""
|
|
for row_idx, row in enumerate(data, 2):
|
|
for col_idx, col in enumerate(columns, 1):
|
|
value = row.get(col, "")
|
|
if value is None:
|
|
value = ""
|
|
elif isinstance(value, (list, dict)):
|
|
value = str(value)
|
|
ws.cell(row=row_idx, column=col_idx, value=value)
|
|
|
|
|
|
def _try_xlsxwriter_fallback(
|
|
chart: "Slice",
|
|
data: List[Dict[str, Any]],
|
|
columns: List[str],
|
|
cache_status: Any,
|
|
performance: Any,
|
|
) -> "ChartData | ChartError":
|
|
"""Try xlsxwriter as fallback for Excel export."""
|
|
try:
|
|
excel_b64 = _create_excel_with_xlsxwriter(chart, data, columns)
|
|
return _create_excel_chart_data_xlsxwriter(
|
|
chart, data, excel_b64, performance, cache_status
|
|
)
|
|
except ImportError:
|
|
from superset.mcp_service.chart.schemas import ChartError
|
|
|
|
return ChartError(
|
|
error="Excel export requires openpyxl or xlsxwriter package",
|
|
error_type="ExportError",
|
|
)
|
|
|
|
|
|
def _create_excel_with_xlsxwriter(
|
|
chart: "Slice", data: List[Dict[str, Any]], columns: List[str]
|
|
) -> str:
|
|
"""Create Excel file using xlsxwriter."""
|
|
import base64
|
|
import io
|
|
|
|
import xlsxwriter
|
|
|
|
output = io.BytesIO()
|
|
workbook = xlsxwriter.Workbook(output, {"in_memory": True})
|
|
sheet_name = chart.slice_name[:31] if chart.slice_name else "Chart Data"
|
|
worksheet = workbook.add_worksheet(sheet_name)
|
|
|
|
if data and columns:
|
|
_write_xlsxwriter_data(worksheet, data, columns)
|
|
|
|
workbook.close()
|
|
output.seek(0)
|
|
return base64.b64encode(output.read()).decode()
|
|
|
|
|
|
def _write_xlsxwriter_data(
|
|
worksheet: Any, data: List[Dict[str, Any]], columns: List[str]
|
|
) -> None:
|
|
"""Write data to xlsxwriter worksheet."""
|
|
# Write headers
|
|
for idx, col in enumerate(columns):
|
|
worksheet.write(0, idx, col)
|
|
|
|
# Write data
|
|
for row_idx, row in enumerate(data):
|
|
for col_idx, col in enumerate(columns):
|
|
value = row.get(col, "")
|
|
if value is None:
|
|
value = ""
|
|
elif isinstance(value, (list, dict)):
|
|
value = str(value)
|
|
worksheet.write(row_idx + 1, col_idx, value)
|
|
|
|
|
|
def _create_excel_chart_data(
|
|
chart: "Slice",
|
|
data: List[Dict[str, Any]],
|
|
excel_b64: str,
|
|
performance: Any,
|
|
cache_status: Any,
|
|
) -> "ChartData":
|
|
"""Create ChartData response for Excel export (openpyxl)."""
|
|
from superset.mcp_service.chart.schemas import ChartData
|
|
|
|
chart_name = chart.slice_name or f"Chart {chart.id}"
|
|
summary = f"Excel export of chart '{chart.slice_name}' with {len(data)} rows"
|
|
|
|
return ChartData(
|
|
chart_id=chart.id,
|
|
chart_name=chart_name,
|
|
chart_type=chart.viz_type or "unknown",
|
|
columns=[],
|
|
data=[],
|
|
row_count=len(data),
|
|
total_rows=len(data),
|
|
summary=summary,
|
|
insights=["Data exported as Excel format (base64 encoded)"],
|
|
data_quality={},
|
|
recommended_visualizations=[],
|
|
data_freshness=None,
|
|
performance=performance,
|
|
cache_status=cache_status,
|
|
excel_data=excel_b64,
|
|
format="excel",
|
|
)
|
|
|
|
|
|
def _create_excel_chart_data_xlsxwriter(
|
|
chart: "Slice",
|
|
data: List[Dict[str, Any]],
|
|
excel_b64: str,
|
|
performance: Any,
|
|
cache_status: Any,
|
|
) -> "ChartData":
|
|
"""Create ChartData response for Excel export (xlsxwriter)."""
|
|
from superset.mcp_service.chart.schemas import ChartData
|
|
|
|
chart_name = chart.slice_name or f"Chart {chart.id}"
|
|
summary = f"Excel export of chart '{chart.slice_name}' with {len(data)} rows"
|
|
|
|
return ChartData(
|
|
chart_id=chart.id,
|
|
chart_name=chart_name,
|
|
chart_type=chart.viz_type or "unknown",
|
|
columns=[],
|
|
data=[],
|
|
row_count=len(data),
|
|
total_rows=len(data),
|
|
summary=summary,
|
|
insights=["Data exported as Excel format (base64 encoded, xlsxwriter)"],
|
|
data_quality={},
|
|
recommended_visualizations=[],
|
|
data_freshness=None,
|
|
performance=performance,
|
|
cache_status=cache_status,
|
|
excel_data=excel_b64,
|
|
format="excel",
|
|
)
|