Skip to content

Commit 2a6780d

Browse files
committed
fix: forward MCP tool images to LLM context
MCP tools can return image content blocks, but these were only passed to the UI for display and never forwarded to the LLM in the follow-up turn. Since OpenAI's `role: "tool"` messages only accept text, inject a separate `role: "user"` message containing the image parts when the model supports multimodal input.
1 parent c2e6c6d commit 2a6780d

File tree

2 files changed

+38
-5
lines changed

2 files changed

+38
-5
lines changed

src/lib/server/textGeneration/mcp/runMcpFlow.ts

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -696,15 +696,23 @@ export async function* runMcpFlow({
696696
if (event.type === "update") {
697697
yield event.update;
698698
} else {
699-
messagesOpenAI = [
700-
...messagesOpenAI,
699+
const followupMessages: ChatCompletionMessageParam[] = [
701700
assistantToolMessage,
702701
...(event.summary.toolMessages ?? []),
703702
];
703+
// Inject tool-returned images as a user message so the LLM can see them
704+
const toolImageCount = event.summary.toolImages?.length ?? 0;
705+
if (mmEnabled && toolImageCount > 0) {
706+
followupMessages.push({
707+
role: "user",
708+
content: event.summary.toolImages,
709+
});
710+
}
711+
messagesOpenAI = [...messagesOpenAI, ...followupMessages];
704712
toolMsgCount = event.summary.toolMessages?.length ?? 0;
705713
toolRunCount = event.summary.toolRuns?.length ?? 0;
706714
logger.info(
707-
{ toolMsgCount, toolRunCount },
715+
{ toolMsgCount, toolRunCount, toolImageCount },
708716
"[mcp] tools executed; continuing loop for follow-up completion"
709717
);
710718
}

src/lib/server/textGeneration/mcp/toolInvocation.ts

Lines changed: 27 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,11 @@ export type ToolRun = {
2323
output: string;
2424
};
2525

26+
export type ToolImagePart = {
27+
type: "image_url";
28+
image_url: { url: string; detail: "auto" };
29+
};
30+
2631
export interface NormalizedToolCall {
2732
id: string;
2833
name: string;
@@ -47,6 +52,7 @@ export interface ExecuteToolCallsParams {
4752
export interface ToolCallExecutionResult {
4853
toolMessages: ChatCompletionMessageParam[];
4954
toolRuns: ToolRun[];
55+
toolImages: ToolImagePart[];
5056
finalAnswer?: { text: string; interrupted: boolean };
5157
}
5258

@@ -64,6 +70,17 @@ const serverMap = (servers: McpServerConfig[]): Map<string, McpServerConfig> =>
6470
return map;
6571
};
6672

73+
function toToolImagePart(block: unknown): ToolImagePart | undefined {
74+
if (!block || typeof block !== "object") return undefined;
75+
const obj = block as Record<string, unknown>;
76+
if (obj.type !== "image" || typeof obj.data !== "string" || typeof obj.mimeType !== "string")
77+
return undefined;
78+
return {
79+
type: "image_url",
80+
image_url: { url: `data:${obj.mimeType};base64,${obj.data}`, detail: "auto" },
81+
};
82+
}
83+
6784
export async function* executeToolCalls({
6885
calls,
6986
mapping,
@@ -78,6 +95,7 @@ export async function* executeToolCalls({
7895
const effectiveTimeoutMs = toolTimeoutMs ?? getMcpToolTimeoutMs();
7996
const toolMessages: ChatCompletionMessageParam[] = [];
8097
const toolRuns: ToolRun[] = [];
98+
const toolImages: ToolImagePart[] = [];
8199
const serverLookup = serverMap(servers);
82100
// Pre-emit call + ETA updates and prepare tasks
83101
type TaskResult = {
@@ -335,7 +353,14 @@ export async function* executeToolCalls({
335353
const name = prepared[r.index].call.name;
336354
const id = prepared[r.index].call.id;
337355
if (!r.error) {
338-
const output = r.output ?? "";
356+
let output = r.output ?? "";
357+
// Extract any image content blocks returned by the MCP tool
358+
const imageParts = (r.blocks ?? []).map(toToolImagePart).filter(Boolean) as ToolImagePart[];
359+
toolImages.push(...imageParts);
360+
// If output is empty but images were returned, provide placeholder text
361+
if (output === "" && imageParts.length > 0) {
362+
output = "Tool returned image(s).";
363+
}
339364
toolRuns.push({ name, parameters: r.paramsClean, output });
340365
// For the LLM follow-up call, we keep only the textual output
341366
toolMessages.push({ role: "tool", tool_call_id: id, content: output });
@@ -345,5 +370,5 @@ export async function* executeToolCalls({
345370
}
346371
}
347372

348-
yield { type: "complete", summary: { toolMessages, toolRuns } };
373+
yield { type: "complete", summary: { toolMessages, toolRuns, toolImages } };
349374
}

0 commit comments

Comments
 (0)