theopenco · rcogal · Apr 8, 2026 · Apr 8, 2026 · Apr 10, 2026 · coderabbitai
diff --git a/apps/gateway/src/anthropic/anthropic.ts b/apps/gateway/src/anthropic/anthropic.ts
@@ -143,6 +143,12 @@ const anthropicResponseSchema = z.object({
 	usage: z.object({
 		input_tokens: z.number(),
 		output_tokens: z.number(),
+		// Anthropic emits these on caching-supported models, but we keep them
+		// optional with a 0 default so the schema doesn't fail validation if an
+		// older Claude model, a beta endpoint, or a future API change ever omits
+		// them. The downstream conversion code already handles 0 correctly.
+		cache_creation_input_tokens: z.number().optional().default(0),
+		cache_read_input_tokens: z.number().optional().default(0),
 	}),
 });
 
@@ -207,21 +213,40 @@ anthropic.openapi(messages, async (c) => {
 	// Transform Anthropic request to OpenAI format
 	const openaiMessages: Array<Record<string, unknown>> = [];
 
-	// Add system message if provided
+	// Add system message if provided.
+	// When the caller supplies cache_control on any text block, preserve the
+	// per-block array form so the inner /v1/chat/completions path can forward
+	// cache_control markers verbatim to Anthropic. Otherwise, join with " " to
+	// preserve the legacy behavior (and matching token counts) for callers
+	// that pass array-form system without caching opt-in.
 	if (anthropicRequest.system) {
-		let systemContent: string;
 		if (typeof anthropicRequest.system === "string") {
-			systemContent = anthropicRequest.system;
+			openaiMessages.push({
+				role: "system",
+				content: anthropicRequest.system,
+			});
 		} else {
-			// Handle array format - concatenate all text blocks
-			systemContent = anthropicRequest.system
-				.map((block) => block.text)
-				.join(" ");
+			const hasAnyCacheControl = anthropicRequest.system.some(
+				(block) => block.cache_control,
+			);
+			if (hasAnyCacheControl) {
+				openaiMessages.push({
+					role: "system",
+					content: anthropicRequest.system.map((block) => ({
+						type: "text",
+						text: block.text,
+						...(block.cache_control && {
+							cache_control: block.cache_control,
+						}),
+					})),
+				});
+			} else {
+				openaiMessages.push({
+					role: "system",
+					content: anthropicRequest.system.map((block) => block.text).join(" "),
+				});
+			}
 		}
-		openaiMessages.push({
-			role: "system",
-			content: systemContent,
-		});
 	}
 
 	// Transform messages using the approach from claude-code-proxy
@@ -372,9 +397,13 @@ anthropic.openapi(messages, async (c) => {
 			const hasOnlyText = message.content.every(
 				(block) => block.type === "text",
 			);
+			const hasAnyCacheControl = message.content.some(
+				(block) => block.type === "text" && block.cache_control,
+			);
 
-			if (hasOnlyText) {
-				// For text-only content, flatten to a simple string to avoid content type issues
+			if (hasOnlyText && !hasAnyCacheControl) {
+				// For text-only content with no cache markers, flatten to a simple
+				// string to avoid content type issues.
 				const textContent = message.content
 					.filter((block) => block.type === "text")
 					.map((block) => block.text)
@@ -385,10 +414,18 @@ anthropic.openapi(messages, async (c) => {
 					content: textContent,
 				});
 			} else {
-				// For true multi-modal content, transform blocks
+				// For multi-modal content, or text content with cache_control markers,
+				// transform blocks while preserving cache_control so the inner
+				// completions path can forward it to Anthropic.
 				const content = message.content.map((block) => {
 					if (block.type === "text" && block.text) {
-						return { type: "text", text: block.text };
+						return {
+							type: "text",
+							text: block.text,
+							...(block.cache_control && {
+								cache_control: block.cache_control,
+							}),
+						};
 					}
 					if (block.type === "image" && block.source) {
 						return {
@@ -498,7 +535,17 @@ anthropic.openapi(messages, async (c) => {
 					name?: string;
 					input?: string;
 				}> = [];
-				let usage = { input_tokens: 0, output_tokens: 0 };
+				let usage: {
+					input_tokens: number;
+					output_tokens: number;
+					cache_creation_input_tokens: number;
+					cache_read_input_tokens: number;
+				} = {
+					input_tokens: 0,
+					output_tokens: 0,
+					cache_creation_input_tokens: 0,
+					cache_read_input_tokens: 0,
+				};
 				let currentTextBlockIndex: number | null = null;
 				const toolCallBlockIndex = new Map<number, number>();
 
@@ -694,9 +741,23 @@ anthropic.openapi(messages, async (c) => {
 
 									// Update usage if available
 									if (chunk.usage) {
+										const promptDetails =
+											chunk.usage.prompt_tokens_details ?? {};
+										const cacheRead: number = promptDetails.cached_tokens ?? 0;
+										const cacheCreation: number =
+											promptDetails.cache_creation_tokens ?? 0;
+										const totalPrompt: number = chunk.usage.prompt_tokens ?? 0;
+										const nonCachedInput = Math.max(
+											0,
+											totalPrompt - cacheRead - cacheCreation,
+										);
 										usage = {
-											input_tokens: chunk.usage.prompt_tokens ?? 0,
+											input_tokens: nonCachedInput,
 											output_tokens: chunk.usage.completion_tokens ?? 0,
+											// Match Anthropic's API and always emit both fields
+											// (set to 0 when inapplicable).
+											cache_creation_input_tokens: cacheCreation,
+											cache_read_input_tokens: cacheRead,
 										};
 									}
 
@@ -787,6 +848,15 @@ anthropic.openapi(messages, async (c) => {
 		}
 	}
 
+	const usageDetails = openaiResponse.usage?.prompt_tokens_details ?? {};
+	const cachedTokens: number = usageDetails.cached_tokens ?? 0;
+	const cacheCreationTokens: number = usageDetails.cache_creation_tokens ?? 0;
+	const totalPromptTokens: number = openaiResponse.usage?.prompt_tokens ?? 0;
+	const nonCachedInputTokens = Math.max(
+		0,
+		totalPromptTokens - cachedTokens - cacheCreationTokens,
+	);
+
 	const anthropicResponse = {
 		id: openaiResponse.id,
 		type: "message" as const,
@@ -798,8 +868,13 @@ anthropic.openapi(messages, async (c) => {
 		),
 		stop_sequence: null,
 		usage: {
-			input_tokens: openaiResponse.usage?.prompt_tokens ?? 0,
+			input_tokens: nonCachedInputTokens,
 			output_tokens: openaiResponse.usage?.completion_tokens ?? 0,
+			// Match Anthropic's actual API: always emit both fields (set to 0
+			// when inapplicable) so SDK clients with strict typing can read them
+			// without optionality checks.
+			cache_creation_input_tokens: cacheCreationTokens,
+			cache_read_input_tokens: cachedTokens,
 		},
 	};
 

diff --git a/apps/gateway/src/chat-prompt-caching.e2e.ts b/apps/gateway/src/chat-prompt-caching.e2e.ts
@@ -167,32 +167,59 @@ describe("e2e prompt caching", getConcurrentTestOptions(), () => {
 					});
 				}
 
-				// Second request - should read from cache
-				const secondRequestId = generateTestRequestId();
-				const secondRes = await app.request("/v1/chat/completions", {
-					method: "POST",
-					headers: {
-						"Content-Type": "application/json",
-						"x-request-id": secondRequestId,
-						Authorization: `Bearer real-token`,
-					},
-					body: JSON.stringify({
-						model: model,
-						messages: [
-							{
-								role: "system",
-								content: longSystemPrompt,
-							},
-							{
-								role: "user",
-								content:
-									"Just reply with 'OK' to confirm you received the context.",
-							},
-						],
-					}),
-				});
-
-				const secondJson = await secondRes.json();
+				// Second request - should read from cache.
+				// Anthropic prompt cache writes are eventually consistent, so a
+				// back-to-back request can occasionally miss. Retry with backoff
+				// until we observe a cache read or run out of attempts.
+				const sendCacheRequest = async () => {
+					const secondRequestId = generateTestRequestId();
+					const res = await app.request("/v1/chat/completions", {
+						method: "POST",
+						headers: {
+							"Content-Type": "application/json",
+							"x-request-id": secondRequestId,
+							Authorization: `Bearer real-token`,
+						},
+						body: JSON.stringify({
+							model: model,
+							messages: [
+								{
+									role: "system",
+									content: longSystemPrompt,
+								},
+								{
+									role: "user",
+									content:
+										"Just reply with 'OK' to confirm you received the context.",
+								},
+							],
+						}),
+					});
+					const json = await res.json();
+					return { res, json, secondRequestId };
+				};
+
+				let attempt = 0;
+				const maxAttempts = 4;
+				let secondRes: Response;
+				let secondJson: any;
+				let secondRequestId: string;
+				do {
+					attempt++;
+					({
+						res: secondRes,
+						json: secondJson,
+						secondRequestId,
+					} = await sendCacheRequest());
+					const cached =
+						secondJson?.usage?.prompt_tokens_details?.cached_tokens ?? 0;
+					if (secondRes.status !== 200 || cached > 0) {
+						break;
+					}
+					if (attempt < maxAttempts) {
+						await new Promise((r) => setTimeout(r, 750 * attempt));
+					}
+				} while (attempt < maxAttempts);
 				if (logMode) {
 					console.log("Second response:", JSON.stringify(secondJson, null, 2));
 				}

diff --git a/apps/gateway/src/chat/chat.ts b/apps/gateway/src/chat/chat.ts
@@ -5403,6 +5403,7 @@ chat.openapi(completions, async (c) => {
 				let totalTokens = null;
 				let reasoningTokens = null;
 				let cachedTokens = null;
+				let cacheCreationTokens: number | null = null;
 				let streamingToolCalls = null;
 				let imageByteSize = 0; // Track total image data size for token estimation
 				let outputImageCount = 0; // Track number of output images for cost calculation
@@ -6570,6 +6571,9 @@ chat.openapi(completions, async (c) => {
 								if (usage.cachedTokens !== null) {
 									cachedTokens = usage.cachedTokens;
 								}
+								if (usage.cacheCreationTokens !== null) {
+									cacheCreationTokens = usage.cacheCreationTokens;
+								}
 
 								// Estimate tokens if not provided and we have a finish reason
 								if (finishReason && (!promptTokens || !completionTokens)) {
@@ -7103,9 +7107,15 @@ chat.openapi(completions, async (c) => {
 											1,
 											Math.round(adjPrompt + adjCompletion),
 										),
-										...(cachedTokens !== null && {
+										...((cachedTokens !== null ||
+											(cacheCreationTokens !== null &&
+												cacheCreationTokens > 0)) && {
 											prompt_tokens_details: {
-												cached_tokens: cachedTokens,
+												cached_tokens: cachedTokens ?? 0,
+												...(cacheCreationTokens !== null &&
+													cacheCreationTokens > 0 && {
+														cache_creation_tokens: cacheCreationTokens,
+													}),
 											},
 										}),
 										cost_usd_total: streamingCostsEarly.totalCost,
@@ -8720,6 +8730,7 @@ chat.openapi(completions, async (c) => {
 		completionTokens,
 		reasoningTokens,
 		cachedTokens,
+		cacheCreationTokens,
 		toolResults,
 		images,
 		annotations,
@@ -8897,6 +8908,7 @@ chat.openapi(completions, async (c) => {
 		routingAttempts.length > 0 ? routingAttempts : null,
 		requestId,
 		usedRegion,
+		cacheCreationTokens,
 	);
 
 	// Extract plugin IDs for logging

diff --git a/apps/gateway/src/chat/schemas/completions.ts b/apps/gateway/src/chat/schemas/completions.ts
@@ -19,6 +19,11 @@ export const completionsRequestSchema = z.object({
 							z.object({
 								type: z.literal("text"),
 								text: z.string(),
+								cache_control: z
+									.object({
+										type: z.literal("ephemeral"),
+									})
+									.optional(),
 							}),
 							z.object({
 								type: z.literal("image_url"),

diff --git a/apps/gateway/src/chat/tools/extract-token-usage.ts b/apps/gateway/src/chat/tools/extract-token-usage.ts
@@ -45,6 +45,7 @@ export function extractTokenUsage(
 	let totalTokens = null;
 	let reasoningTokens = null;
 	let cachedTokens = null;
+	let cacheCreationTokens = null;
 
 	switch (provider) {
 		case "google-ai-studio":
@@ -110,6 +111,7 @@ export function extractTokenUsage(
 				completionTokens = data.usage.outputTokens ?? null;
 				// Cached tokens are the tokens read from cache (discount applies to these)
 				cachedTokens = cacheReadTokens;
+				cacheCreationTokens = cacheWriteTokens;
 				totalTokens = data.usage.totalTokens ?? null;
 			}
 			break;
@@ -118,15 +120,16 @@ export function extractTokenUsage(
 				// For Anthropic: input_tokens are the non-cached tokens
 				// We need to add cache_creation_input_tokens to get total input tokens
 				const inputTokens = data.usage.input_tokens ?? 0;
-				const cacheCreationTokens = data.usage.cache_creation_input_tokens ?? 0;
+				const cacheCreation = data.usage.cache_creation_input_tokens ?? 0;
 				const cacheReadTokens = data.usage.cache_read_input_tokens ?? 0;
 
 				// Total prompt tokens = non-cached + cache creation + cache read
-				promptTokens = inputTokens + cacheCreationTokens + cacheReadTokens;
+				promptTokens = inputTokens + cacheCreation + cacheReadTokens;
 				completionTokens = data.usage.output_tokens ?? null;
 				reasoningTokens = data.usage.reasoning_output_tokens ?? null;
 				// Cached tokens are the tokens read from cache (discount applies to these)
 				cachedTokens = cacheReadTokens;
+				cacheCreationTokens = cacheCreation;
 				totalTokens = (promptTokens ?? 0) + (completionTokens ?? 0);
 			}
 			break;
@@ -157,5 +160,6 @@ export function extractTokenUsage(
 		totalTokens,
 		reasoningTokens,
 		cachedTokens,
+		cacheCreationTokens,
 	};
 }