Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
111 changes: 93 additions & 18 deletions apps/gateway/src/anthropic/anthropic.ts
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,12 @@ const anthropicResponseSchema = z.object({
usage: z.object({
input_tokens: z.number(),
output_tokens: z.number(),
// Anthropic emits these on caching-supported models, but we keep them
// optional with a 0 default so the schema doesn't fail validation if an
// older Claude model, a beta endpoint, or a future API change ever omits
// them. The downstream conversion code already handles 0 correctly.
cache_creation_input_tokens: z.number().optional().default(0),
cache_read_input_tokens: z.number().optional().default(0),
}),
});

Expand Down Expand Up @@ -207,21 +213,40 @@ anthropic.openapi(messages, async (c) => {
// Transform Anthropic request to OpenAI format
const openaiMessages: Array<Record<string, unknown>> = [];

// Add system message if provided
// Add system message if provided.
// When the caller supplies cache_control on any text block, preserve the
// per-block array form so the inner /v1/chat/completions path can forward
// cache_control markers verbatim to Anthropic. Otherwise, join with " " to
// preserve the legacy behavior (and matching token counts) for callers
// that pass array-form system without caching opt-in.
if (anthropicRequest.system) {
let systemContent: string;
if (typeof anthropicRequest.system === "string") {
systemContent = anthropicRequest.system;
openaiMessages.push({
role: "system",
content: anthropicRequest.system,
});
} else {
// Handle array format - concatenate all text blocks
systemContent = anthropicRequest.system
.map((block) => block.text)
.join(" ");
const hasAnyCacheControl = anthropicRequest.system.some(
(block) => block.cache_control,
);
if (hasAnyCacheControl) {
openaiMessages.push({
role: "system",
content: anthropicRequest.system.map((block) => ({
type: "text",
text: block.text,
...(block.cache_control && {
cache_control: block.cache_control,
}),
})),
});
} else {
openaiMessages.push({
role: "system",
content: anthropicRequest.system.map((block) => block.text).join(" "),
});
}
}
openaiMessages.push({
role: "system",
content: systemContent,
});
}

// Transform messages using the approach from claude-code-proxy
Expand Down Expand Up @@ -372,9 +397,13 @@ anthropic.openapi(messages, async (c) => {
const hasOnlyText = message.content.every(
(block) => block.type === "text",
);
const hasAnyCacheControl = message.content.some(
(block) => block.type === "text" && block.cache_control,
);

if (hasOnlyText) {
// For text-only content, flatten to a simple string to avoid content type issues
if (hasOnlyText && !hasAnyCacheControl) {
// For text-only content with no cache markers, flatten to a simple
// string to avoid content type issues.
const textContent = message.content
.filter((block) => block.type === "text")
.map((block) => block.text)
Expand All @@ -385,10 +414,18 @@ anthropic.openapi(messages, async (c) => {
content: textContent,
});
} else {
// For true multi-modal content, transform blocks
// For multi-modal content, or text content with cache_control markers,
// transform blocks while preserving cache_control so the inner
// completions path can forward it to Anthropic.
const content = message.content.map((block) => {
if (block.type === "text" && block.text) {
return { type: "text", text: block.text };
return {
type: "text",
text: block.text,
...(block.cache_control && {
cache_control: block.cache_control,
}),
};
}
if (block.type === "image" && block.source) {
return {
Expand Down Expand Up @@ -498,7 +535,17 @@ anthropic.openapi(messages, async (c) => {
name?: string;
input?: string;
}> = [];
let usage = { input_tokens: 0, output_tokens: 0 };
let usage: {
input_tokens: number;
output_tokens: number;
cache_creation_input_tokens: number;
cache_read_input_tokens: number;
} = {
input_tokens: 0,
output_tokens: 0,
cache_creation_input_tokens: 0,
cache_read_input_tokens: 0,
};
Comment on lines +538 to +548
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

Add the cache usage fields to message_start too.

This path now treats cache_creation_input_tokens and cache_read_input_tokens as always-present, but the message_start payload at Line 603 still emits usage: { input_tokens, output_tokens } only. Native streaming clients that inspect message_start.message.usage will still see undefined for the new fields.

Possible fix
  usage: {
  	input_tokens: 0,
  	output_tokens: 0,
+ 	cache_creation_input_tokens: 0,
+ 	cache_read_input_tokens: 0,
  },

Also applies to: 592-604, 739-758

🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@apps/gateway/src/anthropic/anthropic.ts` around lines 535 - 545, The
message_start payload is only emitting usage: { input_tokens, output_tokens }
while the code elsewhere (the local usage object in anthropic.ts) now includes
cache_creation_input_tokens and cache_read_input_tokens, causing native
streaming clients to see those fields as undefined; update all places that
construct or emit message_start.message.usage (including the blocks around the
existing usage declaration and the message_start emission sites referenced) to
include cache_creation_input_tokens and cache_read_input_tokens (populated from
the same usage object or initialized to 0) so the emitted usage object
consistently has { input_tokens, output_tokens, cache_creation_input_tokens,
cache_read_input_tokens } across the codepaths.

let currentTextBlockIndex: number | null = null;
const toolCallBlockIndex = new Map<number, number>();

Expand Down Expand Up @@ -694,9 +741,23 @@ anthropic.openapi(messages, async (c) => {

// Update usage if available
if (chunk.usage) {
const promptDetails =
chunk.usage.prompt_tokens_details ?? {};
const cacheRead: number = promptDetails.cached_tokens ?? 0;
const cacheCreation: number =
promptDetails.cache_creation_tokens ?? 0;
const totalPrompt: number = chunk.usage.prompt_tokens ?? 0;
const nonCachedInput = Math.max(
0,
totalPrompt - cacheRead - cacheCreation,
);
usage = {
input_tokens: chunk.usage.prompt_tokens ?? 0,
input_tokens: nonCachedInput,
output_tokens: chunk.usage.completion_tokens ?? 0,
// Match Anthropic's API and always emit both fields
// (set to 0 when inapplicable).
cache_creation_input_tokens: cacheCreation,
cache_read_input_tokens: cacheRead,
};
}

Expand Down Expand Up @@ -787,6 +848,15 @@ anthropic.openapi(messages, async (c) => {
}
}

const usageDetails = openaiResponse.usage?.prompt_tokens_details ?? {};
const cachedTokens: number = usageDetails.cached_tokens ?? 0;
const cacheCreationTokens: number = usageDetails.cache_creation_tokens ?? 0;
const totalPromptTokens: number = openaiResponse.usage?.prompt_tokens ?? 0;
const nonCachedInputTokens = Math.max(
0,
totalPromptTokens - cachedTokens - cacheCreationTokens,
);

const anthropicResponse = {
id: openaiResponse.id,
type: "message" as const,
Expand All @@ -798,8 +868,13 @@ anthropic.openapi(messages, async (c) => {
),
stop_sequence: null,
usage: {
input_tokens: openaiResponse.usage?.prompt_tokens ?? 0,
input_tokens: nonCachedInputTokens,
output_tokens: openaiResponse.usage?.completion_tokens ?? 0,
// Match Anthropic's actual API: always emit both fields (set to 0
// when inapplicable) so SDK clients with strict typing can read them
// without optionality checks.
cache_creation_input_tokens: cacheCreationTokens,
cache_read_input_tokens: cachedTokens,
},
};

Expand Down
79 changes: 53 additions & 26 deletions apps/gateway/src/chat-prompt-caching.e2e.ts
Original file line number Diff line number Diff line change
Expand Up @@ -167,32 +167,59 @@ describe("e2e prompt caching", getConcurrentTestOptions(), () => {
});
}

// Second request - should read from cache
const secondRequestId = generateTestRequestId();
const secondRes = await app.request("/v1/chat/completions", {
method: "POST",
headers: {
"Content-Type": "application/json",
"x-request-id": secondRequestId,
Authorization: `Bearer real-token`,
},
body: JSON.stringify({
model: model,
messages: [
{
role: "system",
content: longSystemPrompt,
},
{
role: "user",
content:
"Just reply with 'OK' to confirm you received the context.",
},
],
}),
});

const secondJson = await secondRes.json();
// Second request - should read from cache.
// Anthropic prompt cache writes are eventually consistent, so a
// back-to-back request can occasionally miss. Retry with backoff
// until we observe a cache read or run out of attempts.
const sendCacheRequest = async () => {
const secondRequestId = generateTestRequestId();
const res = await app.request("/v1/chat/completions", {
method: "POST",
headers: {
"Content-Type": "application/json",
"x-request-id": secondRequestId,
Authorization: `Bearer real-token`,
},
body: JSON.stringify({
model: model,
messages: [
{
role: "system",
content: longSystemPrompt,
},
{
role: "user",
content:
"Just reply with 'OK' to confirm you received the context.",
},
],
}),
});
const json = await res.json();
return { res, json, secondRequestId };
};

let attempt = 0;
const maxAttempts = 4;
let secondRes: Response;
let secondJson: any;
let secondRequestId: string;
do {
attempt++;
({
res: secondRes,
json: secondJson,
secondRequestId,
} = await sendCacheRequest());
const cached =
secondJson?.usage?.prompt_tokens_details?.cached_tokens ?? 0;
if (secondRes.status !== 200 || cached > 0) {
break;
}
if (attempt < maxAttempts) {
await new Promise((r) => setTimeout(r, 750 * attempt));
}
} while (attempt < maxAttempts);
if (logMode) {
console.log("Second response:", JSON.stringify(secondJson, null, 2));
}
Expand Down
16 changes: 14 additions & 2 deletions apps/gateway/src/chat/chat.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5403,6 +5403,7 @@ chat.openapi(completions, async (c) => {
let totalTokens = null;
let reasoningTokens = null;
let cachedTokens = null;
let cacheCreationTokens: number | null = null;
let streamingToolCalls = null;
let imageByteSize = 0; // Track total image data size for token estimation
let outputImageCount = 0; // Track number of output images for cost calculation
Expand Down Expand Up @@ -6570,6 +6571,9 @@ chat.openapi(completions, async (c) => {
if (usage.cachedTokens !== null) {
cachedTokens = usage.cachedTokens;
}
if (usage.cacheCreationTokens !== null) {
cacheCreationTokens = usage.cacheCreationTokens;
}

// Estimate tokens if not provided and we have a finish reason
if (finishReason && (!promptTokens || !completionTokens)) {
Expand Down Expand Up @@ -7103,9 +7107,15 @@ chat.openapi(completions, async (c) => {
1,
Math.round(adjPrompt + adjCompletion),
),
...(cachedTokens !== null && {
...((cachedTokens !== null ||
(cacheCreationTokens !== null &&
cacheCreationTokens > 0)) && {
prompt_tokens_details: {
cached_tokens: cachedTokens,
cached_tokens: cachedTokens ?? 0,
...(cacheCreationTokens !== null &&
cacheCreationTokens > 0 && {
cache_creation_tokens: cacheCreationTokens,
}),
Comment on lines +7110 to +7118
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

The normal [DONE] path still drops cache token details.

This addition only affects the late !doneSent usage chunk. In the normal unbuffered flow, Lines 5829-5870 already emit the final usage payload and Lines 5912-5918 set doneSent = true, so Anthropic/Bedrock streams still finish without prompt_tokens_details in the common case. That also leaks into the forceStream JSON adapter, since it copies usage from the streamed chunks.

🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@apps/gateway/src/chat/chat.ts` around lines 7110 - 7118, The final (normal
unbuffered) usage emission path still omits prompt_tokens_details, so update the
code that emits the final usage payload (the block that sets doneSent = true) to
include the same conditional spread used elsewhere: include
prompt_tokens_details when (cachedTokens !== null || (cacheCreationTokens !==
null && cacheCreationTokens > 0)) with cached_tokens: cachedTokens ?? 0 and
cache_creation_tokens when applicable; also ensure the forceStream JSON adapter
(which copies usage from streamed chunks) will receive/merge that
prompt_tokens_details by copying usage including prompt_tokens_details rather
than overwriting it. Reference prompt_tokens_details, cachedTokens,
cacheCreationTokens, doneSent and forceStream when making the changes.

},
}),
cost_usd_total: streamingCostsEarly.totalCost,
Expand Down Expand Up @@ -8720,6 +8730,7 @@ chat.openapi(completions, async (c) => {
completionTokens,
reasoningTokens,
cachedTokens,
cacheCreationTokens,
toolResults,
images,
annotations,
Expand Down Expand Up @@ -8897,6 +8908,7 @@ chat.openapi(completions, async (c) => {
routingAttempts.length > 0 ? routingAttempts : null,
requestId,
usedRegion,
cacheCreationTokens,
);

// Extract plugin IDs for logging
Expand Down
5 changes: 5 additions & 0 deletions apps/gateway/src/chat/schemas/completions.ts
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,11 @@ export const completionsRequestSchema = z.object({
z.object({
type: z.literal("text"),
text: z.string(),
cache_control: z
.object({
type: z.literal("ephemeral"),
})
.optional(),
}),
z.object({
type: z.literal("image_url"),
Expand Down
8 changes: 6 additions & 2 deletions apps/gateway/src/chat/tools/extract-token-usage.ts
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ export function extractTokenUsage(
let totalTokens = null;
let reasoningTokens = null;
let cachedTokens = null;
let cacheCreationTokens = null;

switch (provider) {
case "google-ai-studio":
Expand Down Expand Up @@ -110,6 +111,7 @@ export function extractTokenUsage(
completionTokens = data.usage.outputTokens ?? null;
// Cached tokens are the tokens read from cache (discount applies to these)
cachedTokens = cacheReadTokens;
cacheCreationTokens = cacheWriteTokens;
totalTokens = data.usage.totalTokens ?? null;
}
break;
Expand All @@ -118,15 +120,16 @@ export function extractTokenUsage(
// For Anthropic: input_tokens are the non-cached tokens
// We need to add cache_creation_input_tokens to get total input tokens
const inputTokens = data.usage.input_tokens ?? 0;
const cacheCreationTokens = data.usage.cache_creation_input_tokens ?? 0;
const cacheCreation = data.usage.cache_creation_input_tokens ?? 0;
const cacheReadTokens = data.usage.cache_read_input_tokens ?? 0;

// Total prompt tokens = non-cached + cache creation + cache read
promptTokens = inputTokens + cacheCreationTokens + cacheReadTokens;
promptTokens = inputTokens + cacheCreation + cacheReadTokens;
completionTokens = data.usage.output_tokens ?? null;
reasoningTokens = data.usage.reasoning_output_tokens ?? null;
// Cached tokens are the tokens read from cache (discount applies to these)
cachedTokens = cacheReadTokens;
cacheCreationTokens = cacheCreation;
totalTokens = (promptTokens ?? 0) + (completionTokens ?? 0);
}
break;
Expand Down Expand Up @@ -157,5 +160,6 @@ export function extractTokenUsage(
totalTokens,
reasoningTokens,
cachedTokens,
cacheCreationTokens,
};
}
Loading
Loading