server/llm: revamp counting tokens for google genai

haraldschilly · haraldschilly · commit 1c014455f76f · 2025-07-17T12:25:10.000+02:00
diff --git a/src/packages/server/llm/google-genai-client.ts b/src/packages/server/llm/google-genai-client.ts
@@ -5,11 +5,13 @@
  */
 
 import { GenerativeModel, GoogleGenerativeAI } from "@google/generative-ai";
+import { AIMessageChunk } from "@langchain/core/messages";
 import {
   ChatPromptTemplate,
   MessagesPlaceholder,
 } from "@langchain/core/prompts";
 import { RunnableWithMessageHistory } from "@langchain/core/runnables";
+import { concat } from "@langchain/core/utils/stream";
 import { ChatGoogleGenerativeAI } from "@langchain/google-genai";
 import getLogger from "@cocalc/backend/logger";
 import { getServerSettings } from "@cocalc/database/settings";
@@ -117,10 +119,7 @@ export class GoogleGenAIClient {
       streaming: true,
     });
 
-    // However, we also count tokens, and for that we use "gemini-1.5-pro" only
-    const geminiPro: GenerativeModel = this.genAI.getGenerativeModel({
-      model: "gemini-1.5-pro",
-    });
+    // Token counting will be done using either usage_metadata or the actual model
 
     const prompt = ChatPromptTemplate.fromMessages([
       ["system", system ?? ""],
@@ -143,33 +142,75 @@ export class GoogleGenAIClient {
 
     const chunks = await chainWithHistory.stream({ input });
 
+    let finalResult: AIMessageChunk | undefined;
     let output = "";
     for await (const chunk of chunks) {
       const { content } = chunk;
       if (typeof content !== "string") continue;
       output += content;
       stream?.(content);
+
+      // Collect the final result to check for usage metadata
+      if (finalResult) {
+        finalResult = concat(finalResult, chunk);
+      } else {
+        finalResult = chunk;
+      }
     }
 
     stream?.(null);
 
-    const { totalTokens: prompt_tokens } = await geminiPro.countTokens([
-      input,
-      system ?? "",
-      ...history.map(({ content }) => content),
-    ]);
-
-    const { totalTokens: completion_tokens } = await geminiPro.countTokens(
-      output,
-    );
+    // Check for usage metadata from LangChain first (more accurate, includes thinking tokens)
+    const usage_metadata = finalResult?.usage_metadata;
+    log.debug("usage_metadata", usage_metadata);
+     console.log("usage_metadata", usage_metadata);
+
+    if (usage_metadata) {
+      const { input_tokens, output_tokens, total_tokens } = usage_metadata;
+      log.debug("chatGemini successful (using usage_metadata)", {
+        input_tokens,
+        output_tokens,
+        total_tokens,
+        usage_metadata, // Log full metadata to see what other fields might be available
+      });
 
-    log.debug("chatGemini successful", { prompt_tokens, completion_tokens });
+      // For now, return the standard ChatOutput format
+      // TODO: Consider extending ChatOutput interface to include thinking_tokens if available
+      return {
+        output,
+        total_tokens,
+        completion_tokens: output_tokens,
+        prompt_tokens: input_tokens,
+      };
+    } else {
+      // Fallback to manual token counting using the actual model (not hardcoded)
+      const tokenCountingModel: GenerativeModel = this.genAI.getGenerativeModel(
+        {
+          model: modelName,
+        },
+      );
+
+      const { totalTokens: prompt_tokens } =
+        await tokenCountingModel.countTokens([
+          input,
+          system ?? "",
+          ...history.map(({ content }) => content),
+        ]);
+
+      const { totalTokens: completion_tokens } =
+        await tokenCountingModel.countTokens(output);
+
+      log.debug("chatGemini successful (using manual counting)", {
+        prompt_tokens,
+        completion_tokens,
+      });
 
-    return {
-      output,
-      total_tokens: prompt_tokens + completion_tokens,
-      completion_tokens,
-      prompt_tokens,
-    };
+      return {
+        output,
+        total_tokens: prompt_tokens + completion_tokens,
+        completion_tokens,
+        prompt_tokens,
+      };
+    }
   }
 }
diff --git a/src/packages/server/llm/test/models.test.ts b/src/packages/server/llm/test/models.test.ts
@@ -38,7 +38,11 @@ const QUERY = {
 function checkAnswer(answer) {
   const { output, total_tokens, completion_tokens, prompt_tokens } = answer;
   expect(output).toContain("100");
-  expect(total_tokens).toEqual(prompt_tokens + completion_tokens);
+  // total tokens is more than that sume for "thinking" models like gemini 2.5
+  // because thinking tokens are not part of this
+  expect(total_tokens).toBeGreaterThanOrEqual(
+    prompt_tokens + completion_tokens,
+  );
   expect(prompt_tokens).toBeGreaterThan(5);
   expect(completion_tokens).toBeGreaterThan(0);
 }