diff --git a/src/.claude/settings.local.json b/src/.claude/settings.local.json
index d69e67b58f2..11bf37a4f99 100644
--- a/src/.claude/settings.local.json
+++ b/src/.claude/settings.local.json
@@ -4,7 +4,18 @@
       "Bash(pnpm tsc:*)",
       "Bash(pnpm build:*)",
       "Bash(git add:*)",
-      "Bash(git commit:*)"
+      "Bash(git commit:*)",
+      "Bash(node:*)",
+      "Bash(grep:*)",
+      "Bash(find:*)",
+      "WebFetch(domain:github.com)",
+      "WebFetch(domain:cocalc.com)",
+      "WebFetch(domain:doc.cocalc.com)",
+      "Bash(npm show:*)",
+      "Bash(prettier -w:*)",
+      "Bash(npx tsc:*)",
+      "Bash(gh pr view:*)",
+      "Bash(gh:*)"
     ],
     "deny": []
   }
diff --git a/src/packages/frontend/admin/_style.sass b/src/packages/frontend/admin/_style.sass
index 5af8b6f26a6..80e020a52c2 100644
--- a/src/packages/frontend/admin/_style.sass
+++ b/src/packages/frontend/admin/_style.sass
@@ -13,3 +13,6 @@
     td:first-child
       font-family: monospace
       font-weight: bold
+
+.admin-llm-test-running-row
+  background-color: #f0f0f0 !important
diff --git a/src/packages/frontend/admin/llm/admin-llm-test.tsx b/src/packages/frontend/admin/llm/admin-llm-test.tsx
new file mode 100644
index 00000000000..66718e15418
--- /dev/null
+++ b/src/packages/frontend/admin/llm/admin-llm-test.tsx
@@ -0,0 +1,400 @@
+import { Alert, Button, Input, Select, Space, Table } from "antd";
+
+import {
+  redux,
+  useAsyncEffect,
+  useState,
+  useTypedRedux,
+} from "@cocalc/frontend/app-framework";
+import { Icon, Loading, Paragraph, Title } from "@cocalc/frontend/components";
+import { LLMModelName } from "@cocalc/frontend/components/llm-name";
+import { Markdown } from "@cocalc/frontend/markdown";
+import { webapp_client } from "@cocalc/frontend/webapp-client";
+import {
+  USER_SELECTABLE_LLMS_BY_VENDOR,
+  isCoreLanguageModel,
+  toCustomOpenAIModel,
+  toOllamaModel,
+} from "@cocalc/util/db-schema/llm-utils";
+import { trunc_middle } from "@cocalc/util/misc";
+import { COLORS } from "@cocalc/util/theme";
+import { PROMPTS } from "./tests";
+import { Value } from "./value";
+
+interface TestResult {
+  model: string;
+  status: "pending" | "running" | "passed" | "failed";
+  output: string;
+  error?: string;
+}
+
+export function TestLLMAdmin() {
+  const customize = redux.getStore("customize");
+  const globallyEnabledLLMs = customize.getEnabledLLMs();
+  const selectableLLMs = useTypedRedux("customize", "selectable_llms");
+  const ollama = useTypedRedux("customize", "ollama");
+  const custom_openai = useTypedRedux("customize", "custom_openai");
+  const [test, setTest] = useState<number | null>(0);
+  const [querying, setQuerying] = useState<boolean>(false);
+  const [testResults, setTestResults] = useState<TestResult[]>([]);
+  const [currentTestIndex, setCurrentTestIndex] = useState<number>(0);
+
+  // Initialize test results on component mount or when test changes
+  useAsyncEffect(() => {
+    if (test !== null) {
+      const allModels = getAllModels();
+      const initialResults: TestResult[] = allModels.map((model) => ({
+        model,
+        status: "pending",
+        output: "",
+      }));
+      setTestResults(initialResults);
+    } else {
+      setTestResults([]);
+    }
+  }, [test, custom_openai, ollama, selectableLLMs]);
+
+  function getAllModels(): string[] {
+    const models: string[] = [];
+
+    // Get core models
+    Object.entries(USER_SELECTABLE_LLMS_BY_VENDOR).forEach(([vendor, llms]) => {
+      if (vendor !== "ollama" && vendor !== "custom_openai") {
+        llms.filter(isCoreLanguageModel).forEach((llm) => {
+          models.push(llm);
+        });
+      }
+    });
+
+    // Get custom OpenAI models
+    Object.entries(custom_openai?.toJS() ?? {}).forEach(([key, _val]) => {
+      const model = toCustomOpenAIModel(key);
+      models.push(model);
+    });
+
+    // Get Ollama models
+    Object.entries(ollama?.toJS() ?? {}).forEach(([key, _val]) => {
+      const model = toOllamaModel(key);
+      models.push(model);
+    });
+
+    return models;
+  }
+
+  function getEnabledModels(): string[] {
+    return getAllModels().filter((model) => {
+      // Check if model is enabled in selectable LLMs
+      if (isCoreLanguageModel(model)) {
+        return selectableLLMs.includes(model);
+      }
+      // Custom OpenAI and Ollama models are always considered enabled if configured
+      return true;
+    });
+  }
+
+  async function runTestForModel(
+    model: string,
+    testConfig: any,
+  ): Promise<TestResult> {
+    const { prompt, expected, system, history } = testConfig;
+    const expectedRegex = new RegExp(expected, "g");
+
+    return new Promise((resolve) => {
+      try {
+        const llmStream = webapp_client.openai_client.queryStream({
+          input: prompt,
+          project_id: null,
+          tag: "admin-llm-test",
+          model,
+          system,
+          history,
+          maxTokens: 20,
+        });
+
+        let reply = "";
+
+        llmStream.on("token", (token) => {
+          console.log({ model, system, token });
+          if (token != null) {
+            reply += token;
+            // Update the result in real-time
+            setTestResults((prev) =>
+              prev.map((r) =>
+                r.model === model ? { ...r, output: reply } : r,
+              ),
+            );
+          } else {
+            // Stream is complete (token is null)
+            const passed = expectedRegex.test(reply);
+            resolve({
+              model,
+              status: passed ? "passed" : "failed",
+              output: reply,
+            });
+          }
+        });
+
+        llmStream.on("error", (err) => {
+          console.error(`Error in LLM stream for model ${model}:`, err);
+          resolve({
+            model,
+            status: "failed",
+            output: reply,
+            error: err?.toString(),
+          });
+        });
+
+        // Start the stream
+        llmStream.emit("start");
+      } catch (err) {
+        console.error(`Error running test for model ${model}:`, err);
+        resolve({
+          model,
+          status: "failed",
+          output: "",
+          error: err?.toString(),
+        });
+      }
+    });
+  }
+
+  async function runSingleTest(model: string) {
+    if (test === null) return;
+
+    const testConfig = PROMPTS[test];
+
+    // Find the model in the results and update its status
+    const modelIndex = testResults.findIndex((r) => r.model === model);
+    if (modelIndex === -1) return;
+
+    setCurrentTestIndex(modelIndex);
+
+    // Update status to running
+    setTestResults((prev) =>
+      prev.map((r, idx) =>
+        idx === modelIndex
+          ? { ...r, status: "running", output: "", error: undefined }
+          : r,
+      ),
+    );
+
+    const result = await runTestForModel(model, testConfig);
+
+    // Update with final result
+    setTestResults((prev) =>
+      prev.map((r, idx) => (idx === modelIndex ? result : r)),
+    );
+  }
+
+  async function runSequentialTests() {
+    if (test === null) return;
+
+    const models = getEnabledModels();
+    const testConfig = PROMPTS[test];
+
+    // Initialize results
+    const initialResults: TestResult[] = models.map((model) => ({
+      model,
+      status: "pending",
+      output: "",
+    }));
+
+    setTestResults(initialResults);
+    setQuerying(true);
+    setCurrentTestIndex(0);
+
+    // Run tests sequentially
+    for (let i = 0; i < models.length; i++) {
+      setCurrentTestIndex(i);
+
+      // Update status to running
+      setTestResults((prev) =>
+        prev.map((r, idx) => (idx === i ? { ...r, status: "running" } : r)),
+      );
+
+      const result = await runTestForModel(models[i], testConfig);
+
+      // Update with final result
+      setTestResults((prev) => prev.map((r, idx) => (idx === i ? result : r)));
+
+      // Add delay between tests to avoid rate limiting
+      if (i < models.length - 1) {
+        await new Promise((resolve) => setTimeout(resolve, 100));
+      }
+    }
+
+    setQuerying(false);
+  }
+
+  function renderTestResultIcon(status: TestResult["status"]) {
+    switch (status) {
+      case "pending":
+        return <Icon unicode={0x2753} />;
+      case "running":
+        return <Loading text="" />;
+      case "passed":
+        return <Value val={true} />;
+      case "failed":
+        return <Value val={false} />;
+      default:
+        return <Icon unicode={0x2753} />;
+    }
+  }
+
+  function renderTestResults() {
+    if (testResults.length === 0) {
+      return (
+        <Paragraph>
+          Click "Run Tests" to execute the selected test on all enabled models.
+        </Paragraph>
+      );
+    }
+
+    const columns = [
+      {
+        title: "Status",
+        dataIndex: "status",
+        key: "status",
+        width: 80,
+        render: (status: TestResult["status"]) => renderTestResultIcon(status),
+      },
+      {
+        title: "Model",
+        dataIndex: "model",
+        key: "model",
+        width: 180,
+        render: (model: string /*, record: TestResult*/) => (
+          <Space>
+            <LLMModelName model={model} />
+            {/* {record.status === "running" && <span>(Running...)</span>} */}
+          </Space>
+        ),
+      },
+      {
+        title: "Output",
+        dataIndex: "output",
+        key: "output",
+        render: (output: string) =>
+          output ? (
+            <Markdown value={output} />
+          ) : (
+            <span style={{ color: COLORS.GRAY_M }}>-</span>
+          ),
+      },
+      {
+        title: "Error",
+        dataIndex: "error",
+        key: "error",
+        render: (error: string) =>
+          error ? (
+            <Alert type="error" banner message={error} style={{ margin: 0 }} />
+          ) : (
+            <span style={{ color: COLORS.GRAY_M }}>-</span>
+          ),
+      },
+      {
+        title: "Test",
+        key: "test",
+        width: 80,
+        render: (_, record: TestResult) => {
+          const isEnabled = getEnabledModels().includes(record.model);
+          const isRunning = record.status === "running";
+          const isQuerying = querying && record.status === "running";
+
+          return (
+            <Button
+              type="primary"
+              size="small"
+              disabled={test === null || !isEnabled || isQuerying}
+              loading={isRunning}
+              onClick={() => runSingleTest(record.model)}
+              style={{ width: "60px" }}
+            >
+              {isRunning ? "" : "Run"}
+            </Button>
+          );
+        },
+      },
+    ];
+
+    const dataSource = testResults.map((result, index) => ({
+      ...result,
+      key: result.model,
+      // Add row styling for currently running test
+      className:
+        index === currentTestIndex && querying ? "running-row" : undefined,
+    }));
+
+    return (
+      <div>
+        <Title level={4}>Test Results</Title>
+        <Table
+          columns={columns}
+          dataSource={dataSource}
+          pagination={false}
+          size="small"
+          rowClassName={(_, index) =>
+            index === currentTestIndex && querying
+              ? "admin-llm-test-running-row"
+              : ""
+          }
+          style={{ marginTop: "10px" }}
+        />
+      </div>
+    );
+  }
+
+  return (
+    <div>
+      <Paragraph>
+        Globally enabled LLMs (Admin Settings):
+        <Value val={globallyEnabledLLMs} />.
+      </Paragraph>
+      <Paragraph>
+        <Space>
+          <Input
+            value={test != null ? PROMPTS[test].prompt : ""}
+            disabled={true || querying}
+            onChange={(e) => setTest(parseInt(e.target.value))}
+            placeholder="Enter a query..."
+            addonAfter={
+              <Select
+                onSelect={setTest}
+                defaultValue={0}
+                popupMatchSelectWidth={false}
+              >
+                {PROMPTS.map((p, i) => (
+                  <Select.Option key={i} value={i}>
+                    {trunc_middle(p.prompt, 25)}
+                  </Select.Option>
+                ))}
+              </Select>
+            }
+          />
+          <Button
+            type="primary"
+            onClick={runSequentialTests}
+            disabled={test == null || querying}
+          >
+            Run Tests
+          </Button>
+          <Button
+            onClick={() => {
+              setTest(null);
+              setTestResults([]);
+            }}
+          >
+            Clear
+          </Button>
+        </Space>
+      </Paragraph>
+
+      {renderTestResults()}
+
+      <Title level={5}>Ollama configuration</Title>
+      <Value val={ollama} />
+      <Title level={5}>Custom OpenAI API</Title>
+      <Value val={custom_openai} />
+    </div>
+  );
+}
diff --git a/src/packages/frontend/admin/llm/index.tsx b/src/packages/frontend/admin/llm/index.tsx
deleted file mode 100644
index 94523e20440..00000000000
--- a/src/packages/frontend/admin/llm/index.tsx
+++ /dev/null
@@ -1,197 +0,0 @@
-import { Button, Col, Input, Row, Select, Space, Switch } from "antd";
-
-import {
-  CSS,
-  redux,
-  useState,
-  useTypedRedux,
-} from "@cocalc/frontend/app-framework";
-import { Paragraph, Title } from "@cocalc/frontend/components";
-import { LLMModelName } from "@cocalc/frontend/components/llm-name";
-import {
-  LLMServiceName,
-  LLM_PROVIDER,
-  LanguageModelCore,
-  USER_SELECTABLE_LLMS_BY_VENDOR,
-  isCoreLanguageModel,
-  toCustomOpenAIModel,
-  toOllamaModel,
-} from "@cocalc/util/db-schema/llm-utils";
-import { getRandomColor, trunc_middle } from "@cocalc/util/misc";
-import { TestLLM } from "./test-component";
-import { PROMPTS } from "./tests";
-import { Value } from "./value";
-
-export function TestLLMAdmin() {
-  const customize = redux.getStore("customize");
-  const globallyEnabledLLMs = customize.getEnabledLLMs();
-  const selectableLLMs = useTypedRedux("customize", "selectable_llms");
-  const ollama = useTypedRedux("customize", "ollama");
-  const custom_openai = useTypedRedux("customize", "custom_openai");
-  const [test, setTest] = useState<number | null>(0);
-  // TODO: this is used to trigger sending queries – makes no sense that all of them disable it. fix this.
-  const [querying, setQuerying] = useState<boolean>();
-  const [all, setAll] = useState<boolean>(false);
-
-  function llmStyle(llm: string): CSS {
-    return {
-      marginLeft: "5px",
-      marginBottom: "5px",
-      borderLeft: `5px solid ${getRandomColor(llm, {
-        min: 0,
-        max: 255,
-        diff: 100,
-      })}`,
-    };
-  }
-
-  function renderStatus(llm: LanguageModelCore, vendor: LLMServiceName) {
-    const enabled = all || selectableLLMs.includes(llm);
-
-    return (
-      <Row gutter={[10, 20]} style={llmStyle(llm)} key={`${vendor}-${llm}`}>
-        <Col md={24}>
-          <Space>
-            <Value val={enabled} /> <LLMModelName model={llm} />
-          </Space>
-        </Col>
-        <Col md={24}>
-          {enabled ? (
-            <TestLLM
-              test={test}
-              model={llm}
-              queryState={[querying, setQuerying]}
-            />
-          ) : undefined}
-        </Col>
-      </Row>
-    );
-  }
-
-  function renderCustomOpenAI() {
-    return (
-      <Col key={"custom_openai"} md={12} xs={24}>
-        <Title level={5}>Custom OpenAI</Title>
-        {Object.entries(custom_openai?.toJS() ?? {}).map(([key, _val]) => {
-          const model = toCustomOpenAIModel(key);
-
-          return (
-            <Row
-              gutter={[10, 20]}
-              style={llmStyle(model)}
-              key={`custom_openai-${key}`}
-            >
-              <Col md={24}>
-                <Space>
-                  <Value val={true} /> <LLMModelName model={model} />
-                </Space>
-              </Col>
-              <Col md={24}>
-                <TestLLM
-                  test={test}
-                  model={model}
-                  queryState={[querying, setQuerying]}
-                />
-              </Col>
-            </Row>
-          );
-        })}
-      </Col>
-    );
-  }
-
-  function renderOllama() {
-    return (
-      <Col key={"ollama"} md={12} xs={24}>
-        <Title level={5}>Ollama</Title>
-        {Object.entries(ollama?.toJS() ?? {}).map(([key, _val]) => {
-          const model = toOllamaModel(key);
-
-          return (
-            <Row
-              gutter={[10, 20]}
-              style={llmStyle(model)}
-              key={`ollama-${key}`}
-            >
-              <Col md={24}>
-                <Space>
-                  <Value val={true} /> <LLMModelName model={model} />
-                </Space>
-              </Col>
-              <Col md={24}>
-                <TestLLM
-                  test={test}
-                  model={model}
-                  queryState={[querying, setQuerying]}
-                />
-              </Col>
-            </Row>
-          );
-        })}
-      </Col>
-    );
-  }
-
-  return (
-    <div>
-      <Paragraph>
-        Globally enabled LLMs (Admin Settings):
-        <Value val={globallyEnabledLLMs} />.
-      </Paragraph>
-      <Paragraph>
-        <Space>
-          <Input
-            value={test != null ? PROMPTS[test].prompt : ""}
-            disabled={true || querying}
-            onChange={(e) => setTest(parseInt(e.target.value))}
-            placeholder="Enter a query..."
-            addonAfter={
-              <Select
-                onSelect={setTest}
-                defaultValue={0}
-                popupMatchSelectWidth={false}
-              >
-                {PROMPTS.map((p, i) => (
-                  <Select.Option key={i} value={i}>
-                    {trunc_middle(p.prompt, 25)}
-                  </Select.Option>
-                ))}
-              </Select>
-            }
-          />
-          <Button
-            type="primary"
-            onClick={() => setQuerying(true)}
-            disabled={test == null || querying}
-          >
-            Run Tests
-          </Button>
-          <Button onClick={() => setTest(null)}>Clear</Button>
-          <Switch onChange={(e) => setAll(e)} /> All
-        </Space>
-      </Paragraph>
-      <Paragraph>
-        <Row gutter={[10, 10]}>
-          {Object.entries(USER_SELECTABLE_LLMS_BY_VENDOR).map(
-            ([vendor, llms]) =>
-              vendor !== "ollama" && vendor !== "custom_openai" ? (
-                <Col key={vendor} md={12} xs={24}>
-                  <Title level={5}>{LLM_PROVIDER[vendor].name}</Title>
-                  {llms
-                    .filter(isCoreLanguageModel)
-                    .map((llm) => renderStatus(llm, vendor as LLMServiceName))}
-                </Col>
-              ) : undefined,
-          )}
-          {renderOllama()}
-          {renderCustomOpenAI()}
-        </Row>
-      </Paragraph>
-
-      <Title level={5}>Ollama configuration</Title>
-      <Value val={ollama} />
-      <Title level={5}>Custom OpenAI API</Title>
-      <Value val={custom_openai} />
-    </div>
-  );
-}
diff --git a/src/packages/frontend/admin/llm/test-component.tsx b/src/packages/frontend/admin/llm/test-component.tsx
deleted file mode 100644
index cab5496bc40..00000000000
--- a/src/packages/frontend/admin/llm/test-component.tsx
+++ /dev/null
@@ -1,122 +0,0 @@
-import { Alert, Space } from "antd";
-import { throttle } from "lodash";
-
-import {
-  useAsyncEffect,
-  useEffect,
-  useState,
-} from "@cocalc/frontend/app-framework";
-import { Icon, Loading } from "@cocalc/frontend/components";
-import { Markdown } from "@cocalc/frontend/markdown";
-import { webapp_client } from "@cocalc/frontend/webapp-client";
-import { LanguageModelCore } from "@cocalc/util/db-schema/llm-utils";
-import { PROMPTS } from "./tests";
-import { Value } from "./value";
-
-interface TestLLMProps {
-  model: LanguageModelCore | string;
-  test: number | null;
-  queryState: [boolean | undefined, (val: boolean) => void];
-}
-
-export function TestLLM({ model, test, queryState }: TestLLMProps) {
-  const [querying, setQuerying] = queryState;
-  const [output, setOutput] = useState<string>("");
-  const [error, setError] = useState<string>("");
-  const [passed, setPassed] = useState<boolean | undefined>();
-
-  const {
-    prompt,
-    expected,
-    system = undefined,
-    history = undefined,
-  } = typeof test === "number" ? PROMPTS[test] : { prompt: "", expected: "" };
-  const expectedRegex = new RegExp(expected, "g");
-
-  const check = throttle(
-    () => {
-      if (passed != null && output.trim() === "") {
-        setPassed(undefined);
-      } else if (expectedRegex.test(output) && !passed) {
-        setPassed(true);
-      }
-    },
-    250,
-    {
-      leading: false,
-      trailing: true,
-    },
-  );
-
-  useEffect(() => {
-    if (prompt.trim() === "") {
-      setOutput("");
-      setError("");
-      setPassed(undefined);
-    }
-  }, [prompt, test]);
-
-  useEffect(() => {
-    check();
-  }, [output]);
-
-  useAsyncEffect(async () => {
-    if (!querying || prompt.trim() === "") {
-      querying && setQuerying(false);
-      setError("");
-      return;
-    }
-
-    try {
-      setPassed(undefined);
-      const llmStream = webapp_client.openai_client.queryStream({
-        input: prompt,
-        project_id: null,
-        tag: "admin-llm-test",
-        model,
-        system,
-        history,
-        maxTokens: 20,
-      });
-
-      let reply = "";
-      llmStream.on("token", (token) => {
-        if (token) {
-          reply += token;
-          setOutput(reply);
-        }
-      });
-
-      llmStream.on("error", (err) => {
-        setPassed(false);
-        setError(err?.toString());
-        setQuerying(false);
-      });
-    } catch (err) {
-      setError(err?.toString());
-    } finally {
-      setQuerying(false);
-    }
-  }, [querying]);
-
-  function renderPassed() {
-    if (typeof passed === "boolean") {
-      return <Value val={passed} />;
-    } else {
-      return <Icon unicode={0x2753} />;
-    }
-  }
-
-  if (querying) {
-    return <Loading />;
-  }
-
-  return (
-    <>
-      <Space direction="horizontal" align="start">
-        {renderPassed()} <Markdown value={output} />
-      </Space>
-      {error ? <Alert banner type="error" message={error} /> : undefined}
-    </>
-  );
-}
diff --git a/src/packages/frontend/admin/page.tsx b/src/packages/frontend/admin/page.tsx
index fcdeac3cdd0..99bba3301bc 100644
--- a/src/packages/frontend/admin/page.tsx
+++ b/src/packages/frontend/admin/page.tsx
@@ -14,7 +14,7 @@ import { UsageStatistics } from "./stats/page";
 import { SystemNotifications } from "./system-notifications";
 import { UserSearch } from "./users/user-search";
 import AIAvatar from "@cocalc/frontend/components/ai-avatar";
-import { TestLLMAdmin } from "./llm";
+import { TestLLMAdmin } from "./llm/admin-llm-test";
 
 const headerStyle = { fontSize: "12pt" } as const;
 
diff --git a/src/packages/frontend/jupyter/insert-cell/ai-cell-generator.tsx b/src/packages/frontend/jupyter/insert-cell/ai-cell-generator.tsx
index 2e50f5f015b..4b135f3f477 100644
--- a/src/packages/frontend/jupyter/insert-cell/ai-cell-generator.tsx
+++ b/src/packages/frontend/jupyter/insert-cell/ai-cell-generator.tsx
@@ -330,7 +330,7 @@ export function AIGenerateCodeCell({
         if (cancel.current) {
           // we abort this
           stream.removeAllListeners();
-          // singal "finalization"
+          // single "finalization"
           updateCells(answer);
           return;
         }
diff --git a/src/packages/frontend/misc/llm.ts b/src/packages/frontend/misc/llm.ts
index 4f1d28ef01f..15e10d08287 100644
--- a/src/packages/frontend/misc/llm.ts
+++ b/src/packages/frontend/misc/llm.ts
@@ -12,7 +12,7 @@ export { getMaxTokens };
 //  about 5 characters long on average, and there is a space character between
 // each word. So, for every 6 characters, there is approximately one token."
 // Using this, our 250,000 character text gets truncated down to 6*4096 ~ 25,000
-// and then runnin the tokenizer is fast: it takes 62ms instead of nearly 6 seconds!
+// and then running the tokenizer is fast: it takes 62ms instead of nearly 6 seconds!
 
 // if 6 is about right, 8 should be a good upper bound.
 const APPROX_CHARACTERS_PER_TOKEN = 8;
diff --git a/src/packages/pnpm-lock.yaml b/src/packages/pnpm-lock.yaml
index 1fe4499fe01..5a163627777 100644
--- a/src/packages/pnpm-lock.yaml
+++ b/src/packages/pnpm-lock.yaml
@@ -1205,23 +1205,23 @@ importers:
         specifier: ^1.4.1
         version: 1.4.1
       '@langchain/anthropic':
-        specifier: ^0.3.18
-        version: 0.3.24(@langchain/core@0.3.62(@opentelemetry/api@1.9.0)(openai@4.104.0(encoding@0.1.13)(ws@8.18.3)(zod@3.25.76)))
+        specifier: ^0.3.24
+        version: 0.3.24(@langchain/core@0.3.64(@opentelemetry/api@1.9.0)(openai@4.104.0(encoding@0.1.13)(ws@8.18.3)(zod@3.25.76)))
       '@langchain/core':
         specifier: ^0.3.46
-        version: 0.3.62(@opentelemetry/api@1.9.0)(openai@4.104.0(encoding@0.1.13)(ws@8.18.3)(zod@3.25.76))
+        version: 0.3.64(@opentelemetry/api@1.9.0)(openai@4.104.0(encoding@0.1.13)(ws@8.18.3)(zod@3.25.76))
       '@langchain/google-genai':
-        specifier: ^0.2.4
-        version: 0.2.14(@langchain/core@0.3.62(@opentelemetry/api@1.9.0)(openai@4.104.0(encoding@0.1.13)(ws@8.18.3)(zod@3.25.76)))
+        specifier: ^0.2.15
+        version: 0.2.15(@langchain/core@0.3.64(@opentelemetry/api@1.9.0)(openai@4.104.0(encoding@0.1.13)(ws@8.18.3)(zod@3.25.76)))
       '@langchain/mistralai':
-        specifier: ^0.2.0
-        version: 0.2.1(@langchain/core@0.3.62(@opentelemetry/api@1.9.0)(openai@4.104.0(encoding@0.1.13)(ws@8.18.3)(zod@3.25.76)))(zod@3.25.76)
+        specifier: ^0.2.1
+        version: 0.2.1(@langchain/core@0.3.64(@opentelemetry/api@1.9.0)(openai@4.104.0(encoding@0.1.13)(ws@8.18.3)(zod@3.25.76)))(zod@3.25.76)
       '@langchain/ollama':
-        specifier: ^0.2.0
-        version: 0.2.3(@langchain/core@0.3.62(@opentelemetry/api@1.9.0)(openai@4.104.0(encoding@0.1.13)(ws@8.18.3)(zod@3.25.76)))
+        specifier: ^0.2.3
+        version: 0.2.3(@langchain/core@0.3.64(@opentelemetry/api@1.9.0)(openai@4.104.0(encoding@0.1.13)(ws@8.18.3)(zod@3.25.76)))
       '@langchain/openai':
-        specifier: ^0.5.5
-        version: 0.5.18(@langchain/core@0.3.62(@opentelemetry/api@1.9.0)(openai@4.104.0(encoding@0.1.13)(ws@8.18.3)(zod@3.25.76)))(ws@8.18.3)
+        specifier: ^0.6.1
+        version: 0.6.1(@langchain/core@0.3.64(@opentelemetry/api@1.9.0)(openai@4.104.0(encoding@0.1.13)(ws@8.18.3)(zod@3.25.76)))(ws@8.18.3)
       '@node-saml/passport-saml':
         specifier: ^5.0.1
         version: 5.0.1
@@ -1331,7 +1331,7 @@ importers:
         specifier: ^6.9.16
         version: 6.10.1
       openai:
-        specifier: ^4.95.1
+        specifier: ^4.104.0
         version: 4.104.0(encoding@0.1.13)(ws@8.18.3)(zod@3.25.76)
       parse-domain:
         specifier: ^5.0.0
@@ -3023,12 +3023,12 @@ packages:
     peerDependencies:
       '@langchain/core': ^0.3.46
 
-  '@langchain/core@0.3.62':
-    resolution: {integrity: sha512-GqRTcoUPnozGRMUcA6QkP7LHL/OvanGdB51Jgb0w7IIPDI3wFugxMHZ4gphnGDtxsD1tQY5ykyEpYNxFK8kl1w==}
+  '@langchain/core@0.3.64':
+    resolution: {integrity: sha512-KOHTnmycOPfuffLAm3wwv1rThQ47iG5a3HuWMt2qYhwwImFi6HLeYqKgmxVS5qcJjc6t0IPwR7jOvv9IKxfrAw==}
     engines: {node: '>=18'}
 
-  '@langchain/google-genai@0.2.14':
-    resolution: {integrity: sha512-gKe/T2LNh8wSSMJOaFmYd8cwQnDSXKtVtC6a7CFoq5nWuh0bKzhItM/7bue1aMN8mlKfB2G1HCwxhaZoSpS/DA==}
+  '@langchain/google-genai@0.2.15':
+    resolution: {integrity: sha512-fAD3xjzd5TxWQCKlttNeEc+b5tUX43hBqKH3rk3g+wbl1ToLqe3ocWawKRmGotEuI5jhDVmoHjDxoNMifFDgmg==}
     engines: {node: '>=18'}
     peerDependencies:
       '@langchain/core': ^0.3.46
@@ -3045,8 +3045,8 @@ packages:
     peerDependencies:
       '@langchain/core': ^0.3.46
 
-  '@langchain/openai@0.5.18':
-    resolution: {integrity: sha512-CX1kOTbT5xVFNdtLjnM0GIYNf+P7oMSu+dGCFxxWRa3dZwWiuyuBXCm+dToUGxDLnsHuV1bKBtIzrY1mLq/A1Q==}
+  '@langchain/openai@0.6.1':
+    resolution: {integrity: sha512-jm8MzMEjAKPReYma4Lewb9vGnocKbhoClqPuRTxtKPDgqQ5yJWSisNy4iZO/a1d6ag/7MnxwKMjVsJdy1cBsxw==}
     engines: {node: '>=18'}
     peerDependencies:
       '@langchain/core': ^0.3.46
@@ -4656,8 +4656,8 @@ packages:
   access-control@1.0.1:
     resolution: {integrity: sha512-H5aqjkogmFxfaOrfn/e42vyspHVXuJ8er63KuljJXpOyJ1ZO/U5CrHfO8BLKIy2w7mBM02L5quL0vbfQqrGQbA==}
 
-  acorn-import-phases@1.0.3:
-    resolution: {integrity: sha512-jtKLnfoOzm28PazuQ4dVBcE9Jeo6ha1GAJvq3N0LlNOszmTfx+wSycBehn+FN0RnyeR77IBxN/qVYMw0Rlj0Xw==}
+  acorn-import-phases@1.0.4:
+    resolution: {integrity: sha512-wKmbr/DDiIXzEOiWrTTUcDm24kQ2vGfZQvM2fwg2vXqR5uW6aapr7ObPtj1th32b9u90/Pf4AItvdTh42fBmVQ==}
     engines: {node: '>=10.13.0'}
     peerDependencies:
       acorn: ^8.14.0
@@ -8194,8 +8194,8 @@ packages:
   langs@2.0.0:
     resolution: {integrity: sha512-v4pxOBEQVN1WBTfB1crhTtxzNLZU9HPWgadlwzWKISJtt6Ku/CnpBrwVy+jFv8StjxsPfwPFzO0CMwdZLJ0/BA==}
 
-  langsmith@0.3.44:
-    resolution: {integrity: sha512-LMCZ7ULSzIpDmsrxGZKzCpp8exuempvCFCX1N0m+u517ZhikPDEtAtgnREObMjIISzB7eXkODkFq0Klxc9FODg==}
+  langsmith@0.3.46:
+    resolution: {integrity: sha512-Hhi4/cMjhWIGpu0DW5eQrXBbeeKQWPYYQyJCYzhFjod+xinMry4i8QR0gxrrgjGOgfMuU6nyK79YqjGTEPVbDA==}
     peerDependencies:
       '@opentelemetry/api': '*'
       '@opentelemetry/exporter-trace-otlp-proto': '*'
@@ -8773,6 +8773,7 @@ packages:
   node-domexception@1.0.0:
     resolution: {integrity: sha512-/jKZoMpw0F8GRwl4/eLROPA3cfcXtLApP0QzLmUT/HuPCZWyB7IY9ZrMeKw2O/nFIqPQB3PVM9aYm0F312AXDQ==}
     engines: {node: '>=10.5.0'}
+    deprecated: Use your platform's native DOMException instead
 
   node-fetch@2.7.0:
     resolution: {integrity: sha512-c4FRfUm/dbcWZ7U+1Wq0AwCyFL+3nt2bEw05wfxSz+DWpWsitgmSgYmy2dQdWyKC1694ELPqMs/YzUSNozLt8A==}
@@ -8944,8 +8945,8 @@ packages:
       zod:
         optional: true
 
-  openai@5.9.0:
-    resolution: {integrity: sha512-cmLC0pfqLLhBGxE4aZPyRPjydgYCncppV2ClQkKmW79hNjCvmzkfhz8rN5/YVDmjVQlFV+UsF1JIuNjNgeagyQ==}
+  openai@5.10.1:
+    resolution: {integrity: sha512-fq6xVfv1/gpLbsj8fArEt3b6B9jBxdhAK+VJ+bDvbUvNd+KTLlA3bnDeYZaBsGH9LUhJ1M1yXfp9sEyBLMx6eA==}
     hasBin: true
     peerDependencies:
       ws: ^8.18.0
@@ -13095,20 +13096,20 @@ snapshots:
       '@lumino/properties': 2.0.3
       '@lumino/signaling': 2.1.4
 
-  '@langchain/anthropic@0.3.24(@langchain/core@0.3.62(@opentelemetry/api@1.9.0)(openai@4.104.0(encoding@0.1.13)(ws@8.18.3)(zod@3.25.76)))':
+  '@langchain/anthropic@0.3.24(@langchain/core@0.3.64(@opentelemetry/api@1.9.0)(openai@4.104.0(encoding@0.1.13)(ws@8.18.3)(zod@3.25.76)))':
     dependencies:
       '@anthropic-ai/sdk': 0.56.0
-      '@langchain/core': 0.3.62(@opentelemetry/api@1.9.0)(openai@4.104.0(encoding@0.1.13)(ws@8.18.3)(zod@3.25.76))
+      '@langchain/core': 0.3.64(@opentelemetry/api@1.9.0)(openai@4.104.0(encoding@0.1.13)(ws@8.18.3)(zod@3.25.76))
       fast-xml-parser: 4.5.3
 
-  '@langchain/core@0.3.62(@opentelemetry/api@1.9.0)(openai@4.104.0(encoding@0.1.13)(ws@8.18.3)(zod@3.25.76))':
+  '@langchain/core@0.3.64(@opentelemetry/api@1.9.0)(openai@4.104.0(encoding@0.1.13)(ws@8.18.3)(zod@3.25.76))':
     dependencies:
       '@cfworker/json-schema': 4.1.1
       ansi-styles: 5.2.0
       camelcase: 6.3.0
       decamelize: 1.2.0
       js-tiktoken: 1.0.20
-      langsmith: 0.3.44(@opentelemetry/api@1.9.0)(openai@4.104.0(encoding@0.1.13)(ws@8.18.3)(zod@3.25.76))
+      langsmith: 0.3.46(@opentelemetry/api@1.9.0)(openai@4.104.0(encoding@0.1.13)(ws@8.18.3)(zod@3.25.76))
       mustache: 4.2.0
       p-queue: 6.6.2
       p-retry: 4.6.2
@@ -13121,31 +13122,31 @@ snapshots:
       - '@opentelemetry/sdk-trace-base'
       - openai
 
-  '@langchain/google-genai@0.2.14(@langchain/core@0.3.62(@opentelemetry/api@1.9.0)(openai@4.104.0(encoding@0.1.13)(ws@8.18.3)(zod@3.25.76)))':
+  '@langchain/google-genai@0.2.15(@langchain/core@0.3.64(@opentelemetry/api@1.9.0)(openai@4.104.0(encoding@0.1.13)(ws@8.18.3)(zod@3.25.76)))':
     dependencies:
       '@google/generative-ai': 0.24.1
-      '@langchain/core': 0.3.62(@opentelemetry/api@1.9.0)(openai@4.104.0(encoding@0.1.13)(ws@8.18.3)(zod@3.25.76))
+      '@langchain/core': 0.3.64(@opentelemetry/api@1.9.0)(openai@4.104.0(encoding@0.1.13)(ws@8.18.3)(zod@3.25.76))
       uuid: 11.1.0
 
-  '@langchain/mistralai@0.2.1(@langchain/core@0.3.62(@opentelemetry/api@1.9.0)(openai@4.104.0(encoding@0.1.13)(ws@8.18.3)(zod@3.25.76)))(zod@3.25.76)':
+  '@langchain/mistralai@0.2.1(@langchain/core@0.3.64(@opentelemetry/api@1.9.0)(openai@4.104.0(encoding@0.1.13)(ws@8.18.3)(zod@3.25.76)))(zod@3.25.76)':
     dependencies:
-      '@langchain/core': 0.3.62(@opentelemetry/api@1.9.0)(openai@4.104.0(encoding@0.1.13)(ws@8.18.3)(zod@3.25.76))
+      '@langchain/core': 0.3.64(@opentelemetry/api@1.9.0)(openai@4.104.0(encoding@0.1.13)(ws@8.18.3)(zod@3.25.76))
       '@mistralai/mistralai': 1.7.4(zod@3.25.76)
       uuid: 10.0.0
     transitivePeerDependencies:
       - zod
 
-  '@langchain/ollama@0.2.3(@langchain/core@0.3.62(@opentelemetry/api@1.9.0)(openai@4.104.0(encoding@0.1.13)(ws@8.18.3)(zod@3.25.76)))':
+  '@langchain/ollama@0.2.3(@langchain/core@0.3.64(@opentelemetry/api@1.9.0)(openai@4.104.0(encoding@0.1.13)(ws@8.18.3)(zod@3.25.76)))':
     dependencies:
-      '@langchain/core': 0.3.62(@opentelemetry/api@1.9.0)(openai@4.104.0(encoding@0.1.13)(ws@8.18.3)(zod@3.25.76))
+      '@langchain/core': 0.3.64(@opentelemetry/api@1.9.0)(openai@4.104.0(encoding@0.1.13)(ws@8.18.3)(zod@3.25.76))
       ollama: 0.5.16
       uuid: 10.0.0
 
-  '@langchain/openai@0.5.18(@langchain/core@0.3.62(@opentelemetry/api@1.9.0)(openai@4.104.0(encoding@0.1.13)(ws@8.18.3)(zod@3.25.76)))(ws@8.18.3)':
+  '@langchain/openai@0.6.1(@langchain/core@0.3.64(@opentelemetry/api@1.9.0)(openai@4.104.0(encoding@0.1.13)(ws@8.18.3)(zod@3.25.76)))(ws@8.18.3)':
     dependencies:
-      '@langchain/core': 0.3.62(@opentelemetry/api@1.9.0)(openai@4.104.0(encoding@0.1.13)(ws@8.18.3)(zod@3.25.76))
+      '@langchain/core': 0.3.64(@opentelemetry/api@1.9.0)(openai@4.104.0(encoding@0.1.13)(ws@8.18.3)(zod@3.25.76))
       js-tiktoken: 1.0.20
-      openai: 5.9.0(ws@8.18.3)(zod@3.25.76)
+      openai: 5.10.1(ws@8.18.3)(zod@3.25.76)
       zod: 3.25.76
     transitivePeerDependencies:
       - ws
@@ -14997,7 +14998,7 @@ snapshots:
       setheader: 1.0.2
       vary: 1.1.2
 
-  acorn-import-phases@1.0.3(acorn@8.15.0):
+  acorn-import-phases@1.0.4(acorn@8.15.0):
     dependencies:
       acorn: 8.15.0
 
@@ -15321,7 +15322,7 @@ snapshots:
 
   axios@1.10.0:
     dependencies:
-      follow-redirects: 1.15.9
+      follow-redirects: 1.15.9(debug@4.4.1)
       form-data: 4.0.3
       proxy-from-env: 1.1.0
     transitivePeerDependencies:
@@ -17457,8 +17458,6 @@ snapshots:
     dependencies:
       dtype: 2.0.0
 
-  follow-redirects@1.15.9: {}
-
   follow-redirects@1.15.9(debug@4.4.1):
     optionalDependencies:
       debug: 4.4.1
@@ -19317,7 +19316,7 @@ snapshots:
 
   langs@2.0.0: {}
 
-  langsmith@0.3.44(@opentelemetry/api@1.9.0)(openai@4.104.0(encoding@0.1.13)(ws@8.18.3)(zod@3.25.76)):
+  langsmith@0.3.46(@opentelemetry/api@1.9.0)(openai@4.104.0(encoding@0.1.13)(ws@8.18.3)(zod@3.25.76)):
     dependencies:
       '@types/uuid': 10.0.0
       chalk: 4.1.2
@@ -20156,7 +20155,7 @@ snapshots:
     transitivePeerDependencies:
       - encoding
 
-  openai@5.9.0(ws@8.18.3)(zod@3.25.76):
+  openai@5.10.1(ws@8.18.3)(zod@3.25.76):
     optionalDependencies:
       ws: 8.18.3
       zod: 3.25.76
@@ -23123,7 +23122,7 @@ snapshots:
       '@webassemblyjs/wasm-edit': 1.14.1
       '@webassemblyjs/wasm-parser': 1.14.1
       acorn: 8.15.0
-      acorn-import-phases: 1.0.3(acorn@8.15.0)
+      acorn-import-phases: 1.0.4(acorn@8.15.0)
       browserslist: 4.25.1
       chrome-trace-event: 1.0.4
       enhanced-resolve: 5.18.2
@@ -23155,7 +23154,7 @@ snapshots:
       '@webassemblyjs/wasm-edit': 1.14.1
       '@webassemblyjs/wasm-parser': 1.14.1
       acorn: 8.15.0
-      acorn-import-phases: 1.0.3(acorn@8.15.0)
+      acorn-import-phases: 1.0.4(acorn@8.15.0)
       browserslist: 4.25.1
       chrome-trace-event: 1.0.4
       enhanced-resolve: 5.18.2
diff --git a/src/packages/server/llm/anthropic.ts b/src/packages/server/llm/anthropic.ts
index 52078757443..b8902431eb7 100644
--- a/src/packages/server/llm/anthropic.ts
+++ b/src/packages/server/llm/anthropic.ts
@@ -1,9 +1,11 @@
 import { ChatAnthropic } from "@langchain/anthropic";
+import { AIMessageChunk } from "@langchain/core/messages";
 import {
   ChatPromptTemplate,
   MessagesPlaceholder,
 } from "@langchain/core/prompts";
 import { RunnableWithMessageHistory } from "@langchain/core/runnables";
+import { concat } from "@langchain/core/utils/stream";
 
 import getLogger from "@cocalc/backend/logger";
 import { getServerSettings } from "@cocalc/database/settings";
@@ -19,18 +21,11 @@ import { numTokens } from "./chatgpt-numtokens";
 const log = getLogger("llm:anthropic");
 
 function getModelName(model: AnthropicModel): string {
-  // The -4k and -8k variants have a limited context window (by us here) while offered for free
-  if (model === "claude-3-sonnet-4k") {
-    model = "claude-3-sonnet";
-  } else if (model === "claude-3-haiku-8k") {
-    model = "claude-3-haiku";
-  } else if (model === "claude-3-opus-8k") {
-    model = "claude-3-opus";
-  } else if (model === "claude-3-5-sonnet-4k") {
-    model = "claude-3-5-sonnet";
+  const id = ANTHROPIC_VERSION[model];
+  if (id == null) {
+    throw new Error(`Anthropic model ${model} is no longer supported`);
   }
-  // now we have a valid name, and we have to append their static version number
-  return `${model}-${ANTHROPIC_VERSION[model]}`;
+  return id;
 }
 
 interface AnthropicOpts {
@@ -105,9 +100,8 @@ export async function evaluateAnthropic(
     inputMessagesKey: "input",
     historyMessagesKey: "history",
     getMessageHistory: async () => {
-      const { messageHistory, tokens } = await transformHistoryToMessages(
-        history,
-      );
+      const { messageHistory, tokens } =
+        await transformHistoryToMessages(history);
       historyTokens = tokens;
       return messageHistory;
     },
@@ -115,24 +109,57 @@ export async function evaluateAnthropic(
 
   const chunks = await chainWithHistory.stream({ input });
 
+  let finalResult: AIMessageChunk | undefined;
   let output = "";
   for await (const chunk of chunks) {
     const { content } = chunk;
     if (typeof content !== "string") continue;
     output += content;
     opts.stream?.(content);
+
+    // Collect the final result to check for usage metadata
+    if (finalResult) {
+      finalResult = concat(finalResult, chunk);
+    } else {
+      finalResult = chunk;
+    }
   }
 
   opts.stream?.(null);
 
-  // we use that GPT3 tokenizer to get an approximate number of tokens
-  const prompt_tokens = numTokens(input) + historyTokens;
-  const completion_tokens = numTokens(output);
-
-  return {
-    output,
-    total_tokens: prompt_tokens + completion_tokens,
-    completion_tokens,
-    prompt_tokens,
-  };
+  // Check for usage metadata from LangChain first (more accurate)
+  const usage_metadata = finalResult?.usage_metadata;
+  log.debug("usage_metadata", usage_metadata);
+
+  if (usage_metadata) {
+    const { input_tokens, output_tokens, total_tokens } = usage_metadata;
+    log.debug("evaluateAnthropic successful (using usage_metadata)", {
+      input_tokens,
+      output_tokens,
+      total_tokens,
+    });
+
+    return {
+      output,
+      total_tokens,
+      completion_tokens: output_tokens,
+      prompt_tokens: input_tokens,
+    };
+  } else {
+    // Fallback to manual token counting (approximation using GPT-3 tokenizer)
+    const prompt_tokens = numTokens(input) + historyTokens;
+    const completion_tokens = numTokens(output);
+
+    log.debug("evaluateAnthropic successful (using manual counting)", {
+      prompt_tokens,
+      completion_tokens,
+    });
+
+    return {
+      output,
+      total_tokens: prompt_tokens + completion_tokens,
+      completion_tokens,
+      prompt_tokens,
+    };
+  }
 }
diff --git a/src/packages/server/llm/call-llm.ts b/src/packages/server/llm/call-llm.ts
index 5eeebcfc601..7ea0df60c8a 100644
--- a/src/packages/server/llm/call-llm.ts
+++ b/src/packages/server/llm/call-llm.ts
@@ -2,11 +2,13 @@ import { delay } from "awaiting";
 import type OpenAI from "openai";
 import getLogger from "@cocalc/backend/logger";
 import { OpenAIMessages, OpenAIModel } from "@cocalc/util/db-schema/llm-utils";
-import type { ChatOutput, Stream as StreamFunction } from "@cocalc/util/types/llm";
+import type {
+  ChatOutput,
+  Stream as StreamFunction,
+} from "@cocalc/util/types/llm";
 import { totalNumTokens } from "./chatgpt-numtokens";
 import type { Stream } from "openai/streaming";
 
-
 const log = getLogger("llm:call-llm");
 
 interface CallChatGPTOpts {
diff --git a/src/packages/server/llm/client.ts b/src/packages/server/llm/client.ts
index d792ff447cd..c738dd3e2bf 100644
--- a/src/packages/server/llm/client.ts
+++ b/src/packages/server/llm/client.ts
@@ -151,8 +151,8 @@ export async function getCustomOpenAI(model: string) {
     );
   }
 
-  const settings = await getServerSettings();
-  const config = settings.custom_openai_configuration?.[model];
+  const { custom_openai_configuration } = await getServerSettings();
+  const config = custom_openai_configuration?.[model];
   if (!config) {
     throw new Error(
       `Custom OpenAI model ${model} not configured – you have to create an entry {${model}: {baseUrl: "https://...", ...}} in the "Custom OpenAI Configuration" entry of the server settings!`,
@@ -173,12 +173,22 @@ export async function getCustomOpenAI(model: string) {
 
   // extract all other properties from the config, except the url, model, keepAlive field and the "cocalc" field
   const other = omit(config, ["baseUrl", "model", "keepAlive", "cocalc"]);
-  const customOpenAIConfig = {
+
+  // Handle legacy API key field names for backward compatibility
+  const customOpenAIConfig: any = {
     configuration: { baseURL }, // https://js.langchain.com/docs/integrations/chat/openai/#custom-urls
     model: config.model ?? model,
     ...other,
   };
 
+  // Convert legacy API key field names to the expected "apiKey" field
+  if (config.openAIApiKey && !customOpenAIConfig.apiKey) {
+    customOpenAIConfig.apiKey = config.openAIApiKey;
+  }
+  if (config.azureOpenAIApiKey && !customOpenAIConfig.apiKey) {
+    customOpenAIConfig.apiKey = config.azureOpenAIApiKey;
+  }
+
   log.debug(
     "Instantiating Custom OpenAI client with config (omitting api keys)",
     omit(customOpenAIConfig, ["apiKey", "openAIApiKey", "azureOpenAIApiKey"]),
diff --git a/src/packages/server/llm/custom-openai.ts b/src/packages/server/llm/custom-openai.ts
index a6dd5c7b627..cb925c392e7 100644
--- a/src/packages/server/llm/custom-openai.ts
+++ b/src/packages/server/llm/custom-openai.ts
@@ -4,11 +4,13 @@ import {
   isCustomOpenAI,
 } from "@cocalc/util/db-schema/llm-utils";
 import type { ChatOutput, History, Stream } from "@cocalc/util/types/llm";
+import { AIMessageChunk } from "@langchain/core/messages";
 import {
   ChatPromptTemplate,
   MessagesPlaceholder,
 } from "@langchain/core/prompts";
 import { RunnableWithMessageHistory } from "@langchain/core/runnables";
+import { concat } from "@langchain/core/utils/stream";
 import {
   ChatOpenAI as ChatOpenAILC,
   OpenAICallOptions,
@@ -51,7 +53,7 @@ export async function evaluateCustomOpenAI(
 
   const prompt = ChatPromptTemplate.fromMessages([
     ["system", system ?? ""],
-    new MessagesPlaceholder("chat_history"),
+    new MessagesPlaceholder("history"),
     ["human", "{input}"],
   ]);
 
@@ -63,11 +65,10 @@ export async function evaluateCustomOpenAI(
     runnable: chain,
     config: { configurable: { sessionId: "ignored" } },
     inputMessagesKey: "input",
-    historyMessagesKey: "chat_history",
+    historyMessagesKey: "history",
     getMessageHistory: async () => {
-      const { messageHistory, tokens } = await transformHistoryToMessages(
-        history,
-      );
+      const { messageHistory, tokens } =
+        await transformHistoryToMessages(history);
       historyTokens = tokens;
       return messageHistory;
     },
@@ -75,6 +76,7 @@ export async function evaluateCustomOpenAI(
 
   const chunks = await chainWithHistory.stream({ input });
 
+  let finalResult: AIMessageChunk | undefined;
   let output = "";
   for await (const chunk of chunks) {
     const { content } = chunk;
@@ -83,19 +85,51 @@ export async function evaluateCustomOpenAI(
     }
     output += content;
     opts.stream?.(content);
+
+    // Collect the final result to check for usage metadata
+    if (finalResult) {
+      finalResult = concat(finalResult, chunk);
+    } else {
+      finalResult = chunk;
+    }
   }
 
   // and an empty call when done
   opts.stream?.(null);
 
-  // we use that GPT3 tokenizer to get an approximate number of tokens
-  const prompt_tokens = numTokens(input) + historyTokens;
-  const completion_tokens = numTokens(output);
+  // Check for usage metadata from LangChain first (more accurate)
+  const usage_metadata = finalResult?.usage_metadata;
+  log.debug("usage_metadata", usage_metadata);
+
+  if (usage_metadata) {
+    const { input_tokens, output_tokens, total_tokens } = usage_metadata;
+    log.debug("evaluateCustomOpenAI successful (using usage_metadata)", {
+      input_tokens,
+      output_tokens,
+      total_tokens,
+    });
 
-  return {
-    output,
-    total_tokens: prompt_tokens + completion_tokens,
-    completion_tokens,
-    prompt_tokens,
-  };
+    return {
+      output,
+      total_tokens,
+      completion_tokens: output_tokens,
+      prompt_tokens: input_tokens,
+    };
+  } else {
+    // Fallback to manual token counting (approximation using GPT-3 tokenizer)
+    const prompt_tokens = numTokens(input) + historyTokens;
+    const completion_tokens = numTokens(output);
+
+    log.debug("evaluateCustomOpenAI successful (using manual counting)", {
+      prompt_tokens,
+      completion_tokens,
+    });
+
+    return {
+      output,
+      total_tokens: prompt_tokens + completion_tokens,
+      completion_tokens,
+      prompt_tokens,
+    };
+  }
 }
diff --git a/src/packages/server/llm/evaluate-lc.ts b/src/packages/server/llm/evaluate-lc.ts
new file mode 100644
index 00000000000..8f986205cfb
--- /dev/null
+++ b/src/packages/server/llm/evaluate-lc.ts
@@ -0,0 +1,406 @@
+/**
+ * Unified LangChain evaluation implementation
+ *
+ * This file provides a unified interface for all LangChain-based LLM providers,
+ * eliminating code duplication while preserving all provider-specific functionality.
+ */
+
+import getLogger from "@cocalc/backend/logger";
+import { getServerSettings } from "@cocalc/database/settings";
+import { ServerSettings } from "@cocalc/database/settings/server-settings";
+import {
+  ANTHROPIC_VERSION,
+  AnthropicModel,
+  fromCustomOpenAIModel,
+  GOOGLE_MODEL_TO_ID,
+  GoogleModel,
+  isAnthropicModel,
+  isCustomOpenAI,
+  isGoogleModel,
+  isMistralModel,
+  isOpenAIModel,
+} from "@cocalc/util/db-schema/llm-utils";
+import type { ChatOutput, History, Stream } from "@cocalc/util/types/llm";
+import { ChatAnthropic } from "@langchain/anthropic";
+import { AIMessageChunk } from "@langchain/core/messages";
+import {
+  ChatPromptTemplate,
+  MessagesPlaceholder,
+} from "@langchain/core/prompts";
+import { RunnableWithMessageHistory } from "@langchain/core/runnables";
+import { concat } from "@langchain/core/utils/stream";
+import { ChatGoogleGenerativeAI } from "@langchain/google-genai";
+import { ChatMistralAI } from "@langchain/mistralai";
+import { ChatOpenAI } from "@langchain/openai";
+import { transformHistoryToMessages } from "./chat-history";
+import { numTokens } from "./chatgpt-numtokens";
+import { getCustomOpenAI } from "./client";
+import { normalizeOpenAIModel } from "./index";
+
+const log = getLogger("llm:evaluate-lc");
+
+// Common interface for all LLM evaluation options
+export interface LLMEvaluationOptions {
+  input: string;
+  system?: string;
+  history?: History;
+  model: string;
+  stream?: Stream;
+  maxTokens?: number;
+  apiKey?: string;
+}
+
+// Provider-specific client configuration
+export interface LLMProviderConfig {
+  // Provider identification
+  name: string;
+
+  // Client creation function
+  createClient: (
+    options: LLMEvaluationOptions,
+    settings: ServerSettings,
+    mode: "cocalc" | "user",
+  ) => Promise<any>;
+
+  // Model processing
+  canonicalModel?: (model: string) => string;
+
+  // Special handling flags
+  getSystemRole?: (model: string) => string;
+
+  // Token counting fallback
+  getTokenCountFallback?: (
+    input: string,
+    output: string,
+    historyTokens: number,
+    model: string,
+    settings: any,
+  ) => Promise<{ prompt_tokens: number; completion_tokens: number }>;
+}
+
+function isO1Model(normalizedModel) {
+  return normalizedModel === "o1" || normalizedModel === "o1-mini";
+}
+
+// Provider configurations
+export const PROVIDER_CONFIGS = {
+  openai: {
+    name: "OpenAI",
+    createClient: async (options, settings) => {
+      const { openai_api_key: apiKey } = settings;
+      const normalizedModel = normalizeOpenAIModel(options.model);
+
+      log.debug(
+        `OpenAI createClient: original=${options.model}, normalized=${normalizedModel}`,
+      );
+
+      // Check if it's O1 model (doesn't support streaming)
+      const isO1 = isO1Model(normalizedModel);
+      return new ChatOpenAI({
+        model: normalizedModel,
+        apiKey: options.apiKey || apiKey,
+        maxTokens: options.maxTokens,
+        streaming: options.stream != null && !isO1,
+        streamUsage: true,
+        ...(options.stream != null && !isO1
+          ? { streamOptions: { includeUsage: true } }
+          : {}),
+      });
+    },
+    canonicalModel: (model) => normalizeOpenAIModel(model),
+    getSystemRole: (_model) => "system",
+    getTokenCountFallback: async (input, output, historyTokens) => ({
+      prompt_tokens: numTokens(input) + historyTokens,
+      completion_tokens: numTokens(output),
+    }),
+  },
+
+  google: {
+    name: "Google GenAI",
+    createClient: async (options, settings, mode) => {
+      const apiKey =
+        mode === "cocalc" ? settings.google_vertexai_key : options.apiKey;
+      const modelName =
+        mode === "cocalc"
+          ? GOOGLE_MODEL_TO_ID[options.model as GoogleModel] ?? options.model
+          : options.model;
+
+      log.debug(
+        `Google createClient: original=${options.model}, modelName=${modelName}`,
+      );
+
+      return new ChatGoogleGenerativeAI({
+        model: modelName,
+        apiKey: options.apiKey || apiKey,
+        maxOutputTokens: options.maxTokens,
+        // Only enable thinking tokens for Gemini 2.5 models
+        ...(modelName === "gemini-2.5-flash" || modelName === "gemini-2.5-pro"
+          ? { maxReasoningTokens: 1024 }
+          : {}),
+        streaming: true,
+      });
+    },
+    canonicalModel: (model) =>
+      GOOGLE_MODEL_TO_ID[model as GoogleModel] ?? model,
+    getTokenCountFallback: async (input, output, historyTokens) => ({
+      prompt_tokens: numTokens(input) + historyTokens,
+      completion_tokens: numTokens(output),
+    }),
+  },
+
+  anthropic: {
+    name: "Anthropic",
+    createClient: async (options, settings, mode) => {
+      const apiKey =
+        mode === "cocalc" ? settings.anthropic_api_key : options.apiKey;
+      const modelName =
+        mode === "cocalc"
+          ? ANTHROPIC_VERSION[options.model as AnthropicModel]
+          : options.model;
+
+      if (modelName == null) {
+        throw new Error(
+          `Anthropic model ${options.model} is no longer supported`,
+        );
+      }
+
+      log.debug(
+        `Anthropic createClient: original=${options.model}, modelVersion=${modelName}`,
+      );
+
+      return new ChatAnthropic({
+        model: modelName,
+        apiKey,
+        maxTokens: options.maxTokens,
+      });
+    },
+    canonicalModel: (model) => {
+      const version = ANTHROPIC_VERSION[model as AnthropicModel];
+      if (version == null) {
+        throw new Error(`Anthropic model ${model} is no longer supported`);
+      }
+      return version;
+    },
+    getTokenCountFallback: async (input, output, historyTokens) => ({
+      prompt_tokens: numTokens(input) + historyTokens,
+      completion_tokens: numTokens(output),
+    }),
+  },
+
+  mistral: {
+    name: "Mistral",
+    createClient: async (options, settings, mode) => {
+      const apiKey =
+        mode === "cocalc" ? settings.mistral_api_key : options.apiKey;
+
+      log.debug(`Mistral createClient: model=${options.model}`);
+
+      return new ChatMistralAI({
+        model: options.model,
+        apiKey,
+      });
+    },
+    getTokenCountFallback: async (input, output, historyTokens) => ({
+      prompt_tokens: numTokens(input) + historyTokens,
+      completion_tokens: numTokens(output),
+    }),
+  },
+
+  "custom-openai": {
+    name: "Custom OpenAI",
+    createClient: async (options, _settings) => {
+      const transformedModel = fromCustomOpenAIModel(options.model);
+      log.debug(
+        `Custom OpenAI createClient: original=${options.model}, transformed=${transformedModel}`,
+      );
+      return await getCustomOpenAI(transformedModel);
+    },
+    canonicalModel: (model) => fromCustomOpenAIModel(model),
+    getTokenCountFallback: async (input, output, historyTokens) => ({
+      prompt_tokens: numTokens(input) + historyTokens,
+      completion_tokens: numTokens(output),
+    }),
+  },
+} as const satisfies Record<string, LLMProviderConfig>;
+
+// Get provider config based on model
+export function getProviderConfig(model: string): LLMProviderConfig {
+  if (isOpenAIModel(model)) {
+    return PROVIDER_CONFIGS.openai;
+  } else if (isGoogleModel(model)) {
+    return PROVIDER_CONFIGS.google;
+  } else if (isAnthropicModel(model)) {
+    return PROVIDER_CONFIGS.anthropic;
+  } else if (isMistralModel(model)) {
+    return PROVIDER_CONFIGS.mistral;
+  } else if (isCustomOpenAI(model)) {
+    return PROVIDER_CONFIGS["custom-openai"];
+  } else {
+    throw new Error(`Unknown model provider for: ${model}`);
+  }
+}
+
+// Content processing helper
+function content2string(content: any): string {
+  if (typeof content === "string") {
+    return content;
+  } else if (Array.isArray(content)) {
+    const output0 = content[0];
+    if (output0?.type === "text") {
+      return output0.text;
+    }
+  }
+
+  log.debug("content2string unable to process", content);
+  return "";
+}
+
+// Main unified evaluation function
+export async function evaluateWithLangChain(
+  options: LLMEvaluationOptions,
+  mode: "cocalc" | "user" = "cocalc",
+): Promise<ChatOutput> {
+  const { input, system, history = [], model, stream, maxTokens } = options;
+
+  log.debug("evaluateWithLangChain", {
+    input,
+    history,
+    system,
+    model,
+    stream: stream != null,
+    maxTokens,
+  });
+
+  // Get provider configuration
+  const config = getProviderConfig(model);
+
+  // Get server settings
+  const settings = await getServerSettings();
+
+  // Create LangChain client
+  const client = await config.createClient(options, settings, mode);
+
+  // Canonical model name
+  const canonicalModel = config.canonicalModel
+    ? config.canonicalModel(model)
+    : model;
+
+  // Determine system role (always use "history" for historyKey)
+  const systemRole = config.getSystemRole
+    ? config.getSystemRole(model)
+    : "system";
+
+  const historyMessagesKey = "history";
+
+  // Create prompt template
+  // For o1 models, omit the system message entirely since they don't support system roles
+  const isO1 = isO1Model(canonicalModel);
+  const prompt = isO1
+    ? ChatPromptTemplate.fromMessages([
+        new MessagesPlaceholder(historyMessagesKey),
+        ["human", system ? `${system}\n\n{input}` : "{input}"],
+      ])
+    : ChatPromptTemplate.fromMessages([
+        [systemRole, system ?? ""],
+        new MessagesPlaceholder(historyMessagesKey),
+        ["human", "{input}"],
+      ]);
+
+  const chain = prompt.pipe(client);
+
+  let historyTokens = 0;
+
+  // Set up chain with history
+  const chainWithHistory = new RunnableWithMessageHistory({
+    runnable: chain,
+    config: { configurable: { sessionId: "ignored" } },
+    inputMessagesKey: "input",
+    historyMessagesKey,
+    getMessageHistory: async () => {
+      const { messageHistory, tokens } = await transformHistoryToMessages(
+        history,
+      );
+      historyTokens = tokens;
+      return messageHistory;
+    },
+  });
+
+  let finalResult: AIMessageChunk | undefined;
+  let output = "";
+
+  if (stream) {
+    // Streaming mode
+    const chunks = await chainWithHistory.stream({ input });
+
+    for await (const chunk of chunks) {
+      const chunkTyped = chunk as AIMessageChunk;
+      const { content } = chunkTyped;
+      const contentStr = content2string(content);
+
+      if (typeof content === "string") {
+        output += content;
+        stream(content);
+      } else if (contentStr) {
+        output += contentStr;
+        stream(contentStr);
+      }
+
+      // Collect final result for usage metadata
+      if (finalResult) {
+        finalResult = concat(finalResult, chunkTyped);
+      } else {
+        finalResult = chunkTyped;
+      }
+    }
+  } else {
+    // Non-streaming mode
+    finalResult = (await chainWithHistory.invoke({ input })) as AIMessageChunk;
+    const { content } = finalResult;
+    output = content2string(content);
+  }
+
+  stream?.(null);
+
+  // Token counting - prefer usage_metadata, fallback to provider-specific method
+  const usage_metadata = finalResult?.usage_metadata;
+  log.debug("usage_metadata", usage_metadata);
+
+  if (usage_metadata) {
+    const { input_tokens, output_tokens, total_tokens } = usage_metadata;
+    log.debug(`${config.name} successful (using usage_metadata)`, {
+      input_tokens,
+      output_tokens,
+      total_tokens,
+    });
+
+    return {
+      output,
+      total_tokens,
+      completion_tokens: output_tokens,
+      prompt_tokens: input_tokens,
+    };
+  } else {
+    // Fallback to provider-specific token counting
+    const tokenCount = config.getTokenCountFallback
+      ? await config.getTokenCountFallback(
+          input,
+          output,
+          historyTokens,
+          model,
+          settings,
+        )
+      : {
+          prompt_tokens: numTokens(input) + historyTokens,
+          completion_tokens: numTokens(output),
+        };
+
+    log.debug(`${config.name} successful (using manual counting)`, tokenCount);
+
+    return {
+      output,
+      total_tokens: tokenCount.prompt_tokens + tokenCount.completion_tokens,
+      completion_tokens: tokenCount.completion_tokens,
+      prompt_tokens: tokenCount.prompt_tokens,
+    };
+  }
+}
diff --git a/src/packages/server/llm/google-genai-client.ts b/src/packages/server/llm/google-genai-client.ts
index a83172f97d1..676de4352af 100644
--- a/src/packages/server/llm/google-genai-client.ts
+++ b/src/packages/server/llm/google-genai-client.ts
@@ -5,11 +5,13 @@
  */
 
 import { GenerativeModel, GoogleGenerativeAI } from "@google/generative-ai";
+import { AIMessageChunk } from "@langchain/core/messages";
 import {
   ChatPromptTemplate,
   MessagesPlaceholder,
 } from "@langchain/core/prompts";
 import { RunnableWithMessageHistory } from "@langchain/core/runnables";
+import { concat } from "@langchain/core/utils/stream";
 import { ChatGoogleGenerativeAI } from "@langchain/google-genai";
 import getLogger from "@cocalc/backend/logger";
 import { getServerSettings } from "@cocalc/database/settings";
@@ -110,13 +112,14 @@ export class GoogleGenAIClient {
       model: modelName,
       apiKey: this.apiKey,
       maxOutputTokens: maxTokens,
+      // Only enable thinking tokens for Gemini 2.5 models
+      ...(modelName === "gemini-2.5-flash" || modelName === "gemini-2.5-pro"
+        ? { maxReasoningTokens: 1024 }
+        : {}),
       streaming: true,
     });
 
-    // However, we also count tokens, and for that we use "gemini-1.5-pro" only
-    const geminiPro: GenerativeModel = this.genAI.getGenerativeModel({
-      model: "gemini-1.5-pro",
-    });
+    // Token counting will be done using either usage_metadata or the actual model
 
     const prompt = ChatPromptTemplate.fromMessages([
       ["system", system ?? ""],
@@ -139,32 +142,74 @@ export class GoogleGenAIClient {
 
     const chunks = await chainWithHistory.stream({ input });
 
+    let finalResult: AIMessageChunk | undefined;
     let output = "";
     for await (const chunk of chunks) {
       const { content } = chunk;
       if (typeof content !== "string") continue;
       output += content;
       stream?.(content);
+
+      // Collect the final result to check for usage metadata
+      if (finalResult) {
+        finalResult = concat(finalResult, chunk);
+      } else {
+        finalResult = chunk;
+      }
     }
 
     stream?.(null);
 
-    const { totalTokens: prompt_tokens } = await geminiPro.countTokens([
-      input,
-      system ?? "",
-      ...history.map(({ content }) => content),
-    ]);
-
-    const { totalTokens: completion_tokens } =
-      await geminiPro.countTokens(output);
+    // Check for usage metadata from LangChain first (more accurate, includes thinking tokens)
+    const usage_metadata = finalResult?.usage_metadata;
+    log.debug("usage_metadata", usage_metadata);
+
+    if (usage_metadata) {
+      const { input_tokens, output_tokens, total_tokens } = usage_metadata;
+      log.debug("chatGemini successful (using usage_metadata)", {
+        input_tokens,
+        output_tokens,
+        total_tokens,
+        usage_metadata, // Log full metadata to see what other fields might be available
+      });
 
-    log.debug("chatGemini successful", { prompt_tokens, completion_tokens });
+      // For now, return the standard ChatOutput format
+      // TODO: Consider extending ChatOutput interface to include thinking_tokens if available
+      return {
+        output,
+        total_tokens,
+        completion_tokens: output_tokens,
+        prompt_tokens: input_tokens,
+      };
+    } else {
+      // Fallback to manual token counting using the actual model (not hardcoded)
+      const tokenCountingModel: GenerativeModel = this.genAI.getGenerativeModel(
+        {
+          model: modelName,
+        },
+      );
+
+      const { totalTokens: prompt_tokens } =
+        await tokenCountingModel.countTokens([
+          input,
+          system ?? "",
+          ...history.map(({ content }) => content),
+        ]);
+
+      const { totalTokens: completion_tokens } =
+        await tokenCountingModel.countTokens(output);
+
+      log.debug("chatGemini successful (using manual counting)", {
+        prompt_tokens,
+        completion_tokens,
+      });
 
-    return {
-      output,
-      total_tokens: prompt_tokens + completion_tokens,
-      completion_tokens,
-      prompt_tokens,
-    };
+      return {
+        output,
+        total_tokens: prompt_tokens + completion_tokens,
+        completion_tokens,
+        prompt_tokens,
+      };
+    }
   }
 }
diff --git a/src/packages/server/llm/google-lc.ts b/src/packages/server/llm/google-lc.ts
index 657dd3cb403..6c0537465b7 100644
--- a/src/packages/server/llm/google-lc.ts
+++ b/src/packages/server/llm/google-lc.ts
@@ -1,8 +1,10 @@
+import { AIMessageChunk } from "@langchain/core/messages";
 import {
   ChatPromptTemplate,
   MessagesPlaceholder,
 } from "@langchain/core/prompts";
 import { RunnableWithMessageHistory } from "@langchain/core/runnables";
+import { concat } from "@langchain/core/utils/stream";
 
 import getLogger from "@cocalc/backend/logger";
 import { getServerSettings } from "@cocalc/database/settings";
@@ -94,24 +96,57 @@ export async function evaluateGoogleGenAILC(
 
   const chunks = await chainWithHistory.stream({ input });
 
+  let finalResult: AIMessageChunk | undefined;
   let output = "";
   for await (const chunk of chunks) {
     const { content } = chunk;
     if (typeof content !== "string") continue;
     output += content;
     opts.stream?.(content);
+
+    // Collect the final result to check for usage metadata
+    if (finalResult) {
+      finalResult = concat(finalResult, chunk);
+    } else {
+      finalResult = chunk;
+    }
   }
 
   opts.stream?.(null);
 
-  // we use that GPT3 tokenizer to get an approximate number of tokens
-  const prompt_tokens = numTokens(input) + historyTokens;
-  const completion_tokens = numTokens(output);
-
-  return {
-    output,
-    total_tokens: prompt_tokens + completion_tokens,
-    completion_tokens,
-    prompt_tokens,
-  };
+  // Check for usage metadata from LangChain first (more accurate)
+  const usage_metadata = finalResult?.usage_metadata;
+  log.debug("usage_metadata", usage_metadata);
+
+  if (usage_metadata) {
+    const { input_tokens, output_tokens, total_tokens } = usage_metadata;
+    log.debug("evaluateGoogleGenAILC successful (using usage_metadata)", {
+      input_tokens,
+      output_tokens,
+      total_tokens,
+    });
+
+    return {
+      output,
+      total_tokens,
+      completion_tokens: output_tokens,
+      prompt_tokens: input_tokens,
+    };
+  } else {
+    // Fallback to manual token counting (approximation using GPT-3 tokenizer)
+    const prompt_tokens = numTokens(input) + historyTokens;
+    const completion_tokens = numTokens(output);
+
+    log.debug("evaluateGoogleGenAILC successful (using manual counting)", {
+      prompt_tokens,
+      completion_tokens,
+    });
+
+    return {
+      output,
+      total_tokens: prompt_tokens + completion_tokens,
+      completion_tokens,
+      prompt_tokens,
+    };
+  }
 }
diff --git a/src/packages/server/llm/index.ts b/src/packages/server/llm/index.ts
index 8e398f4b585..32fdeb574d7 100644
--- a/src/packages/server/llm/index.ts
+++ b/src/packages/server/llm/index.ts
@@ -48,6 +48,7 @@ import type {
 } from "@cocalc/util/types/llm";
 import { checkForAbuse } from "./abuse";
 import { evaluateAnthropic } from "./anthropic";
+import { evaluateWithLangChain } from "./evaluate-lc";
 import { callChatGPTAPI } from "./call-llm";
 import { getClient } from "./client";
 import { evaluateCustomOpenAI } from "./custom-openai";
@@ -64,6 +65,10 @@ const DEBUG_THROW_LLM_ERROR = process.env.DEBUG_THROW_LLM_ERROR === "true";
 
 const log = getLogger("llm");
 
+// Feature flag to use the new unified LangChain implementation
+export const USE_NEWER_LC_IMPL =
+  (process.env.COCALC_LLM_USE_NEWER_LC_IMPL ?? "true") === "true";
+
 async function getDefaultModel(): Promise<LanguageModel> {
   return ((await getServerSettings()).default_llm ??
     DEFAULT_MODEL) as LanguageModel;
@@ -178,31 +183,52 @@ async function evaluateImpl({
 
   const { output, total_tokens, prompt_tokens, completion_tokens } =
     await (async () => {
-      if (isUserDefinedModel(model)) {
-        return await evaluateUserDefinedLLM(params, account_id);
-      } else if (isOllamaLLM(model)) {
-        return await evaluateOllama(params);
-      } else if (isCustomOpenAI(model)) {
-        return await evaluateCustomOpenAI(params);
-      } else if (isMistralModel(model)) {
-        return await evaluateMistral(params);
-      } else if (isAnthropicModel(model)) {
-        return await evaluateAnthropic(params);
-      } else if (isGoogleModel(model)) {
-        const client = await getClient(model);
-        if (!(client instanceof GoogleGenAIClient)) {
-          throw new Error("Wrong client. This should never happen. [GenAI]");
+      if (USE_NEWER_LC_IMPL) {
+        // Use the new unified LangChain implementation
+        if (isUserDefinedModel(model)) {
+          return await evaluateUserDefinedLLM(params, account_id);
+        } else if (isOllamaLLM(model)) {
+          return await evaluateOllama(params);
+        } else if (
+          isCustomOpenAI(model) ||
+          isMistralModel(model) ||
+          isAnthropicModel(model) ||
+          isGoogleModel(model) ||
+          isOpenAIModel(model)
+        ) {
+          // Use unified implementation for LangChain-based providers
+          return await evaluateWithLangChain(params);
+        } else {
+          throw new Error(`Unable to handel model '${model}'.`);
         }
-        return await evaluateGoogleGenAI({ ...params, client });
-      } else if (isOpenAIModel(model)) {
-        return await evaluateOpenAILC(params);
       } else {
-        throw new Error(`Unable to handel model '${model}'.`);
-        // const client = await getClient(model);
-        // if (!(client instanceof OpenAI)) {
-        //   throw new Error("Wrong client. This should never happen. [OpenAI]");
-        // }
-        // return await evaluateOpenAI({ ...params, client });
+        // Use the original file-by-file implementation
+        if (isUserDefinedModel(model)) {
+          return await evaluateUserDefinedLLM(params, account_id);
+        } else if (isOllamaLLM(model)) {
+          return await evaluateOllama(params);
+        } else if (isCustomOpenAI(model)) {
+          return await evaluateCustomOpenAI(params);
+        } else if (isMistralModel(model)) {
+          return await evaluateMistral(params);
+        } else if (isAnthropicModel(model)) {
+          return await evaluateAnthropic(params);
+        } else if (isGoogleModel(model)) {
+          const client = await getClient(model);
+          if (!(client instanceof GoogleGenAIClient)) {
+            throw new Error("Wrong client. This should never happen. [GenAI]");
+          }
+          return await evaluateGoogleGenAI({ ...params, client });
+        } else if (isOpenAIModel(model)) {
+          return await evaluateOpenAILC(params);
+        } else {
+          throw new Error(`Unable to handel model '${model}'.`);
+          // const client = await getClient(model);
+          // if (!(client instanceof OpenAI)) {
+          //   throw new Error("Wrong client. This should never happen. [OpenAI]");
+          // }
+          // return await evaluateOpenAI({ ...params, client });
+        }
       }
     })();
 
diff --git a/src/packages/server/llm/mistral.ts b/src/packages/server/llm/mistral.ts
index c50de74afe9..7a6c5ec82fa 100644
--- a/src/packages/server/llm/mistral.ts
+++ b/src/packages/server/llm/mistral.ts
@@ -1,8 +1,10 @@
+import { AIMessageChunk } from "@langchain/core/messages";
 import {
   ChatPromptTemplate,
   MessagesPlaceholder,
 } from "@langchain/core/prompts";
 import { RunnableWithMessageHistory } from "@langchain/core/runnables";
+import { concat } from "@langchain/core/utils/stream";
 import { ChatMistralAI, ChatMistralAIInput } from "@langchain/mistralai";
 import getLogger from "@cocalc/backend/logger";
 import { getServerSettings } from "@cocalc/database/settings";
@@ -86,24 +88,57 @@ export async function evaluateMistral(
 
   const chunks = await chainWithHistory.stream({ input });
 
+  let finalResult: AIMessageChunk | undefined;
   let output = "";
   for await (const chunk of chunks) {
     const { content } = chunk;
     if (typeof content !== "string") continue;
     output += content;
     opts.stream?.(content);
+
+    // Collect the final result to check for usage metadata
+    if (finalResult) {
+      finalResult = concat(finalResult, chunk);
+    } else {
+      finalResult = chunk;
+    }
   }
 
   opts.stream?.(null);
 
-  // we use that GPT3 tokenizer to get an approximate number of tokens
-  const prompt_tokens = numTokens(input) + historyTokens;
-  const completion_tokens = numTokens(output);
-
-  return {
-    output,
-    total_tokens: prompt_tokens + completion_tokens,
-    completion_tokens,
-    prompt_tokens,
-  };
+  // Check for usage metadata from LangChain first (more accurate)
+  const usage_metadata = finalResult?.usage_metadata;
+  log.debug("usage_metadata", usage_metadata);
+
+  if (usage_metadata) {
+    const { input_tokens, output_tokens, total_tokens } = usage_metadata;
+    log.debug("evaluateMistral successful (using usage_metadata)", {
+      input_tokens,
+      output_tokens,
+      total_tokens,
+    });
+
+    return {
+      output,
+      total_tokens,
+      completion_tokens: output_tokens,
+      prompt_tokens: input_tokens,
+    };
+  } else {
+    // Fallback to manual token counting (approximation using GPT-3 tokenizer)
+    const prompt_tokens = numTokens(input) + historyTokens;
+    const completion_tokens = numTokens(output);
+
+    log.debug("evaluateMistral successful (using manual counting)", {
+      prompt_tokens,
+      completion_tokens,
+    });
+
+    return {
+      output,
+      total_tokens: prompt_tokens + completion_tokens,
+      completion_tokens,
+      prompt_tokens,
+    };
+  }
 }
diff --git a/src/packages/server/llm/ollama.ts b/src/packages/server/llm/ollama.ts
index ffff2850689..50321a038cf 100644
--- a/src/packages/server/llm/ollama.ts
+++ b/src/packages/server/llm/ollama.ts
@@ -44,9 +44,11 @@ export async function evaluateOllama(
 
   const ollama = client ?? (await getOllama(model));
 
+  const historyMessagesKey = "history";
+
   const prompt = ChatPromptTemplate.fromMessages([
     ["system", system ?? ""],
-    new MessagesPlaceholder("chat_history"),
+    new MessagesPlaceholder(historyMessagesKey),
     ["human", "{input}"],
   ]);
 
@@ -58,11 +60,10 @@ export async function evaluateOllama(
     runnable: chain,
     config: { configurable: { sessionId: "ignored" } },
     inputMessagesKey: "input",
-    historyMessagesKey: "chat_history",
+    historyMessagesKey,
     getMessageHistory: async () => {
-      const { messageHistory, tokens } = await transformHistoryToMessages(
-        history,
-      );
+      const { messageHistory, tokens } =
+        await transformHistoryToMessages(history);
       historyTokens = tokens;
       return messageHistory;
     },
diff --git a/src/packages/server/llm/openai-lc.ts b/src/packages/server/llm/openai-lc.ts
index 85dcd915923..3ad0de0ff72 100644
--- a/src/packages/server/llm/openai-lc.ts
+++ b/src/packages/server/llm/openai-lc.ts
@@ -55,8 +55,8 @@ export async function evaluateOpenAILC(
 
   // As of Jan 2025: reasoning models (o1) do not support streaming
   // https://platform.openai.com/docs/guides/reasoning/
-  const isO1 = model != "o1-mini" && model != "o1";
-  const streaming = stream != null && isO1;
+  const isO1 = model.includes("o1");
+  const streaming = stream != null && !isO1;
 
   // This is also quite big -- only uncomment when developing and needing this.
   //   log.debug("evaluateOpenAILC", {
@@ -75,10 +75,10 @@ export async function evaluateOpenAILC(
     ...params,
     maxTokens,
     streaming,
-  }).bind(isO1 ? {} : { stream_options: { include_usage: true } });
+  }).withConfig(streaming ? { stream_options: { include_usage: true } } : {});
 
   const prompt = ChatPromptTemplate.fromMessages([
-    [isO1 ? "developer" : "system", system ?? ""],
+    ["system", system ?? ""],
     new MessagesPlaceholder("history"),
     ["human", "{input}"],
   ]);
diff --git a/src/packages/server/llm/test/00.test.ts b/src/packages/server/llm/test/00.test.ts
deleted file mode 100644
index 5834de3bed6..00000000000
--- a/src/packages/server/llm/test/00.test.ts
+++ /dev/null
@@ -1,154 +0,0 @@
-// import { log } from "console";
-
-import getPool, { initEphemeralDatabase } from "@cocalc/database/pool";
-import {
-  AnthropicModel,
-  LanguageModelCore,
-  // GoogleModel,
-  MistralModel,
-  isAnthropicModel,
-  isGoogleModel,
-  isMistralModel,
-  isOpenAIModel,
-} from "@cocalc/util/db-schema/llm-utils";
-// import { evaluateMistral } from "../mistral";
-import { evaluateAnthropic } from "../anthropic";
-import { GoogleGenAIClient } from "../google-genai-client";
-import { evaluateMistral } from "../mistral";
-import { evaluateOpenAILC } from "../openai-lc";
-import { enableModels, setupAPIKeys, test_llm } from "./shared";
-import { evaluateGoogleGenAI } from "..";
-import { getClient } from "../client";
-
-beforeAll(async () => {
-  await initEphemeralDatabase();
-  await setupAPIKeys();
-  await enableModels();
-}, 15000);
-
-afterAll(async () => {
-  await getPool().end();
-});
-
-const QUERY = {
-  input: "What's 99 + 1?",
-  system: "Reply only the value.",
-} as const;
-
-function checkAnswer(answer) {
-  const { output, total_tokens, completion_tokens, prompt_tokens } = answer;
-  expect(output).toContain("100");
-  expect(total_tokens).toEqual(prompt_tokens + completion_tokens);
-  expect(prompt_tokens).toBeGreaterThan(5);
-  expect(completion_tokens).toBeGreaterThan(0);
-}
-
-async function llmOpenAI(model: LanguageModelCore) {
-  if (!isOpenAIModel(model)) {
-    throw new Error(`model: ${model} is not an OpenAI model`);
-  }
-
-  const answer = await evaluateOpenAILC({
-    model,
-    ...QUERY,
-  });
-
-  checkAnswer(answer);
-}
-
-async function llmGoogle(model: LanguageModelCore) {
-  if (!isGoogleModel(model)) {
-    throw new Error(`model: ${model} is not a Google model`);
-  }
-  const client = (await getClient(model)) as GoogleGenAIClient;
-  const answer = await evaluateGoogleGenAI({
-    model,
-    client,
-    ...QUERY,
-  });
-  checkAnswer(answer);
-}
-
-// write a test in jest that fails
-test_llm("openai")("OpenAI", () => {
-  test("gpt3.5 works", async () => {
-    llmOpenAI("gpt-3.5-turbo");
-  });
-  test("gpt 4 works", async () => {
-    llmOpenAI("gpt-4");
-  });
-  test("gpt 4 turbo works", async () => {
-    llmOpenAI("gpt-4-turbo-8k");
-  });
-  test("gpt 4 omni works", async () => {
-    llmOpenAI("gpt-4o-8k");
-  });
-  test("gpt 4o mini works", async () => {
-    llmOpenAI("gpt-4o-mini-8k");
-  });
-  test("gpt 4.1 works", async () => {
-    llmOpenAI("gpt-4.1");
-  });
-  test("gpt 4.1 mini works", async () => {
-    llmOpenAI("gpt-4.1-mini");
-  });
-
-  // test("gpt o1", async () => {
-  //   llmOpenAI("o1-8k");
-  // });
-  // test("gpt o1 mini works", async () => {
-  //   llmOpenAI("o1-mini-8k");
-  // });
-});
-
-// ATTN: does not work everywhere around, geolocation matters
-test_llm("google")("Google GenAI", () => {
-  test("gemini 1.5 pro works", async () => {
-    llmGoogle("gemini-1.5-pro");
-  });
-  test("gemini 2.0 flash works", async () => {
-    llmGoogle("gemini-2.0-flash-8k");
-  });
-  test("gemini 2.0 flash lite works", async () => {
-    llmGoogle("gemini-2.0-flash-lite-8k");
-  });
-});
-
-test_llm("mistralai")("Mistral AI", () => {
-  const model: MistralModel = "mistral-small-latest";
-
-  test("model", () => {
-    expect(isMistralModel(model)).toBe(true);
-  });
-
-  // segaults – no clue why. happens with version 0.2.0
-  test.skip("basics", async () => {
-    const answer = await evaluateMistral({ model, ...QUERY });
-    checkAnswer(answer);
-  });
-});
-
-test_llm("anthropic")("Anthropic", () => {
-  const haiku: AnthropicModel = "claude-3-haiku";
-  const sonnet: AnthropicModel = "claude-3-5-sonnet-4k";
-  const opus: AnthropicModel = "claude-3-opus-8k";
-
-  test("model", () => {
-    expect(isAnthropicModel(haiku)).toBe(true);
-  });
-
-  test("haiku", async () => {
-    const answer = await evaluateAnthropic({ model: haiku, ...QUERY });
-    checkAnswer(answer);
-  });
-
-  test("sonnet", async () => {
-    const answer = await evaluateAnthropic({ model: sonnet, ...QUERY });
-    checkAnswer(answer);
-  });
-
-  test("opus", async () => {
-    const answer = await evaluateAnthropic({ model: opus, ...QUERY });
-    checkAnswer(answer);
-  });
-});
diff --git a/src/packages/server/llm/test/models.test.ts b/src/packages/server/llm/test/models.test.ts
new file mode 100644
index 00000000000..6c692352c23
--- /dev/null
+++ b/src/packages/server/llm/test/models.test.ts
@@ -0,0 +1,505 @@
+// import { log } from "console";
+
+import getPool, { initEphemeralDatabase } from "@cocalc/database/pool";
+import {
+  AnthropicModel,
+  LanguageModelCore,
+  // GoogleModel,
+  MistralModel,
+  isAnthropicModel,
+  isGoogleModel,
+  isMistralModel,
+  isOpenAIModel,
+  UserDefinedLLM,
+  toUserLLMModelName,
+} from "@cocalc/util/db-schema/llm-utils";
+import { evaluateGoogleGenAI } from "..";
+import { evaluateAnthropic } from "../anthropic";
+import { getClient } from "../client";
+import createAccount from "../../accounts/create-account";
+import { db } from "@cocalc/database";
+import { callback2 } from "@cocalc/util/async-utils";
+import { OTHER_SETTINGS_USERDEFINED_LLM } from "@cocalc/util/db-schema/defaults";
+import { uuid } from "@cocalc/util/misc";
+import { evaluateWithLangChain } from "../evaluate-lc";
+import { GoogleGenAIClient } from "../google-genai-client";
+import { USE_NEWER_LC_IMPL } from "../index";
+import { evaluateMistral } from "../mistral";
+import { evaluateOpenAILC } from "../openai-lc";
+import { evaluateUserDefinedLLM } from "../user-defined";
+import { enableModels, setupAPIKeys, test_llm } from "./shared";
+
+// sometimes (flaky case) they take more than 10s to even start a response
+const LLM_TIMEOUT = 15_000;
+
+beforeAll(async () => {
+  await initEphemeralDatabase();
+  await setupAPIKeys();
+  await enableModels();
+}, 15000);
+
+afterAll(async () => {
+  await getPool().end();
+});
+
+const QUERY = {
+  input: "What's 99 + 1?",
+  system: "Reply only the value.",
+} as const;
+
+function checkAnswer(answer) {
+  const { output, total_tokens, completion_tokens, prompt_tokens } = answer;
+  expect(output).toContain("100");
+  // total tokens is more than that sume for "thinking" models like gemini 2.5
+  // because thinking tokens are not part of this
+  expect(total_tokens).toBeGreaterThanOrEqual(
+    prompt_tokens + completion_tokens,
+  );
+  expect(prompt_tokens).toBeGreaterThan(5);
+  expect(completion_tokens).toBeGreaterThan(0);
+}
+
+async function llmOpenAI(model: LanguageModelCore) {
+  if (!isOpenAIModel(model)) {
+    throw new Error(`model: ${model} is not an OpenAI model`);
+  }
+
+  const answer = USE_NEWER_LC_IMPL
+    ? await evaluateWithLangChain({
+        model,
+        ...QUERY,
+      })
+    : await evaluateOpenAILC({
+        model,
+        ...QUERY,
+      });
+
+  checkAnswer(answer);
+}
+
+async function llmGoogle(model: LanguageModelCore) {
+  if (!isGoogleModel(model)) {
+    throw new Error(`model: ${model} is not a Google model`);
+  }
+
+  const answer = USE_NEWER_LC_IMPL
+    ? await evaluateWithLangChain({
+        model,
+        ...QUERY,
+      })
+    : await (async () => {
+        const client = (await getClient(model)) as GoogleGenAIClient;
+        return await evaluateGoogleGenAI({
+          model,
+          client,
+          ...QUERY,
+        });
+      })();
+
+  checkAnswer(answer);
+}
+
+// write a test in jest that fails
+test_llm("openai")("OpenAI", () => {
+  test(
+    "gpt3.5 works",
+    async () => {
+      await llmOpenAI("gpt-3.5-turbo");
+    },
+    LLM_TIMEOUT,
+  );
+  test(
+    "gpt 4 works",
+    async () => {
+      await llmOpenAI("gpt-4");
+    },
+    LLM_TIMEOUT,
+  );
+  test(
+    "gpt 4 turbo works",
+    async () => {
+      await llmOpenAI("gpt-4-turbo-8k");
+    },
+    LLM_TIMEOUT,
+  );
+  test(
+    "gpt 4 omni works",
+    async () => {
+      await llmOpenAI("gpt-4o-8k");
+    },
+    LLM_TIMEOUT,
+  );
+  test(
+    "gpt 4o mini works",
+    async () => {
+      await llmOpenAI("gpt-4o-mini-8k");
+    },
+    LLM_TIMEOUT,
+  );
+  test(
+    "gpt 4.1 works",
+    async () => {
+      await llmOpenAI("gpt-4.1");
+    },
+    LLM_TIMEOUT,
+  );
+  test(
+    "4.1 mini works",
+    async () => {
+      await llmOpenAI("gpt-4.1-mini");
+    },
+    LLM_TIMEOUT,
+  );
+
+  test("o1", async () => {
+    await llmOpenAI("o1-8k");
+  });
+
+  test("o1 mini works", async () => {
+    await llmOpenAI("o1-mini-8k");
+  });
+});
+
+// ATTN: does not work everywhere around, geolocation matters
+test_llm("google")("Google GenAI", () => {
+  test(
+    "gemini 1.5 pro works",
+    async () => {
+      await llmGoogle("gemini-1.5-pro");
+    },
+    LLM_TIMEOUT,
+  );
+  test(
+    "gemini 2.0 flash works",
+    async () => {
+      await llmGoogle("gemini-2.0-flash-8k");
+    },
+    LLM_TIMEOUT,
+  );
+  test(
+    "gemini 2.0 flash lite works",
+    async () => {
+      await llmGoogle("gemini-2.0-flash-lite-8k");
+    },
+    LLM_TIMEOUT,
+  );
+  test(
+    "gemini 2.5 flash works",
+    async () => {
+      await llmGoogle("gemini-2.5-flash-8k");
+    },
+    LLM_TIMEOUT,
+  );
+  test(
+    "gemini 2.5 pro works",
+    async () => {
+      await llmGoogle("gemini-2.5-pro-8k");
+    },
+    LLM_TIMEOUT,
+  );
+});
+
+test_llm("mistralai")("Mistral AI", () => {
+  const small: MistralModel = "mistral-small-latest";
+  const medium: MistralModel = "mistral-medium-latest";
+  const large: MistralModel = "mistral-large-latest";
+
+  test("model", () => {
+    expect(isMistralModel(small)).toBe(true);
+    expect(isMistralModel(medium)).toBe(true);
+    expect(isMistralModel(large)).toBe(true);
+  });
+
+  test(
+    "small",
+    async () => {
+      const answer = USE_NEWER_LC_IMPL
+        ? await evaluateWithLangChain({ model: small, ...QUERY })
+        : await evaluateMistral({ model: small, ...QUERY });
+      checkAnswer(answer);
+    },
+    LLM_TIMEOUT,
+  );
+
+  test(
+    "medium",
+    async () => {
+      const answer = USE_NEWER_LC_IMPL
+        ? await evaluateWithLangChain({ model: medium, ...QUERY })
+        : await evaluateMistral({ model: medium, ...QUERY });
+      checkAnswer(answer);
+    },
+    LLM_TIMEOUT,
+  );
+
+  test(
+    "large",
+    async () => {
+      const answer = USE_NEWER_LC_IMPL
+        ? await evaluateWithLangChain({ model: large, ...QUERY })
+        : await evaluateMistral({ model: large, ...QUERY });
+      checkAnswer(answer);
+    },
+    LLM_TIMEOUT,
+  );
+});
+
+test_llm("anthropic")("Anthropic", () => {
+  const haiku: AnthropicModel = "claude-3-5-haiku-8k";
+  const sonnet: AnthropicModel = "claude-4-sonnet-8k";
+  const opus: AnthropicModel = "claude-4-opus-8k";
+
+  test("model", () => {
+    expect(isAnthropicModel(haiku)).toBe(true);
+    expect(isAnthropicModel(sonnet)).toBe(true);
+    expect(isAnthropicModel(opus)).toBe(true);
+  });
+
+  test(
+    "haiku",
+    async () => {
+      const answer = USE_NEWER_LC_IMPL
+        ? await evaluateWithLangChain({ model: haiku, ...QUERY })
+        : await evaluateAnthropic({ model: haiku, ...QUERY });
+      checkAnswer(answer);
+    },
+    LLM_TIMEOUT,
+  );
+
+  test(
+    "sonnet",
+    async () => {
+      const answer = USE_NEWER_LC_IMPL
+        ? await evaluateWithLangChain({ model: sonnet, ...QUERY })
+        : await evaluateAnthropic({ model: sonnet, ...QUERY });
+      checkAnswer(answer);
+    },
+    LLM_TIMEOUT,
+  );
+
+  test(
+    "opus",
+    async () => {
+      const answer = USE_NEWER_LC_IMPL
+        ? await evaluateWithLangChain({ model: opus, ...QUERY })
+        : await evaluateAnthropic({ model: opus, ...QUERY });
+      checkAnswer(answer);
+    },
+    LLM_TIMEOUT,
+  );
+});
+
+// User-defined LLM tests
+describe("User-defined LLMs", () => {
+  const account_id = uuid();
+  let accountCreated = false;
+
+  beforeAll(async () => {
+    // Create test account only once for the entire describe block
+    if (!accountCreated) {
+      await createAccount({
+        email: `test-${account_id}@example.com`,
+        password: "testpass123",
+        firstName: "Test",
+        lastName: "User",
+        account_id,
+      });
+      accountCreated = true;
+    }
+
+    // Enable user-defined LLMs server setting
+    await callback2(db().set_server_setting, {
+      name: "user_defined_llm",
+      value: "yes",
+      readonly: true,
+    });
+  });
+
+  async function createUserDefinedLLMConfig(configs: UserDefinedLLM[]) {
+    const userDefinedLLMJson = JSON.stringify(configs);
+    const pool = getPool();
+    await pool.query(
+      `UPDATE accounts SET other_settings = jsonb_set(
+        COALESCE(other_settings, '{}'::jsonb),
+        '{${OTHER_SETTINGS_USERDEFINED_LLM}}',
+        to_jsonb($1::text)
+      ) WHERE account_id = $2`,
+      [userDefinedLLMJson, account_id],
+    );
+  }
+
+  // Test user-defined OpenAI model
+  test(
+    "user-defined OpenAI model works",
+    async () => {
+      const openaiKey = process.env.COCALC_TEST_OPENAI_KEY;
+      if (!openaiKey) {
+        console.log("Skipping user-defined OpenAI test - no API key");
+        return;
+      }
+
+      const config: UserDefinedLLM = {
+        id: 1,
+        service: "openai",
+        display: "Test GPT-4o Mini",
+        endpoint: "https://api.openai.com/v1",
+        model: "gpt-4o-mini",
+        apiKey: openaiKey,
+      };
+
+      await createUserDefinedLLMConfig([config]);
+
+      const userModel = toUserLLMModelName(config);
+      const answer = await evaluateUserDefinedLLM(
+        {
+          model: userModel,
+          ...QUERY,
+        },
+        account_id,
+      );
+
+      checkAnswer(answer);
+    },
+    LLM_TIMEOUT,
+  );
+
+  // Test user-defined Google model
+  test(
+    "user-defined Google model works",
+    async () => {
+      const googleKey = process.env.COCALC_TEST_GOOGLE_GENAI_KEY;
+      if (!googleKey) {
+        console.log("Skipping user-defined Google test - no API key");
+        return;
+      }
+
+      const config: UserDefinedLLM = {
+        id: 2,
+        service: "google",
+        display: "Test Gemini Flash",
+        endpoint: "",
+        model: "gemini-1.5-flash",
+        apiKey: googleKey,
+      };
+
+      await createUserDefinedLLMConfig([config]);
+
+      const userModel = toUserLLMModelName(config);
+      const answer = await evaluateUserDefinedLLM(
+        {
+          model: userModel,
+          ...QUERY,
+        },
+        account_id,
+      );
+
+      checkAnswer(answer);
+    },
+    LLM_TIMEOUT,
+  );
+
+  // Test user-defined Anthropic model
+  test(
+    "user-defined Anthropic model works",
+    async () => {
+      const anthropicKey = process.env.COCALC_TEST_ANTHROPIC_KEY;
+      if (!anthropicKey) {
+        console.log("Skipping user-defined Anthropic test - no API key");
+        return;
+      }
+
+      const config: UserDefinedLLM = {
+        id: 3,
+        service: "anthropic",
+        display: "claude-3-5-haiku-latest",
+        endpoint: "",
+        model: "claude-3-5-haiku-latest",
+        apiKey: anthropicKey,
+      };
+
+      await createUserDefinedLLMConfig([config]);
+
+      const userModel = toUserLLMModelName(config);
+      const answer = await evaluateUserDefinedLLM(
+        {
+          model: userModel,
+          ...QUERY,
+        },
+        account_id,
+      );
+
+      checkAnswer(answer);
+    },
+    LLM_TIMEOUT,
+  );
+
+  // Test user-defined Mistral model
+  test(
+    "user-defined Mistral model works",
+    async () => {
+      const mistralKey = process.env.COCALC_TEST_MISTRAL_AI_KEY;
+      if (!mistralKey) {
+        console.log("Skipping user-defined Mistral test - no API key");
+        return;
+      }
+
+      const config: UserDefinedLLM = {
+        id: 4,
+        service: "mistralai",
+        display: "Test Mistral Small",
+        endpoint: "",
+        model: "mistral-small-latest",
+        apiKey: mistralKey,
+      };
+
+      await createUserDefinedLLMConfig([config]);
+
+      const userModel = toUserLLMModelName(config);
+      const answer = await evaluateUserDefinedLLM(
+        {
+          model: userModel,
+          ...QUERY,
+        },
+        account_id,
+      );
+
+      checkAnswer(answer);
+    },
+    LLM_TIMEOUT,
+  );
+
+  // Test user-defined custom OpenAI model
+  test(
+    "user-defined custom OpenAI model works",
+    async () => {
+      const openaiKey = process.env.COCALC_TEST_OPENAI_KEY;
+      if (!openaiKey) {
+        console.log("Skipping user-defined custom OpenAI test - no API key");
+        return;
+      }
+
+      const config: UserDefinedLLM = {
+        id: 5,
+        service: "custom_openai",
+        display: "Test Custom GPT-4o",
+        endpoint: "https://api.openai.com/v1",
+        model: "gpt-4o",
+        apiKey: openaiKey,
+      };
+
+      await createUserDefinedLLMConfig([config]);
+
+      const userModel = toUserLLMModelName(config);
+      const answer = await evaluateUserDefinedLLM(
+        {
+          model: userModel,
+          ...QUERY,
+        },
+        account_id,
+      );
+
+      checkAnswer(answer);
+    },
+    LLM_TIMEOUT,
+  );
+});
diff --git a/src/packages/server/llm/user-defined.ts b/src/packages/server/llm/user-defined.ts
index 3e846f0624f..c4670f56a06 100644
--- a/src/packages/server/llm/user-defined.ts
+++ b/src/packages/server/llm/user-defined.ts
@@ -16,11 +16,12 @@ import {
 } from "@cocalc/util/db-schema/llm-utils";
 import { isValidUUID, unreachable } from "@cocalc/util/misc";
 import type { History, Stream } from "@cocalc/util/types/llm";
-import { evaluateAnthropic } from "./anthropic";
 import { evaluateCustomOpenAI } from "./custom-openai";
-import { evaluateGoogleGenAILC } from "./google-lc";
-import { evaluateMistral } from "./mistral";
 import { evaluateOllama } from "./ollama";
+// import { evaluateWithLangChain } from "./evaluate-lc";
+import { evaluateAnthropic } from "./anthropic";
+import { evaluateMistral } from "./mistral";
+import { evaluateGoogleGenAILC } from "./google-lc";
 import { evaluateOpenAILC } from "./openai-lc";
 
 const log = getLogger("llm:userdefined");
@@ -65,6 +66,8 @@ export async function evaluateUserDefinedLLM(
   // and then construct the corresponding client (maybe with a use provided API key)
   // and call the appropriate evaluation function. For that, it mimics how the llm framework
   // usually calls an LLM.
+  // NOTE: evaluateWithLangChain "could" work after further refactoring. In particular, its
+  // getProviderConfig must be enhanced with a generalized way to configure based on provider, not model name
   const { service, endpoint, apiKey } = conf;
   switch (service) {
     case "custom_openai": {
@@ -107,24 +110,56 @@ export async function evaluateUserDefinedLLM(
         { ...opts, model: um.model, apiKey: conf.apiKey },
         "user",
       );
+    // return await evaluateWithLangChain(
+    //   {
+    //     ...opts,
+    //     model: um.model,
+    //     apiKey: conf.apiKey,
+    //   },
+    //   "user",
+    // );
 
     case "mistralai":
       return await evaluateMistral(
         { ...opts, model: um.model, apiKey: conf.apiKey },
         "user",
       );
+    // return await evaluateWithLangChain(
+    //   {
+    //     ...opts,
+    //     model: um.model,
+    //     apiKey: conf.apiKey,
+    //   },
+    //   "user",
+    // );
 
     case "google":
       return await evaluateGoogleGenAILC(
         { ...opts, model: um.model, apiKey: conf.apiKey },
         "user",
       );
+    // return await evaluateWithLangChain(
+    //   {
+    //     ...opts,
+    //     model: um.model,
+    //     apiKey: conf.apiKey,
+    //   },
+    //   "user",
+    // );
 
     case "openai":
       return await evaluateOpenAILC(
         { ...opts, model: um.model, apiKey: conf.apiKey },
         "user",
       );
+    // return await evaluateWithLangChain(
+    //   {
+    //     ...opts,
+    //     model: um.model,
+    //     apiKey: conf.apiKey,
+    //   },
+    //   "user",
+    // );
 
     default:
       unreachable(service);
diff --git a/src/packages/server/package.json b/src/packages/server/package.json
index fd46d02793b..69809b69c05 100644
--- a/src/packages/server/package.json
+++ b/src/packages/server/package.json
@@ -55,12 +55,12 @@
     "@google-cloud/storage-transfer": "^3.3.0",
     "@google/generative-ai": "^0.14.0",
     "@isaacs/ttlcache": "^1.4.1",
-    "@langchain/anthropic": "^0.3.18",
-    "@langchain/core": "^0.3.46",
-    "@langchain/google-genai": "^0.2.4",
-    "@langchain/mistralai": "^0.2.0",
-    "@langchain/ollama": "^0.2.0",
-    "@langchain/openai": "^0.5.5",
+    "@langchain/anthropic": "^0.3.24",
+    "@langchain/core": "^0.3.64",
+    "@langchain/google-genai": "^0.2.15",
+    "@langchain/mistralai": "^0.2.1",
+    "@langchain/ollama": "^0.2.3",
+    "@langchain/openai": "^0.6.1",
     "@node-saml/passport-saml": "^5.0.1",
     "@passport-js/passport-twitter": "^1.0.8",
     "@passport-next/passport-google-oauth2": "^1.0.0",
@@ -97,7 +97,7 @@
     "nanoid": "^3.3.8",
     "node-zendesk": "^5.0.13",
     "nodemailer": "^6.9.16",
-    "openai": "^4.95.1",
+    "openai": "^4.104.0",
     "parse-domain": "^5.0.0",
     "passport": "^0.6.0",
     "passport-activedirectory": "^1.0.4",
diff --git a/src/packages/util/db-schema/llm-utils.test.ts b/src/packages/util/db-schema/llm-utils.test.ts
index dd9c18eb08d..54fb8289ee7 100644
--- a/src/packages/util/db-schema/llm-utils.test.ts
+++ b/src/packages/util/db-schema/llm-utils.test.ts
@@ -125,9 +125,9 @@ describe("llm", () => {
     expect(getModel(DEFAULT_MODEL)).toEqual(DEFAULT_MODEL);
     expect(getModel("mistral-medium-latest")).toEqual(DEFAULT_MODEL);
     expect(getModel("mistral-large-latest")).toEqual("mistral-large-latest");
-    expect(getModel("claude-3-haiku-8k")).toEqual("claude-3-haiku-8k");
+    expect(getModel("claude-3-5-haiku-8k")).toEqual("claude-3-5-haiku-8k");
     // anthropic service disabled
-    expect(getModel("claude-3-haiku-8k", "anthropic")).toEqual(DEFAULT_MODEL);
+    expect(getModel("claude-3-5-haiku-8k", "anthropic")).toEqual(DEFAULT_MODEL);
     // ollama
     expect(getModel("ollama-foo")).toEqual(DEFAULT_MODEL);
     expect(getModel("ollama-phi3")).toEqual("ollama-phi3");
@@ -142,8 +142,8 @@ describe("llm", () => {
       "user-openai-gpt-3.5-turbo",
     );
     // it's ok to use a model if disabled by the admin, since it's their key
-    expect(getModel("user-anthropic-claude-3-haiku-8k", "anthropic")).toEqual(
-      "user-anthropic-claude-3-haiku-8k",
+    expect(getModel("user-anthropic-claude-3-5-haiku-8k", "anthropic")).toEqual(
+      "user-anthropic-claude-3-5-haiku-8k",
     );
     // meaningless user service
     expect(getModel("user-baz-delta99")).toEqual(DEFAULT_MODEL);
diff --git a/src/packages/util/db-schema/llm-utils.ts b/src/packages/util/db-schema/llm-utils.ts
index 33dbfeb8cee..0542461cfbb 100644
--- a/src/packages/util/db-schema/llm-utils.ts
+++ b/src/packages/util/db-schema/llm-utils.ts
@@ -151,10 +151,13 @@ export function isMistralModel(model: unknown): model is MistralModel {
 // $ curl -s "https://generativelanguage.googleapis.com/v1beta/models?key=$GOOGLE_GENAI" | jq
 export const GOOGLE_MODELS = [
   "gemini-1.5-flash-8k", // introduced 2024-05-15
+  "gemini-1.5-flash", // for user defined models
   "gemini-pro", // Discontinued Feb'25. Keep it to avoid breaking old references!
   "gemini-1.0-ultra", // hangs
   "gemini-1.5-pro-8k", // works now with langchaing
   "gemini-1.5-pro", // works now with langchaing
+  "gemini-2.5-flash-8k",
+  "gemini-2.5-pro-8k",
   "gemini-2.0-flash-8k",
   "gemini-2.0-flash-lite-8k",
 ] as const;
@@ -168,33 +171,38 @@ export const GOOGLE_MODEL_TO_ID: Partial<{ [m in GoogleModel]: string }> = {
   "gemini-1.5-flash-8k": "gemini-1.5-flash-latest",
   "gemini-2.0-flash-8k": "gemini-2.0-flash",
   "gemini-2.0-flash-lite-8k": "gemini-2.0-flash-lite",
+  "gemini-2.5-flash-8k": "gemini-2.5-flash",
+  "gemini-2.5-pro-8k": "gemini-2.5-pro",
 } as const;
 
-// https://docs.anthropic.com/claude/docs/models-overview -- stable names for the modesl ...
+// https://docs.anthropic.com/en/docs/about-claude/models/overview -- stable names for the modesl ...
 export const ANTHROPIC_MODELS = [
   "claude-3-5-sonnet",
   "claude-3-5-sonnet-4k", // added 2024-06-24
+  "claude-3-5-haiku-8k",
   "claude-3-haiku",
   "claude-3-haiku-8k", // limited context window, offered for free
   "claude-3-sonnet",
   "claude-3-sonnet-4k", // limited context window, offered for free
-  "claude-3-opus-8k", // same issue as the large GPT models, limit the context window to limit spending
   "claude-3-opus",
+  "claude-3-opus-8k", // same issue as the large GPT models, limit the context window to limit spending
+  "claude-4-sonnet-8k",
+  "claude-4-opus-8k",
 ] as const;
-const CLAUDE_SONNET_VERSION = "20240229";
-const CLAUDE_HAIKU_VERSION = "20240307";
-const CLAUDE_OPUS_VERSION = "20240229";
-const CLAUDE_SONNET_3_5_VERSION = "20240620";
-// ... and we add a version number (there is no "*-latest") when dispatching on the backend
-export const ANTHROPIC_VERSION: { [name in AnthropicModel]: string } = {
-  "claude-3-sonnet-4k": CLAUDE_SONNET_VERSION,
-  "claude-3-opus": CLAUDE_OPUS_VERSION,
-  "claude-3-opus-8k": CLAUDE_OPUS_VERSION,
-  "claude-3-sonnet": CLAUDE_SONNET_VERSION,
-  "claude-3-5-sonnet": CLAUDE_SONNET_3_5_VERSION,
-  "claude-3-5-sonnet-4k": CLAUDE_SONNET_3_5_VERSION,
-  "claude-3-haiku": CLAUDE_HAIKU_VERSION,
-  "claude-3-haiku-8k": CLAUDE_HAIKU_VERSION,
+// https://docs.anthropic.com/en/docs/about-claude/models/overview#model-aliases
+// if it points to null, the model is no longer supported
+export const ANTHROPIC_VERSION: { [name in AnthropicModel]: string | null } = {
+  "claude-3-5-sonnet": "claude-3-5-sonnet-latest",
+  "claude-3-5-sonnet-4k": "claude-3-5-sonnet-latest",
+  "claude-3-5-haiku-8k": "claude-3-5-haiku-latest",
+  "claude-3-haiku": "claude-3-haiku-20240307",
+  "claude-3-haiku-8k": "claude-3-haiku-20240307",
+  "claude-4-sonnet-8k": "claude-sonnet-4-0",
+  "claude-4-opus-8k": "claude-opus-4-0",
+  "claude-3-sonnet": null,
+  "claude-3-sonnet-4k": null,
+  "claude-3-opus": null,
+  "claude-3-opus-8k": null,
 } as const;
 export const ANTHROPIC_PREFIX = "anthropic-";
 export type AnthropicModel = (typeof ANTHROPIC_MODELS)[number];
@@ -237,7 +245,9 @@ export const USER_SELECTABLE_LLMS_BY_VENDOR: {
       m === "gpt-4o-8k" ||
       m === "gpt-4o-mini-8k" ||
       m === "gpt-4.1" ||
-      m === "gpt-4.1-mini",
+      m === "gpt-4.1-mini" ||
+      m === "o1-mini-8k" ||
+      m === "o1-8k",
 
     // ATTN: there is code for o1 and o1-mini, but it does not work yet.
     // The API changed, there is no support for streaming, and it took
@@ -248,18 +258,19 @@ export const USER_SELECTABLE_LLMS_BY_VENDOR: {
   google: GOOGLE_MODELS.filter(
     (m) =>
       // we only enable 1.5 pro and 1.5 flash with a limited context window.
-      m === "gemini-1.5-pro-8k" ||
+      //m === "gemini-1.5-pro-8k" ||
       //m === "gemini-1.5-flash-8k" ||
-      m === "gemini-2.0-flash-8k" ||
-      m === "gemini-2.0-flash-lite-8k",
+      m === "gemini-2.0-flash-lite-8k" ||
+      m === "gemini-2.5-flash-8k" ||
+      m === "gemini-2.5-pro-8k",
   ),
   mistralai: MISTRAL_MODELS.filter((m) => m !== "mistral-medium-latest"),
   anthropic: ANTHROPIC_MODELS.filter((m) => {
     // we show opus and the context restricted models (to avoid high costs)
     return (
-      m === "claude-3-opus-8k" ||
-      m === "claude-3-5-sonnet-4k" ||
-      m === "claude-3-haiku-8k"
+      m === "claude-3-5-haiku-8k" ||
+      m === "claude-4-sonnet-8k" ||
+      m === "claude-4-opus-8k"
     );
   }),
   ollama: [], // this is empty, because these models are not hardcoded
@@ -601,7 +612,7 @@ export function service2model_core(
 }
 
 // NOTE: do not use this – instead use server_settings.default_llm
-export const DEFAULT_MODEL: LanguageModel = "gemini-2.0-flash-8k";
+export const DEFAULT_MODEL: LanguageModel = "gemini-2.5-flash-8k";
 
 interface LLMVendor {
   name: LLMServiceName;
@@ -737,21 +748,27 @@ export const LLM_USERNAMES: LLM2String = {
   "chat-bison-001": "PaLM 2",
   "gemini-pro": "Gemini 1.0 Pro",
   "gemini-1.0-ultra": "Gemini 1.0 Ultra",
+  "gemini-1.5-flash": "Gemini 1.5 Flash",
   "gemini-1.5-pro": "Gemini 1.5 Pro 1m",
   "gemini-1.5-pro-8k": "Gemini 1.5 Pro",
   "gemini-1.5-flash-8k": "Gemini 1.5 Flash",
   "gemini-2.0-flash-8k": "Gemini 2.0 Flash",
   "gemini-2.0-flash-lite-8k": "Gemini 2.0 Flash Lite",
+  "gemini-2.5-flash-8k": "Gemini 2.5 Flash",
+  "gemini-2.5-pro-8k": "Gemini 2.5 Pro",
   "mistral-small-latest": "Mistral AI Small",
   "mistral-medium-latest": "Mistral AI Medium",
   "mistral-large-latest": "Mistral AI Large",
-  "claude-3-haiku": "Claude 3 Haiku 200k",
+  "claude-3-haiku": "Claude 3 Haiku",
   "claude-3-haiku-8k": "Claude 3 Haiku",
+  "claude-3-5-haiku-8k": "Claude 3 Haiku",
   "claude-3-sonnet": "Claude 3 Sonnet 200k",
   "claude-3-sonnet-4k": "Claude 3 Sonnet",
-  "claude-3-5-sonnet": "Claude 3.5 Sonnet 200k",
+  "claude-3-5-sonnet": "Claude 3.5 Sonnet",
   "claude-3-5-sonnet-4k": "Claude 3.5 Sonnet",
-  "claude-3-opus": "Claude 3 Opus 200k",
+  "claude-4-sonnet-8k": "Claude 4 Sonnet",
+  "claude-4-opus-8k": "Claude 4 Opus",
+  "claude-3-opus": "Claude 3 Opus",
   "claude-3-opus-8k": "Claude 3 Opus",
 } as const;
 
@@ -795,6 +812,7 @@ export const LLM_DESCR: LLM2String = {
     "Google's Gemini 1.0 Ultra Generative AI model (30k token context)",
   "gemini-1.5-pro":
     "Google's Gemini 1.5 Pro Generative AI model (1m token context)",
+  "gemini-1.5-flash": "Google's Gemini 1.5 Flash Generative AI model",
   "gemini-1.5-pro-8k":
     "Google's Gemini 1.5 Pro Generative AI model (8k token context)",
   "gemini-1.5-flash-8k":
@@ -803,6 +821,10 @@ export const LLM_DESCR: LLM2String = {
     "Google's Gemini 2.0 Flash Generative AI model (8k token context)",
   "gemini-2.0-flash-lite-8k":
     "Google's Gemini 2.0 Flash Lite Generative AI model (8k token context)",
+  "gemini-2.5-flash-8k":
+    "Google's Gemini 2.5 Flash Generative AI model (8k token context)",
+  "gemini-2.5-pro-8k":
+    "Google's Gemini 2.5 Pro Generative AI model (8k token context)",
   "mistral-small-latest":
     "Fast, simple queries, short answers, less capabilities. (Mistral AI, 4k token context)",
   "mistral-medium-latest":
@@ -815,10 +837,16 @@ export const LLM_DESCR: LLM2String = {
     "Fastest model, lightweight actions (Anthropic, 8k token context)",
   "claude-3-5-sonnet":
     "Our most intelligent model (Anthropic, 200k token context)",
+  "claude-3-sonnet":
+    "Our most intelligent model (Anthropic, 200k token context)",
   "claude-3-5-sonnet-4k":
     "Our most intelligent model (Anthropic, 4k token context)",
-  "claude-3-sonnet":
-    "Best combination of performance and speed (Anthropic, 200k token context)",
+  "claude-3-5-haiku-8k":
+    "Fastest model, lightweight actions (Anthropic, 8k token context)",
+  "claude-4-sonnet-8k":
+    "Best combination of performance and speed (Anthropic, 8k token context)",
+  "claude-4-opus-8k":
+    "Excels at writing and complex tasks (Anthropic, 8k token context)",
   "claude-3-sonnet-4k":
     "Best combination of performance and speed (Anthropic, 4k token context)",
   "claude-3-opus":
@@ -984,25 +1012,25 @@ export const LLM_COST: { [name in LanguageModelCore]: Cost } = {
   },
   o1: {
     prompt_tokens: usd1Mtokens(15),
-    completion_tokens: usd1Mtokens(7.5),
-    max_tokens: 8192, // like gpt-4-turbo-8k
-    free: false,
-  },
-  "o1-mini": {
-    prompt_tokens: usd1Mtokens(3),
-    completion_tokens: usd1Mtokens(1.5),
+    completion_tokens: usd1Mtokens(60),
     max_tokens: 8192, // like gpt-4-turbo-8k
     free: false,
   },
   "o1-8k": {
     prompt_tokens: usd1Mtokens(15),
-    completion_tokens: usd1Mtokens(7.5),
+    completion_tokens: usd1Mtokens(60),
     max_tokens: 8192, // like gpt-4-turbo-8k
     free: false,
   },
   "o1-mini-8k": {
-    prompt_tokens: usd1Mtokens(3),
-    completion_tokens: usd1Mtokens(1.5),
+    prompt_tokens: usd1Mtokens(1.1),
+    completion_tokens: usd1Mtokens(4.4),
+    max_tokens: 8192, // like gpt-4-turbo-8k
+    free: true,
+  },
+  "o1-mini": {
+    prompt_tokens: usd1Mtokens(1.1),
+    completion_tokens: usd1Mtokens(4.4),
     max_tokens: 8192, // like gpt-4-turbo-8k
     free: false,
   },
@@ -1038,6 +1066,12 @@ export const LLM_COST: { [name in LanguageModelCore]: Cost } = {
     max_tokens: 30720,
     free: true,
   },
+  "gemini-1.5-flash": {
+    prompt_tokens: usd1Mtokens(0.075),
+    completion_tokens: usd1Mtokens(0.3),
+    max_tokens: 8_000,
+    free: true,
+  },
   "gemini-1.5-flash-8k": {
     prompt_tokens: usd1Mtokens(0.075),
     completion_tokens: usd1Mtokens(0.3),
@@ -1057,6 +1091,18 @@ export const LLM_COST: { [name in LanguageModelCore]: Cost } = {
     max_tokens: 8_000,
     free: true,
   },
+  "gemini-2.5-flash-8k": {
+    prompt_tokens: usd1Mtokens(0.3),
+    completion_tokens: usd1Mtokens(2.5),
+    max_tokens: 8_000,
+    free: true,
+  },
+  "gemini-2.5-pro-8k": {
+    prompt_tokens: usd1Mtokens(1.25),
+    completion_tokens: usd1Mtokens(10),
+    max_tokens: 8_000,
+    free: false,
+  },
   // https://mistral.ai/technology/
   "mistral-small-latest": {
     prompt_tokens: usd1Mtokens(0.2),
@@ -1105,7 +1151,7 @@ export const LLM_COST: { [name in LanguageModelCore]: Cost } = {
     prompt_tokens: usd1Mtokens(3),
     completion_tokens: usd1Mtokens(15),
     max_tokens: 4_000, // limited to 4k tokens, offered for free
-    free: true,
+    free: false,
   },
   "claude-3-sonnet": {
     prompt_tokens: usd1Mtokens(3),
@@ -1122,7 +1168,25 @@ export const LLM_COST: { [name in LanguageModelCore]: Cost } = {
   "claude-3-haiku": {
     prompt_tokens: usd1Mtokens(0.8),
     completion_tokens: usd1Mtokens(4),
-    max_tokens: 200_000,
+    max_tokens: 8_000, // limited to 8k tokens, offered for free
+    free: true,
+  },
+  "claude-3-5-haiku-8k": {
+    prompt_tokens: usd1Mtokens(0.8),
+    completion_tokens: usd1Mtokens(4),
+    max_tokens: 8_000,
+    free: true,
+  },
+  "claude-4-sonnet-8k": {
+    prompt_tokens: usd1Mtokens(3),
+    completion_tokens: usd1Mtokens(15),
+    max_tokens: 8_000,
+    free: false,
+  },
+  "claude-4-opus-8k": {
+    prompt_tokens: usd1Mtokens(15),
+    completion_tokens: usd1Mtokens(75),
+    max_tokens: 8_000,
     free: false,
   },
 } as const;
diff --git a/src/packages/util/db-schema/purchase-quotas.ts b/src/packages/util/db-schema/purchase-quotas.ts
index becc3377535..a8c456ee016 100644
--- a/src/packages/util/db-schema/purchase-quotas.ts
+++ b/src/packages/util/db-schema/purchase-quotas.ts
@@ -37,9 +37,14 @@ export function isPaygService(service: Service): boolean {
   return IS_PAYG[category ?? ""] ?? false;
 }
 
+const GOOGLE_AI_COLOR = "#ff4d4f";
+const ANTHROPIC_COLOR = "#181818";
+const OPENAI_COLOR = "#10a37f";
+const MISTRALAI_COLOR = "#ff7000";
+
 const GPT_TURBO_128k: Spec = {
   display: "OpenAI GPT-4 Turbo 128k",
-  color: "#10a37f",
+  color: OPENAI_COLOR,
   category: "ai",
 } as const;
 
@@ -50,7 +55,7 @@ const GPT_TURBO_8K: Spec = {
 
 const GPT_OMNI_128k: Spec = {
   display: "OpenAI GPT-4o 128k",
-  color: "#10a37f",
+  color: OPENAI_COLOR,
   category: "ai",
 } as const;
 
@@ -71,7 +76,7 @@ const GPT_OMNI_MINI_8K: Spec = {
 
 const GPT_41_8K: Spec = {
   display: "OpenAI GPT-4.1",
-  color: "#10a37f",
+  color: OPENAI_COLOR,
   category: "ai",
 } as const;
 
@@ -90,8 +95,6 @@ const GPT_O1_MINI_8K: Spec = {
   display: "OpenAI o1 mini",
 } as const;
 
-const GOOGLE_AI_COLOR = "#ff4d4f";
-
 // NOTE: all-quotas-config.tsx will automatically filter out those, which are free or not selectable by the user
 export const QUOTA_SPEC: QuotaSpec = {
   credit: {
@@ -159,26 +162,30 @@ export const QUOTA_SPEC: QuotaSpec = {
     description: "Charge for purchasing a voucher.",
   },
   // ATTN: LLMs comes below this line, the quotas above are the important ones to show first!
-  "openai-gpt-4": { display: "OpenAI GPT-4", color: "#10a37f", category: "ai" },
+  "openai-gpt-4": {
+    display: "OpenAI GPT-4",
+    color: OPENAI_COLOR,
+    category: "ai",
+  },
   "openai-gpt-3.5-turbo": {
     display: "OpenAI GPT-3.5",
-    color: "#10a37f",
+    color: OPENAI_COLOR,
     category: "ai",
   },
   "openai-gpt-3.5-turbo-16k": {
     display: "OpenAI GPT-3.5 16k",
-    color: "#10a37f",
+    color: OPENAI_COLOR,
     category: "ai",
   },
   "openai-text-embedding-ada-002": {
     display: "OpenAI Text Embedding Ada 002",
-    color: "#10a37f",
+    color: OPENAI_COLOR,
     noSet: true, // because this model is not user visible yet
     category: "ai",
   },
   "openai-gpt-4-32k": {
     display: "OpenAI GPT-4 32k",
-    color: "#10a37f",
+    color: OPENAI_COLOR,
     category: "ai",
   },
   "openai-gpt-4-turbo-preview": GPT_TURBO_128k, // the "preview" is over
@@ -213,6 +220,11 @@ export const QUOTA_SPEC: QuotaSpec = {
     noSet: true, // deprecated, will be removed
     category: "ai",
   },
+  "google-gemini-1.5-flash": {
+    display: "Google Gemini 1.5 Flash",
+    color: GOOGLE_AI_COLOR,
+    category: "ai",
+  },
   "google-gemini-1.5-flash-8k": {
     display: "Google Gemini 1.5 Flash",
     color: GOOGLE_AI_COLOR,
@@ -248,59 +260,84 @@ export const QUOTA_SPEC: QuotaSpec = {
     color: GOOGLE_AI_COLOR,
     category: "ai",
   },
+  "google-gemini-2.5-flash-8k": {
+    display: LLM_USERNAMES["gemini-2.5-flash-8k"],
+    color: GOOGLE_AI_COLOR,
+    category: "ai",
+  },
+  "google-gemini-2.5-pro-8k": {
+    display: LLM_USERNAMES["gemini-2.5-pro-8k"],
+    color: GOOGLE_AI_COLOR,
+    category: "ai",
+  },
   "anthropic-claude-3-opus": {
     display: LLM_USERNAMES["claude-3-opus"],
-    color: "#181818",
+    color: ANTHROPIC_COLOR,
     category: "ai",
   },
   "anthropic-claude-3-opus-8k": {
     display: LLM_USERNAMES["claude-3-opus-8k"],
-    color: "#181818",
+    color: ANTHROPIC_COLOR,
     category: "ai",
   },
   "anthropic-claude-3-sonnet": {
     display: LLM_USERNAMES["claude-3-sonnet"],
-    color: "#181818",
+    color: ANTHROPIC_COLOR,
     category: "ai",
   },
   "anthropic-claude-3-sonnet-4k": {
     display: LLM_USERNAMES["claude-3-sonnet-4k"],
-    color: "#181818",
+    color: ANTHROPIC_COLOR,
     category: "ai",
   },
   "anthropic-claude-3-5-sonnet": {
     display: LLM_USERNAMES["claude-3-5-sonnet"],
-    color: "#181818",
+    color: ANTHROPIC_COLOR,
     category: "ai",
   },
   "anthropic-claude-3-5-sonnet-4k": {
     display: LLM_USERNAMES["claude-3-5-sonnet-4k"],
-    color: "#181818",
+    color: ANTHROPIC_COLOR,
     category: "ai",
   },
   "anthropic-claude-3-haiku": {
     display: LLM_USERNAMES["claude-3-haiku"],
-    color: "#181818",
+    color: ANTHROPIC_COLOR,
     category: "ai",
   },
   "anthropic-claude-3-haiku-8k": {
     display: LLM_USERNAMES["claude-3-haiku-8k"],
-    color: "#181818",
+    color: ANTHROPIC_COLOR,
+    category: "ai",
+  },
+  "anthropic-claude-3-5-haiku-8k": {
+    display: LLM_USERNAMES["claude-3-5-haiku-8k"],
+    color: ANTHROPIC_COLOR,
+    category: "ai",
+  },
+  "anthropic-claude-4-sonnet-8k": {
+    display: LLM_USERNAMES["claude-4-sonnet-8k"],
+    color: ANTHROPIC_COLOR,
+    category: "ai",
+  },
+  "anthropic-claude-4-opus-8k": {
+    display: LLM_USERNAMES["claude-4-opus-8k"],
+    color: ANTHROPIC_COLOR,
     category: "ai",
   },
   "mistralai-mistral-small-latest": {
     display: LLM_USERNAMES["mistral-small-latest"],
-    color: "#ff7000", // the orange from their website
+    color: MISTRALAI_COLOR, // the orange from their website
     category: "ai",
   },
   "mistralai-mistral-medium-latest": {
     display: LLM_USERNAMES["mistral-medium-latest"],
-    color: "#ff7000", // the orange from their website
+    color: MISTRALAI_COLOR, // the orange from their website
     category: "ai",
   },
   "mistralai-mistral-large-latest": {
     display: LLM_USERNAMES["mistral-large-latest"],
-    color: "#ff7000", // the orange from their website
+    color: MISTRALAI_COLOR, // the orange from their website
     category: "ai",
   },
   "project-upgrade": {
diff --git a/src/packages/util/db-schema/site-settings-extras.ts b/src/packages/util/db-schema/site-settings-extras.ts
index f6c01efe932..09db8b45bac 100644
--- a/src/packages/util/db-schema/site-settings-extras.ts
+++ b/src/packages/util/db-schema/site-settings-extras.ts
@@ -339,7 +339,7 @@ export const EXTRAS: SettingsExtras = {
   // This is very similar to the ollama config, but there are small differences in the details.
   custom_openai_configuration: {
     name: "Custom OpenAI Endpoints",
-    desc: 'Configure OpenAI endpoints, queried via [@langchain/openai (Node.js)](https://js.langchain.com/v0.1/docs/integrations/llms/openai/). e.g. `{"myllm" : {"baseUrl": "http://1.2.3.4:5678/" , apiKey: "key...", cocalc: {display: "My LLM", desc: "My custom LLM", icon: "https://.../...png"}}, "gpt-4o-high": {baseUrl: "https://api.openai.com/v1", temperature: 1.5, "openAIApiKey": "sk-...", "model": "gpt-4o", cocalc: {display: "High GPT-4 Omni", desc: "GPT 4 Omni High Temp"}}}`',
+    desc: 'Configure OpenAI endpoints, queried via [@langchain/openai (Node.js)](https://js.langchain.com/v0.1/docs/integrations/llms/openai/). e.g. `{"myllm" : {"baseUrl": "http://1.2.3.4:5678/" , apiKey: "key...", cocalc: {display: "My LLM", desc: "My custom LLM", icon: "https://.../...png"}}, "gpt-4o-high": {baseUrl: "https://api.openai.com/v1", temperature: 1.5, "apiKey": "sk-...", "model": "gpt-4o", cocalc: {display: "High GPT-4 Omni", desc: "GPT 4 Omni High Temp"}}}`',
     default: "{}",
     multiline: 5,
     show: custom_openai_enabled,