ggml-org · pudepiedj · Sep 4, 2025 · Sep 4, 2025 · Sep 4, 2025 · Sep 9, 2025
diff --git a/KK2-0905.sh b/KK2-0905.sh
@@ -0,0 +1,15 @@
+# Shell to run KK2-0905-UD-Q3_K_XL without having to type the monster CLI
+# assumes we have CD'd to ~/Development/llama.cpp
+
+#!/bin/bash
+MODEL_PATH="/Users/edsilmacstudio/Development/llama.cpp/models/Kimi-K2-Instruct-0905-UD-Q3_K_XL/models--unsloth--Kimi-K2-Instruct-0905-GGUF/snapshots/ca516d05c7621c0615db3fc7efa63c9617547363/UD-Q3_K_XL/Kimi-K2-Instruct-0905-UD-Q3_K_XL-00001-of-00010.gguf"
+
+./build/bin/llama-server \
+  -m "$MODEL_PATH" \
+  -c 225176 \
+  -ngl 99 \
+  --parallel 4 \
+  --no-warmup \
+  --host 0.0.0.0 \
+  --port 3000 \
+  --log-file "KK2_0905_log_4"
diff --git a/tools/server/public/index.html b/tools/server/public/index.html
diff --git a/tools/server/public/index.html.gz b/tools/server/public/index.html.gz
diff --git a/tools/server/webui/package-lock.json b/tools/server/webui/package-lock.json
diff --git a/tools/server/webui/src/components/ChatMessage.tsx b/tools/server/webui/src/components/ChatMessage.tsx
@@ -1,4 +1,4 @@
-import { useMemo, useState } from 'react';
+import { useMemo, useEffect, useState } from 'react';
 import { useAppContext } from '../utils/app.context';
 import { Message, PendingMessage } from '../utils/types';
 import { classNames } from '../utils/misc';
@@ -37,7 +37,22 @@ export default function ChatMessage({
   onChangeSibling(sibling: Message['id']): void;
   isPending?: boolean;
 }) {
-  const { viewingChat, config } = useAppContext();
+  const { viewingChat, config, serverProps } = useAppContext();
+
+  if (serverProps) {
+    // Add debugging:
+    console.log('ChatMessage - serverProps: ', serverProps);
+    console.log('ChatMessage - serverProps type: ', typeof serverProps);
+    console.log(
+      'ChatMessage - serverProps keys: ',
+      serverProps ? Object.keys(serverProps) : 'null'
+    );
+    console.log('ChatMessage - n_ctx direct: ', serverProps?.n_ctx);
+    console.log(
+      'ChatMessage - currently loaded model: ',
+      serverProps?.model_path
+    );
+  }
   const [editingContent, setEditingContent] = useState<string | null>(null);
   const timings = useMemo(
     () =>
@@ -55,6 +70,22 @@ export default function ChatMessage({
   const nextSibling = siblingLeafNodeIds[siblingCurrIdx + 1];
   const prevSibling = siblingLeafNodeIds[siblingCurrIdx - 1];
 
+  const { getConversationTokenTotal, addTokensToConversation } =
+    useAppContext();
+  const [hasAddedTokens, setHasAddedTokens] = useState(false);
+
+  // Get current conversation token total
+  const conversationTotal = getConversationTokenTotal(msg.convId);
+
+  // Add tokens to running total when timings are available
+  useEffect(() => {
+    if (timings && !hasAddedTokens && msg.role === 'assistant') {
+      const messageTokens = timings.prompt_n + timings.predicted_n;
+      addTokensToConversation(msg.convId, messageTokens);
+      setHasAddedTokens(true);
+    }
+  }, [timings, hasAddedTokens, msg.convId, msg.role, addTokensToConversation]);
+
   // for reasoning model, we split the message into content and thought
   // TODO: implement this as remark/rehype plugin in the future
   const { content, thought, isThinking }: SplitMessage = useMemo(() => {
@@ -87,6 +118,9 @@ export default function ChatMessage({
 
   const isUser = msg.role === 'user';
 
+  // @ts-expect-error/ban-ts-comment
+  const contextSize = serverProps?.['default_generation_settings']?.['n_ctx'];
+
   return (
     <div
       className="group"
@@ -175,19 +209,38 @@ export default function ChatMessage({
                     role="button"
                     className="cursor-pointer font-semibold text-sm opacity-60"
                   >
-                    Speed: {timings.predicted_per_second.toFixed(1)} t/s
+                    Speed test: {timings.predicted_per_second.toFixed(1)} t/s |
+                    Tokens: {timings.prompt_n + timings.predicted_n} this msg,{' '}
+                    {conversationTotal} total
                   </div>
-                  <div className="dropdown-content bg-base-100 z-10 w-64 p-2 shadow mt-4">
-                    <b>Prompt</b>
-                    <br />- Tokens: {timings.prompt_n}
-                    <br />- Time: {timings.prompt_ms} ms
-                    <br />- Speed: {timings.prompt_per_second.toFixed(1)} t/s
-                    <br />
-                    <b>Generation</b>
-                    <br />- Tokens: {timings.predicted_n}
-                    <br />- Time: {timings.predicted_ms} ms
+                  <div className="dropdown-content bg-base-100 z-10 w-64 p-2 shadow mt-4 h-80 overflow-y-auto">
+                    <h3>Chat Stats:</h3>
+                    <b>This Response</b>
+                    <br />- Generated: {timings.predicted_n} tokens
                     <br />- Speed: {timings.predicted_per_second.toFixed(1)} t/s
                     <br />
+                    <b>Total Conversation</b>
+                    <br />- Context used:{' '}
+                    {timings.prompt_n + timings.predicted_n} tokens
+                    <br />- Prompt history: {timings.prompt_n} tokens
+                    <br />- This response: {timings.predicted_n} tokens
+                    {contextSize && (
+                      <>
+                        <br />- Context limit: {contextSize} tokens
+                        <br />- Remaining:{' '}
+                        {contextSize -
+                          timings.prompt_n -
+                          timings.predicted_n}{' '}
+                        tokens
+                        <br />- Usage:{' '}
+                        {Math.round(
+                          ((timings.prompt_n + timings.predicted_n) /
+                            contextSize) *
+                            100
+                        )}
+                        %
+                      </>
+                    )}
                   </div>
                 </div>
               )}

diff --git a/tools/server/webui/src/components/Header.tsx b/tools/server/webui/src/components/Header.tsx
@@ -12,7 +12,16 @@ import {
 
 export default function Header() {
   const [selectedTheme, setSelectedTheme] = useState(StorageUtils.getTheme());
-  const { setShowSettings } = useAppContext();
+  const { setShowSettings, serverProps } = useAppContext();
+
+  const fullFile = serverProps?.model_path?.split(/[/\\]/).pop() ?? '';
+  const build = serverProps?.build_info ?? '?';
+
+  // Extract model name from model_path and remove the sharding suffix if present
+  const modelName = serverProps?.model_path
+    ?.split(/(\\|\/)/)
+    .pop()
+    ?.replace(/-\d{5}-of-\d{5}(?=\.gguf$)/, '');
 
   const setTheme = (theme: string) => {
     StorageUtils.setTheme(theme);
@@ -34,7 +43,12 @@ export default function Header() {
         <Bars3Icon className="h-5 w-5" />
       </label>
 
-      <div className="grow text-2xl font-bold ml-2">llama.cpp</div>
+      <div
+        className="grow text-xl font-bold ml-2 truncate"
+        title={`${fullFile}\nllama.cpp build ${build}`}
+      >
+        llama server.cpp: {modelName}
+      </div>
 
       {/* action buttons (top right) */}
       <div className="flex items-center">

diff --git a/tools/server/webui/src/utils/app.context.tsx b/tools/server/webui/src/utils/app.context.tsx
@@ -52,6 +52,10 @@ interface AppContextValue {
 
   // props
   serverProps: LlamaCppServerProps | null;
+
+  // Token tracking
+  getConversationTokenTotal: (convId: string) => number;
+  addTokensToConversation: (convId: string, tokens: number) => void;
 }
 
 // this callback is used for scrolling to the bottom of the chat and switching to the last node
@@ -93,6 +97,9 @@ export const AppContextProvider = ({
   const [config, setConfig] = useState(StorageUtils.getConfig());
   const [canvasData, setCanvasData] = useState<CanvasData | null>(null);
   const [showSettings, setShowSettings] = useState(false);
+  const [conversationTokenTotals, setConversationTokenTotals] = useState<
+    Record<string, number>
+  >({});
 
   // get server props
   useEffect(() => {
@@ -386,6 +393,17 @@ export const AppContextProvider = ({
     setConfig(config);
   };
 
+  const getConversationTokenTotal = (convId: string): number => {
+    return conversationTokenTotals[convId] || 0;
+  };
+
+  const addTokensToConversation = (convId: string, tokens: number) => {
+    setConversationTokenTotals((prev) => ({
+      ...prev,
+      [convId]: (prev[convId] || 0) + tokens,
+    }));
+  };
+
   return (
     <AppContext.Provider
       value={{
@@ -402,6 +420,8 @@ export const AppContextProvider = ({
         showSettings,
         setShowSettings,
         serverProps,
+        getConversationTokenTotal,
+        addTokensToConversation,
       }}
     >
       {children}