Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions KK2-0905.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
# Shell to run KK2-0905-UD-Q3_K_XL without having to type the monster CLI
# assumes we have CD'd to ~/Development/llama.cpp

#!/bin/bash
MODEL_PATH="/Users/edsilmacstudio/Development/llama.cpp/models/Kimi-K2-Instruct-0905-UD-Q3_K_XL/models--unsloth--Kimi-K2-Instruct-0905-GGUF/snapshots/ca516d05c7621c0615db3fc7efa63c9617547363/UD-Q3_K_XL/Kimi-K2-Instruct-0905-UD-Q3_K_XL-00001-of-00010.gguf"

./build/bin/llama-server \
-m "$MODEL_PATH" \
-c 225176 \
-ngl 99 \
--parallel 4 \
--no-warmup \
--host 0.0.0.0 \
--port 3000 \
--log-file "KK2_0905_log_4"
605 changes: 605 additions & 0 deletions tools/server/public/index.html

Large diffs are not rendered by default.

Binary file modified tools/server/public/index.html.gz
Binary file not shown.
730 changes: 426 additions & 304 deletions tools/server/webui/package-lock.json

Large diffs are not rendered by default.

77 changes: 65 additions & 12 deletions tools/server/webui/src/components/ChatMessage.tsx
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import { useMemo, useState } from 'react';
import { useMemo, useEffect, useState } from 'react';
import { useAppContext } from '../utils/app.context';
import { Message, PendingMessage } from '../utils/types';
import { classNames } from '../utils/misc';
Expand Down Expand Up @@ -37,7 +37,22 @@ export default function ChatMessage({
onChangeSibling(sibling: Message['id']): void;
isPending?: boolean;
}) {
const { viewingChat, config } = useAppContext();
const { viewingChat, config, serverProps } = useAppContext();

if (serverProps) {
// Add debugging:
console.log('ChatMessage - serverProps: ', serverProps);
console.log('ChatMessage - serverProps type: ', typeof serverProps);
console.log(
'ChatMessage - serverProps keys: ',
serverProps ? Object.keys(serverProps) : 'null'
);
console.log('ChatMessage - n_ctx direct: ', serverProps?.n_ctx);
console.log(
'ChatMessage - currently loaded model: ',
serverProps?.model_path
);
}
const [editingContent, setEditingContent] = useState<string | null>(null);
const timings = useMemo(
() =>
Expand All @@ -55,6 +70,22 @@ export default function ChatMessage({
const nextSibling = siblingLeafNodeIds[siblingCurrIdx + 1];
const prevSibling = siblingLeafNodeIds[siblingCurrIdx - 1];

const { getConversationTokenTotal, addTokensToConversation } =
useAppContext();
const [hasAddedTokens, setHasAddedTokens] = useState(false);

// Get current conversation token total
const conversationTotal = getConversationTokenTotal(msg.convId);

// Add tokens to running total when timings are available
useEffect(() => {
if (timings && !hasAddedTokens && msg.role === 'assistant') {
const messageTokens = timings.prompt_n + timings.predicted_n;
addTokensToConversation(msg.convId, messageTokens);
setHasAddedTokens(true);
}
}, [timings, hasAddedTokens, msg.convId, msg.role, addTokensToConversation]);

// for reasoning model, we split the message into content and thought
// TODO: implement this as remark/rehype plugin in the future
const { content, thought, isThinking }: SplitMessage = useMemo(() => {
Expand Down Expand Up @@ -87,6 +118,9 @@ export default function ChatMessage({

const isUser = msg.role === 'user';

// @ts-expect-error/ban-ts-comment
const contextSize = serverProps?.['default_generation_settings']?.['n_ctx'];

return (
<div
className="group"
Expand Down Expand Up @@ -175,19 +209,38 @@ export default function ChatMessage({
role="button"
className="cursor-pointer font-semibold text-sm opacity-60"
>
Speed: {timings.predicted_per_second.toFixed(1)} t/s
Speed test: {timings.predicted_per_second.toFixed(1)} t/s |
Tokens: {timings.prompt_n + timings.predicted_n} this msg,{' '}
{conversationTotal} total
</div>
<div className="dropdown-content bg-base-100 z-10 w-64 p-2 shadow mt-4">
<b>Prompt</b>
<br />- Tokens: {timings.prompt_n}
<br />- Time: {timings.prompt_ms} ms
<br />- Speed: {timings.prompt_per_second.toFixed(1)} t/s
<br />
<b>Generation</b>
<br />- Tokens: {timings.predicted_n}
<br />- Time: {timings.predicted_ms} ms
<div className="dropdown-content bg-base-100 z-10 w-64 p-2 shadow mt-4 h-80 overflow-y-auto">
<h3>Chat Stats:</h3>
<b>This Response</b>
<br />- Generated: {timings.predicted_n} tokens
<br />- Speed: {timings.predicted_per_second.toFixed(1)} t/s
<br />
<b>Total Conversation</b>
<br />- Context used:{' '}
{timings.prompt_n + timings.predicted_n} tokens
<br />- Prompt history: {timings.prompt_n} tokens
<br />- This response: {timings.predicted_n} tokens
{contextSize && (
<>
<br />- Context limit: {contextSize} tokens
<br />- Remaining:{' '}
{contextSize -
timings.prompt_n -
timings.predicted_n}{' '}
tokens
<br />- Usage:{' '}
{Math.round(
((timings.prompt_n + timings.predicted_n) /
contextSize) *
100
)}
%
</>
)}
</div>
</div>
)}
Expand Down
18 changes: 16 additions & 2 deletions tools/server/webui/src/components/Header.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,16 @@ import {

export default function Header() {
const [selectedTheme, setSelectedTheme] = useState(StorageUtils.getTheme());
const { setShowSettings } = useAppContext();
const { setShowSettings, serverProps } = useAppContext();

const fullFile = serverProps?.model_path?.split(/[/\\]/).pop() ?? '';
const build = serverProps?.build_info ?? '?';

// Extract model name from model_path and remove the sharding suffix if present
const modelName = serverProps?.model_path
?.split(/(\\|\/)/)
.pop()
?.replace(/-\d{5}-of-\d{5}(?=\.gguf$)/, '');

const setTheme = (theme: string) => {
StorageUtils.setTheme(theme);
Expand All @@ -34,7 +43,12 @@ export default function Header() {
<Bars3Icon className="h-5 w-5" />
</label>

<div className="grow text-2xl font-bold ml-2">llama.cpp</div>
<div
className="grow text-xl font-bold ml-2 truncate"
title={`${fullFile}\nllama.cpp build ${build}`}
>
llama server.cpp: {modelName}
</div>

{/* action buttons (top right) */}
<div className="flex items-center">
Expand Down
20 changes: 20 additions & 0 deletions tools/server/webui/src/utils/app.context.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,10 @@ interface AppContextValue {

// props
serverProps: LlamaCppServerProps | null;

// Token tracking
getConversationTokenTotal: (convId: string) => number;
addTokensToConversation: (convId: string, tokens: number) => void;
}

// this callback is used for scrolling to the bottom of the chat and switching to the last node
Expand Down Expand Up @@ -93,6 +97,9 @@ export const AppContextProvider = ({
const [config, setConfig] = useState(StorageUtils.getConfig());
const [canvasData, setCanvasData] = useState<CanvasData | null>(null);
const [showSettings, setShowSettings] = useState(false);
const [conversationTokenTotals, setConversationTokenTotals] = useState<
Record<string, number>
>({});

// get server props
useEffect(() => {
Expand Down Expand Up @@ -386,6 +393,17 @@ export const AppContextProvider = ({
setConfig(config);
};

const getConversationTokenTotal = (convId: string): number => {
return conversationTokenTotals[convId] || 0;
};

const addTokensToConversation = (convId: string, tokens: number) => {
setConversationTokenTotals((prev) => ({
...prev,
[convId]: (prev[convId] || 0) + tokens,
}));
};

return (
<AppContext.Provider
value={{
Expand All @@ -402,6 +420,8 @@ export const AppContextProvider = ({
showSettings,
setShowSettings,
serverProps,
getConversationTokenTotal,
addTokensToConversation,
}}
>
{children}
Expand Down
Loading