diff --git a/.changeset/dsl-aware-mdma-il-prompt.md b/.changeset/dsl-aware-mdma-il-prompt.md new file mode 100644 index 0000000..e4aa0d9 --- /dev/null +++ b/.changeset/dsl-aware-mdma-il-prompt.md @@ -0,0 +1,10 @@ +--- +"@mobile-reality/mdma-prompt-pack": minor +--- + +Make the `mobile-reality/mdma-il` author variant DSL-aware. The MDMA-IL model +reads an MDMA-IL DSL intent, so its system prompt must describe the DSL grammar; +the previous variant had none. `getAuthorPromptVariant('mobile-reality/mdma-il')` +now returns the full authoring prompt β€” DSL input grammar, authoring rules, and +worked form/table/chart examples β€” as the single source of truth (previously +duplicated in the eval harness). The registry label/description are unchanged. diff --git a/README.md b/README.md index eddd62a..a0d8b09 100644 --- a/README.md +++ b/README.md @@ -11,6 +11,8 @@ πŸ“– Docs   Β·   πŸ’¬ Discord +   Β·   + πŸ€— Model

@@ -63,6 +65,17 @@ onAction: submit ```` +## Speed comparison + +Same scenario, two models. GPT-5.5 and our own hosted MDMA-IL model. + +Our model is available on Hugging Face: [MobileReality/mdma-gemma4-26b-dsl-unsloth-v1](https://huggingface.co/MobileReality/mdma-gemma4-26b-dsl-unsloth-v1) + +| GPT-5.5 | Our own hosted model | +| :---: | :---: | +| | | + + ## MDMA_AUTHOR prompt matrix Each cell shows the pass rate of the model-specialized MDMA_AUTHOR prompt variant on the listed eval suite. diff --git a/assets/gpt-5.5.gif b/assets/gpt-5.5.gif new file mode 100644 index 0000000..638ec64 Binary files /dev/null and b/assets/gpt-5.5.gif differ diff --git a/assets/own-model.gif b/assets/own-model.gif new file mode 100644 index 0000000..06aaec6 Binary files /dev/null and b/assets/own-model.gif differ diff --git a/demo/src/AgentChatView.tsx b/demo/src/AgentChatView.tsx index 658bb0f..c579fcb 100644 --- a/demo/src/AgentChatView.tsx +++ b/demo/src/AgentChatView.tsx @@ -1,10 +1,43 @@ -import { useRef, useEffect, useCallback } from 'react'; +import { useRef, useEffect, useCallback, useState } from 'react'; import { useAgent } from './agent/use-agent.js'; import { useAgentActionLog } from './agent/use-agent-action-log.js'; import { AgentMessage } from './agent/AgentMessage.js'; import { AgentSettings } from './agent/AgentSettings.js'; import { ChatActionLog } from './chat/ChatActionLog.js'; import { ChatInput } from './chat/ChatInput.js'; +import type { AssistantTurn, AgentDisplayTurn } from './agent/types.js'; + +const sleep = (ms: number) => new Promise((r) => setTimeout(r, ms)); + +// Scripted conversation for the auto-play demo (each entry is one user message). +const DEMO_SCRIPT = [ + 'hi', + 'generate sample form', + 'sample chart', + 'and table', + 'whats Product Name', + 'ok could make chart from this table', + 'line pls', +]; + +/** + * Serialize the conversation to a raw transcript: the user's messages and the + * agent's PURE responses (conversational text + the generate_mdma document), + * with only `You:` / `Agent:` to mark who spoke β€” no other added labels. + */ +function buildRawTranscript(turns: AgentDisplayTurn[]): string { + return turns + .map((turn) => { + if (turn.role === 'user') return turn.hidden ? '' : `You:\n${turn.content}`; + const body = (turn as AssistantTurn).blocks + .map((b) => (b.type === 'tool_use' ? b.document : b.content)) + .filter(Boolean) + .join('\n\n'); + return `Agent:\n${body}`; + }) + .filter(Boolean) + .join('\n\n'); +} export function AgentChatView() { const { @@ -16,6 +49,7 @@ export function AgentChatView() { config, updateConfig, send, + sendText, stop, clear, inputRef, @@ -23,6 +57,54 @@ export function AgentChatView() { const { events, isOpen: logOpen, setIsOpen: setLogOpen, clearEvents } = useAgentActionLog(turns); + const [copiedRaw, setCopiedRaw] = useState(false); + const handleCopyRaw = useCallback(async () => { + try { + await navigator.clipboard.writeText(buildRawTranscript(turns)); + setCopiedRaw(true); + setTimeout(() => setCopiedRaw(false), 1500); + } catch { + /* clipboard unavailable */ + } + }, [turns]); + + // ── Auto-play demo ────────────────────────────────────────────────────────── + // Replays a scripted conversation through the real agent: types each message, + // sends it, waits for the full response, then the next. For demo recordings. + const [isPlaying, setIsPlaying] = useState(false); + const playingRef = useRef(false); + const handlePlayDemo = useCallback(async () => { + if (playingRef.current) { + // already running β†’ stop + playingRef.current = false; + setIsPlaying(false); + return; + } + playingRef.current = true; + setIsPlaying(true); + clear(); + clearEvents(); + await sleep(500); + + for (const msg of DEMO_SCRIPT) { + if (!playingRef.current) break; + // Typewriter the message into the input for a natural look. + for (let k = 1; k <= msg.length; k++) { + if (!playingRef.current) break; + setInput(msg.slice(0, k)); + await sleep(28); + } + await sleep(350); + if (!playingRef.current) break; + setInput(''); + await sendText(msg); // renders the user bubble + awaits the agent's reply + await sleep(1100); // beat between turns + } + + playingRef.current = false; + setIsPlaying(false); + }, [clear, clearEvents, sendText, setInput]); + const chatEndRef = useRef(null); const prevCountRef = useRef(turns.length); @@ -72,6 +154,10 @@ export function AgentChatView() { isGenerating={isGenerating} hasMessages={turns.length > 0} inputRef={inputRef} + onCopyRaw={handleCopyRaw} + copiedRaw={copiedRaw} + onPlayDemo={handlePlayDemo} + isPlaying={isPlaying} /> diff --git a/demo/src/PreviewView.tsx b/demo/src/PreviewView.tsx index 496294c..615a9cb 100644 --- a/demo/src/PreviewView.tsx +++ b/demo/src/PreviewView.tsx @@ -1,13 +1,14 @@ -import { useRef, useEffect, useCallback, useState } from 'react'; -import { useAgent } from './agent/use-agent.js'; +import { useCallback, useEffect, useRef, useState } from 'react'; import { AgentMessage } from './agent/AgentMessage.js'; import { AgentSettings } from './agent/AgentSettings.js'; +import { useAgent } from './agent/use-agent.js'; import { ChatInput } from './chat/ChatInput.js'; import { BackendLogDrawer } from './preview/BackendLogPane.js'; import { PreviewPanel } from './preview/PreviewPanel.js'; import { clearSubmissionLog } from './preview/insurance-backend.js'; import { INSURANCE_FLOW_PROMPT } from './preview/insurance-flow-prompt.js'; import { useInsuranceFlow } from './preview/use-insurance-flow.js'; +import { usePreviewAutoplay } from './preview/use-preview-autoplay.js'; import { usePreviewValidation } from './preview/use-preview-validation.js'; function countToolUseBlocks(turns: ReturnType['turns']): number { @@ -29,6 +30,7 @@ export function PreviewView() { config, updateConfig, send, + sendText, sendHidden, stop, clear, @@ -74,6 +76,16 @@ export function PreviewView() { insuranceFlow.reset(); }, [clear, insuranceFlow]); + // Auto-play the full claim flow hands-free (kickoff message β†’ fill & submit + // each step form). Mirrors the scripted demo in the Agent Chat view. + const { isPlaying, play } = usePreviewAutoplay({ + previewState, + isGenerating, + sendText, + setInput, + reset: handleClear, + }); + return (
@@ -115,6 +127,8 @@ export function PreviewView() { isGenerating={isGenerating} hasMessages={turns.length > 0} inputRef={inputRef} + onPlayDemo={play} + isPlaying={isPlaying} />
diff --git a/demo/src/agent/AgentMessage.tsx b/demo/src/agent/AgentMessage.tsx index 422f8d9..7432ef7 100644 --- a/demo/src/agent/AgentMessage.tsx +++ b/demo/src/agent/AgentMessage.tsx @@ -1,8 +1,9 @@ -import { memo } from 'react'; +import { memo, useState } from 'react'; import type { ReactNode } from 'react'; import { MdmaDocument } from '@mobile-reality/mdma-renderer-react'; import { customizations } from '../custom-components.js'; import type { + AgentBlock, AgentDisplayTurn, AssistantTurn, ThinkingBlock, @@ -10,6 +11,15 @@ import type { ToolUseBlock, } from './types.js'; +// Raw dump of an assistant turn β€” the pure model output (conversational text + +// the generate_mdma document), with no added labels, for debugging messaging. +function buildRawDump(blocks: AgentBlock[]): string { + return blocks + .map((b) => (b.type === 'tool_use' ? b.document : b.content)) + .filter(Boolean) + .join('\n\n'); +} + // ── Inline markdown renderer ────────────────────────────────────────────────── function parseInline(text: string): ReactNode { @@ -270,6 +280,8 @@ export const AgentMessage = memo(function AgentMessage({ activeToolUseId, onSelectToolUse, }: AgentMessageProps) { + const [showRaw, setShowRaw] = useState(false); + if (turn.role === 'user') { if (turn.hidden) return null; return ( @@ -286,32 +298,53 @@ export const AgentMessage = memo(function AgentMessage({ const { blocks } = turn as AssistantTurn; + const hasContent = blocks.some( + (b) => (b.type === 'text' || b.type === 'thinking' ? b.content : (b as ToolUseBlock).document), + ); + return (
Agent -
-
- {blocks.length === 0 ? ( - Starting… - ) : ( - blocks.map((block) => { - if (block.type === 'thinking') - return ; - if (block.type === 'text') return ; - if (block.type === 'tool_use') - return ( - onSelectToolUse(block.id) : undefined} - /> - ); - }) + {hasContent && ( + )}
+ {showRaw ? ( +
+
{buildRawDump(blocks)}
+
+ ) : ( +
+ {blocks.length === 0 ? ( + Starting… + ) : ( + blocks.map((block) => { + if (block.type === 'thinking') + return ; + if (block.type === 'text') return ; + if (block.type === 'tool_use') + return ( + onSelectToolUse(block.id) : undefined} + /> + ); + }) + )} +
+ )}
); }); diff --git a/demo/src/agent/AgentSettings.tsx b/demo/src/agent/AgentSettings.tsx index bc1cb8f..06ff712 100644 --- a/demo/src/agent/AgentSettings.tsx +++ b/demo/src/agent/AgentSettings.tsx @@ -3,6 +3,8 @@ import { AUTHOR_PROMPT_VARIANTS } from '@mobile-reality/mdma-prompt-pack'; import { getDefaultPromptVariantForModel } from '../model-prompt-variant.js'; import type { AnthropicConfig } from './anthropic-client.js'; +const OWN_MODEL_URL_PLACEHOLDER = 'https://your-deployment.example.com/v1'; + const PROVIDER_MODELS: Record> = { anthropic: [ { value: 'claude-opus-4-7', label: 'claude-opus-4.7' }, @@ -38,12 +40,21 @@ const PROVIDER_MODELS: Record> = { value: 'x-ai/grok-4.20', label: 'x-ai/grok-4.20' }, { value: 'x-ai/grok-4.3', label: 'x-ai/grok-4.3' }, ], + 'own-model': [{ value: 'mdma-26b', label: 'mdma-26b (our MDMA model)' }], +}; + +const PROVIDER_LABELS: Record = { + anthropic: 'anthropic', + openai: 'openai', + openrouter: 'openrouter', + 'own-model': 'own MDMA model', }; const DEFAULT_MODELS: Record = { anthropic: 'claude-sonnet-4-6', openai: 'gpt-5.5', openrouter: 'google/gemini-3.1-pro-preview', + 'own-model': 'mdma-26b', }; const API_KEY_LABELS: Record = { @@ -80,7 +91,9 @@ export const AgentSettings = memo(function AgentSettings({ config, onUpdate }: A const provider = config.provider ?? 'anthropic'; const models = PROVIDER_MODELS[provider] ?? []; - const missingKey = !getApiKey(config, provider); + // Our own endpoint has auth off, so it never needs a key. + const needsKey = provider !== 'own-model'; + const missingKey = needsKey && !getApiKey(config, provider); function switchProvider(next: NonNullable) { if (next === provider) return; @@ -115,21 +128,35 @@ export const AgentSettings = memo(function AgentSettings({ config, onUpdate }: A className={`ai-preset-btn ${provider === p ? 'ai-preset-btn--active' : ''}`} onClick={() => switchProvider(p)} > - {p} + {PROVIDER_LABELS[p] ?? p} ), )}
- + {needsKey ? ( + + ) : ( + + )}
Model onUpdate({ systemPromptId: e.target.value })} - title={ - AUTHOR_PROMPT_VARIANTS.find( - (v) => v.id === (config.systemPromptId ?? AUTHOR_PROMPT_VARIANTS[0].id), - )?.description - } - > - {AUTHOR_PROMPT_VARIANTS.map((v) => ( - - ))} - - + {provider === 'own-model' ? ( + // The own MDMA model uses its own fixed Gemma-aligned prompt, so + // the variant is locked. + + ) : ( + + )} +
+ {provider === 'own-model' && ( +

+ The entire agent runs on your self-hosted MDMA model endpoint + (OpenAI-compatible, tool-calling enabled) β€” no third-party model is called. Enter the + deployed model URL above; leave it blank to use the default. The{' '} + /v1 suffix is added automatically. +

+ )}

πŸ”’ Your API key is stored in your browser's localStorage only. It is never sent to any server other than the AI provider you select. diff --git a/demo/src/agent/anthropic-client.ts b/demo/src/agent/anthropic-client.ts index c5bcc39..e8ec1cf 100644 --- a/demo/src/agent/anthropic-client.ts +++ b/demo/src/agent/anthropic-client.ts @@ -6,19 +6,38 @@ */ export interface AnthropicConfig { - provider?: 'anthropic' | 'openai' | 'openrouter'; + /** + * 'own-model' routes the entire agent loop to our self-hosted mdma-26b + * endpoint (OpenAI-compatible, tool-calling enabled) β€” no third-party model. + */ + provider?: 'anthropic' | 'openai' | 'openrouter' | 'own-model'; /** Anthropic API key */ apiKey: string; /** OpenAI API key β€” stored separately so switching providers doesn't clear it */ openaiApiKey?: string; /** OpenRouter API key */ openrouterApiKey?: string; + /** + * Base URL of the self-hosted MDMA model endpoint (OpenAI-compatible). + * Only used when provider is 'own-model'; falls back to + * OWN_MODEL_DEFAULT_BASE_URL when empty. + */ + ownModelBaseUrl?: string; model: string; /** Token budget for extended thinking. Only used when provider is 'anthropic'. */ thinkingBudget?: number; systemPromptId?: string; } +/** + * Default endpoint for the self-hosted MDMA model, used when the user hasn't + * entered their own URL in Agent Settings. Overridable at build time via + * VITE_OWN_MODEL_BASE_URL. + */ +export const OWN_MODEL_DEFAULT_BASE_URL = + import.meta.env.VITE_OWN_MODEL_BASE_URL ?? + 'https://REDACTED.modal.run/v1'; + export interface ToolDefinition { name: string; description: string; diff --git a/demo/src/agent/openai-agent-client.ts b/demo/src/agent/openai-agent-client.ts index 6469449..5dbf5c9 100644 --- a/demo/src/agent/openai-agent-client.ts +++ b/demo/src/agent/openai-agent-client.ts @@ -25,10 +25,110 @@ export interface OpenAIToolCall { export type OpenAIMessage = OpenAIUserMessage | OpenAIAssistantMessage | OpenAIToolMessage; -// text block always lives at index 0; tool calls at 1, 2, … +// reasoning block lives at -1 (rendered first), text at 0, tool calls at 1, 2, … +const REASONING_IDX = -1; const TEXT_IDX = 0; const TOOL_IDX_OFFSET = 1; +// Safety limits so a stalled or runaway stream can never hang the UI. Our +// self-hosted endpoint can emit an unbounded `delta.reasoning` channel (the +// model's chain-of-thought); without these a non-terminating stream leaves the +// agent loop awaiting forever and `isGenerating` stuck true. +const IDLE_TIMEOUT_MS = 60_000; // no chunk for this long β†’ assume the stream died +const MAX_STREAM_MS = 240_000; // hard wall-clock ceiling for one response +const MAX_STREAM_BYTES = 4_000_000; // ~4 MB of SSE text β†’ runaway guard + +// Loop detector. Gemma 4's known repetition collapse (see +// evals/own-model/repetition-loops.md) degrades a thinking block into a short +// token/phrase flooding the budget. `min_p` + `repetition_penalty` cut most of +// it, but the collapse is an unfixed model trait, so we keep a cheap safety +// net: over a sliding window of recent words, a healthy stream is lexically +// diverse; a degenerate loop (one token, or a cycle like +// `(END) (DONE) (STOP) (FINAL) …`) collapses unique/total. Below the floor we +// abort rather than let it eat the whole generation. +// +// We run this on BOTH channels. The collapse usually lives in `reasoning`, but +// it can also leak onto `content`: the model emits a valid document, then keeps +// going with a raw "Thinking Process:" ramble after the reasoning span has +// already closed. A legit MDMA document is well above the diversity floor, so +// guarding content does not false-positive on real output. +const LOOP_WINDOW_WORDS = 160; // sliding window of recent words +const LOOP_MIN_WORDS = 120; // don't judge until we have enough signal +const LOOP_UNIQUE_RATIO = 0.15; // unique/total below this β†’ degenerate loop + +/** Tracks recent words on one channel and flags a degenerate repetition loop. */ +class RepetitionLoopDetector { + private readonly words: string[] = []; + + /** Feed a delta; returns true once the window collapses into a loop. */ + push(text: string): boolean { + for (const w of text.split(/\s+/)) { + if (!w) continue; + this.words.push(w); + if (this.words.length > LOOP_WINDOW_WORDS) this.words.shift(); + } + if (this.words.length < LOOP_MIN_WORDS) return false; + const unique = new Set(this.words).size; + return unique / this.words.length < LOOP_UNIQUE_RATIO; + } +} + +const MDMA_FENCE_OPEN = '```mdma'; +const MDMA_FENCE_CLOSE = '```'; + +// Strips leaked ```mdma fenced documents out of the assistant's chat (content) +// channel. The real document always arrives via the generate_mdma tool call and +// renders in the preview pane; the model occasionally ALSO transcribes a copy of +// the document as raw markdown into chat (most often on the first turn). Those +// fenced blocks must never reach the chat UI, which is prose-only. Operates on +// the live stream: complete lines are classified as they arrive, and the +// trailing partial line is held back only while it could still be the start of a +// ```mdma fence β€” so normal prose keeps streaming smoothly. +class MdmaFenceStripper { + private buf = ''; // text after the last emitted char (start-aligned to current line) + private inFence = false; + private partialEmitted = 0; // chars of the current unterminated line already emitted + + /** Feed a content delta; returns only the text that is safe to show in chat. */ + push(text: string): string { + this.buf += text; + let out = ''; + let nl: number; + while ((nl = this.buf.indexOf('\n')) !== -1) { + const line = this.buf.slice(0, nl + 1); + this.buf = this.buf.slice(nl + 1); + const trimmed = line.trim(); + if (this.inFence) { + if (trimmed === MDMA_FENCE_CLOSE) this.inFence = false; + } else if (trimmed.startsWith(MDMA_FENCE_OPEN)) { + this.inFence = true; + } else { + out += line.slice(this.partialEmitted); + } + this.partialEmitted = 0; + } + // Trailing partial line: emit eagerly unless it could still open a fence. + if (!this.inFence && this.buf.length > this.partialEmitted) { + const trimmed = this.buf.trim(); + const couldOpenFence = trimmed.length > 0 && MDMA_FENCE_OPEN.startsWith(trimmed); + if (!couldOpenFence) { + out += this.buf.slice(this.partialEmitted); + this.partialEmitted = this.buf.length; + } + } + return out; + } + + /** Emit any leftover at stream end (drops a dangling, never-closed fence). */ + flush(): string { + const out = this.inFence ? '' : this.buf.slice(this.partialEmitted); + this.buf = ''; + this.partialEmitted = 0; + this.inFence = false; + return out; + } +} + export async function* streamOpenAIAgentMessages( apiKey: string, model: string, @@ -37,6 +137,8 @@ export async function* streamOpenAIAgentMessages( tools: ToolDefinition[], signal?: AbortSignal, baseUrl = 'https://api.openai.com/v1', + /** Extra request-body fields merged in (e.g. temperature, chat_template_kwargs). */ + extraBody?: Record, ): AsyncGenerator { const openAITools = tools.map((t) => ({ type: 'function' as const, @@ -57,6 +159,7 @@ export async function* streamOpenAIAgentMessages( messages: [{ role: 'system', content: systemPrompt }, ...messages], tools: openAITools, tool_choice: 'auto', + ...extraBody, }), signal, }); @@ -82,12 +185,47 @@ export async function* streamOpenAIAgentMessages( let buf = ''; let finishReason = 'stop'; const startedBlocks = new Set(); + const startedAt = Date.now(); + let totalBytes = 0; + const reasoningLoopDetector = new RepetitionLoopDetector(); + const contentLoopDetector = new RepetitionLoopDetector(); + const fenceStripper = new MdmaFenceStripper(); try { while (true) { - const { done, value } = await reader.read(); + // Race the read against an idle timer so a stalled stream can't hang. + const readPromise = reader.read(); + readPromise.catch(() => {}); // swallow rejection if we cancel below + let idleTimer: ReturnType | undefined; + const idle = new Promise<'idle'>((resolve) => { + idleTimer = setTimeout(() => resolve('idle'), IDLE_TIMEOUT_MS); + }); + const result = await Promise.race([readPromise, idle]); + clearTimeout(idleTimer); + + if (result === 'idle') { + reader.cancel().catch(() => {}); + yield { + type: 'stream_error', + message: `Stream stalled β€” no data for ${IDLE_TIMEOUT_MS / 1000}s. The model may be stuck; please try again.`, + }; + return; + } + + const { done, value } = result; if (done) break; + totalBytes += value?.byteLength ?? 0; + if (totalBytes > MAX_STREAM_BYTES || Date.now() - startedAt > MAX_STREAM_MS) { + reader.cancel().catch(() => {}); + yield { + type: 'stream_error', + message: + 'Stream exceeded safety limits (likely a runaway generation) and was stopped. Please try again.', + }; + return; + } + buf += decoder.decode(value, { stream: true }); const lines = buf.split('\n'); buf = lines.pop() ?? ''; @@ -113,12 +251,49 @@ export async function* streamOpenAIAgentMessages( if (finish) finishReason = finish; if (!delta) continue; + // Our model streams its chain-of-thought on a separate `reasoning` + // channel (OpenAI-compatible servers like vLLM expose it here). Render + // it as a collapsible thinking block instead of dropping it on the floor. + if (typeof delta.reasoning === 'string' && delta.reasoning) { + if (!startedBlocks.has(REASONING_IDX)) { + startedBlocks.add(REASONING_IDX); + yield { type: 'block_start', index: REASONING_IDX, blockType: 'thinking' }; + } + yield { type: 'thinking_delta', index: REASONING_IDX, thinking: delta.reasoning }; + + if (reasoningLoopDetector.push(delta.reasoning)) { + reader.cancel().catch(() => {}); + yield { + type: 'stream_error', + message: + 'The model got stuck repeating itself while thinking and was stopped. Please try again.', + }; + return; + } + } + if (typeof delta.content === 'string' && delta.content) { - if (!startedBlocks.has(TEXT_IDX)) { - startedBlocks.add(TEXT_IDX); - yield { type: 'block_start', index: TEXT_IDX, blockType: 'text' }; + // Strip any leaked ```mdma document; only prose reaches the chat UI. + const visible = fenceStripper.push(delta.content); + if (visible) { + if (!startedBlocks.has(TEXT_IDX)) { + startedBlocks.add(TEXT_IDX); + yield { type: 'block_start', index: TEXT_IDX, blockType: 'text' }; + } + yield { type: 'text_delta', index: TEXT_IDX, text: visible }; + } + + // Feed the loop detector the RAW content so a runaway fenced block + // still trips it even though we never display the fence. + if (contentLoopDetector.push(delta.content)) { + reader.cancel().catch(() => {}); + yield { + type: 'stream_error', + message: + 'The model got stuck repeating itself and was stopped. Please try again.', + }; + return; } - yield { type: 'text_delta', index: TEXT_IDX, text: delta.content }; } const toolCalls = delta.tool_calls as Array> | undefined; @@ -151,9 +326,26 @@ export async function* streamOpenAIAgentMessages( } } } finally { - reader.releaseLock(); + // releaseLock throws if we already cancelled the reader β€” ignore that. + try { + reader.releaseLock(); + } catch { + /* reader already released via cancel() */ + } + } + + // Emit any prose the stripper was holding back (e.g. a final line with no + // trailing newline that turned out not to be a fence). + const tail = fenceStripper.flush(); + if (tail) { + if (!startedBlocks.has(TEXT_IDX)) { + startedBlocks.add(TEXT_IDX); + yield { type: 'block_start', index: TEXT_IDX, blockType: 'text' }; + } + yield { type: 'text_delta', index: TEXT_IDX, text: tail }; } + if (startedBlocks.has(REASONING_IDX)) yield { type: 'block_stop', index: REASONING_IDX }; if (startedBlocks.has(TEXT_IDX)) yield { type: 'block_stop', index: TEXT_IDX }; for (const tcIdx of Array.from(startedBlocks).filter((i) => i >= TOOL_IDX_OFFSET)) { yield { type: 'block_stop', index: tcIdx }; diff --git a/demo/src/agent/use-agent.ts b/demo/src/agent/use-agent.ts index cfaf730..3362a33 100644 --- a/demo/src/agent/use-agent.ts +++ b/demo/src/agent/use-agent.ts @@ -2,11 +2,16 @@ import { useState, useCallback, useRef, useEffect } from 'react'; import type { Dispatch, SetStateAction } from 'react'; import { buildSystemPrompt, + buildFixerPrompt, + buildFixerMessage, getAuthorPromptVariant, getAgentToolPromptVariant, + MDMA_IL_AGENT_SYSTEM_PROMPT, } from '@mobile-reality/mdma-prompt-pack'; +import { validate } from '@mobile-reality/mdma-validator'; import { streamAgentMessages, + OWN_MODEL_DEFAULT_BASE_URL, type AnthropicConfig, type ApiMessage, type ApiAssistantBlock, @@ -65,6 +70,40 @@ const GENERATE_MDMA_TOOL_BRIEF = { }, }; +// ── Own-model (mdma-26b) endpoint ───────────────────────────────────────────── +// Our self-hosted model, served OpenAI-compatible with tool-calling enabled. +// In "own-model" provider mode the WHOLE agent loop runs here (conversation + +// generate_mdma via tool_choice:auto), so no third-party model is called. +// Auth is off (placeholder key); enable_thinking must be false; temperature 1 +// for agentic/conversational use. +const OWN_MODEL_NAME = import.meta.env.VITE_OWN_MODEL_NAME ?? 'mdma-26b'; + +// The own-model endpoint is user-configurable in Agent Settings. Normalise what +// they type: trim trailing slashes and append the OpenAI-compatible `/v1` suffix +// if missing. Empty β†’ fall back to the build-time default. +function normalizeOwnModelBaseUrl(raw?: string): string { + const url = (raw ?? '').trim().replace(/\/+$/, ''); + if (!url) return OWN_MODEL_DEFAULT_BASE_URL; + return url.endsWith('/v1') ? url : `${url}/v1`; +} + +// Extra OpenAI-request body our endpoint needs (merged in by the OpenAI client). +// `max_tokens` bounds the response server-side so a runaway reasoning channel +// can't generate forever (the client also caps the stream defensively). +// +// `min_p` + `repetition_penalty` cut the degenerate reasoning repetition loop +// (word-doubling β†’ token-doubling β†’ single-token flooding) that is a known +// Gemma 4 trait β€” see evals/own-model/repetition-loops.md. `min_p` is the +// primary tail-cutter; `repetition_penalty` starts low (raise only if needed β€” +// too high hurts valid output). DRY would be ideal but vLLM doesn't support it. +const OWN_MODEL_EXTRA_BODY = { + temperature: 1, + max_tokens: 8192, + min_p: 0.02, + repetition_penalty: 1.1, + chat_template_kwargs: { enable_thinking: false }, +} as const; + // ── Config persistence ─────────────────────────────────────────────────────── const CONFIG_KEY = 'mdma-agent-config'; @@ -202,8 +241,7 @@ async function callAuthorOpenAI( brief: string, signal: AbortSignal, ): Promise { - const provider = config.provider ?? 'openai'; - const baseUrl = OPENAI_COMPAT_BASE_URLS[provider] ?? OPENAI_COMPAT_BASE_URLS.openai!; + const baseUrl = getBaseUrlForProvider(config); const apiKey = getApiKeyForProvider(config); const response = await fetch(`${baseUrl}/chat/completions`, { method: 'POST', @@ -387,6 +425,7 @@ async function runAgentLoop( if (meta.apiBlock.type === 'tool_use') meta.apiBlock.input = { document }; } + document = await maybeFixDocument(config, document, signal); const parsed = await parseMarkdown(document).catch(() => null); const ast = parsed?.ast ?? null; const store = parsed?.store ?? null; @@ -434,17 +473,103 @@ const OPENAI_COMPAT_BASE_URLS: Partial { + const provider = config.provider ?? 'anthropic'; + if (provider === 'anthropic') return callAuthorAnthropic(config, system, user, signal); + + const isOwn = provider === 'own-model'; + const baseUrl = getBaseUrlForProvider(config); + const response = await fetch(`${baseUrl}/chat/completions`, { + method: 'POST', + headers: { 'content-type': 'application/json', authorization: `Bearer ${getApiKeyForProvider(config)}` }, + body: JSON.stringify({ + model: isOwn ? OWN_MODEL_NAME : config.model, + messages: [ + { role: 'system', content: system }, + { role: 'user', content: user }, + ], + temperature: 0, // strict, deterministic repair + ...(isOwn ? { max_tokens: 4096, chat_template_kwargs: { enable_thinking: false } } : {}), + }), + signal, + }); + if (!response.ok) throw new Error(`Fixer call failed (${response.status})`); + const json = (await response.json()) as { choices?: Array<{ message?: { content?: string } }> }; + return json.choices?.[0]?.message?.content ?? ''; +} + +/** + * Return a valid (or best-effort repaired) MDMA document. No-ops when the input + * is already valid, so it adds zero latency on the common path. + */ +async function maybeFixDocument( + config: AnthropicConfig, + document: string, + signal: AbortSignal, +): Promise { + if (!document.trim()) return document; + // 1. Deterministic auto-fix. + const r = validate(document, { exclude: ['thinking-block'], autoFix: true }); + if (r.ok) return r.output; + + // 2. LLM fixer for the remaining issues. + const unfixed = r.issues.filter( + (i) => !i.fixed && (i.severity === 'error' || i.severity === 'warning'), + ); + if (unfixed.length === 0) return r.output; + + try { + const system = `${buildSystemPrompt()}\n\n---\n\n${buildFixerPrompt('single-block')}`; + const userMessage = buildFixerMessage(document, unfixed, {}); + const fixed = await chatOnce(config, system, userMessage, signal); + if (fixed) { + // Accept only if it actually improves validity. + const after = validate(fixed, { exclude: ['thinking-block'], autoFix: true }); + if (after.summary.errors <= r.summary.errors) return after.output; + } + } catch { + /* fixer failed β€” fall back to the deterministic best-effort below */ + } + return r.output; +} + async function runOpenAIAgentLoop( config: AnthropicConfig, systemPrompt: string, @@ -456,9 +581,11 @@ async function runOpenAIAgentLoop( nextId: () => string, subAgent: AuthorSubAgent | null, ): Promise { - const baseUrl = - OPENAI_COMPAT_BASE_URLS[config.provider ?? 'openai'] ?? OPENAI_COMPAT_BASE_URLS.openai!; + const isOwnModel = config.provider === 'own-model'; + const baseUrl = getBaseUrlForProvider(config); const apiKey = getApiKeyForProvider(config); + const model = isOwnModel ? OWN_MODEL_NAME : config.model; + const extraBody = isOwnModel ? OWN_MODEL_EXTRA_BODY : undefined; const tool = subAgent ? GENERATE_MDMA_TOOL_BRIEF : GENERATE_MDMA_TOOL_INLINE; let continueLoop = true; @@ -471,12 +598,13 @@ async function runOpenAIAgentLoop( for await (const ev of streamOpenAIAgentMessages( apiKey, - config.model, + model, systemPrompt, history, [tool], signal, baseUrl, + extraBody, )) { if (ev.type === 'stream_error') { onError(ev.message); @@ -486,7 +614,20 @@ async function runOpenAIAgentLoop( if (ev.type === 'block_start') { const displayId = nextId(); - if (ev.blockType === 'text') { + if (ev.blockType === 'thinking') { + // Reasoning channel (delta.reasoning) β†’ collapsible thinking block. + // Not added to the OpenAI history (only text + tool_calls are). + const apiBlock: ApiAssistantBlock = { type: 'thinking', thinking: '', signature: '' }; + blockMeta.set(ev.index, { displayId, apiBlock }); + setTurns((prev) => + appendBlock(prev, assistantTurnId, { + id: displayId, + type: 'thinking', + content: '', + isStreaming: true, + } satisfies AgentBlock), + ); + } else if (ev.blockType === 'text') { const apiBlock: ApiAssistantBlock = { type: 'text', text: '' }; blockMeta.set(ev.index, { displayId, apiBlock }); setTurns((prev) => @@ -532,6 +673,15 @@ async function runOpenAIAgentLoop( } } + if (ev.type === 'thinking_delta') { + const meta = blockMeta.get(ev.index); + if (meta?.apiBlock.type === 'thinking') { + meta.apiBlock.thinking += ev.thinking; + const snap = meta.apiBlock.thinking; + setTurns((prev) => patchBlock(prev, assistantTurnId, meta.displayId, { content: snap })); + } + } + if (ev.type === 'input_json_delta') { const meta = blockMeta.get(ev.index); if (meta?.partialJson !== undefined) meta.partialJson += ev.partial_json; @@ -579,6 +729,7 @@ async function runOpenAIAgentLoop( } } + document = await maybeFixDocument(config, document, signal); const parsed = await parseMarkdown(document).catch(() => null); setTurns((prev) => patchBlock(prev, assistantTurnId, meta.displayId, { @@ -749,23 +900,34 @@ export function useAgent(options: UseAgentOptions = {}) { ]); abortRef.current = new AbortController(); + const provider = config.provider ?? 'anthropic'; + // Our own model runs the whole turn itself (tool-calling enabled), so it + // emits the MDMA document inline via generate_mdma β€” no author sub-agent. + const useSubAgent = (options.useAuthorSubAgent ?? false) && provider !== 'own-model'; const toolPrompt = getAgentToolPromptVariant(config.systemPromptId).prompt; - // In sub-agent mode the conversation agent never writes MDMA directly, - // so its system prompt omits the author prompt and the buildSystemPrompt - // reminder (both of which would tempt the agent to inline MDMA in chat). - const systemPrompt = options.useAuthorSubAgent - ? options.flowPrompt + // Our own model gets its own Gemma-aligned agentic prompt (no + // leak β€” see prompt-pack mdma-agent/mobile-reality/mdma-il). Other + // providers: sub-agent mode uses just the tool prompt; inline mode layers + // the author prompt via buildSystemPrompt. + let systemPrompt: string; + if (provider === 'own-model') { + systemPrompt = options.flowPrompt + ? `${MDMA_IL_AGENT_SYSTEM_PROMPT}\n\n---\n\n${options.flowPrompt}` + : MDMA_IL_AGENT_SYSTEM_PROMPT; + } else if (useSubAgent) { + systemPrompt = options.flowPrompt ? `${toolPrompt}\n\n---\n\n${options.flowPrompt}` - : toolPrompt - : buildSystemPrompt({ - authorPrompt: getAuthorPromptVariant(config.systemPromptId).prompt, - customPrompt: options.flowPrompt - ? `${toolPrompt}\n\n---\n\n${options.flowPrompt}` - : toolPrompt, - }); - - const subAgent = options.useAuthorSubAgent ? makeAuthorSubAgent(config) : null; - const provider = config.provider ?? 'anthropic'; + : toolPrompt; + } else { + systemPrompt = buildSystemPrompt({ + authorPrompt: getAuthorPromptVariant(config.systemPromptId).prompt, + customPrompt: options.flowPrompt + ? `${toolPrompt}\n\n---\n\n${options.flowPrompt}` + : toolPrompt, + }); + } + + const subAgent = useSubAgent ? makeAuthorSubAgent(config) : null; try { if (provider === 'anthropic') { @@ -824,6 +986,15 @@ export function useAgent(options: UseAgentOptions = {}) { [runTurn], ); + // Send a specific message as a visible user turn and resolve when the agent's + // response is fully complete. Used by the auto-play demo to pace the script. + const sendText = useCallback( + async (text: string) => { + await runTurn(text, false); + }, + [runTurn], + ); + const stop = useCallback(() => { abortRef.current?.abort(); }, []); @@ -848,6 +1019,7 @@ export function useAgent(options: UseAgentOptions = {}) { updateConfig, send, sendHidden, + sendText, stop, clear, inputRef, diff --git a/demo/src/chat/ChatInput.tsx b/demo/src/chat/ChatInput.tsx index 359326a..9276f43 100644 --- a/demo/src/chat/ChatInput.tsx +++ b/demo/src/chat/ChatInput.tsx @@ -13,6 +13,14 @@ export interface ChatInputProps { disabled?: boolean; /** Placeholder text override. */ placeholder?: string; + /** Copy the whole raw conversation to the clipboard (debugging). */ + onCopyRaw?: () => void; + /** Briefly true right after a successful copy, for button feedback. */ + copiedRaw?: boolean; + /** Start/stop the scripted auto-play demo. */ + onPlayDemo?: () => void; + /** True while the auto-play demo is running. */ + isPlaying?: boolean; } export const ChatInput = memo(function ChatInput({ @@ -26,8 +34,12 @@ export const ChatInput = memo(function ChatInput({ inputRef, disabled, placeholder, + onCopyRaw, + copiedRaw, + onPlayDemo, + isPlaying, }: ChatInputProps) { - const isDisabled = disabled && !isGenerating; + const isDisabled = (disabled && !isGenerating) || Boolean(isPlaying); return (

@@ -48,6 +60,27 @@ export const ChatInput = memo(function ChatInput({ }} />
+ {onPlayDemo && ( + + )} + {hasMessages && onCopyRaw && ( + + )} {hasMessages && (