-
- {API_KEY_LABELS[provider]}
- onUpdate(apiKeyPatch(provider, e.target.value))}
- placeholder={API_KEY_PLACEHOLDERS[provider]}
- />
-
+ {needsKey ? (
+
+ {API_KEY_LABELS[provider]}
+ onUpdate(apiKeyPatch(provider, e.target.value))}
+ placeholder={API_KEY_PLACEHOLDERS[provider]}
+ />
+
+ ) : (
+
+ Model endpoint URL
+ onUpdate({ ownModelBaseUrl: e.target.value })}
+ placeholder={OWN_MODEL_URL_PLACEHOLDER}
+ spellCheck={false}
+ autoComplete="off"
+ />
+
+ )}
Model
)}
-
+
System prompt variant
- onUpdate({ systemPromptId: e.target.value })}
- title={
- AUTHOR_PROMPT_VARIANTS.find(
- (v) => v.id === (config.systemPromptId ?? AUTHOR_PROMPT_VARIANTS[0].id),
- )?.description
- }
- >
- {AUTHOR_PROMPT_VARIANTS.map((v) => (
-
- {v.label}
-
- ))}
-
-
+ {provider === 'own-model' ? (
+ // The own MDMA model uses its own fixed Gemma-aligned prompt, so
+ // the variant is locked.
+
+ MDMA DSL Variant
+
+ ) : (
+ onUpdate({ systemPromptId: e.target.value })}
+ title={
+ AUTHOR_PROMPT_VARIANTS.find(
+ (v) => v.id === (config.systemPromptId ?? AUTHOR_PROMPT_VARIANTS[0].id),
+ )?.description
+ }
+ >
+ {AUTHOR_PROMPT_VARIANTS.map((v) => (
+
+ {v.label}
+
+ ))}
+
+ )}
+
+ {provider === 'own-model' && (
+
+ The entire agent runs on your self-hosted MDMA model endpoint
+ (OpenAI-compatible, tool-calling enabled) β no third-party model is called. Enter the
+ deployed model URL above; leave it blank to use the default. The{' '}
+ /v1 suffix is added automatically.
+
+ )}
π Your API key is stored in your browser's localStorage only. It is never sent to
any server other than the AI provider you select.
diff --git a/demo/src/agent/anthropic-client.ts b/demo/src/agent/anthropic-client.ts
index c5bcc39..e8ec1cf 100644
--- a/demo/src/agent/anthropic-client.ts
+++ b/demo/src/agent/anthropic-client.ts
@@ -6,19 +6,38 @@
*/
export interface AnthropicConfig {
- provider?: 'anthropic' | 'openai' | 'openrouter';
+ /**
+ * 'own-model' routes the entire agent loop to our self-hosted mdma-26b
+ * endpoint (OpenAI-compatible, tool-calling enabled) β no third-party model.
+ */
+ provider?: 'anthropic' | 'openai' | 'openrouter' | 'own-model';
/** Anthropic API key */
apiKey: string;
/** OpenAI API key β stored separately so switching providers doesn't clear it */
openaiApiKey?: string;
/** OpenRouter API key */
openrouterApiKey?: string;
+ /**
+ * Base URL of the self-hosted MDMA model endpoint (OpenAI-compatible).
+ * Only used when provider is 'own-model'; falls back to
+ * OWN_MODEL_DEFAULT_BASE_URL when empty.
+ */
+ ownModelBaseUrl?: string;
model: string;
/** Token budget for extended thinking. Only used when provider is 'anthropic'. */
thinkingBudget?: number;
systemPromptId?: string;
}
+/**
+ * Default endpoint for the self-hosted MDMA model, used when the user hasn't
+ * entered their own URL in Agent Settings. Overridable at build time via
+ * VITE_OWN_MODEL_BASE_URL.
+ */
+export const OWN_MODEL_DEFAULT_BASE_URL =
+ import.meta.env.VITE_OWN_MODEL_BASE_URL ??
+ 'https://REDACTED.modal.run/v1';
+
export interface ToolDefinition {
name: string;
description: string;
diff --git a/demo/src/agent/openai-agent-client.ts b/demo/src/agent/openai-agent-client.ts
index 6469449..5dbf5c9 100644
--- a/demo/src/agent/openai-agent-client.ts
+++ b/demo/src/agent/openai-agent-client.ts
@@ -25,10 +25,110 @@ export interface OpenAIToolCall {
export type OpenAIMessage = OpenAIUserMessage | OpenAIAssistantMessage | OpenAIToolMessage;
-// text block always lives at index 0; tool calls at 1, 2, β¦
+// reasoning block lives at -1 (rendered first), text at 0, tool calls at 1, 2, β¦
+const REASONING_IDX = -1;
const TEXT_IDX = 0;
const TOOL_IDX_OFFSET = 1;
+// Safety limits so a stalled or runaway stream can never hang the UI. Our
+// self-hosted endpoint can emit an unbounded `delta.reasoning` channel (the
+// model's chain-of-thought); without these a non-terminating stream leaves the
+// agent loop awaiting forever and `isGenerating` stuck true.
+const IDLE_TIMEOUT_MS = 60_000; // no chunk for this long β assume the stream died
+const MAX_STREAM_MS = 240_000; // hard wall-clock ceiling for one response
+const MAX_STREAM_BYTES = 4_000_000; // ~4 MB of SSE text β runaway guard
+
+// Loop detector. Gemma 4's known repetition collapse (see
+// evals/own-model/repetition-loops.md) degrades a thinking block into a short
+// token/phrase flooding the budget. `min_p` + `repetition_penalty` cut most of
+// it, but the collapse is an unfixed model trait, so we keep a cheap safety
+// net: over a sliding window of recent words, a healthy stream is lexically
+// diverse; a degenerate loop (one token, or a cycle like
+// `(END) (DONE) (STOP) (FINAL) β¦`) collapses unique/total. Below the floor we
+// abort rather than let it eat the whole generation.
+//
+// We run this on BOTH channels. The collapse usually lives in `reasoning`, but
+// it can also leak onto `content`: the model emits a valid document, then keeps
+// going with a raw "Thinking Process:" ramble after the reasoning span has
+// already closed. A legit MDMA document is well above the diversity floor, so
+// guarding content does not false-positive on real output.
+const LOOP_WINDOW_WORDS = 160; // sliding window of recent words
+const LOOP_MIN_WORDS = 120; // don't judge until we have enough signal
+const LOOP_UNIQUE_RATIO = 0.15; // unique/total below this β degenerate loop
+
+/** Tracks recent words on one channel and flags a degenerate repetition loop. */
+class RepetitionLoopDetector {
+ private readonly words: string[] = [];
+
+ /** Feed a delta; returns true once the window collapses into a loop. */
+ push(text: string): boolean {
+ for (const w of text.split(/\s+/)) {
+ if (!w) continue;
+ this.words.push(w);
+ if (this.words.length > LOOP_WINDOW_WORDS) this.words.shift();
+ }
+ if (this.words.length < LOOP_MIN_WORDS) return false;
+ const unique = new Set(this.words).size;
+ return unique / this.words.length < LOOP_UNIQUE_RATIO;
+ }
+}
+
+const MDMA_FENCE_OPEN = '```mdma';
+const MDMA_FENCE_CLOSE = '```';
+
+// Strips leaked ```mdma fenced documents out of the assistant's chat (content)
+// channel. The real document always arrives via the generate_mdma tool call and
+// renders in the preview pane; the model occasionally ALSO transcribes a copy of
+// the document as raw markdown into chat (most often on the first turn). Those
+// fenced blocks must never reach the chat UI, which is prose-only. Operates on
+// the live stream: complete lines are classified as they arrive, and the
+// trailing partial line is held back only while it could still be the start of a
+// ```mdma fence β so normal prose keeps streaming smoothly.
+class MdmaFenceStripper {
+ private buf = ''; // text after the last emitted char (start-aligned to current line)
+ private inFence = false;
+ private partialEmitted = 0; // chars of the current unterminated line already emitted
+
+ /** Feed a content delta; returns only the text that is safe to show in chat. */
+ push(text: string): string {
+ this.buf += text;
+ let out = '';
+ let nl: number;
+ while ((nl = this.buf.indexOf('\n')) !== -1) {
+ const line = this.buf.slice(0, nl + 1);
+ this.buf = this.buf.slice(nl + 1);
+ const trimmed = line.trim();
+ if (this.inFence) {
+ if (trimmed === MDMA_FENCE_CLOSE) this.inFence = false;
+ } else if (trimmed.startsWith(MDMA_FENCE_OPEN)) {
+ this.inFence = true;
+ } else {
+ out += line.slice(this.partialEmitted);
+ }
+ this.partialEmitted = 0;
+ }
+ // Trailing partial line: emit eagerly unless it could still open a fence.
+ if (!this.inFence && this.buf.length > this.partialEmitted) {
+ const trimmed = this.buf.trim();
+ const couldOpenFence = trimmed.length > 0 && MDMA_FENCE_OPEN.startsWith(trimmed);
+ if (!couldOpenFence) {
+ out += this.buf.slice(this.partialEmitted);
+ this.partialEmitted = this.buf.length;
+ }
+ }
+ return out;
+ }
+
+ /** Emit any leftover at stream end (drops a dangling, never-closed fence). */
+ flush(): string {
+ const out = this.inFence ? '' : this.buf.slice(this.partialEmitted);
+ this.buf = '';
+ this.partialEmitted = 0;
+ this.inFence = false;
+ return out;
+ }
+}
+
export async function* streamOpenAIAgentMessages(
apiKey: string,
model: string,
@@ -37,6 +137,8 @@ export async function* streamOpenAIAgentMessages(
tools: ToolDefinition[],
signal?: AbortSignal,
baseUrl = 'https://api.openai.com/v1',
+ /** Extra request-body fields merged in (e.g. temperature, chat_template_kwargs). */
+ extraBody?: Record,
): AsyncGenerator {
const openAITools = tools.map((t) => ({
type: 'function' as const,
@@ -57,6 +159,7 @@ export async function* streamOpenAIAgentMessages(
messages: [{ role: 'system', content: systemPrompt }, ...messages],
tools: openAITools,
tool_choice: 'auto',
+ ...extraBody,
}),
signal,
});
@@ -82,12 +185,47 @@ export async function* streamOpenAIAgentMessages(
let buf = '';
let finishReason = 'stop';
const startedBlocks = new Set();
+ const startedAt = Date.now();
+ let totalBytes = 0;
+ const reasoningLoopDetector = new RepetitionLoopDetector();
+ const contentLoopDetector = new RepetitionLoopDetector();
+ const fenceStripper = new MdmaFenceStripper();
try {
while (true) {
- const { done, value } = await reader.read();
+ // Race the read against an idle timer so a stalled stream can't hang.
+ const readPromise = reader.read();
+ readPromise.catch(() => {}); // swallow rejection if we cancel below
+ let idleTimer: ReturnType | undefined;
+ const idle = new Promise<'idle'>((resolve) => {
+ idleTimer = setTimeout(() => resolve('idle'), IDLE_TIMEOUT_MS);
+ });
+ const result = await Promise.race([readPromise, idle]);
+ clearTimeout(idleTimer);
+
+ if (result === 'idle') {
+ reader.cancel().catch(() => {});
+ yield {
+ type: 'stream_error',
+ message: `Stream stalled β no data for ${IDLE_TIMEOUT_MS / 1000}s. The model may be stuck; please try again.`,
+ };
+ return;
+ }
+
+ const { done, value } = result;
if (done) break;
+ totalBytes += value?.byteLength ?? 0;
+ if (totalBytes > MAX_STREAM_BYTES || Date.now() - startedAt > MAX_STREAM_MS) {
+ reader.cancel().catch(() => {});
+ yield {
+ type: 'stream_error',
+ message:
+ 'Stream exceeded safety limits (likely a runaway generation) and was stopped. Please try again.',
+ };
+ return;
+ }
+
buf += decoder.decode(value, { stream: true });
const lines = buf.split('\n');
buf = lines.pop() ?? '';
@@ -113,12 +251,49 @@ export async function* streamOpenAIAgentMessages(
if (finish) finishReason = finish;
if (!delta) continue;
+ // Our model streams its chain-of-thought on a separate `reasoning`
+ // channel (OpenAI-compatible servers like vLLM expose it here). Render
+ // it as a collapsible thinking block instead of dropping it on the floor.
+ if (typeof delta.reasoning === 'string' && delta.reasoning) {
+ if (!startedBlocks.has(REASONING_IDX)) {
+ startedBlocks.add(REASONING_IDX);
+ yield { type: 'block_start', index: REASONING_IDX, blockType: 'thinking' };
+ }
+ yield { type: 'thinking_delta', index: REASONING_IDX, thinking: delta.reasoning };
+
+ if (reasoningLoopDetector.push(delta.reasoning)) {
+ reader.cancel().catch(() => {});
+ yield {
+ type: 'stream_error',
+ message:
+ 'The model got stuck repeating itself while thinking and was stopped. Please try again.',
+ };
+ return;
+ }
+ }
+
if (typeof delta.content === 'string' && delta.content) {
- if (!startedBlocks.has(TEXT_IDX)) {
- startedBlocks.add(TEXT_IDX);
- yield { type: 'block_start', index: TEXT_IDX, blockType: 'text' };
+ // Strip any leaked ```mdma document; only prose reaches the chat UI.
+ const visible = fenceStripper.push(delta.content);
+ if (visible) {
+ if (!startedBlocks.has(TEXT_IDX)) {
+ startedBlocks.add(TEXT_IDX);
+ yield { type: 'block_start', index: TEXT_IDX, blockType: 'text' };
+ }
+ yield { type: 'text_delta', index: TEXT_IDX, text: visible };
+ }
+
+ // Feed the loop detector the RAW content so a runaway fenced block
+ // still trips it even though we never display the fence.
+ if (contentLoopDetector.push(delta.content)) {
+ reader.cancel().catch(() => {});
+ yield {
+ type: 'stream_error',
+ message:
+ 'The model got stuck repeating itself and was stopped. Please try again.',
+ };
+ return;
}
- yield { type: 'text_delta', index: TEXT_IDX, text: delta.content };
}
const toolCalls = delta.tool_calls as Array> | undefined;
@@ -151,9 +326,26 @@ export async function* streamOpenAIAgentMessages(
}
}
} finally {
- reader.releaseLock();
+ // releaseLock throws if we already cancelled the reader β ignore that.
+ try {
+ reader.releaseLock();
+ } catch {
+ /* reader already released via cancel() */
+ }
+ }
+
+ // Emit any prose the stripper was holding back (e.g. a final line with no
+ // trailing newline that turned out not to be a fence).
+ const tail = fenceStripper.flush();
+ if (tail) {
+ if (!startedBlocks.has(TEXT_IDX)) {
+ startedBlocks.add(TEXT_IDX);
+ yield { type: 'block_start', index: TEXT_IDX, blockType: 'text' };
+ }
+ yield { type: 'text_delta', index: TEXT_IDX, text: tail };
}
+ if (startedBlocks.has(REASONING_IDX)) yield { type: 'block_stop', index: REASONING_IDX };
if (startedBlocks.has(TEXT_IDX)) yield { type: 'block_stop', index: TEXT_IDX };
for (const tcIdx of Array.from(startedBlocks).filter((i) => i >= TOOL_IDX_OFFSET)) {
yield { type: 'block_stop', index: tcIdx };
diff --git a/demo/src/agent/use-agent.ts b/demo/src/agent/use-agent.ts
index cfaf730..3362a33 100644
--- a/demo/src/agent/use-agent.ts
+++ b/demo/src/agent/use-agent.ts
@@ -2,11 +2,16 @@ import { useState, useCallback, useRef, useEffect } from 'react';
import type { Dispatch, SetStateAction } from 'react';
import {
buildSystemPrompt,
+ buildFixerPrompt,
+ buildFixerMessage,
getAuthorPromptVariant,
getAgentToolPromptVariant,
+ MDMA_IL_AGENT_SYSTEM_PROMPT,
} from '@mobile-reality/mdma-prompt-pack';
+import { validate } from '@mobile-reality/mdma-validator';
import {
streamAgentMessages,
+ OWN_MODEL_DEFAULT_BASE_URL,
type AnthropicConfig,
type ApiMessage,
type ApiAssistantBlock,
@@ -65,6 +70,40 @@ const GENERATE_MDMA_TOOL_BRIEF = {
},
};
+// ββ Own-model (mdma-26b) endpoint βββββββββββββββββββββββββββββββββββββββββββββ
+// Our self-hosted model, served OpenAI-compatible with tool-calling enabled.
+// In "own-model" provider mode the WHOLE agent loop runs here (conversation +
+// generate_mdma via tool_choice:auto), so no third-party model is called.
+// Auth is off (placeholder key); enable_thinking must be false; temperature 1
+// for agentic/conversational use.
+const OWN_MODEL_NAME = import.meta.env.VITE_OWN_MODEL_NAME ?? 'mdma-26b';
+
+// The own-model endpoint is user-configurable in Agent Settings. Normalise what
+// they type: trim trailing slashes and append the OpenAI-compatible `/v1` suffix
+// if missing. Empty β fall back to the build-time default.
+function normalizeOwnModelBaseUrl(raw?: string): string {
+ const url = (raw ?? '').trim().replace(/\/+$/, '');
+ if (!url) return OWN_MODEL_DEFAULT_BASE_URL;
+ return url.endsWith('/v1') ? url : `${url}/v1`;
+}
+
+// Extra OpenAI-request body our endpoint needs (merged in by the OpenAI client).
+// `max_tokens` bounds the response server-side so a runaway reasoning channel
+// can't generate forever (the client also caps the stream defensively).
+//
+// `min_p` + `repetition_penalty` cut the degenerate reasoning repetition loop
+// (word-doubling β token-doubling β single-token flooding) that is a known
+// Gemma 4 trait β see evals/own-model/repetition-loops.md. `min_p` is the
+// primary tail-cutter; `repetition_penalty` starts low (raise only if needed β
+// too high hurts valid output). DRY would be ideal but vLLM doesn't support it.
+const OWN_MODEL_EXTRA_BODY = {
+ temperature: 1,
+ max_tokens: 8192,
+ min_p: 0.02,
+ repetition_penalty: 1.1,
+ chat_template_kwargs: { enable_thinking: false },
+} as const;
+
// ββ Config persistence βββββββββββββββββββββββββββββββββββββββββββββββββββββββ
const CONFIG_KEY = 'mdma-agent-config';
@@ -202,8 +241,7 @@ async function callAuthorOpenAI(
brief: string,
signal: AbortSignal,
): Promise {
- const provider = config.provider ?? 'openai';
- const baseUrl = OPENAI_COMPAT_BASE_URLS[provider] ?? OPENAI_COMPAT_BASE_URLS.openai!;
+ const baseUrl = getBaseUrlForProvider(config);
const apiKey = getApiKeyForProvider(config);
const response = await fetch(`${baseUrl}/chat/completions`, {
method: 'POST',
@@ -387,6 +425,7 @@ async function runAgentLoop(
if (meta.apiBlock.type === 'tool_use') meta.apiBlock.input = { document };
}
+ document = await maybeFixDocument(config, document, signal);
const parsed = await parseMarkdown(document).catch(() => null);
const ast = parsed?.ast ?? null;
const store = parsed?.store ?? null;
@@ -434,17 +473,103 @@ const OPENAI_COMPAT_BASE_URLS: Partial {
+ const provider = config.provider ?? 'anthropic';
+ if (provider === 'anthropic') return callAuthorAnthropic(config, system, user, signal);
+
+ const isOwn = provider === 'own-model';
+ const baseUrl = getBaseUrlForProvider(config);
+ const response = await fetch(`${baseUrl}/chat/completions`, {
+ method: 'POST',
+ headers: { 'content-type': 'application/json', authorization: `Bearer ${getApiKeyForProvider(config)}` },
+ body: JSON.stringify({
+ model: isOwn ? OWN_MODEL_NAME : config.model,
+ messages: [
+ { role: 'system', content: system },
+ { role: 'user', content: user },
+ ],
+ temperature: 0, // strict, deterministic repair
+ ...(isOwn ? { max_tokens: 4096, chat_template_kwargs: { enable_thinking: false } } : {}),
+ }),
+ signal,
+ });
+ if (!response.ok) throw new Error(`Fixer call failed (${response.status})`);
+ const json = (await response.json()) as { choices?: Array<{ message?: { content?: string } }> };
+ return json.choices?.[0]?.message?.content ?? '';
+}
+
+/**
+ * Return a valid (or best-effort repaired) MDMA document. No-ops when the input
+ * is already valid, so it adds zero latency on the common path.
+ */
+async function maybeFixDocument(
+ config: AnthropicConfig,
+ document: string,
+ signal: AbortSignal,
+): Promise {
+ if (!document.trim()) return document;
+ // 1. Deterministic auto-fix.
+ const r = validate(document, { exclude: ['thinking-block'], autoFix: true });
+ if (r.ok) return r.output;
+
+ // 2. LLM fixer for the remaining issues.
+ const unfixed = r.issues.filter(
+ (i) => !i.fixed && (i.severity === 'error' || i.severity === 'warning'),
+ );
+ if (unfixed.length === 0) return r.output;
+
+ try {
+ const system = `${buildSystemPrompt()}\n\n---\n\n${buildFixerPrompt('single-block')}`;
+ const userMessage = buildFixerMessage(document, unfixed, {});
+ const fixed = await chatOnce(config, system, userMessage, signal);
+ if (fixed) {
+ // Accept only if it actually improves validity.
+ const after = validate(fixed, { exclude: ['thinking-block'], autoFix: true });
+ if (after.summary.errors <= r.summary.errors) return after.output;
+ }
+ } catch {
+ /* fixer failed β fall back to the deterministic best-effort below */
+ }
+ return r.output;
+}
+
async function runOpenAIAgentLoop(
config: AnthropicConfig,
systemPrompt: string,
@@ -456,9 +581,11 @@ async function runOpenAIAgentLoop(
nextId: () => string,
subAgent: AuthorSubAgent | null,
): Promise {
- const baseUrl =
- OPENAI_COMPAT_BASE_URLS[config.provider ?? 'openai'] ?? OPENAI_COMPAT_BASE_URLS.openai!;
+ const isOwnModel = config.provider === 'own-model';
+ const baseUrl = getBaseUrlForProvider(config);
const apiKey = getApiKeyForProvider(config);
+ const model = isOwnModel ? OWN_MODEL_NAME : config.model;
+ const extraBody = isOwnModel ? OWN_MODEL_EXTRA_BODY : undefined;
const tool = subAgent ? GENERATE_MDMA_TOOL_BRIEF : GENERATE_MDMA_TOOL_INLINE;
let continueLoop = true;
@@ -471,12 +598,13 @@ async function runOpenAIAgentLoop(
for await (const ev of streamOpenAIAgentMessages(
apiKey,
- config.model,
+ model,
systemPrompt,
history,
[tool],
signal,
baseUrl,
+ extraBody,
)) {
if (ev.type === 'stream_error') {
onError(ev.message);
@@ -486,7 +614,20 @@ async function runOpenAIAgentLoop(
if (ev.type === 'block_start') {
const displayId = nextId();
- if (ev.blockType === 'text') {
+ if (ev.blockType === 'thinking') {
+ // Reasoning channel (delta.reasoning) β collapsible thinking block.
+ // Not added to the OpenAI history (only text + tool_calls are).
+ const apiBlock: ApiAssistantBlock = { type: 'thinking', thinking: '', signature: '' };
+ blockMeta.set(ev.index, { displayId, apiBlock });
+ setTurns((prev) =>
+ appendBlock(prev, assistantTurnId, {
+ id: displayId,
+ type: 'thinking',
+ content: '',
+ isStreaming: true,
+ } satisfies AgentBlock),
+ );
+ } else if (ev.blockType === 'text') {
const apiBlock: ApiAssistantBlock = { type: 'text', text: '' };
blockMeta.set(ev.index, { displayId, apiBlock });
setTurns((prev) =>
@@ -532,6 +673,15 @@ async function runOpenAIAgentLoop(
}
}
+ if (ev.type === 'thinking_delta') {
+ const meta = blockMeta.get(ev.index);
+ if (meta?.apiBlock.type === 'thinking') {
+ meta.apiBlock.thinking += ev.thinking;
+ const snap = meta.apiBlock.thinking;
+ setTurns((prev) => patchBlock(prev, assistantTurnId, meta.displayId, { content: snap }));
+ }
+ }
+
if (ev.type === 'input_json_delta') {
const meta = blockMeta.get(ev.index);
if (meta?.partialJson !== undefined) meta.partialJson += ev.partial_json;
@@ -579,6 +729,7 @@ async function runOpenAIAgentLoop(
}
}
+ document = await maybeFixDocument(config, document, signal);
const parsed = await parseMarkdown(document).catch(() => null);
setTurns((prev) =>
patchBlock(prev, assistantTurnId, meta.displayId, {
@@ -749,23 +900,34 @@ export function useAgent(options: UseAgentOptions = {}) {
]);
abortRef.current = new AbortController();
+ const provider = config.provider ?? 'anthropic';
+ // Our own model runs the whole turn itself (tool-calling enabled), so it
+ // emits the MDMA document inline via generate_mdma β no author sub-agent.
+ const useSubAgent = (options.useAuthorSubAgent ?? false) && provider !== 'own-model';
const toolPrompt = getAgentToolPromptVariant(config.systemPromptId).prompt;
- // In sub-agent mode the conversation agent never writes MDMA directly,
- // so its system prompt omits the author prompt and the buildSystemPrompt
- // reminder (both of which would tempt the agent to inline MDMA in chat).
- const systemPrompt = options.useAuthorSubAgent
- ? options.flowPrompt
+ // Our own model gets its own Gemma-aligned agentic prompt (no
+ // leak β see prompt-pack mdma-agent/mobile-reality/mdma-il). Other
+ // providers: sub-agent mode uses just the tool prompt; inline mode layers
+ // the author prompt via buildSystemPrompt.
+ let systemPrompt: string;
+ if (provider === 'own-model') {
+ systemPrompt = options.flowPrompt
+ ? `${MDMA_IL_AGENT_SYSTEM_PROMPT}\n\n---\n\n${options.flowPrompt}`
+ : MDMA_IL_AGENT_SYSTEM_PROMPT;
+ } else if (useSubAgent) {
+ systemPrompt = options.flowPrompt
? `${toolPrompt}\n\n---\n\n${options.flowPrompt}`
- : toolPrompt
- : buildSystemPrompt({
- authorPrompt: getAuthorPromptVariant(config.systemPromptId).prompt,
- customPrompt: options.flowPrompt
- ? `${toolPrompt}\n\n---\n\n${options.flowPrompt}`
- : toolPrompt,
- });
-
- const subAgent = options.useAuthorSubAgent ? makeAuthorSubAgent(config) : null;
- const provider = config.provider ?? 'anthropic';
+ : toolPrompt;
+ } else {
+ systemPrompt = buildSystemPrompt({
+ authorPrompt: getAuthorPromptVariant(config.systemPromptId).prompt,
+ customPrompt: options.flowPrompt
+ ? `${toolPrompt}\n\n---\n\n${options.flowPrompt}`
+ : toolPrompt,
+ });
+ }
+
+ const subAgent = useSubAgent ? makeAuthorSubAgent(config) : null;
try {
if (provider === 'anthropic') {
@@ -824,6 +986,15 @@ export function useAgent(options: UseAgentOptions = {}) {
[runTurn],
);
+ // Send a specific message as a visible user turn and resolve when the agent's
+ // response is fully complete. Used by the auto-play demo to pace the script.
+ const sendText = useCallback(
+ async (text: string) => {
+ await runTurn(text, false);
+ },
+ [runTurn],
+ );
+
const stop = useCallback(() => {
abortRef.current?.abort();
}, []);
@@ -848,6 +1019,7 @@ export function useAgent(options: UseAgentOptions = {}) {
updateConfig,
send,
sendHidden,
+ sendText,
stop,
clear,
inputRef,
diff --git a/demo/src/chat/ChatInput.tsx b/demo/src/chat/ChatInput.tsx
index 359326a..9276f43 100644
--- a/demo/src/chat/ChatInput.tsx
+++ b/demo/src/chat/ChatInput.tsx
@@ -13,6 +13,14 @@ export interface ChatInputProps {
disabled?: boolean;
/** Placeholder text override. */
placeholder?: string;
+ /** Copy the whole raw conversation to the clipboard (debugging). */
+ onCopyRaw?: () => void;
+ /** Briefly true right after a successful copy, for button feedback. */
+ copiedRaw?: boolean;
+ /** Start/stop the scripted auto-play demo. */
+ onPlayDemo?: () => void;
+ /** True while the auto-play demo is running. */
+ isPlaying?: boolean;
}
export const ChatInput = memo(function ChatInput({
@@ -26,8 +34,12 @@ export const ChatInput = memo(function ChatInput({
inputRef,
disabled,
placeholder,
+ onCopyRaw,
+ copiedRaw,
+ onPlayDemo,
+ isPlaying,
}: ChatInputProps) {
- const isDisabled = disabled && !isGenerating;
+ const isDisabled = (disabled && !isGenerating) || Boolean(isPlaying);
return (
@@ -48,6 +60,27 @@ export const ChatInput = memo(function ChatInput({
}}
/>
+ {onPlayDemo && (
+
+ {isPlaying ? 'β Stop demo' : 'βΆ Play demo'}
+
+ )}
+ {hasMessages && onCopyRaw && (
+
+ {copiedRaw ? 'Copied!' : 'Copy raw'}
+
+ )}
{hasMessages && (
new Promise((r) => setTimeout(r, ms));
+
+const KICKOFF = "Hi β I'd like to file a new insurance claim.";
+
+// The flow's three form steps, in order (step 4 is a terminal callout β no form).
+const FLOW_ACTIONS = ['collect-personal-info', 'collect-claim', 'collect-bank'] as const;
+
+// Scripted answers keyed by field name; falls back to a value by field type.
+const ANSWERS: Record = {
+ 'full-name': 'Jamie Rivera',
+ birthday: '1988-03-22',
+ 'claim-description':
+ 'A pipe under the kitchen sink burst overnight and flooded the floor, damaging the lower cabinets and the flooring.',
+ iban: 'DE89370400440532013000',
+};
+
+function answerForField(field: FormField): string | boolean {
+ if (field.name in ANSWERS) return ANSWERS[field.name];
+ switch (field.type) {
+ case 'checkbox':
+ return true;
+ case 'number':
+ return '42';
+ case 'email':
+ return 'jamie.rivera@example.com';
+ case 'date':
+ return '1990-01-01';
+ case 'textarea':
+ return 'Additional details provided for the demo run.';
+ case 'select': {
+ const opts = field.options;
+ if (Array.isArray(opts) && opts.length > 0) {
+ const first = opts[0];
+ return typeof first === 'string' ? first : first.value;
+ }
+ return '';
+ }
+ case 'file':
+ return '';
+ default:
+ return 'Sample value';
+ }
+}
+
+function findForm(ast: PreviewState['ast']): FormComponent | null {
+ if (!ast) return null;
+ for (const child of ast.children) {
+ if ((child as { type?: string }).type !== 'mdmaBlock') continue;
+ const component = (child as { component?: MdmaComponent }).component;
+ if (component?.type === 'form') return component as FormComponent;
+ }
+ return null;
+}
+
+interface UsePreviewAutoplayOptions {
+ previewState: PreviewState;
+ isGenerating: boolean;
+ sendText: (message: string) => Promise;
+ setInput: (value: string) => void;
+ /** Resets the chat, flow state and backend log before a fresh run. */
+ reset: () => void;
+}
+
+export function usePreviewAutoplay({
+ previewState,
+ isGenerating,
+ sendText,
+ setInput,
+ reset,
+}: UsePreviewAutoplayOptions): { isPlaying: boolean; play: () => void } {
+ const [isPlaying, setIsPlaying] = useState(false);
+ const playingRef = useRef(false);
+
+ // Keep the latest reactive values readable from inside the async play loop.
+ const stateRef = useRef(previewState);
+ stateRef.current = previewState;
+ const generatingRef = useRef(isGenerating);
+ generatingRef.current = isGenerating;
+
+ // Poll `predicate` until it returns a truthy value, play is stopped, or timeout.
+ const waitFor = useCallback(
+ async (predicate: () => T | null, timeoutMs = 60_000): Promise => {
+ const start = Date.now();
+ while (Date.now() - start < timeoutMs) {
+ if (!playingRef.current) return null;
+ const result = predicate();
+ if (result) return result;
+ await sleep(150);
+ }
+ return null;
+ },
+ [],
+ );
+
+ const play = useCallback(() => {
+ // Toggle: a second press stops the run.
+ if (playingRef.current) {
+ playingRef.current = false;
+ setIsPlaying(false);
+ return;
+ }
+ playingRef.current = true;
+ setIsPlaying(true);
+
+ void (async () => {
+ reset();
+ await sleep(500);
+
+ // Typewriter the kickoff message into the input, then send it.
+ for (let k = 1; k <= KICKOFF.length; k++) {
+ if (!playingRef.current) break;
+ setInput(KICKOFF.slice(0, k));
+ await sleep(22);
+ }
+ await sleep(250);
+ setInput('');
+ if (playingRef.current) await sendText(KICKOFF);
+
+ let lastBlockId: string | null = null;
+ for (const action of FLOW_ACTIONS) {
+ if (!playingRef.current) break;
+
+ // Wait for a freshly-rendered, validated form for this step.
+ const found = await waitFor(() => {
+ if (generatingRef.current) return null;
+ const s = stateRef.current;
+ if (s.status !== 'ready' || !s.ast || !s.store || s.blockId === lastBlockId) return null;
+ const form = findForm(s.ast);
+ if (!form || form.onSubmit !== action) return null;
+ return { form, store: s.store, blockId: s.blockId };
+ });
+ if (!found) {
+ console.warn(`[preview-autoplay] timed out waiting for "${action}" form`);
+ break;
+ }
+
+ // Fill each field, then submit β same dispatches a human click produces.
+ await sleep(450);
+ for (const field of found.form.fields) {
+ if (!playingRef.current) break;
+ found.store.dispatch({
+ type: 'FIELD_CHANGED',
+ componentId: found.form.id,
+ field: field.name,
+ value: answerForField(field),
+ });
+ await sleep(130);
+ }
+ await sleep(300);
+ if (!playingRef.current) break;
+ found.store.dispatch({
+ type: 'ACTION_TRIGGERED',
+ componentId: found.form.id,
+ actionId: found.form.onSubmit,
+ });
+ lastBlockId = found.blockId;
+
+ // Let useInsuranceFlow call the backend + advance the agent to the next step.
+ await sleep(1000);
+ }
+
+ playingRef.current = false;
+ setIsPlaying(false);
+ })();
+ }, [reset, sendText, setInput, waitFor]);
+
+ return { isPlaying, play };
+}
diff --git a/demo/src/styles.css b/demo/src/styles.css
index 872e92b..1acb0b4 100644
--- a/demo/src/styles.css
+++ b/demo/src/styles.css
@@ -1628,6 +1628,49 @@ body {
cursor: pointer;
}
+/* Toggle row (e.g. "Generate with Own model") β label left, switch right. */
+.ai-setting--toggle {
+ flex-direction: row;
+ align-items: center;
+ justify-content: space-between;
+ gap: 10px;
+}
+
+.ai-setting--toggle input[type='checkbox'] {
+ appearance: none;
+ -webkit-appearance: none;
+ flex: 0 0 auto;
+ width: 38px;
+ height: 22px;
+ padding: 0;
+ border: none;
+ border-radius: 999px;
+ background: #d1d5db;
+ position: relative;
+ cursor: pointer;
+ transition: background 0.15s;
+}
+
+.ai-setting--toggle input[type='checkbox']::after {
+ content: '';
+ position: absolute;
+ top: 2px;
+ left: 2px;
+ width: 18px;
+ height: 18px;
+ border-radius: 50%;
+ background: #fff;
+ transition: transform 0.15s;
+}
+
+.ai-setting--toggle input[type='checkbox']:checked {
+ background: #6c5ce7;
+}
+
+.ai-setting--toggle input[type='checkbox']:checked::after {
+ transform: translateX(16px);
+}
+
.ai-setting-model-group {
display: flex;
gap: 6px;
@@ -1780,6 +1823,48 @@ body {
line-height: 1.5;
}
+/* Per-message "Raw" debug toggle + raw output view */
+.agent-raw-toggle {
+ font-size: 10px;
+ font-weight: 700;
+ text-transform: uppercase;
+ letter-spacing: 0.5px;
+ color: #6c5ce7;
+ background: transparent;
+ border: 1px solid #d6d2f5;
+ border-radius: 5px;
+ padding: 2px 7px;
+ cursor: pointer;
+ transition:
+ background 0.15s,
+ color 0.15s;
+}
+
+.agent-raw-toggle:hover {
+ background: #efecfd;
+}
+
+.agent-raw-toggle[data-active="true"] {
+ background: #6c5ce7;
+ color: #fff;
+ border-color: #6c5ce7;
+}
+
+.agent-raw {
+ font-size: 12px;
+ font-family: "SF Mono", "Fira Code", "Cascadia Code", monospace;
+ white-space: pre-wrap;
+ word-break: break-word;
+ color: #e0e0e0;
+ background: #1e1e2e;
+ margin: 0;
+ padding: 14px;
+ border-radius: 8px;
+ line-height: 1.55;
+ overflow: auto;
+ max-height: 520px;
+}
+
.chat-msg-source {
font-size: 12.5px;
font-family: "SF Mono", "Fira Code", "Cascadia Code", monospace;
@@ -1992,6 +2077,31 @@ body {
background: #fef2f2;
}
+.chat-demo-btn {
+ padding: 6px 12px;
+ font-size: 12px;
+ font-weight: 600;
+ border: 1px solid #d6d2f5;
+ border-radius: 8px;
+ background: transparent;
+ color: #6c5ce7;
+ cursor: pointer;
+ transition: all 0.15s;
+ white-space: nowrap;
+ margin-right: auto;
+}
+
+.chat-demo-btn:hover {
+ background: #efecfd;
+ border-color: #6c5ce7;
+}
+
+.chat-demo-btn[data-active="true"] {
+ background: #6c5ce7;
+ color: #fff;
+ border-color: #6c5ce7;
+}
+
.chat-send-btn {
padding: 7px 18px;
font-size: 13.5px;
diff --git a/evals/.env.example b/evals/.env.example
index 4d3940c..5cd19c5 100644
--- a/evals/.env.example
+++ b/evals/.env.example
@@ -38,9 +38,26 @@ OPENROUTER_API_KEY=
#EVAL_PROVIDER=openrouter:google/gemini-2.5-pro
#EVAL_PROVIDER=openrouter:google/gemini-2.5-flash
#EVAL_PROVIDER=openrouter:google/gemini-2.5-flash-lite
+# Gemma (open weights) β all use the google/gemma prompt variant
+#EVAL_PROVIDER=openrouter:google/gemma-4-26b-a4b-it
+#EVAL_PROVIDER=openrouter:google/gemma-4-31b-it
+#EVAL_PROVIDER=openrouter:google/gemma-3n-e4b-it
# --- xAI (via OpenRouter) ---
#EVAL_PROVIDER=openrouter:x-ai/grok-4.20
#EVAL_PROVIDER=openrouter:x-ai/grok-4.3
EVAL_PROVIDER=openai:gpt-5.5
+
+# --- Our own model (evals/own-model/) ---
+# Self-hosted MDMA-IL model (Gemma-4-E4B + v3 LoRA). The own-model suite reads
+# these dedicated vars instead of EVAL_PROVIDER, so it can run independently of
+# the third-party model evals.
+# OWN_MODEL_PROVIDER promptfoo provider id (OpenAI-compatible chat).
+# OWN_MODEL_BASE_URL the model's OpenAI-compatible base URL (ends in /v1).
+# OWN_MODEL_API_KEY the API key / token for the endpoint.
+# If the endpoint instead uses proxy-auth headers, add a `headers:` block to the
+# provider config in own-model/promptfooconfig.* instead of using OWN_MODEL_API_KEY.
+OWN_MODEL_PROVIDER=openai:chat:mdma-il-v3
+OWN_MODEL_BASE_URL=
+OWN_MODEL_API_KEY=
diff --git a/evals/.gitignore b/evals/.gitignore
new file mode 100644
index 0000000..f1814c1
--- /dev/null
+++ b/evals/.gitignore
@@ -0,0 +1,3 @@
+
+# gemma eval suite + dataset generator β kept local for dataset generation, not published
+gemma/
diff --git a/evals/own-model/README.md b/evals/own-model/README.md
new file mode 100644
index 0000000..40bb77c
--- /dev/null
+++ b/evals/own-model/README.md
@@ -0,0 +1,70 @@
+# Own-model eval β MDMA-IL DSL holdout gate
+
+Self-contained eval for **our own hosted model** β `google/gemma-4-E4B-it` + the
+**v3 MDMA-IL LoRA**.
+
+## What this tests
+
+Our model is **not** an NL chat model β it was fine-tuned to take **one MDMA-IL
+DSL intent** as input and return an **MDMA document**. So this suite is a
+**DSL holdout gate**, not the NL author suites the third-party models run:
+
+- **Input:** the 95 held-out scenarios in **DSL** form
+ (`../gemma/dataset/data/holdout-dsl.jsonl`, via `tests-dsl.mjs`).
+- **System prompt:** the `mobile-reality/mdma-il` author prompt from the prompt
+ pack β DSL input grammar + authoring rules + worked form/table/chart examples.
+- **Assertion:** `validate-mdma` β every output must be a valid MDMA document.
+
+## Why a DSL-aware prompt (not a bare instruction)
+
+The system prompt **must describe the MDMA-IL DSL** the model reads β a bare
+"generate MDMA" instruction is out-of-distribution, since the model's whole job
+is to interpret a DSL intent. Empirically measured against this endpoint:
+
+1. **The DSL grammar is required.** Without the grammar section the model
+ misreads the intent and drops `type:`/`id:`, nests under a `form:` key, or
+ hallucinates `type: action`.
+2. **A worked example anchors the output shape.** On the DSL holdout, validity
+ moved from ~41% (bare instruction) to ~90.5% once the prompt carried the DSL
+ grammar plus a worked example.
+
+The small E4B model has only a 2048-token context (`max_model_len`), so the
+prompt stays lean while still teaching the DSL β grammar + a few examples, not a
+full spec dump.
+
+## Observations (not conclusions)
+
+This is a **small model** (Gemma 4 E4B + LoRA) β prompt-sensitive, with a
+2048-token context. In short, on the DSL holdout, output validity against
+the **current** validator moved with the system prompt: ~41% (bare prompt) β
+~90.5% (DSL-aware prompt with a worked example). It is **not 100%**, and we have
+**not** concluded whether the residual gap calls for a retrain, output
+normalization, or more prompt work β that's an open question.
+
+## Configure & run
+
+Set in `../.env` (dedicated vars, not `EVAL_PROVIDER`):
+
+```
+OWN_MODEL_PROVIDER=openai:chat:mdma-v3 # served LoRA id
+OWN_MODEL_BASE_URL=https://β¦modal.run/v1 # OpenAI-compatible base URL
+OWN_MODEL_API_KEY=EMPTY # placeholder while auth is off
+```
+
+```bash
+pnpm --filter @mobile-reality/mdma-evals eval:own-model # run the gate
+pnpm --filter @mobile-reality/mdma-evals eval:view # view results
+```
+
+If `holdout-dsl.jsonl` is missing (it's gitignored/generated), build it first
+with `pnpm --filter @mobile-reality/mdma-evals dataset:build`, or point
+`OWN_MODEL_HOLDOUT` at your copy.
+
+## Contents
+
+- `promptfooconfig.own-model.yaml` β the gate config.
+- `prompt.mjs` β pins the `mobile-reality/mdma-il` system prompt; passes the
+ DSL as the user message.
+- `tests-dsl.mjs` β loads the DSL holdout into promptfoo test cases.
+- `assertions/` β own copy of the assertion modules (self-contained).
+- `results.json` β output of the last run (committed, reusable downstream).
diff --git a/evals/own-model/assertions/bar-chart.mjs b/evals/own-model/assertions/bar-chart.mjs
new file mode 100644
index 0000000..a37fa16
--- /dev/null
+++ b/evals/own-model/assertions/bar-chart.mjs
@@ -0,0 +1,13 @@
+/**
+ * Asserts that the output contains a bar chart variant.
+ */
+export default function (output) {
+ if (
+ output.includes('variant: bar') ||
+ output.includes("variant: 'bar'") ||
+ output.includes('"bar"')
+ ) {
+ return { pass: true, score: 1, reason: 'Bar chart variant found' };
+ }
+ return { pass: false, score: 0, reason: 'Expected variant: bar in chart component' };
+}
diff --git a/evals/own-model/assertions/callout-variant.mjs b/evals/own-model/assertions/callout-variant.mjs
new file mode 100644
index 0000000..4907119
--- /dev/null
+++ b/evals/own-model/assertions/callout-variant.mjs
@@ -0,0 +1,21 @@
+/**
+ * Asserts that the output contains a callout with the expected variant.
+ * Pass the variant name via config.variant (e.g. config: { variant: warning }).
+ */
+export default function (output, { config }) {
+ const variant = config?.variant || 'warning';
+ const hasCallout = output.includes('type: callout');
+ const hasVariant =
+ output.includes(`variant: ${variant}`) ||
+ output.includes(`variant: '${variant}'`) ||
+ output.includes(`variant: "${variant}"`);
+
+ if (hasCallout && hasVariant) {
+ return { pass: true, score: 1, reason: `Callout with variant: ${variant} found` };
+ }
+ return {
+ pass: false,
+ score: hasCallout ? 0.5 : 0,
+ reason: `Expected callout with variant: ${variant}. ${!hasCallout ? 'No callout found' : 'Wrong variant'}`,
+ };
+}
diff --git a/evals/own-model/assertions/calls-generate-mdma.mjs b/evals/own-model/assertions/calls-generate-mdma.mjs
new file mode 100644
index 0000000..e09e707
--- /dev/null
+++ b/evals/own-model/assertions/calls-generate-mdma.mjs
@@ -0,0 +1,58 @@
+/**
+ * Asserts that the model called the `generate_mdma` tool.
+ *
+ * Checks the output and raw response in all known locations promptfoo may
+ * place tool-call data, so this works regardless of the provider or how
+ * promptfoo serialises the tool call response.
+ *
+ * Optional config:
+ * - shouldCall: boolean (default true) β set to false to assert that the
+ * model did NOT call the tool (e.g. for conversational / info requests).
+ */
+export default function (output, context) {
+ try {
+ const shouldCall = context?.config?.shouldCall ?? true;
+
+ const parts = [
+ output,
+ context?.response,
+ context?.response?.output,
+ context?.response?.raw,
+ ].map((v) => {
+ if (v == null) return '';
+ if (typeof v === 'string') return v;
+ try {
+ return JSON.stringify(v);
+ } catch {
+ return '';
+ }
+ });
+
+ const combined = parts.join('\n');
+ const called = combined.includes('generate_mdma');
+
+ if (shouldCall) {
+ return {
+ pass: called,
+ score: called ? 1 : 0,
+ reason: called
+ ? 'Model correctly called generate_mdma tool'
+ : 'Model did not call generate_mdma β check tool definition and system prompt tool-use instruction',
+ };
+ }
+
+ return {
+ pass: !called,
+ score: !called ? 1 : 0,
+ reason: !called
+ ? 'Model correctly did not call generate_mdma for a non-document request'
+ : 'Model should not have called generate_mdma for this request',
+ };
+ } catch (err) {
+ return {
+ pass: false,
+ score: 0,
+ reason: `Assertion error: ${err instanceof Error ? err.message : String(err)}`,
+ };
+ }
+}
diff --git a/evals/own-model/assertions/component-count.mjs b/evals/own-model/assertions/component-count.mjs
new file mode 100644
index 0000000..213f2e7
--- /dev/null
+++ b/evals/own-model/assertions/component-count.mjs
@@ -0,0 +1,17 @@
+/**
+ * Asserts that the output contains at least N mdma components.
+ * Uses config.min as the minimum count (default: 5).
+ */
+export default function (output, { config }) {
+ const min = config?.min || 5;
+ const blocks = [...output.matchAll(/```mdma\n([\s\S]*?)```/g)];
+
+ if (blocks.length >= min) {
+ return { pass: true, score: 1, reason: `Found ${blocks.length} components (min: ${min})` };
+ }
+ return {
+ pass: false,
+ score: blocks.length / min,
+ reason: `Expected at least ${min} components, found ${blocks.length}`,
+ };
+}
diff --git a/evals/own-model/assertions/exact-field-count.mjs b/evals/own-model/assertions/exact-field-count.mjs
new file mode 100644
index 0000000..907d9f6
--- /dev/null
+++ b/evals/own-model/assertions/exact-field-count.mjs
@@ -0,0 +1,32 @@
+/**
+ * Asserts that a form contains exactly N fields (using `- name:` occurrences).
+ *
+ * Uses `config.expected` as the expected count.
+ * Tolerant: passes if count matches exactly.
+ */
+export default function (output, { config }) {
+ const expected = config.expected;
+ if (!expected) {
+ return { pass: false, score: 0, reason: 'No config.expected (field count) provided' };
+ }
+
+ // Count field definitions inside mdma blocks
+ const blocks = [...output.matchAll(/```mdma\n([\s\S]*?)```/g)];
+ const formBlocks = blocks.filter((b) => b[1].includes('type: form'));
+
+ let totalFields = 0;
+ for (const block of formBlocks) {
+ const fieldNames = block[1].match(/- name:/g) || [];
+ totalFields += fieldNames.length;
+ }
+
+ if (totalFields === expected) {
+ return { pass: true, score: 1, reason: `Exactly ${expected} form fields found` };
+ }
+
+ return {
+ pass: false,
+ score: totalFields > expected ? 0.5 : totalFields / expected,
+ reason: `Expected exactly ${expected} form fields, found ${totalFields}`,
+ };
+}
diff --git a/evals/own-model/assertions/fields-preserved.mjs b/evals/own-model/assertions/fields-preserved.mjs
new file mode 100644
index 0000000..f7f0fe6
--- /dev/null
+++ b/evals/own-model/assertions/fields-preserved.mjs
@@ -0,0 +1,33 @@
+/**
+ * Asserts that specific field names are still present in the output.
+ *
+ * Used to verify that after a user requests an adjustment (e.g. tone change),
+ * the original fields defined in the MDMA document are preserved.
+ *
+ * Expects `assertion.value` to be a comma-separated list of field names/keywords
+ * that must all be present in the output.
+ */
+export default function (output, { assertion }) {
+ const requiredFields = assertion.value
+ .split(',')
+ .map((f) => f.trim())
+ .filter(Boolean);
+
+ const missing = requiredFields.filter(
+ (field) => !output.toLowerCase().includes(field.toLowerCase()),
+ );
+
+ if (missing.length === 0) {
+ return {
+ pass: true,
+ score: 1,
+ reason: `All ${requiredFields.length} fields preserved: ${requiredFields.join(', ')}`,
+ };
+ }
+
+ return {
+ pass: false,
+ score: (requiredFields.length - missing.length) / requiredFields.length,
+ reason: `Missing fields after adjustment: ${missing.join(', ')}`,
+ };
+}
diff --git a/evals/own-model/assertions/file-field.mjs b/evals/own-model/assertions/file-field.mjs
new file mode 100644
index 0000000..a8d8520
--- /dev/null
+++ b/evals/own-model/assertions/file-field.mjs
@@ -0,0 +1,43 @@
+/**
+ * Asserts that the output contains a form with a file-typed field.
+ *
+ * Optional config:
+ * - sensitive: boolean β asserts the file field sets `sensitive: true`
+ *
+ * Note: `accept` and `multiple` are renderer-level concerns and are NOT part
+ * of the MDMA spec, so they are not asserted here.
+ */
+export default function (output, { config } = {}) {
+ const blockRegex = /```mdma\n([\s\S]*?)```/g;
+ const blocks = [...output.matchAll(blockRegex)].map((m) => m[1]);
+
+ const formBlocks = blocks.filter((b) => /^type:\s*form/m.test(b));
+ if (formBlocks.length === 0) {
+ return { pass: false, score: 0, reason: 'No form block found in output' };
+ }
+
+ const fileBlock = formBlocks.find((b) => /type:\s*file\b/.test(b));
+ if (!fileBlock) {
+ return {
+ pass: false,
+ score: 0,
+ reason: 'No form field with `type: file` found',
+ };
+ }
+
+ const reasons = ['Form contains a file field'];
+
+ if (config?.sensitive === true) {
+ const sensitivePattern = /type:\s*file[\s\S]{0,200}sensitive:\s*true/;
+ if (!sensitivePattern.test(fileBlock)) {
+ return {
+ pass: false,
+ score: 0,
+ reason: 'File field expected sensitive: true but not found',
+ };
+ }
+ reasons.push('sensitive: true');
+ }
+
+ return { pass: true, score: 1, reason: reasons.join('; ') };
+}
diff --git a/evals/own-model/assertions/fixer-contains-component.mjs b/evals/own-model/assertions/fixer-contains-component.mjs
new file mode 100644
index 0000000..927b79f
--- /dev/null
+++ b/evals/own-model/assertions/fixer-contains-component.mjs
@@ -0,0 +1,139 @@
+import { parse } from 'yaml';
+
+/**
+ * Custom promptfoo assertion for fixer eval.
+ *
+ * Finds a component in the fixed output and validates its fields against an
+ * expected MDMA block provided in config.
+ *
+ * config:
+ * expected: string β complete (or partial) MDMA block YAML to compare against.
+ * The `id` field in the expected block is used to locate the
+ * component in the output. Every field present in `expected`
+ * must match the actual component β extra fields in the
+ * actual output are ignored.
+ * hasFields: string[] β additional field names that must exist (any value).
+ *
+ * Example:
+ * config:
+ * expected: |
+ * type: webhook
+ * id: order-webhook
+ * url: https://api.example.com/orders
+ * method: POST
+ * trigger: order-form
+ */
+export default function (output, { config } = {}) {
+ const { expected: expectedYaml, hasFields = [] } = config ?? {};
+
+ if (!expectedYaml) {
+ return { pass: false, score: 0, reason: 'No expected block provided in config' };
+ }
+
+ let expected;
+ try {
+ expected = parse(expectedYaml);
+ } catch (e) {
+ return { pass: false, score: 0, reason: `Could not parse expected block: ${e.message}` };
+ }
+
+ const id = expected?.id;
+ if (!id) {
+ return { pass: false, score: 0, reason: 'Expected block has no id field' };
+ }
+
+ // Extract raw YAML strings from each ```mdma block in the output
+ const blocks = [];
+ const blockRegex = /```mdma\n([\s\S]*?)```/g;
+ let match;
+ while ((match = blockRegex.exec(output)) !== null) {
+ blocks.push(match[1]);
+ }
+
+ // Find and parse the block whose top-level id matches
+ let actual = null;
+ let actualRaw = null;
+ for (const raw of blocks) {
+ let parsed;
+ try {
+ parsed = parse(raw);
+ } catch {
+ continue;
+ }
+ if (parsed?.id === id) {
+ actual = parsed;
+ actualRaw = raw.trim();
+ break;
+ }
+ }
+
+ if (!actual) {
+ return {
+ pass: false,
+ score: 0,
+ reason: `Component "${id}" not found in output (${blocks.length} block(s) present)`,
+ };
+ }
+
+ // Deep compare every field in expected against actual
+ const failures = compareFields(expected, actual, '');
+
+ // Check hasFields presence
+ for (const field of hasFields) {
+ if (actual[field] === undefined || actual[field] === null || actual[field] === '') {
+ failures.push(`field "${field}" is missing or empty`);
+ }
+ }
+
+ if (failures.length > 0) {
+ return {
+ pass: false,
+ score: 0,
+ reason: `Component "${id}" field mismatch:\n${failures.join('\n')}\n\nActual block:\n${actualRaw}`,
+ };
+ }
+
+ return {
+ pass: true,
+ score: 1,
+ reason: `Component "${id}" matches expected block`,
+ };
+}
+
+function compareFields(expected, actual, prefix) {
+ const failures = [];
+ for (const [key, expectedVal] of Object.entries(expected)) {
+ const path = prefix ? `${prefix}.${key}` : key;
+ const actualVal = actual?.[key];
+
+ if (expectedVal === null || expectedVal === undefined) {
+ // null in expected = presence check only
+ if (actualVal === undefined || actualVal === null || actualVal === '') {
+ failures.push(`"${path}" is missing or empty`);
+ }
+ } else if (Array.isArray(expectedVal)) {
+ if (!Array.isArray(actualVal)) {
+ failures.push(`"${path}" should be an array, got ${typeof actualVal}`);
+ } else if (expectedVal.length !== actualVal.length) {
+ failures.push(`"${path}" length: expected ${expectedVal.length}, got ${actualVal.length}`);
+ } else {
+ for (let i = 0; i < expectedVal.length; i++) {
+ if (typeof expectedVal[i] === 'object' && expectedVal[i] !== null) {
+ failures.push(...compareFields(expectedVal[i], actualVal[i] ?? {}, `${path}[${i}]`));
+ } else if (expectedVal[i] !== actualVal[i]) {
+ failures.push(
+ `"${path}[${i}]": expected ${JSON.stringify(expectedVal[i])}, got ${JSON.stringify(actualVal[i])}`,
+ );
+ }
+ }
+ }
+ } else if (typeof expectedVal === 'object') {
+ failures.push(...compareFields(expectedVal, actualVal ?? {}, path));
+ } else if (actualVal !== expectedVal) {
+ failures.push(
+ `"${path}": expected ${JSON.stringify(expectedVal)}, got ${JSON.stringify(actualVal)}`,
+ );
+ }
+ }
+ return failures;
+}
diff --git a/evals/own-model/assertions/fixer-no-prose.mjs b/evals/own-model/assertions/fixer-no-prose.mjs
new file mode 100644
index 0000000..0746d09
--- /dev/null
+++ b/evals/own-model/assertions/fixer-no-prose.mjs
@@ -0,0 +1,31 @@
+/**
+ * Custom promptfoo assertion for fixer eval.
+ *
+ * Enforces that the fixer output contains ONLY ```mdma blocks β no prose,
+ * headings, intro/outro text, or commentary outside the blocks. The fixer's
+ * job is to repair MDMA blocks, not to converse with the user.
+ *
+ * Allowed in the output: ```mdma blocks and whitespace between them.
+ * Disallowed: prose paragraphs, Markdown headings, lists, code fences other
+ * than `mdma`, or any text outside a ```mdma ... ``` pair.
+ */
+export default function (output) {
+ // Strip every ```mdma ... ``` block (greedy across newlines, non-greedy on content)
+ const stripped = output.replace(/```mdma\n[\s\S]*?```/g, '').trim();
+
+ if (stripped.length === 0) {
+ return {
+ pass: true,
+ score: 1,
+ reason: 'Fixer output contains only ```mdma blocks (no prose)',
+ };
+ }
+
+ // Truncate the offending content for the failure message
+ const preview = stripped.length > 200 ? `${stripped.slice(0, 200)}...` : stripped;
+ return {
+ pass: false,
+ score: 0,
+ reason: `Fixer output contains non-mdma content (${stripped.length} chars):\n${preview}`,
+ };
+}
diff --git a/evals/own-model/assertions/fixer-preserves-components.mjs b/evals/own-model/assertions/fixer-preserves-components.mjs
new file mode 100644
index 0000000..2b455d3
--- /dev/null
+++ b/evals/own-model/assertions/fixer-preserves-components.mjs
@@ -0,0 +1,33 @@
+/**
+ * Custom promptfoo assertion for fixer eval.
+ *
+ * Verifies that the fixer didn't drop components. The fixed output
+ * should contain at least config.min mdma blocks (default: same as input).
+ */
+export default function (output, { config } = {}) {
+ const min = config?.min ?? 1;
+ const max = config?.max ?? Number.POSITIVE_INFINITY;
+ const blockCount = (output.match(/```mdma/g) ?? []).length;
+
+ if (blockCount < min) {
+ return {
+ pass: false,
+ score: 0,
+ reason: `Fixer output has ${blockCount} mdma block(s) but expected at least ${min}`,
+ };
+ }
+
+ if (blockCount > max) {
+ return {
+ pass: false,
+ score: 0,
+ reason: `Fixer output has ${blockCount} mdma block(s) but expected at most ${max}`,
+ };
+ }
+
+ return {
+ pass: true,
+ score: 1,
+ reason: `Fixer preserved ${blockCount} mdma block(s) (min: ${min}${max !== Number.POSITIVE_INFINITY ? `, max: ${max}` : ''})`,
+ };
+}
diff --git a/evals/own-model/assertions/fixer-resolves-errors.mjs b/evals/own-model/assertions/fixer-resolves-errors.mjs
new file mode 100644
index 0000000..d675c19
--- /dev/null
+++ b/evals/own-model/assertions/fixer-resolves-errors.mjs
@@ -0,0 +1,63 @@
+import { validate } from '@mobile-reality/mdma-validator';
+
+/**
+ * Custom promptfoo assertion for fixer eval.
+ *
+ * Validates that the LLM-fixed output:
+ * 1. Contains at least one mdma block (didn't strip everything)
+ * 2. Has zero unfixed errors after validation
+ * 3. Reports remaining warnings/infos for transparency
+ *
+ * The config.maxWarnings option (default: Infinity) allows tests to assert
+ * that the fixer also resolved warnings.
+ */
+export default function (output, { config } = {}) {
+ const maxWarnings = config?.maxWarnings ?? Infinity;
+ const exclude = config?.exclude ?? ['thinking-block', 'flow-ordering'];
+
+ // Check the output actually contains mdma blocks
+ const blockCount = (output.match(/```mdma/g) ?? []).length;
+ if (blockCount === 0) {
+ return {
+ pass: false,
+ score: 0,
+ reason: 'Fixer output contains no ```mdma blocks β the LLM may have stripped the document',
+ };
+ }
+
+ const result = validate(output, {
+ exclude,
+ autoFix: false,
+ });
+
+ const unfixedErrors = result.issues.filter((i) => i.severity === 'error');
+ const unfixedWarnings = result.issues.filter((i) => i.severity === 'warning');
+
+ if (unfixedErrors.length > 0) {
+ const details = unfixedErrors
+ .map((i) => `[${i.ruleId}] ${i.componentId ?? '?'}: ${i.message}`)
+ .join('\n');
+ return {
+ pass: false,
+ score: 0,
+ reason: `Fixer output still has ${unfixedErrors.length} error(s):\n${details}`,
+ };
+ }
+
+ if (unfixedWarnings.length > maxWarnings) {
+ const details = unfixedWarnings
+ .map((i) => `[${i.ruleId}] ${i.componentId ?? '?'}: ${i.message}`)
+ .join('\n');
+ return {
+ pass: false,
+ score: 0.5,
+ reason: `Fixer output has ${unfixedWarnings.length} warning(s) (max ${maxWarnings}):\n${details}`,
+ };
+ }
+
+ return {
+ pass: true,
+ score: 1,
+ reason: `Fixer resolved all errors (${result.summary.warnings} warnings, ${result.summary.infos} info, ${blockCount} blocks)`,
+ };
+}
diff --git a/evals/own-model/assertions/form-fields-match.mjs b/evals/own-model/assertions/form-fields-match.mjs
new file mode 100644
index 0000000..311db89
--- /dev/null
+++ b/evals/own-model/assertions/form-fields-match.mjs
@@ -0,0 +1,101 @@
+/**
+ * Deep validation: checks that generated mdma form blocks contain the
+ * expected fields with correct attributes.
+ *
+ * config.expectedForms: Array of { fields: string[], sensitive?: string[] }
+ * - fields: field names that must appear in the form block
+ * - sensitive: field names that must be marked sensitive: true
+ *
+ * If multiple expectedForms are provided, they are matched in order to
+ * the mdma form blocks found in the output.
+ */
+export default function (output, { config }) {
+ const expectedForms = config?.expectedForms || [];
+ if (expectedForms.length === 0) {
+ return { pass: true, score: 1, reason: 'No expected forms to check' };
+ }
+
+ // Extract all mdma form blocks
+ const blockRegex = /```mdma\n([\s\S]*?)```/g;
+ const blocks = [...output.matchAll(blockRegex)];
+ const formBlocks = blocks.map((b) => b[1].trim()).filter((b) => /^type:\s*form/m.test(b));
+
+ if (formBlocks.length === 0) {
+ return {
+ pass: false,
+ score: 0,
+ reason: `Expected ${expectedForms.length} form block(s) but found none`,
+ };
+ }
+
+ const results = [];
+ let totalScore = 0;
+
+ for (let i = 0; i < expectedForms.length; i++) {
+ const expected = expectedForms[i];
+ const block = formBlocks[i];
+
+ if (!block) {
+ results.push(`Form ${i + 1}: missing (expected ${expected.fields.length} fields)`);
+ continue;
+ }
+
+ const blockLower = block.toLowerCase();
+
+ // Check field names
+ const fieldsFound = expected.fields.filter(
+ (f) =>
+ blockLower.includes(`name: ${f.toLowerCase()}`) ||
+ blockLower.includes(`name: "${f.toLowerCase()}"`),
+ );
+ const fieldScore = fieldsFound.length / expected.fields.length;
+
+ // Check onSubmit is present
+ const hasOnSubmit = /onSubmit:\s*\S+/i.test(block);
+ if (!hasOnSubmit) {
+ results.push(`Form ${i + 1}: missing onSubmit (no submit button)`);
+ }
+
+ // Check sensitive flags
+ let sensitiveScore = 1;
+ if (expected.sensitive && expected.sensitive.length > 0) {
+ // For each sensitive field, check that it has sensitive: true nearby
+ let sensitiveFound = 0;
+ for (const sf of expected.sensitive) {
+ // Find the field block and check for sensitive: true
+ const fieldPattern = new RegExp(`name:\\s*"?${sf}"?[\\s\\S]{0,200}sensitive:\\s*true`, 'i');
+ if (fieldPattern.test(block)) {
+ sensitiveFound++;
+ }
+ }
+ sensitiveScore = sensitiveFound / expected.sensitive.length;
+ }
+
+ const submitScore = hasOnSubmit ? 1 : 0;
+ const formScore = (fieldScore + sensitiveScore + submitScore) / 3;
+ totalScore += formScore;
+
+ const missingFields = expected.fields.filter((f) => !fieldsFound.includes(f));
+ if (missingFields.length > 0) {
+ results.push(
+ `Form ${i + 1}: missing fields [${missingFields.join(', ')}] (${fieldsFound.length}/${expected.fields.length} found)`,
+ );
+ }
+ if (sensitiveScore < 1 && expected.sensitive) {
+ results.push(
+ `Form ${i + 1}: some sensitive flags missing (score: ${sensitiveScore.toFixed(2)})`,
+ );
+ }
+ if (missingFields.length === 0 && sensitiveScore === 1) {
+ results.push(`Form ${i + 1}: all ${expected.fields.length} fields correct`);
+ }
+ }
+
+ const avgScore = totalScore / expectedForms.length;
+
+ return {
+ pass: avgScore >= 0.5,
+ score: avgScore,
+ reason: results.join('; '),
+ };
+}
diff --git a/evals/own-model/assertions/has-bindings.mjs b/evals/own-model/assertions/has-bindings.mjs
new file mode 100644
index 0000000..2921e5f
--- /dev/null
+++ b/evals/own-model/assertions/has-bindings.mjs
@@ -0,0 +1,16 @@
+/**
+ * Asserts that the output contains binding expressions ({{ }}).
+ */
+export default function (output) {
+ const bindingPattern = /\{\{[a-z][a-zA-Z0-9_-]*\.[a-zA-Z0-9_.-]+\}\}/g;
+ const matches = output.match(bindingPattern) || [];
+
+ if (matches.length > 0) {
+ return {
+ pass: true,
+ score: 1,
+ reason: `Found ${matches.length} binding(s): ${matches.slice(0, 3).join(', ')}`,
+ };
+ }
+ return { pass: false, score: 0, reason: 'No binding expressions ({{component.field}}) found' };
+}
diff --git a/evals/own-model/assertions/has-confirm.mjs b/evals/own-model/assertions/has-confirm.mjs
new file mode 100644
index 0000000..23ad212
--- /dev/null
+++ b/evals/own-model/assertions/has-confirm.mjs
@@ -0,0 +1,17 @@
+/**
+ * Asserts that the output contains a button with a confirm dialog.
+ */
+export default function (output) {
+ const hasButton = output.includes('type: button');
+ const hasConfirm = output.includes('confirm:');
+ const hasConfirmText = output.includes('confirmText:') || output.includes('message:');
+
+ if (hasButton && hasConfirm && hasConfirmText) {
+ return { pass: true, score: 1, reason: 'Button with confirmation dialog found' };
+ }
+ return {
+ pass: false,
+ score: hasButton ? 0.5 : 0,
+ reason: `Expected button with confirm dialog. ${!hasButton ? 'No button found' : 'Missing confirm config'}`,
+ };
+}
diff --git a/evals/own-model/assertions/has-required-fields.mjs b/evals/own-model/assertions/has-required-fields.mjs
new file mode 100644
index 0000000..e128ea6
--- /dev/null
+++ b/evals/own-model/assertions/has-required-fields.mjs
@@ -0,0 +1,17 @@
+/**
+ * Asserts that the output contains at least N fields with required: true.
+ * Uses config.min as the minimum count (default: 2).
+ */
+export default function (output, { config }) {
+ const minRequired = config?.min || 2;
+ const matches = output.match(/required:\s*true/g) || [];
+
+ if (matches.length >= minRequired) {
+ return { pass: true, score: 1, reason: `Found ${matches.length} required fields` };
+ }
+ return {
+ pass: false,
+ score: matches.length / minRequired,
+ reason: `Expected at least ${minRequired} required: true flags, found ${matches.length}`,
+ };
+}
diff --git a/evals/own-model/assertions/has-sensitive.mjs b/evals/own-model/assertions/has-sensitive.mjs
new file mode 100644
index 0000000..8a64a74
--- /dev/null
+++ b/evals/own-model/assertions/has-sensitive.mjs
@@ -0,0 +1,9 @@
+/**
+ * Asserts that the output contains at least one sensitive: true flag.
+ */
+export default function (output) {
+ if (output.includes('sensitive: true')) {
+ return { pass: true, score: 1, reason: 'Found sensitive: true flag' };
+ }
+ return { pass: false, score: 0, reason: 'Expected at least one sensitive: true flag' };
+}
diff --git a/evals/own-model/assertions/has-webhook.mjs b/evals/own-model/assertions/has-webhook.mjs
new file mode 100644
index 0000000..bcc1313
--- /dev/null
+++ b/evals/own-model/assertions/has-webhook.mjs
@@ -0,0 +1,18 @@
+/**
+ * Asserts that the output contains a webhook component with required fields.
+ */
+export default function (output) {
+ const hasWebhook = output.includes('type: webhook');
+ const hasUrl = output.includes('url:');
+ const hasTrigger = output.includes('trigger:');
+
+ if (hasWebhook && hasUrl && hasTrigger) {
+ return { pass: true, score: 1, reason: 'Webhook with url and trigger found' };
+ }
+
+ if (!hasWebhook) {
+ return { pass: false, score: 0, reason: 'No webhook component found' };
+ }
+ const missing = [!hasUrl && 'url', !hasTrigger && 'trigger'].filter(Boolean);
+ return { pass: false, score: 0.5, reason: `Webhook missing: ${missing.join(', ')}` };
+}
diff --git a/evals/own-model/assertions/judge-matches-expected.mjs b/evals/own-model/assertions/judge-matches-expected.mjs
new file mode 100644
index 0000000..f49d2c6
--- /dev/null
+++ b/evals/own-model/assertions/judge-matches-expected.mjs
@@ -0,0 +1,127 @@
+import { validateConversation } from '@mobile-reality/mdma-validator';
+
+/**
+ * Custom promptfoo assertion for the conversation-judge eval.
+ *
+ * Required:
+ * - `vars.expectedJudgment` β 'valid' | 'invalid'
+ *
+ * Optional per-test config:
+ * - `expectedRules: string[]` β when expectedJudgment is 'invalid',
+ * rule names that MUST appear in the LLM judge's issues array.
+ *
+ * Optional cross-check (turned on when `vars.steps` is provided):
+ * - Runs `validateConversation()` on the assistant messages with the
+ * given step definition. Asserts the deterministic validator agrees
+ * with both `vars.expectedJudgment` AND the LLM judge.
+ *
+ * Passes only when every check it ran agrees. Fails on the first
+ * disagreement and reports what was off (LLM, validator, or both).
+ */
+export default function (output, context) {
+ const vars = context?.vars ?? {};
+ const config = context?.config ?? {};
+ const expectedJudgment = vars.expectedJudgment;
+
+ if (expectedJudgment !== 'valid' && expectedJudgment !== 'invalid') {
+ return {
+ pass: false,
+ score: 0,
+ reason: `Test missing or invalid vars.expectedJudgment (got: ${JSON.stringify(expectedJudgment)})`,
+ };
+ }
+
+ // --- Parse the LLM judge's JSON output ---
+ const fencedMatch = output.match(/```(?:json)?\s*\n?(\{[\s\S]*?\})\s*\n?```/);
+ const candidate = fencedMatch ? fencedMatch[1] : output.trim();
+
+ let judgment;
+ try {
+ judgment = JSON.parse(candidate);
+ } catch (err) {
+ return {
+ pass: false,
+ score: 0,
+ reason: `Judge output is not valid JSON: ${err.message}\nOutput (first 300 chars): ${output.slice(0, 300)}`,
+ };
+ }
+ if (typeof judgment?.valid !== 'boolean' || !Array.isArray(judgment.issues)) {
+ return {
+ pass: false,
+ score: 0,
+ reason: `Judge JSON missing required fields (boolean "valid" and array "issues")`,
+ };
+ }
+
+ const expectedValid = expectedJudgment === 'valid';
+ const llmValid = judgment.valid;
+
+ // --- Check 1: LLM judge matches expectedJudgment ---
+ if (llmValid !== expectedValid) {
+ const issuesSummary = judgment.issues
+ .slice(0, 5)
+ .map((i) => ` [msg ${i.messageIndex}, ${i.rule}] ${i.issue}`)
+ .join('\n');
+ return {
+ pass: false,
+ score: 0,
+ reason: `LLM judge expected "${expectedJudgment}" but returned "${llmValid ? 'valid' : 'invalid'}".\nJudge's issues:\n${issuesSummary || ' (none)'}`,
+ };
+ }
+
+ // --- Check 2: required rules surfaced (only for invalid cases) ---
+ const expectedRules = Array.isArray(config.expectedRules) ? config.expectedRules : null;
+ if (expectedRules && !expectedValid) {
+ const seenRules = new Set(judgment.issues.map((i) => i.rule));
+ const missing = expectedRules.filter((r) => !seenRules.has(r));
+ if (missing.length > 0) {
+ return {
+ pass: false,
+ score: 0.5,
+ reason: `LLM judge correctly marked invalid but missed expected rule violation(s): ${missing.join(', ')}.\nSeen rules: ${[...seenRules].join(', ') || '(none)'}`,
+ };
+ }
+ }
+
+ // --- Check 3: cross-check against validateConversation (deterministic) ---
+ // Activated when the test provides `vars.steps`. Runs the deterministic
+ // validator on the assistant messages and asserts it agrees with both
+ // the expected judgment AND the LLM's judgment.
+ let crossCheckSummary = '';
+ if (Array.isArray(vars.steps) && vars.steps.length > 0) {
+ const assistantMessages = (Array.isArray(vars.conversation) ? vars.conversation : [])
+ .filter((t) => t.role === 'assistant')
+ .map((t) => t.content ?? '');
+
+ const validatorResult = validateConversation(assistantMessages, {
+ steps: vars.steps,
+ exclude: ['thinking-block'],
+ });
+ const validatorOk = validatorResult.ok;
+
+ if (validatorOk !== expectedValid) {
+ const errs = validatorResult.issues
+ .filter((i) => i.severity === 'error')
+ .slice(0, 5)
+ .map((i) => ` [msg ${i.messageIndex}] ${i.message}`)
+ .join('\n');
+ return {
+ pass: false,
+ score: 0,
+ reason: `validateConversation disagrees with expected judgment.\nExpected: "${expectedJudgment}".\nDeterministic validator: "${validatorOk ? 'valid' : 'invalid'}".\nLLM judge: "${llmValid ? 'valid' : 'invalid'}".\nValidator errors:\n${errs || ' (none)'}`,
+ };
+ }
+
+ // Both agree with expected β cross-check passed
+ const errCount = validatorResult.issues.filter((i) => i.severity === 'error').length;
+ crossCheckSummary = ` | validator: ${validatorOk ? 'ok' : `${errCount} error(s)`}`;
+ }
+
+ return {
+ pass: true,
+ score: 1,
+ reason: expectedValid
+ ? `Judge correctly marked the conversation as valid${crossCheckSummary}`
+ : `Judge correctly marked the conversation as invalid (${judgment.issues.length} issue${judgment.issues.length === 1 ? '' : 's'})${crossCheckSummary}`,
+ };
+}
diff --git a/evals/own-model/assertions/mentions-fields.mjs b/evals/own-model/assertions/mentions-fields.mjs
new file mode 100644
index 0000000..b1b68b9
--- /dev/null
+++ b/evals/own-model/assertions/mentions-fields.mjs
@@ -0,0 +1,34 @@
+/**
+ * Asserts that the generated prompt mentions a minimum percentage of the
+ * configured field names.
+ *
+ * config.fields: string[] β field names to look for
+ * config.minRatio: number β minimum ratio of fields that must appear (default: 0.5)
+ */
+export default function (output, { config }) {
+ const fields = config?.fields || [];
+ const minRatio = config?.minRatio ?? 0.5;
+
+ if (fields.length === 0) {
+ return { pass: true, score: 1, reason: 'No fields to check' };
+ }
+
+ const lower = output.toLowerCase();
+ const found = fields.filter((f) => lower.includes(f.toLowerCase()));
+ const ratio = found.length / fields.length;
+
+ if (ratio >= minRatio) {
+ return {
+ pass: true,
+ score: ratio,
+ reason: `Found ${found.length}/${fields.length} field names (${(ratio * 100).toFixed(0)}%)`,
+ };
+ }
+
+ const missing = fields.filter((f) => !lower.includes(f.toLowerCase()));
+ return {
+ pass: false,
+ score: ratio,
+ reason: `Only found ${found.length}/${fields.length} field names (need ${(minRatio * 100).toFixed(0)}%). Missing: ${missing.join(', ')}`,
+ };
+}
diff --git a/evals/own-model/assertions/mentions-trigger.mjs b/evals/own-model/assertions/mentions-trigger.mjs
new file mode 100644
index 0000000..bca49ea
--- /dev/null
+++ b/evals/own-model/assertions/mentions-trigger.mjs
@@ -0,0 +1,114 @@
+/**
+ * Asserts that the generated customPrompt includes trigger/when-to-generate
+ * instructions matching the configured trigger mode.
+ *
+ * config.mode: 'keyword' | 'immediate' | 'contextual' | 'form-submit' | 'multi-step'
+ * config.keywords: string[] β for keyword mode, specific phrases to check
+ * config.contextHints: string[] β for contextual mode, hints to look for
+ * config.steps: { mode: string, keywords?: string[] }[] β for multi-step mode
+ */
+export default function (output, { config }) {
+ const mode = config?.mode;
+ const lower = output.toLowerCase();
+
+ if (mode === 'keyword') {
+ const keywords = config?.keywords || [];
+ if (keywords.length === 0) {
+ return { pass: true, score: 1, reason: 'No keywords to check' };
+ }
+ const found = keywords.filter((kw) => lower.includes(kw.toLowerCase()));
+ if (found.length > 0) {
+ return {
+ pass: true,
+ score: found.length / keywords.length,
+ reason: `Found ${found.length}/${keywords.length} trigger keywords: ${found.join(', ')}`,
+ };
+ }
+ return {
+ pass: false,
+ score: 0,
+ reason: `None of the trigger keywords found: ${keywords.join(', ')}`,
+ };
+ }
+
+ if (mode === 'immediate') {
+ const markers = /immediate|first message|always|conversation start|right away/;
+ if (markers.test(lower)) {
+ return { pass: true, score: 1, reason: 'Found immediate trigger instruction' };
+ }
+ return { pass: false, score: 0, reason: 'Missing immediate trigger instruction' };
+ }
+
+ if (mode === 'contextual') {
+ const hints = config?.contextHints || [];
+ if (hints.length === 0) {
+ const contextMarkers = /when.*user|after.*attempt|if.*express|condition|context/;
+ if (contextMarkers.test(lower)) {
+ return { pass: true, score: 1, reason: 'Found contextual trigger language' };
+ }
+ return { pass: false, score: 0, reason: 'Missing contextual trigger language' };
+ }
+ const found = hints.filter((h) => lower.includes(h.toLowerCase()));
+ if (found.length > 0) {
+ return {
+ pass: true,
+ score: found.length / hints.length,
+ reason: `Found ${found.length}/${hints.length} context hints`,
+ };
+ }
+ return {
+ pass: false,
+ score: 0,
+ reason: `None of the contextual hints found: ${hints.join(', ')}`,
+ };
+ }
+
+ if (mode === 'form-submit') {
+ const markers = /submit|after.*form|previous step|form.*complet|upon.*submis/;
+ if (markers.test(lower)) {
+ return { pass: true, score: 1, reason: 'Found form-submit trigger instruction' };
+ }
+ return { pass: false, score: 0, reason: 'Missing form-submit trigger instruction' };
+ }
+
+ if (mode === 'multi-step') {
+ // Check that output describes a multi-step / sequential flow
+ const stepMarkers =
+ /step\s*[12345]|phase\s*[12345]|first.*then|after.*submit|next.*step|sequential|in order/i;
+ if (!stepMarkers.test(output)) {
+ return { pass: false, score: 0, reason: 'Output does not describe a multi-step flow' };
+ }
+
+ // Optionally check per-step trigger modes
+ const steps = config?.steps || [];
+ if (steps.length === 0) {
+ return { pass: true, score: 1, reason: 'Found multi-step flow language' };
+ }
+
+ let matched = 0;
+ for (const step of steps) {
+ if (step.mode === 'keyword' && step.keywords) {
+ const found = step.keywords.some((kw) => lower.includes(kw.toLowerCase()));
+ if (found) matched++;
+ } else if (step.mode === 'immediate') {
+ if (/immediate|first message|always|conversation start/.test(lower)) matched++;
+ } else if (step.mode === 'form-submit') {
+ if (/submit|after.*form|previous step/.test(lower)) matched++;
+ } else if (step.mode === 'contextual' && step.keywords) {
+ const found = step.keywords.some((kw) => lower.includes(kw.toLowerCase()));
+ if (found) matched++;
+ } else {
+ matched++; // no specific check, count as passed
+ }
+ }
+
+ const score = matched / steps.length;
+ return {
+ pass: score >= 0.5,
+ score,
+ reason: `Matched ${matched}/${steps.length} step triggers in multi-step flow`,
+ };
+ }
+
+ return { pass: true, score: 1, reason: 'No trigger mode specified' };
+}
diff --git a/evals/own-model/assertions/no-mdma-regeneration.mjs b/evals/own-model/assertions/no-mdma-regeneration.mjs
new file mode 100644
index 0000000..0157e24
--- /dev/null
+++ b/evals/own-model/assertions/no-mdma-regeneration.mjs
@@ -0,0 +1,27 @@
+/**
+ * Asserts that the follow-up response does NOT contain full MDMA code blocks.
+ *
+ * After the initial generation, follow-up turns (tone changes, clarifications,
+ * field tweaks) should produce conversational responses β not regenerate the
+ * entire MDMA document from scratch.
+ */
+export default function (output) {
+ const mdmaBlocks = [...output.matchAll(/```mdma\n([\s\S]*?)```/g)];
+
+ // Thinking blocks are always required by the MDMA spec β don't count them as regeneration
+ const nonThinking = mdmaBlocks.filter((m) => !/^\s*type:\s*thinking\b/m.test(m[1]));
+
+ if (nonThinking.length === 0) {
+ return {
+ pass: true,
+ score: 1,
+ reason: 'No MDMA blocks regenerated β conversational reply only',
+ };
+ }
+
+ return {
+ pass: false,
+ score: 0,
+ reason: `Expected no MDMA blocks in follow-up, but found ${nonThinking.length} non-thinking block(s). The model regenerated the document instead of responding conversationally.`,
+ };
+}
diff --git a/evals/own-model/assertions/no-multi-step-flow.mjs b/evals/own-model/assertions/no-multi-step-flow.mjs
new file mode 100644
index 0000000..634f03c
--- /dev/null
+++ b/evals/own-model/assertions/no-multi-step-flow.mjs
@@ -0,0 +1,33 @@
+import { validate } from '@mobile-reality/mdma-validator';
+
+/**
+ * Custom promptfoo assertion for fixer eval.
+ *
+ * Verifies that the fixer output has no flow-ordering errors.
+ * This relies on the validator's own logic for detecting multi-step
+ * flows, circular references, and multiple interactive types.
+ */
+export default function (output) {
+ const result = validate(output, {
+ exclude: ['thinking-block'],
+ autoFix: false,
+ });
+
+ const flowErrors = result.issues.filter(
+ (i) => i.ruleId === 'flow-ordering' && i.severity === 'error',
+ );
+
+ if (flowErrors.length > 0) {
+ return {
+ pass: false,
+ score: 0,
+ reason: `Fixer output still has ${flowErrors.length} flow-ordering error(s):\n${flowErrors.map((i) => i.message).join('\n')}`,
+ };
+ }
+
+ return {
+ pass: true,
+ score: 1,
+ reason: 'No flow-ordering errors',
+ };
+}
diff --git a/evals/own-model/assertions/no-placeholder-content.mjs b/evals/own-model/assertions/no-placeholder-content.mjs
new file mode 100644
index 0000000..28144b0
--- /dev/null
+++ b/evals/own-model/assertions/no-placeholder-content.mjs
@@ -0,0 +1,55 @@
+/**
+ * Custom promptfoo assertion that checks for placeholder content
+ * in visible text and mdma blocks (excluding thinking blocks).
+ *
+ * Thinking blocks may mention placeholders as part of reasoning β
+ * that's fine. We only care about placeholders in rendered content.
+ */
+const PLACEHOLDER_PATTERNS = [
+ /\bTODO\b/i,
+ /\bTBD\b/i,
+ /\bFIXME\b/i,
+ /\bLorem\s*ipsum\b/i,
+ /^\.{3,}$/m,
+];
+
+export default function (output) {
+ // Extract mdma blocks and classify them
+ const blocks = [...output.matchAll(/```mdma\s*([\s\S]*?)```/g)];
+
+ for (const block of blocks) {
+ const yaml = block[1];
+ // Skip thinking blocks
+ if (/^\s*type:\s*thinking\b/m.test(yaml)) continue;
+
+ for (const pattern of PLACEHOLDER_PATTERNS) {
+ if (pattern.test(yaml)) {
+ const match = yaml.match(pattern);
+ return {
+ pass: false,
+ score: 0,
+ reason: `Placeholder content "${match[0]}" found in mdma block`,
+ };
+ }
+ }
+ }
+
+ // Check visible prose (everything outside mdma blocks)
+ const prose = output.replace(/```mdma[\s\S]*?```/g, '');
+ for (const pattern of PLACEHOLDER_PATTERNS) {
+ if (pattern.test(prose)) {
+ const match = prose.match(pattern);
+ return {
+ pass: false,
+ score: 0,
+ reason: `Placeholder content "${match[0]}" found in visible text`,
+ };
+ }
+ }
+
+ return {
+ pass: true,
+ score: 1,
+ reason: 'No placeholder content found in visible output',
+ };
+}
diff --git a/evals/own-model/assertions/no-spec-repetition.mjs b/evals/own-model/assertions/no-spec-repetition.mjs
new file mode 100644
index 0000000..970737a
--- /dev/null
+++ b/evals/own-model/assertions/no-spec-repetition.mjs
@@ -0,0 +1,35 @@
+/**
+ * Asserts that the generated customPrompt does NOT repeat the full MDMA spec.
+ *
+ * A customPrompt should layer domain-specific instructions on top of the spec,
+ * not duplicate it. Checks for spec-level content that should not appear.
+ */
+export default function (output) {
+ const specMarkers = [
+ { pattern: 'MDMA_AUTHOR_PROMPT', label: 'MDMA_AUTHOR_PROMPT reference' },
+ { pattern: '## Self-Check Checklist', label: 'Self-check checklist' },
+ { pattern: 'Component Reference Table', label: 'Component reference table' },
+ { pattern: 'MUST be inside a fenced code block tagged', label: 'Base authoring rule' },
+ ];
+
+ const found = [];
+ for (const marker of specMarkers) {
+ if (output.includes(marker.pattern)) {
+ found.push(marker.label);
+ }
+ }
+
+ if (found.length === 0) {
+ return {
+ pass: true,
+ score: 1,
+ reason: 'No MDMA spec content repeated',
+ };
+ }
+
+ return {
+ pass: false,
+ score: 0,
+ reason: `CustomPrompt repeats MDMA spec content: ${found.join(', ')}`,
+ };
+}
diff --git a/evals/own-model/assertions/no-yaml-leak.mjs b/evals/own-model/assertions/no-yaml-leak.mjs
new file mode 100644
index 0000000..6769d9d
--- /dev/null
+++ b/evals/own-model/assertions/no-yaml-leak.mjs
@@ -0,0 +1,42 @@
+/**
+ * Asserts that the response does not leak raw YAML syntax in visible text.
+ *
+ * MDMA YAML (type:, id:, sensitive:, fields:, etc.) should only appear inside
+ * fenced ```mdma blocks, never in the prose the user sees. This catches cases
+ * where the model dumps component internals outside of code fences.
+ */
+export default function (output) {
+ // Strip all fenced code blocks (mdma or otherwise) to get only visible text
+ const visibleText = output.replace(/```[\s\S]*?```/g, '');
+
+ // YAML-like patterns that should never appear in visible prose
+ const yamlPatterns = [
+ /^type:\s*(form|button|tasklist|table|chart|callout|approval-gate|webhook|thinking)\b/m,
+ /^id:\s*[a-z][a-z0-9-]+$/m,
+ /^sensitive:\s*(true|false)$/m,
+ /^fields:\s*$/m,
+ /^columns:\s*$/m,
+ /^onSubmit:\s*/m,
+ /^onAction:\s*/m,
+ /^requiredApprovers:\s*\d+$/m,
+ /^variant:\s*(primary|secondary|danger|ghost|info|warning|error|success|line|bar|area|pie)\b/m,
+ ];
+
+ const leaks = [];
+ for (const pattern of yamlPatterns) {
+ const match = visibleText.match(pattern);
+ if (match) {
+ leaks.push(match[0].trim());
+ }
+ }
+
+ if (leaks.length === 0) {
+ return { pass: true, score: 1, reason: 'No YAML leaked in visible text' };
+ }
+
+ return {
+ pass: false,
+ score: 0,
+ reason: `Raw YAML leaked in visible text: ${leaks.join(', ')}`,
+ };
+}
diff --git a/evals/own-model/assertions/only-components.mjs b/evals/own-model/assertions/only-components.mjs
new file mode 100644
index 0000000..80be80d
--- /dev/null
+++ b/evals/own-model/assertions/only-components.mjs
@@ -0,0 +1,46 @@
+/**
+ * Asserts that the output contains ONLY the allowed component types (plus thinking).
+ *
+ * Pass allowed types via `config.allowed` as an array of strings.
+ * e.g. config: { allowed: [form, button] }
+ *
+ * The thinking component is always implicitly allowed.
+ * Fails if any component type appears that is not in the allow-list.
+ */
+export default function (output, { config }) {
+ const allowed = new Set((config.allowed || []).map((t) => t.trim()));
+ allowed.add('thinking'); // always permitted
+
+ const blocks = [...output.matchAll(/```mdma\n([\s\S]*?)```/g)];
+ if (blocks.length === 0) {
+ return { pass: false, score: 0, reason: 'No MDMA blocks found' };
+ }
+
+ const found = [];
+ const unexpected = [];
+
+ for (const block of blocks) {
+ const typeMatch = block[1].match(/^type:\s*(.+)$/m);
+ if (!typeMatch) continue;
+ const type = typeMatch[1].trim();
+ found.push(type);
+ if (!allowed.has(type)) {
+ unexpected.push(type);
+ }
+ }
+
+ if (unexpected.length === 0) {
+ const nonThinking = found.filter((t) => t !== 'thinking');
+ return {
+ pass: true,
+ score: 1,
+ reason: `Only allowed components generated: ${nonThinking.join(', ')}`,
+ };
+ }
+
+ return {
+ pass: false,
+ score: 0,
+ reason: `Unexpected component(s): ${unexpected.join(', ')}. Allowed: ${[...allowed].join(', ')}. All found: ${found.join(', ')}`,
+ };
+}
diff --git a/evals/own-model/assertions/pie-chart.mjs b/evals/own-model/assertions/pie-chart.mjs
new file mode 100644
index 0000000..fe63aa2
--- /dev/null
+++ b/evals/own-model/assertions/pie-chart.mjs
@@ -0,0 +1,13 @@
+/**
+ * Asserts that the output contains a pie chart variant.
+ */
+export default function (output) {
+ if (
+ output.includes('variant: pie') ||
+ output.includes("variant: 'pie'") ||
+ output.includes('variant: "pie"')
+ ) {
+ return { pass: true, score: 1, reason: 'Pie chart variant found' };
+ }
+ return { pass: false, score: 0, reason: 'Expected variant: pie in chart component' };
+}
diff --git a/evals/own-model/assertions/pii-sensitive.mjs b/evals/own-model/assertions/pii-sensitive.mjs
new file mode 100644
index 0000000..a4ae88e
--- /dev/null
+++ b/evals/own-model/assertions/pii-sensitive.mjs
@@ -0,0 +1,14 @@
+/**
+ * Asserts that at least 3 fields are marked sensitive: true (email, phone, SSN).
+ */
+export default function (output, context) {
+ const matches = output.match(/sensitive:\s*true/g) || [];
+ if (matches.length >= 3) {
+ return { pass: true, score: 1, reason: `Found ${matches.length} sensitive flags` };
+ }
+ return {
+ pass: false,
+ score: matches.length / 3,
+ reason: `Expected at least 3 sensitive: true flags, found ${matches.length}`,
+ };
+}
diff --git a/evals/own-model/assertions/prompt-has-sections.mjs b/evals/own-model/assertions/prompt-has-sections.mjs
new file mode 100644
index 0000000..cda19a9
--- /dev/null
+++ b/evals/own-model/assertions/prompt-has-sections.mjs
@@ -0,0 +1,43 @@
+/**
+ * Asserts that the generated customPrompt contains the expected structural sections.
+ *
+ * A well-structured customPrompt should include most of these elements:
+ * - Domain/role context
+ * - When to generate / trigger rules
+ * - Component instructions
+ * - Workflow or constraints
+ *
+ * Pass required section keywords via config.sections (array of regex patterns).
+ * By default checks for broad structural markers.
+ */
+export default function (output, { config }) {
+ const sections = config?.sections || [
+ 'domain|workflow|role|assist',
+ 'form|component|field',
+ 'sensitive|pii|personal',
+ ];
+
+ const lower = output.toLowerCase();
+ const missing = [];
+
+ for (const pattern of sections) {
+ const regex = new RegExp(pattern, 'i');
+ if (!regex.test(lower)) {
+ missing.push(pattern);
+ }
+ }
+
+ if (missing.length === 0) {
+ return {
+ pass: true,
+ score: 1,
+ reason: `All ${sections.length} expected section markers found`,
+ };
+ }
+
+ return {
+ pass: false,
+ score: (sections.length - missing.length) / sections.length,
+ reason: `Missing section markers: ${missing.join(', ')}`,
+ };
+}
diff --git a/evals/own-model/assertions/prompt-length.mjs b/evals/own-model/assertions/prompt-length.mjs
new file mode 100644
index 0000000..088cafe
--- /dev/null
+++ b/evals/own-model/assertions/prompt-length.mjs
@@ -0,0 +1,33 @@
+/**
+ * Asserts that the generated customPrompt is within a reasonable length range.
+ *
+ * config.min: minimum chars (default 200)
+ * config.max: maximum chars (default 8000)
+ */
+export default function (output, { config }) {
+ const min = config?.min ?? 200;
+ const max = config?.max ?? 8000;
+ const len = output.length;
+
+ if (len < min) {
+ return {
+ pass: false,
+ score: len / min,
+ reason: `Output too short: ${len} chars (minimum ${min})`,
+ };
+ }
+
+ if (len > max) {
+ return {
+ pass: false,
+ score: max / len,
+ reason: `Output too long: ${len} chars (maximum ${max})`,
+ };
+ }
+
+ return {
+ pass: true,
+ score: 1,
+ reason: `Output length ${len} chars (within ${min}-${max})`,
+ };
+}
diff --git a/evals/own-model/assertions/respects-flow-order.mjs b/evals/own-model/assertions/respects-flow-order.mjs
new file mode 100644
index 0000000..73ab932
--- /dev/null
+++ b/evals/own-model/assertions/respects-flow-order.mjs
@@ -0,0 +1,72 @@
+/**
+ * Asserts that the generated customPrompt respects the multi-step flow order.
+ * Checks that step labels/numbers appear in sequence in the output.
+ *
+ * config.stepLabels: string[] β ordered labels to check sequence
+ * config.minSteps: number β minimum number of distinct steps expected (default: 2)
+ */
+export default function (output, { config }) {
+ const lower = output.toLowerCase();
+ const minSteps = config?.minSteps || 2;
+
+ // Check for step numbering or sequential language
+ const stepNumbers = [];
+ for (let i = 1; i <= 10; i++) {
+ const patterns = [
+ new RegExp(`step\\s*${i}\\b`, 'i'),
+ new RegExp(`phase\\s*${i}\\b`, 'i'),
+ new RegExp(`\\*\\*${i}[\\.\\)]`, 'i'),
+ ];
+ if (patterns.some((p) => p.test(output))) {
+ stepNumbers.push(i);
+ }
+ }
+
+ if (stepNumbers.length < minSteps) {
+ // Fall back to checking for sequential language
+ const sequentialMarkers = [/first|initial|begin/, /then|next|after|subsequent|once.*submit/];
+ const foundSequential = sequentialMarkers.filter((m) => m.test(lower)).length;
+ if (foundSequential >= minSteps) {
+ return {
+ pass: true,
+ score: 0.8,
+ reason: `Found sequential flow language (${foundSequential} markers) but no explicit step numbers`,
+ };
+ }
+ return {
+ pass: false,
+ score: stepNumbers.length / minSteps,
+ reason: `Found only ${stepNumbers.length} step references, expected at least ${minSteps}`,
+ };
+ }
+
+ // Check ordering is correct (step 1 before step 2, etc.)
+ let inOrder = true;
+ for (let i = 1; i < stepNumbers.length; i++) {
+ const prevPos = output.toLowerCase().indexOf(`step ${stepNumbers[i - 1]}`);
+ const currPos = output.toLowerCase().indexOf(`step ${stepNumbers[i]}`);
+ if (prevPos >= 0 && currPos >= 0 && prevPos > currPos) {
+ inOrder = false;
+ break;
+ }
+ }
+
+ // Check step labels if provided
+ const stepLabels = config?.stepLabels || [];
+ let labelsFound = 0;
+ if (stepLabels.length > 0) {
+ for (const label of stepLabels) {
+ if (lower.includes(label.toLowerCase())) labelsFound++;
+ }
+ }
+
+ const labelScore = stepLabels.length > 0 ? labelsFound / stepLabels.length : 1;
+ const orderScore = inOrder ? 1 : 0.5;
+ const score = (labelScore + orderScore) / 2;
+
+ return {
+ pass: score >= 0.5,
+ score,
+ reason: `Found ${stepNumbers.length} steps (${inOrder ? 'in order' : 'out of order'})${stepLabels.length > 0 ? `, ${labelsFound}/${stepLabels.length} labels matched` : ''}`,
+ };
+}
diff --git a/evals/own-model/assertions/select-has-options.mjs b/evals/own-model/assertions/select-has-options.mjs
new file mode 100644
index 0000000..c4e967a
--- /dev/null
+++ b/evals/own-model/assertions/select-has-options.mjs
@@ -0,0 +1,15 @@
+/**
+ * Asserts that the output contains a select field with an options array.
+ */
+export default function (output) {
+ const hasSelect = output.includes('type: select');
+ const hasOptions = output.includes('options:');
+ if (hasSelect && hasOptions) {
+ return { pass: true, score: 1, reason: 'Select field has options' };
+ }
+ return {
+ pass: false,
+ score: 0,
+ reason: `Missing ${!hasSelect ? 'type: select' : 'options array'}`,
+ };
+}
diff --git a/evals/own-model/assertions/table-features.mjs b/evals/own-model/assertions/table-features.mjs
new file mode 100644
index 0000000..b9496bb
--- /dev/null
+++ b/evals/own-model/assertions/table-features.mjs
@@ -0,0 +1,18 @@
+/**
+ * Asserts that the output contains a table with sortable or filterable features.
+ */
+export default function (output) {
+ const hasTable = output.includes('type: table');
+ const hasSortable = output.includes('sortable: true');
+ const hasFilterable = output.includes('filterable: true');
+
+ if (hasTable && (hasSortable || hasFilterable)) {
+ const features = [hasSortable && 'sortable', hasFilterable && 'filterable'].filter(Boolean);
+ return { pass: true, score: 1, reason: `Table with ${features.join(' and ')} found` };
+ }
+ return {
+ pass: false,
+ score: hasTable ? 0.5 : 0,
+ reason: `Expected table with sortable/filterable. ${!hasTable ? 'No table found' : 'Missing data features'}`,
+ };
+}
diff --git a/evals/own-model/assertions/thinking-first.mjs b/evals/own-model/assertions/thinking-first.mjs
new file mode 100644
index 0000000..27c14ae
--- /dev/null
+++ b/evals/own-model/assertions/thinking-first.mjs
@@ -0,0 +1,14 @@
+/**
+ * Asserts that the first mdma block is a thinking component.
+ */
+export default function (output) {
+ const blocks = [...output.matchAll(/```mdma\n([\s\S]*?)```/g)];
+ if (blocks.length === 0) {
+ return { pass: false, score: 0, reason: 'No mdma blocks found' };
+ }
+ const firstBlock = blocks[0][1];
+ if (firstBlock.includes('type: thinking')) {
+ return { pass: true, score: 1, reason: 'Thinking block is first' };
+ }
+ return { pass: false, score: 0, reason: 'First mdma block is not a thinking component' };
+}
diff --git a/evals/own-model/assertions/unique-kebab-ids.mjs b/evals/own-model/assertions/unique-kebab-ids.mjs
new file mode 100644
index 0000000..dd9e97b
--- /dev/null
+++ b/evals/own-model/assertions/unique-kebab-ids.mjs
@@ -0,0 +1,24 @@
+/**
+ * Asserts that all component IDs are unique and follow kebab-case.
+ */
+export default function (output) {
+ const idMatches = [...output.matchAll(/^id:\s*(.+)$/gm)];
+ const ids = idMatches.map((m) => m[1].trim());
+
+ if (ids.length === 0) {
+ return { pass: false, score: 0, reason: 'No component IDs found' };
+ }
+
+ const unique = new Set(ids).size === ids.length;
+ if (!unique) {
+ return { pass: false, score: 0, reason: `Duplicate IDs found: ${ids.join(', ')}` };
+ }
+
+ const kebab = /^[a-z][a-z0-9]*(-[a-z0-9]+)*$/;
+ const nonKebab = ids.filter((id) => !kebab.test(id));
+ if (nonKebab.length > 0) {
+ return { pass: false, score: 0, reason: `Non-kebab-case IDs: ${nonKebab.join(', ')}` };
+ }
+
+ return { pass: true, score: 1, reason: `${ids.length} unique kebab-case IDs` };
+}
diff --git a/evals/own-model/assertions/validate-mdma-examples.mjs b/evals/own-model/assertions/validate-mdma-examples.mjs
new file mode 100644
index 0000000..0101755
--- /dev/null
+++ b/evals/own-model/assertions/validate-mdma-examples.mjs
@@ -0,0 +1,60 @@
+import { validate } from '@mobile-reality/mdma-validator';
+
+/**
+ * Extracts ```mdma blocks from a customPrompt and validates each one
+ * as a standalone MDMA document.
+ *
+ * Unlike validate-mdma.mjs (which validates the entire output as a document),
+ * this assertion handles the case where mdma blocks are embedded as examples
+ * inside instructional prose.
+ */
+export default function (output) {
+ const blockRegex = /```mdma\n([\s\S]*?)```/g;
+ const blocks = [...output.matchAll(blockRegex)];
+
+ if (blocks.length === 0) {
+ return {
+ pass: true,
+ score: 1,
+ reason: 'No mdma example blocks to validate (OK for customPrompt)',
+ };
+ }
+
+ const errors = [];
+ let validCount = 0;
+
+ for (let i = 0; i < blocks.length; i++) {
+ const blockContent = blocks[i][1].trim();
+ // Wrap each block back into a markdown document for the validator
+ const doc = `\`\`\`mdma\n${blockContent}\n\`\`\``;
+
+ const result = validate(doc, {
+ exclude: ['thinking-block'],
+ autoFix: false,
+ });
+
+ if (result.ok) {
+ validCount++;
+ } else {
+ const blockErrors = result.issues
+ .filter((issue) => issue.severity === 'error')
+ .map((issue) => `[${issue.ruleId}] ${issue.message}`)
+ .join('; ');
+ errors.push(`Block ${i + 1}: ${blockErrors}`);
+ }
+ }
+
+ if (errors.length === 0) {
+ return {
+ pass: true,
+ score: 1,
+ reason: `All ${validCount} mdma example block(s) are valid MDMA`,
+ };
+ }
+
+ return {
+ pass: false,
+ score: validCount / blocks.length,
+ reason: `${errors.length}/${blocks.length} mdma block(s) have validation errors:\n${errors.join('\n')}`,
+ };
+}
diff --git a/evals/own-model/assertions/validate-mdma.mjs b/evals/own-model/assertions/validate-mdma.mjs
new file mode 100644
index 0000000..6c84b64
--- /dev/null
+++ b/evals/own-model/assertions/validate-mdma.mjs
@@ -0,0 +1,41 @@
+import { validate } from '@mobile-reality/mdma-validator';
+
+/**
+ * Custom promptfoo assertion that runs the MDMA validator on LLM output.
+ *
+ * Returns pass if the validator reports no unfixed errors.
+ * On failure, includes a summary of all issues found.
+ *
+ * Optional config:
+ * - exclude: string[] β additional rule IDs to skip on top of the
+ * always-excluded `thinking-block` rule. Useful when a suite's
+ * blueprints deliberately violate a stylistic rule (e.g.
+ * `flow-ordering` for the custom-prompt suite, where prompts
+ * intentionally bundle multiple components per message).
+ */
+export default function (output, { config } = {}) {
+ const extraExclude = Array.isArray(config?.exclude) ? config.exclude : [];
+ const result = validate(output, {
+ exclude: ['thinking-block', ...extraExclude],
+ autoFix: false,
+ });
+
+ if (result.ok) {
+ return {
+ pass: true,
+ score: 1,
+ reason: `Valid MDMA document (${result.summary.warnings} warnings, ${result.summary.infos} info)`,
+ };
+ }
+
+ const errorDetails = result.issues
+ .filter((issue) => issue.severity === 'error')
+ .map((issue) => `[${issue.ruleId}] ${issue.message}`)
+ .join('\n');
+
+ return {
+ pass: false,
+ score: 0,
+ reason: `Validation failed with ${result.summary.errors} error(s):\n${errorDetails}`,
+ };
+}
diff --git a/evals/own-model/assertions/yaml-not-json.mjs b/evals/own-model/assertions/yaml-not-json.mjs
new file mode 100644
index 0000000..ef2655a
--- /dev/null
+++ b/evals/own-model/assertions/yaml-not-json.mjs
@@ -0,0 +1,58 @@
+/**
+ * Asserts that all ```mdma blocks in the output use YAML syntax, not JSON.
+ *
+ * Checks:
+ * - No block starts with { or [
+ * - No block contains "type": or "fields": (JSON keys)
+ * - Every block starts with a YAML key: value pattern (e.g. type: form)
+ */
+export default function (output) {
+ const blocks = [...output.matchAll(/```mdma\n([\s\S]*?)```/g)];
+
+ if (blocks.length === 0) {
+ // No mdma blocks in a generated customPrompt is acceptable
+ // (the prompt might describe components without embedding code blocks)
+ return { pass: true, score: 1, reason: 'No mdma blocks to check (OK for customPrompt)' };
+ }
+
+ const issues = [];
+
+ for (let i = 0; i < blocks.length; i++) {
+ const content = blocks[i][1].trim();
+ const blockLabel = `block ${i + 1}`;
+
+ if (content.startsWith('{') || content.startsWith('[')) {
+ issues.push(`${blockLabel}: starts with JSON syntax`);
+ }
+
+ if (/"type"\s*:/.test(content)) {
+ issues.push(`${blockLabel}: contains JSON "type": key`);
+ }
+
+ if (/"fields"\s*:/.test(content)) {
+ issues.push(`${blockLabel}: contains JSON "fields": key`);
+ }
+
+ if (/"id"\s*:/.test(content)) {
+ issues.push(`${blockLabel}: contains JSON "id": key`);
+ }
+
+ if (!/^[a-zA-Z_-]+:\s/.test(content)) {
+ issues.push(`${blockLabel}: does not start with YAML key: value`);
+ }
+ }
+
+ if (issues.length === 0) {
+ return {
+ pass: true,
+ score: 1,
+ reason: `All ${blocks.length} mdma block(s) use valid YAML syntax`,
+ };
+ }
+
+ return {
+ pass: false,
+ score: 0,
+ reason: `JSON detected in mdma blocks:\n${issues.join('\n')}`,
+ };
+}
diff --git a/evals/own-model/prompt-author.mjs b/evals/own-model/prompt-author.mjs
new file mode 100644
index 0000000..925f8d1
--- /dev/null
+++ b/evals/own-model/prompt-author.mjs
@@ -0,0 +1,20 @@
+import { getAuthorPromptVariant } from '@mobile-reality/mdma-prompt-pack';
+
+/**
+ * Promptfoo prompt function β author suite for our model.
+ *
+ * DSL port of the flagship author suite (../tests.yaml): system = the
+ * `mobile-reality/mdma-il` author prompt from the prompt pack (DSL grammar +
+ * rules + form/table/chart examples), user = the scenario's DSL intent
+ * (`vars.request`, supplied by tests-author.yaml). Output is validated against
+ * the schema + the per-case structural assertions β pure DSLβMDMA generation,
+ * no customPrompt layer.
+ */
+const AUTHORING_SYSTEM_PROMPT = getAuthorPromptVariant('mobile-reality/mdma-il').prompt;
+
+export default function ({ vars }) {
+ return [
+ { role: 'system', content: `{% raw %}${AUTHORING_SYSTEM_PROMPT}{% endraw %}` },
+ { role: 'user', content: `{% raw %}${vars.request}{% endraw %}` },
+ ];
+}
diff --git a/evals/own-model/prompt-custom.mjs b/evals/own-model/prompt-custom.mjs
new file mode 100644
index 0000000..c6cb1af
--- /dev/null
+++ b/evals/own-model/prompt-custom.mjs
@@ -0,0 +1,28 @@
+import { getAuthorPromptVariant } from '@mobile-reality/mdma-prompt-pack';
+
+/**
+ * Promptfoo prompt function β custom-system-prompt suite for our model.
+ *
+ * The `mobile-reality/mdma-il` author prompt from the prompt pack (DSL grammar +
+ * rules + form/table/chart few-shot examples) is the author base; the test's
+ * `customPrompt` (the scenario intent in DSL) is layered after a separator; the
+ * NL `request` is the user msg.
+ *
+ * NOTE: we deliberately do NOT use buildSystemPrompt() here. Its generic
+ * "Reminder" footer is flagship/agentic guidance ("respond in plain Markdown",
+ * "the user should see a natural response") that pushes our DSL-tuned model to
+ * reply in prose with NO ```mdma blocks β measured ~60% of the time on the
+ * heaviest multi-component flow (5/5 pass with this lean join vs 2/5 with the
+ * footer). The authoring prompt already carries every rule our model needs.
+ */
+const AUTHORING_SYSTEM_PROMPT = getAuthorPromptVariant('mobile-reality/mdma-il').prompt;
+
+export default function ({ vars }) {
+ const system = vars.customPrompt
+ ? `${AUTHORING_SYSTEM_PROMPT}\n\n---\n\n${vars.customPrompt}`
+ : AUTHORING_SYSTEM_PROMPT;
+ return [
+ { role: 'system', content: `{% raw %}${system}{% endraw %}` },
+ { role: 'user', content: `{% raw %}${vars.request}{% endraw %}` },
+ ];
+}
diff --git a/evals/own-model/prompt-fixer.mjs b/evals/own-model/prompt-fixer.mjs
new file mode 100644
index 0000000..30bfee8
--- /dev/null
+++ b/evals/own-model/prompt-fixer.mjs
@@ -0,0 +1,39 @@
+import {
+ buildFixerMessage,
+ buildFixerPrompt,
+ buildSystemPrompt,
+} from '@mobile-reality/mdma-prompt-pack';
+import { validate } from '@mobile-reality/mdma-validator';
+
+/**
+ * Promptfoo prompt function β fixer suite for our model.
+ *
+ * NOTE: this is OFF-CONTRACT for our model. The model was trained DSLβMDMA;
+ * the fixer task takes a BROKEN MDMA document (+ validator errors) and asks the
+ * model to repair it β not a DSL intent. We run it anyway as a capability probe:
+ * can the DSL-specialized model also repair MDMA, or does it refuse?
+ *
+ * Pipeline mirrors the flagship fixer eval (../prompt-fixer.mjs): run the
+ * validator to surface remaining issues, then send the canonical fixer system
+ * prompt (default author spec + fixer instructions) + the broken doc / issues.
+ */
+export default function ({ vars }) {
+ const variantKey = vars.variantKey ?? 'single-block';
+ const exclude = ['thinking-block'];
+ if (variantKey !== 'flow') exclude.push('flow-ordering');
+
+ const result = validate(vars.brokenDocument, { exclude });
+ const allIssues = result.issues.filter((i) => i.severity === 'error' || i.severity === 'warning');
+
+ const fixerPrompt = buildFixerPrompt(variantKey);
+ const systemPrompt = `${buildSystemPrompt()}\n\n---\n\n${fixerPrompt}`;
+ const userMessage = buildFixerMessage(vars.brokenDocument, allIssues, {
+ conversationHistory: vars.conversationHistory ?? undefined,
+ promptContext: vars.promptContext ?? undefined,
+ });
+
+ return [
+ { role: 'system', content: `{% raw %}${systemPrompt}{% endraw %}` },
+ { role: 'user', content: `{% raw %}${userMessage}{% endraw %}` },
+ ];
+}
diff --git a/evals/own-model/prompt-guidance.mjs b/evals/own-model/prompt-guidance.mjs
new file mode 100644
index 0000000..8468cdd
--- /dev/null
+++ b/evals/own-model/prompt-guidance.mjs
@@ -0,0 +1,25 @@
+/**
+ * Promptfoo prompt function β agent guidance suite for our model.
+ *
+ * Agentic tool-calling probe: the model is given the `generate_mdma` tool (in
+ * the provider config) and an NL request; it should CALL the tool for
+ * document-creation requests and NOT call it for conversational ones
+ * (asserted by calls-generate-mdma).
+ *
+ * β οΈ Requires the endpoint to have function-calling enabled
+ * (vLLM `--enable-auto-tool-choice` + `--tool-call-parser`). Without it the
+ * endpoint returns HTTP 400 for `tool_choice: auto`.
+ */
+const SYSTEM_PROMPT =
+ 'You are an assistant with a `generate_mdma` tool that produces interactive MDMA documents ' +
+ '(forms, tables, charts, tasklists, callouts, approval-gates, buttons, webhooks). ' +
+ 'Call `generate_mdma` whenever the user asks you to create, build, design, or update an ' +
+ 'interactive document or UI. For greetings, questions about capabilities, explanations, or ' +
+ 'other conversational replies, respond normally and do NOT call the tool.';
+
+export default function ({ vars }) {
+ return [
+ { role: 'system', content: `{% raw %}${SYSTEM_PROMPT}{% endraw %}` },
+ { role: 'user', content: `{% raw %}${vars.request}{% endraw %}` },
+ ];
+}
diff --git a/evals/own-model/prompt.mjs b/evals/own-model/prompt.mjs
new file mode 100644
index 0000000..0447fdc
--- /dev/null
+++ b/evals/own-model/prompt.mjs
@@ -0,0 +1,21 @@
+import { getAuthorPromptVariant } from '@mobile-reality/mdma-prompt-pack';
+
+/**
+ * Promptfoo prompt function β MDMA-IL DSL holdout gate.
+ *
+ * System message = the `mobile-reality/mdma-il` author prompt (DSL grammar +
+ * worked examples), looked up directly from the registry, decoupled from the
+ * provider id. User message = the MDMA-IL DSL intent from each holdout case
+ * (`vars.request`, supplied by tests-dsl.mjs).
+ *
+ * Both are wrapped in {% raw %} so Nunjucks passes the DSL (and any `{...}`
+ * select-option braces) through verbatim.
+ */
+const SYSTEM_PROMPT = getAuthorPromptVariant('mobile-reality/mdma-il').prompt;
+
+export default function ({ vars }) {
+ return [
+ { role: 'system', content: `{% raw %}${SYSTEM_PROMPT}{% endraw %}` },
+ { role: 'user', content: `{% raw %}${vars.request}{% endraw %}` },
+ ];
+}
diff --git a/evals/own-model/promptfooconfig.own-model-author.yaml b/evals/own-model/promptfooconfig.own-model-author.yaml
new file mode 100644
index 0000000..c8608e9
--- /dev/null
+++ b/evals/own-model/promptfooconfig.own-model-author.yaml
@@ -0,0 +1,38 @@
+# MDMA Author β own model (DSL port of the flagship author suite)
+#
+# The 28 author scenarios from ../tests.yaml expressed as DSL intents
+# (tests-author.yaml), driven by our shared authoring system prompt
+# (prompt-author.mjs: system = authoring prompt, user = DSL). Output validated
+# against the schema + per-case structural assertions. DSL is the input.
+#
+# Run (serial): pnpm --filter @mobile-reality/mdma-evals eval:own-model:author
+
+description: MDMA Author Eval (DSL) β own model
+
+envPath: ../.env
+outputPath: own-model/results-author.json
+
+prompts:
+ - file://prompt-author.mjs
+
+providers:
+ - id: "{{ env.OWN_MODEL_PROVIDER or 'openai:chat:mdma-26b' }}"
+ config:
+ apiBaseUrl: "{{ env.OWN_MODEL_BASE_URL }}"
+ apiKey: "{{ env.OWN_MODEL_API_KEY }}"
+ temperature: 1
+ max_tokens: 2048
+ # Cut the Gemma 4 reasoning repetition loop (see repetition-loops.md).
+ min_p: 0.02
+ repetition_penalty: 1.1
+ chat_template_kwargs:
+ enable_thinking: false
+
+defaultTest:
+ assert:
+ - type: javascript
+ value: file://assertions/validate-mdma.mjs
+ config:
+ exclude: [flow-ordering]
+
+tests: file://tests-author.yaml
diff --git a/evals/own-model/promptfooconfig.own-model-custom.yaml b/evals/own-model/promptfooconfig.own-model-custom.yaml
new file mode 100644
index 0000000..586a88a
--- /dev/null
+++ b/evals/own-model/promptfooconfig.own-model-custom.yaml
@@ -0,0 +1,45 @@
+# MDMA Author + Custom System Prompt β own model (MDMA-IL)
+#
+# Same eval as the flagship models' custom suite (promptfooconfig.custom.yaml):
+# the model's author/schema prompt + the test's customPrompt layered into the
+# SYSTEM message (buildSystemPrompt), the NL request as the user message, output
+# validated against the schema. Only the provider (our 31B) and the author
+# variant (mobile-reality/mdma-il) differ.
+#
+# No temperature override β default sampling (the model should stay conversational
+# and still produce MDMA). enable_thinking=false per the endpoint contract.
+#
+# Run (serial): pnpm --filter @mobile-reality/mdma-evals eval:own-model:custom
+
+description: MDMA Author + Custom System Prompt Eval β own model
+
+envPath: ../.env
+outputPath: own-model/results-custom.json
+
+prompts:
+ - file://prompt-custom.mjs
+
+providers:
+ - id: "{{ env.OWN_MODEL_PROVIDER or 'openai:chat:mdma-31b' }}"
+ config:
+ apiBaseUrl: "{{ env.OWN_MODEL_BASE_URL }}"
+ apiKey: "{{ env.OWN_MODEL_API_KEY }}"
+ # AWQ endpoint: max_model_len 16384, so no tight output cap needed.
+ # temperature 1 (conversational sampling) β non-deterministic, results may
+ # vary run-to-run.
+ temperature: 1
+ max_tokens: 4096
+ # Cut the Gemma 4 reasoning repetition loop (see repetition-loops.md).
+ min_p: 0.02
+ repetition_penalty: 1.1
+ chat_template_kwargs:
+ enable_thinking: false
+
+defaultTest:
+ assert:
+ - type: javascript
+ value: file://assertions/validate-mdma.mjs
+ config:
+ exclude: [flow-ordering]
+
+tests: file://tests-custom.yaml
diff --git a/evals/own-model/promptfooconfig.own-model-fixer.yaml b/evals/own-model/promptfooconfig.own-model-fixer.yaml
new file mode 100644
index 0000000..99bc4cf
--- /dev/null
+++ b/evals/own-model/promptfooconfig.own-model-fixer.yaml
@@ -0,0 +1,30 @@
+# MDMA Fixer β own model (capability probe)
+#
+# OFF-CONTRACT for our DSLβMDMA model: the input is a broken MDMA document to
+# repair, not a DSL intent. Run as a probe β can the model fix MDMA, or refuse?
+# Reuses the flagship fixer test set (../tests-fixer.yaml) and its assertions.
+#
+# Run (serial): pnpm --filter @mobile-reality/mdma-evals eval:own-model:fixer
+
+description: MDMA Fixer Eval (capability probe) β own model
+
+envPath: ../.env
+outputPath: own-model/results-fixer.json
+
+prompts:
+ - file://prompt-fixer.mjs
+
+providers:
+ - id: "{{ env.OWN_MODEL_PROVIDER or 'openai:chat:mdma-26b' }}"
+ config:
+ apiBaseUrl: "{{ env.OWN_MODEL_BASE_URL }}"
+ apiKey: "{{ env.OWN_MODEL_API_KEY }}"
+ temperature: 1
+ max_tokens: 2048
+ # Cut the Gemma 4 reasoning repetition loop (see repetition-loops.md).
+ min_p: 0.02
+ repetition_penalty: 1.1
+ chat_template_kwargs:
+ enable_thinking: false
+
+tests: ../tests-fixer.yaml
diff --git a/evals/own-model/promptfooconfig.own-model-flows.yaml b/evals/own-model/promptfooconfig.own-model-flows.yaml
new file mode 100644
index 0000000..b255579
--- /dev/null
+++ b/evals/own-model/promptfooconfig.own-model-flows.yaml
@@ -0,0 +1,38 @@
+# MDMA Example Flows β own model (DSL port of the flagship flows suite)
+#
+# The 15 example-flow scenarios from ../tests-flows.yaml with each customPrompt
+# expressed in DSL (tests-flows.yaml). Reuses the custom builder (prompt-custom.mjs:
+# authoring prompt + customPrompt + NL request). Output validated against the
+# schema + per-case structural assertions.
+#
+# Run (serial): pnpm --filter @mobile-reality/mdma-evals eval:own-model:flows
+
+description: MDMA Example Flows Eval (DSL) β own model
+
+envPath: ../.env
+outputPath: own-model/results-flows.json
+
+prompts:
+ - file://prompt-custom.mjs
+
+providers:
+ - id: "{{ env.OWN_MODEL_PROVIDER or 'openai:chat:mdma-26b' }}"
+ config:
+ apiBaseUrl: "{{ env.OWN_MODEL_BASE_URL }}"
+ apiKey: "{{ env.OWN_MODEL_API_KEY }}"
+ temperature: 1
+ max_tokens: 2048
+ # Cut the Gemma 4 reasoning repetition loop (see repetition-loops.md).
+ min_p: 0.02
+ repetition_penalty: 1.1
+ chat_template_kwargs:
+ enable_thinking: false
+
+defaultTest:
+ assert:
+ - type: javascript
+ value: file://assertions/validate-mdma.mjs
+ config:
+ exclude: [flow-ordering]
+
+tests: file://tests-flows.yaml
diff --git a/evals/own-model/promptfooconfig.own-model-guidance.yaml b/evals/own-model/promptfooconfig.own-model-guidance.yaml
new file mode 100644
index 0000000..79074ca
--- /dev/null
+++ b/evals/own-model/promptfooconfig.own-model-guidance.yaml
@@ -0,0 +1,52 @@
+# MDMA Agent Guidance β own model (agentic tool-calling)
+#
+# Tests whether the model correctly CALLS the generate_mdma tool for
+# document-creation requests (and not for conversational ones). DSL port not
+# applicable β this is a tool-calling decision, driven by NL requests
+# (../tests-guidance.yaml).
+#
+# β οΈ REQUIRES the endpoint to have function-calling enabled (vLLM
+# --enable-auto-tool-choice + --tool-call-parser). Until then the endpoint
+# returns HTTP 400 for tool_choice: auto.
+#
+# Run (serial): pnpm --filter @mobile-reality/mdma-evals eval:own-model:guidance
+
+description: MDMA Agent Guidance Eval (tool-calling) β own model
+
+envPath: ../.env
+outputPath: own-model/results-guidance.json
+
+prompts:
+ - file://prompt-guidance.mjs
+
+providers:
+ - id: "{{ env.OWN_MODEL_PROVIDER or 'openai:chat:mdma-26b' }}"
+ config:
+ apiBaseUrl: "{{ env.OWN_MODEL_BASE_URL }}"
+ apiKey: "{{ env.OWN_MODEL_API_KEY }}"
+ temperature: 1
+ max_tokens: 1024
+ # Cut the Gemma 4 reasoning repetition loop (see repetition-loops.md).
+ min_p: 0.02
+ repetition_penalty: 1.1
+ chat_template_kwargs:
+ enable_thinking: false
+ tool_choice: auto
+ tools:
+ - type: function
+ function:
+ name: generate_mdma
+ description: >
+ Generate an MDMA Markdown document to present structured
+ interactive content to the user (forms, tables, tasklists,
+ approval gates, charts, callouts, buttons, webhooks).
+ parameters:
+ type: object
+ properties:
+ document:
+ type: string
+ description: The complete MDMA Markdown document.
+ required:
+ - document
+
+tests: ../tests-guidance.yaml
diff --git a/evals/own-model/promptfooconfig.own-model.yaml b/evals/own-model/promptfooconfig.own-model.yaml
new file mode 100644
index 0000000..29ef88e
--- /dev/null
+++ b/evals/own-model/promptfooconfig.own-model.yaml
@@ -0,0 +1,52 @@
+# MDMA-IL DSL holdout gate β Mobile Reality's own model
+#
+# Our self-hosted model (Gemma-4-E4B + v3 MDMA-IL LoRA) takes ONE MDMA-IL DSL
+# intent and returns an MDMA document. This suite is the DSL holdout gate: feed
+# the 95 held-out scenarios in DSL form (../gemma/dataset/data/holdout-dsl.jsonl
+# via tests-dsl.mjs) and validate the MDMA output.
+#
+# The system prompt is the mobile-reality/mdma-il author prompt (DSL grammar +
+# worked examples). The served context is only 2048 tokens on E4B, so the prompt
+# stays lean. See README.md.
+#
+# Model is plugged in via OWN_MODEL_* in ../.env (OpenAI-compatible endpoint).
+#
+# Run: pnpm --filter @mobile-reality/mdma-evals eval:own-model
+# View: pnpm --filter @mobile-reality/mdma-evals eval:view
+
+description: MDMA-IL DSL Holdout Gate β own model
+
+envPath: ../.env
+outputPath: own-model/results.json
+
+prompts:
+ - file://prompt.mjs
+
+providers:
+ # OpenAI-compatible DSL endpoint (31B mdma-31b). Set OWN_MODEL_* in ../.env.
+ # Contract: v3 system prompt verbatim + DSL
+ # user message; temperature 0; enable_thinking=false (else thinking leaks in).
+ - id: "{{ env.OWN_MODEL_PROVIDER or 'openai:chat:mdma-31b' }}"
+ config:
+ apiBaseUrl: "{{ env.OWN_MODEL_BASE_URL }}"
+ apiKey: "{{ env.OWN_MODEL_API_KEY }}"
+ temperature: 1
+ # 31B has a large context (no 2048 cap like E4B); 1024 truncated big
+ # multi-component docs (callout + full data table), so raised to 2048.
+ max_tokens: 2048
+ # Cut the Gemma 4 reasoning repetition loop (see repetition-loops.md).
+ min_p: 0.02
+ repetition_penalty: 1.1
+ chat_template_kwargs:
+ enable_thinking: false
+
+defaultTest:
+ assert:
+ # The gate: every output must be a valid MDMA document. thinking-block is
+ # always excluded by the assertion; the holdout has no multi-step flows.
+ - type: javascript
+ value: file://assertions/validate-mdma.mjs
+ config:
+ exclude: [flow-ordering]
+
+tests: file://tests-dsl.mjs
diff --git a/evals/own-model/results-author.json b/evals/own-model/results-author.json
new file mode 100644
index 0000000..5318acc
--- /dev/null
+++ b/evals/own-model/results-author.json
@@ -0,0 +1,5273 @@
+{
+ "evalId": "eval-7qF-2026-06-30T12:37:01",
+ "results": {
+ "version": 3,
+ "timestamp": "2026-06-30T12:37:01.882Z",
+ "prompts": [
+ {
+ "raw": "function ({ vars }) {\n return [\n { role: 'system', content: `{% raw %}${AUTHORING_SYSTEM_PROMPT}{% endraw %}` },\n { role: 'user', content: `{% raw %}${vars.request}{% endraw %}` },\n ];\n}",
+ "label": "own-model/prompt-author.mjs",
+ "config": {},
+ "id": "a215030c5bb1ab12c4729b3195affd78a5fff95e4c88201c6183e640304b1358",
+ "provider": "openai:chat:mdma-26b",
+ "metrics": {
+ "score": 28,
+ "testPassCount": 28,
+ "testFailCount": 0,
+ "testErrorCount": 0,
+ "assertPassCount": 92,
+ "assertFailCount": 0,
+ "totalLatencyMs": 47883,
+ "tokenUsage": {
+ "prompt": 40987,
+ "completion": 4609,
+ "cached": 0,
+ "total": 45596,
+ "numRequests": 28,
+ "completionDetails": {
+ "reasoning": 0,
+ "acceptedPrediction": 0,
+ "rejectedPrediction": 0,
+ "cacheReadInputTokens": 0,
+ "cacheCreationInputTokens": 0
+ },
+ "assertions": {
+ "total": 0,
+ "prompt": 0,
+ "completion": 0,
+ "cached": 0,
+ "numRequests": 0,
+ "completionDetails": {
+ "reasoning": 0,
+ "acceptedPrediction": 0,
+ "rejectedPrediction": 0,
+ "cacheReadInputTokens": 0,
+ "cacheCreationInputTokens": 0
+ }
+ }
+ },
+ "namedScores": {},
+ "namedScoresCount": {},
+ "namedScoreWeights": {},
+ "cost": 0
+ }
+ }
+ ],
+ "results": [
+ {
+ "cost": 0,
+ "gradingResult": {
+ "pass": true,
+ "score": 1,
+ "reason": "All assertions passed",
+ "namedScores": {},
+ "tokensUsed": {
+ "total": 0,
+ "prompt": 0,
+ "completion": 0,
+ "cached": 0,
+ "numRequests": 0
+ },
+ "componentResults": [
+ {
+ "pass": true,
+ "score": 1,
+ "reason": "Valid MDMA document (0 warnings, 0 info)",
+ "assertion": {
+ "type": "javascript",
+ "value": "file://assertions/validate-mdma.mjs",
+ "config": {
+ "exclude": [
+ "flow-ordering"
+ ]
+ }
+ }
+ },
+ {
+ "pass": true,
+ "score": 1,
+ "reason": "Only allowed components generated: form",
+ "assertion": {
+ "type": "javascript",
+ "value": "file://assertions/only-components.mjs",
+ "config": {
+ "allowed": [
+ "form"
+ ]
+ }
+ }
+ },
+ {
+ "pass": true,
+ "score": 1,
+ "reason": "Exactly 3 form fields found",
+ "assertion": {
+ "type": "javascript",
+ "value": "file://assertions/exact-field-count.mjs",
+ "config": {
+ "expected": 3
+ }
+ }
+ },
+ {
+ "pass": true,
+ "score": 1,
+ "reason": "Found sensitive: true flag",
+ "assertion": {
+ "type": "javascript",
+ "value": "file://assertions/has-sensitive.mjs"
+ }
+ }
+ ]
+ },
+ "id": "b74a68fa-01bd-47df-b639-ac9aa7f8be81",
+ "latencyMs": 2241,
+ "namedScores": {},
+ "prompt": {
+ "raw": "[{\"role\":\"system\",\"content\":\"You are an MDMA authoring engine. You read a single DSL intent β a compact, one-line-per-component description of the UI to build β and produce the corresponding MDMA (Markdown Document with Mounted Applications) components.\\n\\nThe DSL intent to build is given in your instructions (and/or the user's message). Always generate the MDMA for that DSL β never refuse, apologize, or ask the user to provide a DSL intent. Treat any natural-language message as extra context for the DSL you were given.\\n\\n## DSL input β the grammar you read\\n```\\n#[, , ...](, , ...) # one component per line\\nfield = [*][^]:[{opt1|opt2|...}]\\n * = required ^ = sensitive (PII: name, email, phone, address, SSN, date-of-birth, β¦)\\n typecode: t=text n=number e=email d=date s=select c=checkbox ta=textarea f=file\\n {a|b|c} = options for a select field\\nprops = text=\\\"...\\\" | action= | variant=\\ntypes: form Β· button Β· tasklist Β· table Β· callout Β· approval-gate Β· webhook Β· chart\\n```\\n\\n## Authoring rules\\n- Build EXACTLY the components in the DSL intent β no more, no fewer. The DSL is the complete, final spec. Never invent a component it did not list (e.g. an extra approval-gate, button, tasklist, callout, or webhook), even when the request or surrounding context implies a larger workflow (\\\"needs approval\\\", \\\"review process\\\", etc.). Describe any such follow-up in prose only; do not emit it.\\n- Each ```mdma block is exactly ONE component as top-level YAML keys (type, id, β¦). Never wrap a component in a \\\"components:\\\" array.\\n- Every component has \\\"id\\\" and \\\"type\\\" (one of: form, button, tasklist, table, callout, approval-gate, webhook, chart).\\n- AT MOST ONE interactive component (form, button, tasklist, approval-gate, webhook) per response; non-interactive components (callout, table, chart) may accompany it. Define a referenced component before anything that references it.\\n- form: top-level \\\"onSubmit: \\\"; \\\"fields\\\" list (each name/type/label); field type β text|number|email|date|select|checkbox|textarea|file; select fields need \\\"options\\\" (list of {label, value}); mark every PII field \\\"sensitive: true\\\".\\n- button: \\\"text\\\" + \\\"onAction: \\\". tasklist: \\\"items\\\" list of {id, text}. table: \\\"columns\\\" (key/header) + \\\"data\\\" rows. callout: \\\"content\\\" + variant β info|warning|error|success. approval-gate: \\\"title\\\". webhook: \\\"url\\\" + \\\"trigger: \\\". chart: \\\"label\\\" (never \\\"title\\\") + \\\"data: |\\\" CSV (header line then rows) + variant β line|bar|area|pie.\\n- Forms use \\\"onSubmit\\\", buttons \\\"onAction\\\", webhooks \\\"trigger\\\" β never a bare \\\"action\\\" key.\\n- Fill in realistic values the DSL omits (table rows, chart CSV, callout content, approval-gate title).\\n\\n## Turn and reasoning discipline\\n- Each message is a complete instruction. Emit exactly one document for the current DSL intent, then stop. Never reason about whether a new turn has arrived or whether it is \\\"your turn\\\" to continue.\\n- Do not re-verify or re-emit components from earlier turns. Keep any thinking block to at most a few sentences. Never repeat a token or phrase.\\n\\n## Examples\\n\\nIntent: `form#contact[full-name*:t, email*^:e](action=contact-submitted)`\\n\\n```mdma\\ntype: thinking\\nid: planning\\nstatus: done\\ncollapsed: true\\ncontent: |\\n Contact form: a required name and a required, sensitive email; submits via contact-submitted.\\n```\\n\\n```mdma\\ntype: form\\nid: contact\\nfields:\\n - name: full-name\\n type: text\\n label: \\\"Full Name\\\"\\n required: true\\n - name: email\\n type: email\\n label: \\\"Email\\\"\\n required: true\\n sensitive: true\\nonSubmit: contact-submitted\\n```\\n\\nIntent: `table#orders` β invent realistic columns and rows; default to sortable/filterable tables.\\n\\n```mdma\\ntype: table\\nid: orders\\nsortable: true\\nfilterable: true\\ncolumns:\\n - key: order-id\\n header: \\\"Order ID\\\"\\n sortable: true\\n - key: customer\\n header: \\\"Customer\\\"\\n sortable: true\\n - key: total\\n header: \\\"Total ($)\\\"\\n sortable: true\\n - key: status\\n header: \\\"Status\\\"\\ndata:\\n - { order-id: \\\"ORD-1001\\\", customer: \\\"Acme Inc\\\", total: 1240.50, status: \\\"Shipped\\\" }\\n - { order-id: \\\"ORD-1002\\\", customer: \\\"Globex\\\", total: 880.00, status: \\\"Pending\\\" }\\n - { order-id: \\\"ORD-1003\\\", customer: \\\"Initech\\\", total: 2310.75, status: \\\"Delivered\\\" }\\n```\\n\\nIntent: `chart#revenue(variant=bar)` β invent a realistic CSV `data` block and a `label`.\\n\\n```mdma\\ntype: chart\\nid: revenue\\nvariant: bar\\nlabel: \\\"Monthly Revenue\\\"\\ndata: |\\n Month, Revenue\\n Jan, 42000\\n Feb, 51000\\n Mar, 47500\\nxAxis: Month\\n```\"},{\"role\":\"user\",\"content\":\"form#contact-form[full-name*:t, email*^:e, message*:ta](action=contact-submitted)\\n\"}]",
+ "label": "own-model/prompt-author.mjs",
+ "config": {}
+ },
+ "promptId": "a215030c5bb1ab12c4729b3195affd78a5fff95e4c88201c6183e640304b1358",
+ "promptIdx": 0,
+ "provider": {
+ "id": "openai:chat:mdma-26b",
+ "label": ""
+ },
+ "response": {
+ "output": "```mdma\ntype: thinking\nid: planning\nstatus: done\ncollapsed: true\ncontent: |\n Contact form: requires name, sensitive email, and a message; submits via contact-submitted.\n```\n\n```mdma\ntype: form\nid: contact-form\nfields:\n - name: full-name\n type: text\n label: \"Full Name\"\n required: true\n - name: email\n type: email\n label: \"Email\"\n required: true\n sensitive: true\n - name: message\n type: textarea\n label: \"Message\"\n required: true\nonSubmit: contact-submitted\n```",
+ "tokenUsage": {
+ "total": 1617,
+ "prompt": 1467,
+ "completion": 150,
+ "numRequests": 1
+ },
+ "cached": false,
+ "latencyMs": 2241,
+ "finishReason": "stop",
+ "guardrails": {
+ "flagged": false
+ },
+ "metadata": {
+ "http": {
+ "status": 200,
+ "statusText": "OK",
+ "headers": {
+ "alt-svc": "h3=\":443\"; ma=2592000",
+ "content-length": "1159",
+ "content-type": "application/json",
+ "date": "Tue, 30 Jun 2026 12:37:01 GMT",
+ "modal-function-call-id": "fc-01KWC8GVQXS7GY3T1FADQGTC4K",
+ "vary": "accept-encoding"
+ }
+ }
+ }
+ },
+ "score": 1,
+ "success": true,
+ "testCase": {
+ "description": "Contact form (DSL)",
+ "vars": {
+ "request": "form#contact-form[full-name*:t, email*^:e, message*:ta](action=contact-submitted)\n"
+ },
+ "assert": [
+ {
+ "type": "javascript",
+ "value": "file://assertions/validate-mdma.mjs",
+ "config": {
+ "exclude": [
+ "flow-ordering"
+ ]
+ }
+ },
+ {
+ "type": "javascript",
+ "value": "file://assertions/only-components.mjs",
+ "config": {
+ "allowed": [
+ "form"
+ ]
+ }
+ },
+ {
+ "type": "javascript",
+ "value": "file://assertions/exact-field-count.mjs",
+ "config": {
+ "expected": 3
+ }
+ },
+ {
+ "type": "javascript",
+ "value": "file://assertions/has-sensitive.mjs"
+ }
+ ],
+ "options": {},
+ "metadata": {}
+ },
+ "testIdx": 0,
+ "vars": {
+ "request": "form#contact-form[full-name*:t, email*^:e, message*:ta](action=contact-submitted)\n"
+ },
+ "metadata": {
+ "http": {
+ "status": 200,
+ "statusText": "OK",
+ "headers": {
+ "alt-svc": "h3=\":443\"; ma=2592000",
+ "content-length": "1159",
+ "content-type": "application/json",
+ "date": "Tue, 30 Jun 2026 12:37:01 GMT",
+ "modal-function-call-id": "fc-01KWC8GVQXS7GY3T1FADQGTC4K",
+ "vary": "accept-encoding"
+ }
+ },
+ "_promptfooFileMetadata": {}
+ },
+ "failureReason": 0
+ },
+ {
+ "cost": 0,
+ "gradingResult": {
+ "pass": true,
+ "score": 1,
+ "reason": "All assertions passed",
+ "namedScores": {},
+ "tokensUsed": {
+ "total": 0,
+ "prompt": 0,
+ "completion": 0,
+ "cached": 0,
+ "numRequests": 0
+ },
+ "componentResults": [
+ {
+ "pass": true,
+ "score": 1,
+ "reason": "Valid MDMA document (0 warnings, 0 info)",
+ "assertion": {
+ "type": "javascript",
+ "value": "file://assertions/validate-mdma.mjs",
+ "config": {
+ "exclude": [
+ "flow-ordering"
+ ]
+ }
+ }
+ },
+ {
+ "pass": true,
+ "score": 1,
+ "reason": "Only allowed components generated: form",
+ "assertion": {
+ "type": "javascript",
+ "value": "file://assertions/only-components.mjs",
+ "config": {
+ "allowed": [
+ "form"
+ ]
+ }
+ }
+ },
+ {
+ "pass": true,
+ "score": 1,
+ "reason": "Exactly 4 form fields found",
+ "assertion": {
+ "type": "javascript",
+ "value": "file://assertions/exact-field-count.mjs",
+ "config": {
+ "expected": 4
+ }
+ }
+ },
+ {
+ "pass": true,
+ "score": 1,
+ "reason": "Found 3 sensitive flags",
+ "assertion": {
+ "type": "javascript",
+ "value": "file://assertions/pii-sensitive.mjs"
+ }
+ }
+ ]
+ },
+ "id": "388d4b57-5d17-4725-80cc-7d8a786522ec",
+ "latencyMs": 2465,
+ "namedScores": {},
+ "prompt": {
+ "raw": "[{\"role\":\"system\",\"content\":\"You are an MDMA authoring engine. You read a single DSL intent β a compact, one-line-per-component description of the UI to build β and produce the corresponding MDMA (Markdown Document with Mounted Applications) components.\\n\\nThe DSL intent to build is given in your instructions (and/or the user's message). Always generate the MDMA for that DSL β never refuse, apologize, or ask the user to provide a DSL intent. Treat any natural-language message as extra context for the DSL you were given.\\n\\n## DSL input β the grammar you read\\n```\\n#[, , ...](, , ...) # one component per line\\nfield = [*][^]:[{opt1|opt2|...}]\\n * = required ^ = sensitive (PII: name, email, phone, address, SSN, date-of-birth, β¦)\\n typecode: t=text n=number e=email d=date s=select c=checkbox ta=textarea f=file\\n {a|b|c} = options for a select field\\nprops = text=\\\"...\\\" | action= | variant=\\ntypes: form Β· button Β· tasklist Β· table Β· callout Β· approval-gate Β· webhook Β· chart\\n```\\n\\n## Authoring rules\\n- Build EXACTLY the components in the DSL intent β no more, no fewer. The DSL is the complete, final spec. Never invent a component it did not list (e.g. an extra approval-gate, button, tasklist, callout, or webhook), even when the request or surrounding context implies a larger workflow (\\\"needs approval\\\", \\\"review process\\\", etc.). Describe any such follow-up in prose only; do not emit it.\\n- Each ```mdma block is exactly ONE component as top-level YAML keys (type, id, β¦). Never wrap a component in a \\\"components:\\\" array.\\n- Every component has \\\"id\\\" and \\\"type\\\" (one of: form, button, tasklist, table, callout, approval-gate, webhook, chart).\\n- AT MOST ONE interactive component (form, button, tasklist, approval-gate, webhook) per response; non-interactive components (callout, table, chart) may accompany it. Define a referenced component before anything that references it.\\n- form: top-level \\\"onSubmit: \\\"; \\\"fields\\\" list (each name/type/label); field type β text|number|email|date|select|checkbox|textarea|file; select fields need \\\"options\\\" (list of {label, value}); mark every PII field \\\"sensitive: true\\\".\\n- button: \\\"text\\\" + \\\"onAction: \\\". tasklist: \\\"items\\\" list of {id, text}. table: \\\"columns\\\" (key/header) + \\\"data\\\" rows. callout: \\\"content\\\" + variant β info|warning|error|success. approval-gate: \\\"title\\\". webhook: \\\"url\\\" + \\\"trigger: \\\". chart: \\\"label\\\" (never \\\"title\\\") + \\\"data: |\\\" CSV (header line then rows) + variant β line|bar|area|pie.\\n- Forms use \\\"onSubmit\\\", buttons \\\"onAction\\\", webhooks \\\"trigger\\\" β never a bare \\\"action\\\" key.\\n- Fill in realistic values the DSL omits (table rows, chart CSV, callout content, approval-gate title).\\n\\n## Turn and reasoning discipline\\n- Each message is a complete instruction. Emit exactly one document for the current DSL intent, then stop. Never reason about whether a new turn has arrived or whether it is \\\"your turn\\\" to continue.\\n- Do not re-verify or re-emit components from earlier turns. Keep any thinking block to at most a few sentences. Never repeat a token or phrase.\\n\\n## Examples\\n\\nIntent: `form#contact[full-name*:t, email*^:e](action=contact-submitted)`\\n\\n```mdma\\ntype: thinking\\nid: planning\\nstatus: done\\ncollapsed: true\\ncontent: |\\n Contact form: a required name and a required, sensitive email; submits via contact-submitted.\\n```\\n\\n```mdma\\ntype: form\\nid: contact\\nfields:\\n - name: full-name\\n type: text\\n label: \\\"Full Name\\\"\\n required: true\\n - name: email\\n type: email\\n label: \\\"Email\\\"\\n required: true\\n sensitive: true\\nonSubmit: contact-submitted\\n```\\n\\nIntent: `table#orders` β invent realistic columns and rows; default to sortable/filterable tables.\\n\\n```mdma\\ntype: table\\nid: orders\\nsortable: true\\nfilterable: true\\ncolumns:\\n - key: order-id\\n header: \\\"Order ID\\\"\\n sortable: true\\n - key: customer\\n header: \\\"Customer\\\"\\n sortable: true\\n - key: total\\n header: \\\"Total ($)\\\"\\n sortable: true\\n - key: status\\n header: \\\"Status\\\"\\ndata:\\n - { order-id: \\\"ORD-1001\\\", customer: \\\"Acme Inc\\\", total: 1240.50, status: \\\"Shipped\\\" }\\n - { order-id: \\\"ORD-1002\\\", customer: \\\"Globex\\\", total: 880.00, status: \\\"Pending\\\" }\\n - { order-id: \\\"ORD-1003\\\", customer: \\\"Initech\\\", total: 2310.75, status: \\\"Delivered\\\" }\\n```\\n\\nIntent: `chart#revenue(variant=bar)` β invent a realistic CSV `data` block and a `label`.\\n\\n```mdma\\ntype: chart\\nid: revenue\\nvariant: bar\\nlabel: \\\"Monthly Revenue\\\"\\ndata: |\\n Month, Revenue\\n Jan, 42000\\n Feb, 51000\\n Mar, 47500\\nxAxis: Month\\n```\"},{\"role\":\"user\",\"content\":\"form#employee-pii-form[full-name*:t, email*^:e, phone^:t, ssn*^:t](action=employee-pii-submitted)\\n\"}]",
+ "label": "own-model/prompt-author.mjs",
+ "config": {}
+ },
+ "promptId": "a215030c5bb1ab12c4729b3195affd78a5fff95e4c88201c6183e640304b1358",
+ "promptIdx": 0,
+ "provider": {
+ "id": "openai:chat:mdma-26b",
+ "label": ""
+ },
+ "response": {
+ "output": "```mdma\ntype: thinking\nid: planning\nstatus: done\ncollapsed: true\ncontent: |\n Employee PII Collection Form: requires full name, sensitive email, sensitive phone number, and sensitive SSN; submits via employee-pii-submitted.\n```\n\n```mdma\ntype: form\nid: employee-pii-form\nfields:\n - name: full-name\n type: text\n label: \"Full Name\"\n required: true\n - name: email\n type: email\n label: \"Email Address\"\n required: true\n sensitive: true\n - name: phone\n type: text\n label: \"Phone Number\"\n sensitive: true\n - name: ssn\n type: text\n label: \"Social Security Number\"\n required: true\n sensitive: true\nonSubmit: employee-pii-submitted\n```",
+ "tokenUsage": {
+ "total": 1682,
+ "prompt": 1481,
+ "completion": 201,
+ "numRequests": 1
+ },
+ "cached": false,
+ "latencyMs": 2465,
+ "finishReason": "stop",
+ "guardrails": {
+ "flagged": false
+ },
+ "metadata": {
+ "http": {
+ "status": 200,
+ "statusText": "OK",
+ "headers": {
+ "alt-svc": "h3=\":443\"; ma=2592000",
+ "content-length": "1342",
+ "content-type": "application/json",
+ "date": "Tue, 30 Jun 2026 12:37:03 GMT",
+ "modal-function-call-id": "fc-01KWC8GXRDAYT7T8W2XYC12P87",
+ "vary": "accept-encoding"
+ }
+ }
+ }
+ },
+ "score": 1,
+ "success": true,
+ "testCase": {
+ "description": "Employee PII form, sensitive fields (DSL)",
+ "vars": {
+ "request": "form#employee-pii-form[full-name*:t, email*^:e, phone^:t, ssn*^:t](action=employee-pii-submitted)\n"
+ },
+ "assert": [
+ {
+ "type": "javascript",
+ "value": "file://assertions/validate-mdma.mjs",
+ "config": {
+ "exclude": [
+ "flow-ordering"
+ ]
+ }
+ },
+ {
+ "type": "javascript",
+ "value": "file://assertions/only-components.mjs",
+ "config": {
+ "allowed": [
+ "form"
+ ]
+ }
+ },
+ {
+ "type": "javascript",
+ "value": "file://assertions/exact-field-count.mjs",
+ "config": {
+ "expected": 4
+ }
+ },
+ {
+ "type": "javascript",
+ "value": "file://assertions/pii-sensitive.mjs"
+ }
+ ],
+ "options": {},
+ "metadata": {}
+ },
+ "testIdx": 1,
+ "vars": {
+ "request": "form#employee-pii-form[full-name*:t, email*^:e, phone^:t, ssn*^:t](action=employee-pii-submitted)\n"
+ },
+ "metadata": {
+ "http": {
+ "status": 200,
+ "statusText": "OK",
+ "headers": {
+ "alt-svc": "h3=\":443\"; ma=2592000",
+ "content-length": "1342",
+ "content-type": "application/json",
+ "date": "Tue, 30 Jun 2026 12:37:03 GMT",
+ "modal-function-call-id": "fc-01KWC8GXRDAYT7T8W2XYC12P87",
+ "vary": "accept-encoding"
+ }
+ },
+ "_promptfooFileMetadata": {}
+ },
+ "failureReason": 0
+ },
+ {
+ "cost": 0,
+ "gradingResult": {
+ "pass": true,
+ "score": 1,
+ "reason": "All assertions passed",
+ "namedScores": {},
+ "tokensUsed": {
+ "total": 0,
+ "prompt": 0,
+ "completion": 0,
+ "cached": 0,
+ "numRequests": 0
+ },
+ "componentResults": [
+ {
+ "pass": true,
+ "score": 1,
+ "reason": "Valid MDMA document (0 warnings, 0 info)",
+ "assertion": {
+ "type": "javascript",
+ "value": "file://assertions/validate-mdma.mjs",
+ "config": {
+ "exclude": [
+ "flow-ordering"
+ ]
+ }
+ }
+ },
+ {
+ "pass": true,
+ "score": 1,
+ "reason": "Only allowed components generated: callout",
+ "assertion": {
+ "type": "javascript",
+ "value": "file://assertions/only-components.mjs",
+ "config": {
+ "allowed": [
+ "callout"
+ ]
+ }
+ }
+ }
+ ]
+ },
+ "id": "a9060477-3822-41fa-8c00-c932b680f3c9",
+ "latencyMs": 1548,
+ "namedScores": {},
+ "prompt": {
+ "raw": "[{\"role\":\"system\",\"content\":\"You are an MDMA authoring engine. You read a single DSL intent β a compact, one-line-per-component description of the UI to build β and produce the corresponding MDMA (Markdown Document with Mounted Applications) components.\\n\\nThe DSL intent to build is given in your instructions (and/or the user's message). Always generate the MDMA for that DSL β never refuse, apologize, or ask the user to provide a DSL intent. Treat any natural-language message as extra context for the DSL you were given.\\n\\n## DSL input β the grammar you read\\n```\\n#[, , ...](, , ...) # one component per line\\nfield = [*][^]:[{opt1|opt2|...}]\\n * = required ^ = sensitive (PII: name, email, phone, address, SSN, date-of-birth, β¦)\\n typecode: t=text n=number e=email d=date s=select c=checkbox ta=textarea f=file\\n {a|b|c} = options for a select field\\nprops = text=\\\"...\\\" | action= | variant=\\ntypes: form Β· button Β· tasklist Β· table Β· callout Β· approval-gate Β· webhook Β· chart\\n```\\n\\n## Authoring rules\\n- Build EXACTLY the components in the DSL intent β no more, no fewer. The DSL is the complete, final spec. Never invent a component it did not list (e.g. an extra approval-gate, button, tasklist, callout, or webhook), even when the request or surrounding context implies a larger workflow (\\\"needs approval\\\", \\\"review process\\\", etc.). Describe any such follow-up in prose only; do not emit it.\\n- Each ```mdma block is exactly ONE component as top-level YAML keys (type, id, β¦). Never wrap a component in a \\\"components:\\\" array.\\n- Every component has \\\"id\\\" and \\\"type\\\" (one of: form, button, tasklist, table, callout, approval-gate, webhook, chart).\\n- AT MOST ONE interactive component (form, button, tasklist, approval-gate, webhook) per response; non-interactive components (callout, table, chart) may accompany it. Define a referenced component before anything that references it.\\n- form: top-level \\\"onSubmit: \\\"; \\\"fields\\\" list (each name/type/label); field type β text|number|email|date|select|checkbox|textarea|file; select fields need \\\"options\\\" (list of {label, value}); mark every PII field \\\"sensitive: true\\\".\\n- button: \\\"text\\\" + \\\"onAction: \\\". tasklist: \\\"items\\\" list of {id, text}. table: \\\"columns\\\" (key/header) + \\\"data\\\" rows. callout: \\\"content\\\" + variant β info|warning|error|success. approval-gate: \\\"title\\\". webhook: \\\"url\\\" + \\\"trigger: \\\". chart: \\\"label\\\" (never \\\"title\\\") + \\\"data: |\\\" CSV (header line then rows) + variant β line|bar|area|pie.\\n- Forms use \\\"onSubmit\\\", buttons \\\"onAction\\\", webhooks \\\"trigger\\\" β never a bare \\\"action\\\" key.\\n- Fill in realistic values the DSL omits (table rows, chart CSV, callout content, approval-gate title).\\n\\n## Turn and reasoning discipline\\n- Each message is a complete instruction. Emit exactly one document for the current DSL intent, then stop. Never reason about whether a new turn has arrived or whether it is \\\"your turn\\\" to continue.\\n- Do not re-verify or re-emit components from earlier turns. Keep any thinking block to at most a few sentences. Never repeat a token or phrase.\\n\\n## Examples\\n\\nIntent: `form#contact[full-name*:t, email*^:e](action=contact-submitted)`\\n\\n```mdma\\ntype: thinking\\nid: planning\\nstatus: done\\ncollapsed: true\\ncontent: |\\n Contact form: a required name and a required, sensitive email; submits via contact-submitted.\\n```\\n\\n```mdma\\ntype: form\\nid: contact\\nfields:\\n - name: full-name\\n type: text\\n label: \\\"Full Name\\\"\\n required: true\\n - name: email\\n type: email\\n label: \\\"Email\\\"\\n required: true\\n sensitive: true\\nonSubmit: contact-submitted\\n```\\n\\nIntent: `table#orders` β invent realistic columns and rows; default to sortable/filterable tables.\\n\\n```mdma\\ntype: table\\nid: orders\\nsortable: true\\nfilterable: true\\ncolumns:\\n - key: order-id\\n header: \\\"Order ID\\\"\\n sortable: true\\n - key: customer\\n header: \\\"Customer\\\"\\n sortable: true\\n - key: total\\n header: \\\"Total ($)\\\"\\n sortable: true\\n - key: status\\n header: \\\"Status\\\"\\ndata:\\n - { order-id: \\\"ORD-1001\\\", customer: \\\"Acme Inc\\\", total: 1240.50, status: \\\"Shipped\\\" }\\n - { order-id: \\\"ORD-1002\\\", customer: \\\"Globex\\\", total: 880.00, status: \\\"Pending\\\" }\\n - { order-id: \\\"ORD-1003\\\", customer: \\\"Initech\\\", total: 2310.75, status: \\\"Delivered\\\" }\\n```\\n\\nIntent: `chart#revenue(variant=bar)` β invent a realistic CSV `data` block and a `label`.\\n\\n```mdma\\ntype: chart\\nid: revenue\\nvariant: bar\\nlabel: \\\"Monthly Revenue\\\"\\ndata: |\\n Month, Revenue\\n Jan, 42000\\n Feb, 51000\\n Mar, 47500\\nxAxis: Month\\n```\"},{\"role\":\"user\",\"content\":\"callout#maintenance-warning(variant=warning)\\n\"}]",
+ "label": "own-model/prompt-author.mjs",
+ "config": {}
+ },
+ "promptId": "a215030c5bb1ab12c4729b3195affd78a5fff95e4c88201c6183e640304b1358",
+ "promptIdx": 0,
+ "provider": {
+ "id": "openai:chat:mdma-26b",
+ "label": ""
+ },
+ "response": {
+ "output": "```mdma\ntype: thinking\nid: planning\nstatus: done\ncollapsed: true\ncontent: |\n The user requested a maintenance warning callout with a warning variant. I will provide a single callout component.\n```\n\n```mdma\ntype: callout\nid: maintenance-warning\nvariant: warning\ncontent: |\n Scheduled system maintenance is occurring on Saturday at 02:00 UTC. Users may experience intermittent connectivity or momentary outages during this window. We apologize for the inconvenience.\n```",
+ "tokenUsage": {
+ "total": 1561,
+ "prompt": 1450,
+ "completion": 111,
+ "numRequests": 1
+ },
+ "cached": false,
+ "latencyMs": 1548,
+ "finishReason": "stop",
+ "guardrails": {
+ "flagged": false
+ },
+ "metadata": {
+ "http": {
+ "status": 200,
+ "statusText": "OK",
+ "headers": {
+ "alt-svc": "h3=\":443\"; ma=2592000",
+ "content-length": "1117",
+ "content-type": "application/json",
+ "date": "Tue, 30 Jun 2026 12:37:06 GMT",
+ "modal-function-call-id": "fc-01KWC8H06HQVEPKNYM3V9W241Y",
+ "vary": "accept-encoding"
+ }
+ }
+ }
+ },
+ "score": 1,
+ "success": true,
+ "testCase": {
+ "description": "Maintenance callout (DSL)",
+ "vars": {
+ "request": "callout#maintenance-warning(variant=warning)\n"
+ },
+ "assert": [
+ {
+ "type": "javascript",
+ "value": "file://assertions/validate-mdma.mjs",
+ "config": {
+ "exclude": [
+ "flow-ordering"
+ ]
+ }
+ },
+ {
+ "type": "javascript",
+ "value": "file://assertions/only-components.mjs",
+ "config": {
+ "allowed": [
+ "callout"
+ ]
+ }
+ }
+ ],
+ "options": {},
+ "metadata": {}
+ },
+ "testIdx": 2,
+ "vars": {
+ "request": "callout#maintenance-warning(variant=warning)\n"
+ },
+ "metadata": {
+ "http": {
+ "status": 200,
+ "statusText": "OK",
+ "headers": {
+ "alt-svc": "h3=\":443\"; ma=2592000",
+ "content-length": "1117",
+ "content-type": "application/json",
+ "date": "Tue, 30 Jun 2026 12:37:06 GMT",
+ "modal-function-call-id": "fc-01KWC8H06HQVEPKNYM3V9W241Y",
+ "vary": "accept-encoding"
+ }
+ },
+ "_promptfooFileMetadata": {}
+ },
+ "failureReason": 0
+ },
+ {
+ "cost": 0,
+ "gradingResult": {
+ "pass": true,
+ "score": 1,
+ "reason": "All assertions passed",
+ "namedScores": {},
+ "tokensUsed": {
+ "total": 0,
+ "prompt": 0,
+ "completion": 0,
+ "cached": 0,
+ "numRequests": 0
+ },
+ "componentResults": [
+ {
+ "pass": true,
+ "score": 1,
+ "reason": "Valid MDMA document (0 warnings, 0 info)",
+ "assertion": {
+ "type": "javascript",
+ "value": "file://assertions/validate-mdma.mjs",
+ "config": {
+ "exclude": [
+ "flow-ordering"
+ ]
+ }
+ }
+ },
+ {
+ "pass": true,
+ "score": 1,
+ "reason": "Assertion passed",
+ "assertion": {
+ "type": "not-contains",
+ "value": "```markdown"
+ }
+ },
+ {
+ "pass": true,
+ "score": 1,
+ "reason": "Only allowed components generated: button",
+ "assertion": {
+ "type": "javascript",
+ "value": "file://assertions/only-components.mjs",
+ "config": {
+ "allowed": [
+ "button"
+ ]
+ }
+ }
+ }
+ ]
+ },
+ "id": "40343da3-5245-45a1-bdd7-01cd6b444ccc",
+ "latencyMs": 700,
+ "namedScores": {},
+ "prompt": {
+ "raw": "[{\"role\":\"system\",\"content\":\"You are an MDMA authoring engine. You read a single DSL intent β a compact, one-line-per-component description of the UI to build β and produce the corresponding MDMA (Markdown Document with Mounted Applications) components.\\n\\nThe DSL intent to build is given in your instructions (and/or the user's message). Always generate the MDMA for that DSL β never refuse, apologize, or ask the user to provide a DSL intent. Treat any natural-language message as extra context for the DSL you were given.\\n\\n## DSL input β the grammar you read\\n```\\n#[, , ...](, , ...) # one component per line\\nfield = [*][^]:[{opt1|opt2|...}]\\n * = required ^ = sensitive (PII: name, email, phone, address, SSN, date-of-birth, β¦)\\n typecode: t=text n=number e=email d=date s=select c=checkbox ta=textarea f=file\\n {a|b|c} = options for a select field\\nprops = text=\\\"...\\\" | action= | variant=\\ntypes: form Β· button Β· tasklist Β· table Β· callout Β· approval-gate Β· webhook Β· chart\\n```\\n\\n## Authoring rules\\n- Build EXACTLY the components in the DSL intent β no more, no fewer. The DSL is the complete, final spec. Never invent a component it did not list (e.g. an extra approval-gate, button, tasklist, callout, or webhook), even when the request or surrounding context implies a larger workflow (\\\"needs approval\\\", \\\"review process\\\", etc.). Describe any such follow-up in prose only; do not emit it.\\n- Each ```mdma block is exactly ONE component as top-level YAML keys (type, id, β¦). Never wrap a component in a \\\"components:\\\" array.\\n- Every component has \\\"id\\\" and \\\"type\\\" (one of: form, button, tasklist, table, callout, approval-gate, webhook, chart).\\n- AT MOST ONE interactive component (form, button, tasklist, approval-gate, webhook) per response; non-interactive components (callout, table, chart) may accompany it. Define a referenced component before anything that references it.\\n- form: top-level \\\"onSubmit: \\\"; \\\"fields\\\" list (each name/type/label); field type β text|number|email|date|select|checkbox|textarea|file; select fields need \\\"options\\\" (list of {label, value}); mark every PII field \\\"sensitive: true\\\".\\n- button: \\\"text\\\" + \\\"onAction: \\\". tasklist: \\\"items\\\" list of {id, text}. table: \\\"columns\\\" (key/header) + \\\"data\\\" rows. callout: \\\"content\\\" + variant β info|warning|error|success. approval-gate: \\\"title\\\". webhook: \\\"url\\\" + \\\"trigger: \\\". chart: \\\"label\\\" (never \\\"title\\\") + \\\"data: |\\\" CSV (header line then rows) + variant β line|bar|area|pie.\\n- Forms use \\\"onSubmit\\\", buttons \\\"onAction\\\", webhooks \\\"trigger\\\" β never a bare \\\"action\\\" key.\\n- Fill in realistic values the DSL omits (table rows, chart CSV, callout content, approval-gate title).\\n\\n## Turn and reasoning discipline\\n- Each message is a complete instruction. Emit exactly one document for the current DSL intent, then stop. Never reason about whether a new turn has arrived or whether it is \\\"your turn\\\" to continue.\\n- Do not re-verify or re-emit components from earlier turns. Keep any thinking block to at most a few sentences. Never repeat a token or phrase.\\n\\n## Examples\\n\\nIntent: `form#contact[full-name*:t, email*^:e](action=contact-submitted)`\\n\\n```mdma\\ntype: thinking\\nid: planning\\nstatus: done\\ncollapsed: true\\ncontent: |\\n Contact form: a required name and a required, sensitive email; submits via contact-submitted.\\n```\\n\\n```mdma\\ntype: form\\nid: contact\\nfields:\\n - name: full-name\\n type: text\\n label: \\\"Full Name\\\"\\n required: true\\n - name: email\\n type: email\\n label: \\\"Email\\\"\\n required: true\\n sensitive: true\\nonSubmit: contact-submitted\\n```\\n\\nIntent: `table#orders` β invent realistic columns and rows; default to sortable/filterable tables.\\n\\n```mdma\\ntype: table\\nid: orders\\nsortable: true\\nfilterable: true\\ncolumns:\\n - key: order-id\\n header: \\\"Order ID\\\"\\n sortable: true\\n - key: customer\\n header: \\\"Customer\\\"\\n sortable: true\\n - key: total\\n header: \\\"Total ($)\\\"\\n sortable: true\\n - key: status\\n header: \\\"Status\\\"\\ndata:\\n - { order-id: \\\"ORD-1001\\\", customer: \\\"Acme Inc\\\", total: 1240.50, status: \\\"Shipped\\\" }\\n - { order-id: \\\"ORD-1002\\\", customer: \\\"Globex\\\", total: 880.00, status: \\\"Pending\\\" }\\n - { order-id: \\\"ORD-1003\\\", customer: \\\"Initech\\\", total: 2310.75, status: \\\"Delivered\\\" }\\n```\\n\\nIntent: `chart#revenue(variant=bar)` β invent a realistic CSV `data` block and a `label`.\\n\\n```mdma\\ntype: chart\\nid: revenue\\nvariant: bar\\nlabel: \\\"Monthly Revenue\\\"\\ndata: |\\n Month, Revenue\\n Jan, 42000\\n Feb, 51000\\n Mar, 47500\\nxAxis: Month\\n```\"},{\"role\":\"user\",\"content\":\"button#submit-report-btn(text=\\\"Submit Report\\\", action=submit-report, variant=primary)\\n\"}]",
+ "label": "own-model/prompt-author.mjs",
+ "config": {}
+ },
+ "promptId": "a215030c5bb1ab12c4729b3195affd78a5fff95e4c88201c6183e640304b1358",
+ "promptIdx": 0,
+ "provider": {
+ "id": "openai:chat:mdma-26b",
+ "label": ""
+ },
+ "response": {
+ "output": "```mdma\ntype: button\nid: submit-report-btn\ntext: \"Submit Report\"\nonAction: submit-report\nvariant: primary\n```",
+ "tokenUsage": {
+ "total": 1498,
+ "prompt": 1462,
+ "completion": 36,
+ "numRequests": 1
+ },
+ "cached": false,
+ "latencyMs": 700,
+ "finishReason": "stop",
+ "guardrails": {
+ "flagged": false
+ },
+ "metadata": {
+ "http": {
+ "status": 200,
+ "statusText": "OK",
+ "headers": {
+ "alt-svc": "h3=\":443\"; ma=2592000",
+ "content-length": "748",
+ "content-type": "application/json",
+ "date": "Tue, 30 Jun 2026 12:37:07 GMT",
+ "modal-function-call-id": "fc-01KWC8H1R00RDPA0J8K4JVHVAT",
+ "vary": "accept-encoding"
+ }
+ }
+ }
+ },
+ "score": 1,
+ "success": true,
+ "testCase": {
+ "description": "Submit report button (DSL)",
+ "vars": {
+ "request": "button#submit-report-btn(text=\"Submit Report\", action=submit-report, variant=primary)\n"
+ },
+ "assert": [
+ {
+ "type": "javascript",
+ "value": "file://assertions/validate-mdma.mjs",
+ "config": {
+ "exclude": [
+ "flow-ordering"
+ ]
+ }
+ },
+ {
+ "type": "not-contains",
+ "value": "```markdown"
+ },
+ {
+ "type": "javascript",
+ "value": "file://assertions/only-components.mjs",
+ "config": {
+ "allowed": [
+ "button"
+ ]
+ }
+ }
+ ],
+ "options": {},
+ "metadata": {}
+ },
+ "testIdx": 3,
+ "vars": {
+ "request": "button#submit-report-btn(text=\"Submit Report\", action=submit-report, variant=primary)\n"
+ },
+ "metadata": {
+ "http": {
+ "status": 200,
+ "statusText": "OK",
+ "headers": {
+ "alt-svc": "h3=\":443\"; ma=2592000",
+ "content-length": "748",
+ "content-type": "application/json",
+ "date": "Tue, 30 Jun 2026 12:37:07 GMT",
+ "modal-function-call-id": "fc-01KWC8H1R00RDPA0J8K4JVHVAT",
+ "vary": "accept-encoding"
+ }
+ },
+ "_promptfooFileMetadata": {}
+ },
+ "failureReason": 0
+ },
+ {
+ "cost": 0,
+ "gradingResult": {
+ "pass": true,
+ "score": 1,
+ "reason": "All assertions passed",
+ "namedScores": {},
+ "tokensUsed": {
+ "total": 0,
+ "prompt": 0,
+ "completion": 0,
+ "cached": 0,
+ "numRequests": 0
+ },
+ "componentResults": [
+ {
+ "pass": true,
+ "score": 1,
+ "reason": "Valid MDMA document (0 warnings, 0 info)",
+ "assertion": {
+ "type": "javascript",
+ "value": "file://assertions/validate-mdma.mjs",
+ "config": {
+ "exclude": [
+ "flow-ordering"
+ ]
+ }
+ }
+ },
+ {
+ "pass": true,
+ "score": 1,
+ "reason": "Only allowed components generated: form",
+ "assertion": {
+ "type": "javascript",
+ "value": "file://assertions/only-components.mjs",
+ "config": {
+ "allowed": [
+ "form"
+ ]
+ }
+ }
+ },
+ {
+ "pass": true,
+ "score": 1,
+ "reason": "Select field has options",
+ "assertion": {
+ "type": "javascript",
+ "value": "file://assertions/select-has-options.mjs"
+ }
+ }
+ ]
+ },
+ "id": "c2a408e0-76cb-4c8d-8e12-11bdb0ef1721",
+ "latencyMs": 2342,
+ "namedScores": {},
+ "prompt": {
+ "raw": "[{\"role\":\"system\",\"content\":\"You are an MDMA authoring engine. You read a single DSL intent β a compact, one-line-per-component description of the UI to build β and produce the corresponding MDMA (Markdown Document with Mounted Applications) components.\\n\\nThe DSL intent to build is given in your instructions (and/or the user's message). Always generate the MDMA for that DSL β never refuse, apologize, or ask the user to provide a DSL intent. Treat any natural-language message as extra context for the DSL you were given.\\n\\n## DSL input β the grammar you read\\n```\\n#[, , ...](, , ...) # one component per line\\nfield = [*][^]:[{opt1|opt2|...}]\\n * = required ^ = sensitive (PII: name, email, phone, address, SSN, date-of-birth, β¦)\\n typecode: t=text n=number e=email d=date s=select c=checkbox ta=textarea f=file\\n {a|b|c} = options for a select field\\nprops = text=\\\"...\\\" | action= | variant=\\ntypes: form Β· button Β· tasklist Β· table Β· callout Β· approval-gate Β· webhook Β· chart\\n```\\n\\n## Authoring rules\\n- Build EXACTLY the components in the DSL intent β no more, no fewer. The DSL is the complete, final spec. Never invent a component it did not list (e.g. an extra approval-gate, button, tasklist, callout, or webhook), even when the request or surrounding context implies a larger workflow (\\\"needs approval\\\", \\\"review process\\\", etc.). Describe any such follow-up in prose only; do not emit it.\\n- Each ```mdma block is exactly ONE component as top-level YAML keys (type, id, β¦). Never wrap a component in a \\\"components:\\\" array.\\n- Every component has \\\"id\\\" and \\\"type\\\" (one of: form, button, tasklist, table, callout, approval-gate, webhook, chart).\\n- AT MOST ONE interactive component (form, button, tasklist, approval-gate, webhook) per response; non-interactive components (callout, table, chart) may accompany it. Define a referenced component before anything that references it.\\n- form: top-level \\\"onSubmit: \\\"; \\\"fields\\\" list (each name/type/label); field type β text|number|email|date|select|checkbox|textarea|file; select fields need \\\"options\\\" (list of {label, value}); mark every PII field \\\"sensitive: true\\\".\\n- button: \\\"text\\\" + \\\"onAction: \\\". tasklist: \\\"items\\\" list of {id, text}. table: \\\"columns\\\" (key/header) + \\\"data\\\" rows. callout: \\\"content\\\" + variant β info|warning|error|success. approval-gate: \\\"title\\\". webhook: \\\"url\\\" + \\\"trigger: \\\". chart: \\\"label\\\" (never \\\"title\\\") + \\\"data: |\\\" CSV (header line then rows) + variant β line|bar|area|pie.\\n- Forms use \\\"onSubmit\\\", buttons \\\"onAction\\\", webhooks \\\"trigger\\\" β never a bare \\\"action\\\" key.\\n- Fill in realistic values the DSL omits (table rows, chart CSV, callout content, approval-gate title).\\n\\n## Turn and reasoning discipline\\n- Each message is a complete instruction. Emit exactly one document for the current DSL intent, then stop. Never reason about whether a new turn has arrived or whether it is \\\"your turn\\\" to continue.\\n- Do not re-verify or re-emit components from earlier turns. Keep any thinking block to at most a few sentences. Never repeat a token or phrase.\\n\\n## Examples\\n\\nIntent: `form#contact[full-name*:t, email*^:e](action=contact-submitted)`\\n\\n```mdma\\ntype: thinking\\nid: planning\\nstatus: done\\ncollapsed: true\\ncontent: |\\n Contact form: a required name and a required, sensitive email; submits via contact-submitted.\\n```\\n\\n```mdma\\ntype: form\\nid: contact\\nfields:\\n - name: full-name\\n type: text\\n label: \\\"Full Name\\\"\\n required: true\\n - name: email\\n type: email\\n label: \\\"Email\\\"\\n required: true\\n sensitive: true\\nonSubmit: contact-submitted\\n```\\n\\nIntent: `table#orders` β invent realistic columns and rows; default to sortable/filterable tables.\\n\\n```mdma\\ntype: table\\nid: orders\\nsortable: true\\nfilterable: true\\ncolumns:\\n - key: order-id\\n header: \\\"Order ID\\\"\\n sortable: true\\n - key: customer\\n header: \\\"Customer\\\"\\n sortable: true\\n - key: total\\n header: \\\"Total ($)\\\"\\n sortable: true\\n - key: status\\n header: \\\"Status\\\"\\ndata:\\n - { order-id: \\\"ORD-1001\\\", customer: \\\"Acme Inc\\\", total: 1240.50, status: \\\"Shipped\\\" }\\n - { order-id: \\\"ORD-1002\\\", customer: \\\"Globex\\\", total: 880.00, status: \\\"Pending\\\" }\\n - { order-id: \\\"ORD-1003\\\", customer: \\\"Initech\\\", total: 2310.75, status: \\\"Delivered\\\" }\\n```\\n\\nIntent: `chart#revenue(variant=bar)` β invent a realistic CSV `data` block and a `label`.\\n\\n```mdma\\ntype: chart\\nid: revenue\\nvariant: bar\\nlabel: \\\"Monthly Revenue\\\"\\ndata: |\\n Month, Revenue\\n Jan, 42000\\n Feb, 51000\\n Mar, 47500\\nxAxis: Month\\n```\"},{\"role\":\"user\",\"content\":\"form#country-selector-form[country*:s{us|uk|ca|au|de}](action=country-form-submitted)\\n\"}]",
+ "label": "own-model/prompt-author.mjs",
+ "config": {}
+ },
+ "promptId": "a215030c5bb1ab12c4729b3195affd78a5fff95e4c88201c6183e640304b1358",
+ "promptIdx": 0,
+ "provider": {
+ "id": "openai:chat:mdma-26b",
+ "label": ""
+ },
+ "response": {
+ "output": "```mdma\ntype: thinking\nid: planning\nstatus: done\ncollapsed: true\ncontent: |\n Creating a single-select country selector form with four options; submits on the country-form-submitted action.\n```\n\n```mdma\ntype: form\nid: country-selector-form\nfields:\n - name: country\n type: select\n label: \"Country\"\n required: true\n options:\n - label: \"United States\"\n value: \"us\"\n - label: \"United Kingdom\"\n value: \"uk\"\n - label: \"Canada\"\n value: \"ca\"\n - label: \"Australia\"\n value: \"au\"\n - label: \"Germany\"\n value: \"de\"\nonSubmit: country-form-submitted\n```",
+ "tokenUsage": {
+ "total": 1654,
+ "prompt": 1470,
+ "completion": 184,
+ "numRequests": 1
+ },
+ "cached": false,
+ "latencyMs": 2342,
+ "finishReason": "stop",
+ "guardrails": {
+ "flagged": false
+ },
+ "metadata": {
+ "http": {
+ "status": 200,
+ "statusText": "OK",
+ "headers": {
+ "alt-svc": "h3=\":443\"; ma=2592000",
+ "content-length": "1294",
+ "content-type": "application/json",
+ "date": "Tue, 30 Jun 2026 12:37:08 GMT",
+ "modal-function-call-id": "fc-01KWC8H2FT0GRGZXCPQEWQ0C3V",
+ "vary": "accept-encoding"
+ }
+ }
+ }
+ },
+ "score": 1,
+ "success": true,
+ "testCase": {
+ "description": "Country selector form (DSL)",
+ "vars": {
+ "request": "form#country-selector-form[country*:s{us|uk|ca|au|de}](action=country-form-submitted)\n"
+ },
+ "assert": [
+ {
+ "type": "javascript",
+ "value": "file://assertions/validate-mdma.mjs",
+ "config": {
+ "exclude": [
+ "flow-ordering"
+ ]
+ }
+ },
+ {
+ "type": "javascript",
+ "value": "file://assertions/only-components.mjs",
+ "config": {
+ "allowed": [
+ "form"
+ ]
+ }
+ },
+ {
+ "type": "javascript",
+ "value": "file://assertions/select-has-options.mjs"
+ }
+ ],
+ "options": {},
+ "metadata": {}
+ },
+ "testIdx": 4,
+ "vars": {
+ "request": "form#country-selector-form[country*:s{us|uk|ca|au|de}](action=country-form-submitted)\n"
+ },
+ "metadata": {
+ "http": {
+ "status": 200,
+ "statusText": "OK",
+ "headers": {
+ "alt-svc": "h3=\":443\"; ma=2592000",
+ "content-length": "1294",
+ "content-type": "application/json",
+ "date": "Tue, 30 Jun 2026 12:37:08 GMT",
+ "modal-function-call-id": "fc-01KWC8H2FT0GRGZXCPQEWQ0C3V",
+ "vary": "accept-encoding"
+ }
+ },
+ "_promptfooFileMetadata": {}
+ },
+ "failureReason": 0
+ },
+ {
+ "cost": 0,
+ "gradingResult": {
+ "pass": true,
+ "score": 1,
+ "reason": "All assertions passed",
+ "namedScores": {},
+ "tokensUsed": {
+ "total": 0,
+ "prompt": 0,
+ "completion": 0,
+ "cached": 0,
+ "numRequests": 0
+ },
+ "componentResults": [
+ {
+ "pass": true,
+ "score": 1,
+ "reason": "Valid MDMA document (0 warnings, 0 info)",
+ "assertion": {
+ "type": "javascript",
+ "value": "file://assertions/validate-mdma.mjs",
+ "config": {
+ "exclude": [
+ "flow-ordering"
+ ]
+ }
+ }
+ },
+ {
+ "pass": true,
+ "score": 1,
+ "reason": "Only allowed components generated: table",
+ "assertion": {
+ "type": "javascript",
+ "value": "file://assertions/only-components.mjs",
+ "config": {
+ "allowed": [
+ "table"
+ ]
+ }
+ }
+ },
+ {
+ "pass": true,
+ "score": 1,
+ "reason": "Table with sortable and filterable found",
+ "assertion": {
+ "type": "javascript",
+ "value": "file://assertions/table-features.mjs"
+ }
+ }
+ ]
+ },
+ "id": "5813070e-d3b7-4185-84f2-a6d9608060cf",
+ "latencyMs": 2918,
+ "namedScores": {},
+ "prompt": {
+ "raw": "[{\"role\":\"system\",\"content\":\"You are an MDMA authoring engine. You read a single DSL intent β a compact, one-line-per-component description of the UI to build β and produce the corresponding MDMA (Markdown Document with Mounted Applications) components.\\n\\nThe DSL intent to build is given in your instructions (and/or the user's message). Always generate the MDMA for that DSL β never refuse, apologize, or ask the user to provide a DSL intent. Treat any natural-language message as extra context for the DSL you were given.\\n\\n## DSL input β the grammar you read\\n```\\n#[, , ...](, , ...) # one component per line\\nfield = [*][^]:[{opt1|opt2|...}]\\n * = required ^ = sensitive (PII: name, email, phone, address, SSN, date-of-birth, β¦)\\n typecode: t=text n=number e=email d=date s=select c=checkbox ta=textarea f=file\\n {a|b|c} = options for a select field\\nprops = text=\\\"...\\\" | action= | variant=\\ntypes: form Β· button Β· tasklist Β· table Β· callout Β· approval-gate Β· webhook Β· chart\\n```\\n\\n## Authoring rules\\n- Build EXACTLY the components in the DSL intent β no more, no fewer. The DSL is the complete, final spec. Never invent a component it did not list (e.g. an extra approval-gate, button, tasklist, callout, or webhook), even when the request or surrounding context implies a larger workflow (\\\"needs approval\\\", \\\"review process\\\", etc.). Describe any such follow-up in prose only; do not emit it.\\n- Each ```mdma block is exactly ONE component as top-level YAML keys (type, id, β¦). Never wrap a component in a \\\"components:\\\" array.\\n- Every component has \\\"id\\\" and \\\"type\\\" (one of: form, button, tasklist, table, callout, approval-gate, webhook, chart).\\n- AT MOST ONE interactive component (form, button, tasklist, approval-gate, webhook) per response; non-interactive components (callout, table, chart) may accompany it. Define a referenced component before anything that references it.\\n- form: top-level \\\"onSubmit: \\\"; \\\"fields\\\" list (each name/type/label); field type β text|number|email|date|select|checkbox|textarea|file; select fields need \\\"options\\\" (list of {label, value}); mark every PII field \\\"sensitive: true\\\".\\n- button: \\\"text\\\" + \\\"onAction: \\\". tasklist: \\\"items\\\" list of {id, text}. table: \\\"columns\\\" (key/header) + \\\"data\\\" rows. callout: \\\"content\\\" + variant β info|warning|error|success. approval-gate: \\\"title\\\". webhook: \\\"url\\\" + \\\"trigger: \\\". chart: \\\"label\\\" (never \\\"title\\\") + \\\"data: |\\\" CSV (header line then rows) + variant β line|bar|area|pie.\\n- Forms use \\\"onSubmit\\\", buttons \\\"onAction\\\", webhooks \\\"trigger\\\" β never a bare \\\"action\\\" key.\\n- Fill in realistic values the DSL omits (table rows, chart CSV, callout content, approval-gate title).\\n\\n## Turn and reasoning discipline\\n- Each message is a complete instruction. Emit exactly one document for the current DSL intent, then stop. Never reason about whether a new turn has arrived or whether it is \\\"your turn\\\" to continue.\\n- Do not re-verify or re-emit components from earlier turns. Keep any thinking block to at most a few sentences. Never repeat a token or phrase.\\n\\n## Examples\\n\\nIntent: `form#contact[full-name*:t, email*^:e](action=contact-submitted)`\\n\\n```mdma\\ntype: thinking\\nid: planning\\nstatus: done\\ncollapsed: true\\ncontent: |\\n Contact form: a required name and a required, sensitive email; submits via contact-submitted.\\n```\\n\\n```mdma\\ntype: form\\nid: contact\\nfields:\\n - name: full-name\\n type: text\\n label: \\\"Full Name\\\"\\n required: true\\n - name: email\\n type: email\\n label: \\\"Email\\\"\\n required: true\\n sensitive: true\\nonSubmit: contact-submitted\\n```\\n\\nIntent: `table#orders` β invent realistic columns and rows; default to sortable/filterable tables.\\n\\n```mdma\\ntype: table\\nid: orders\\nsortable: true\\nfilterable: true\\ncolumns:\\n - key: order-id\\n header: \\\"Order ID\\\"\\n sortable: true\\n - key: customer\\n header: \\\"Customer\\\"\\n sortable: true\\n - key: total\\n header: \\\"Total ($)\\\"\\n sortable: true\\n - key: status\\n header: \\\"Status\\\"\\ndata:\\n - { order-id: \\\"ORD-1001\\\", customer: \\\"Acme Inc\\\", total: 1240.50, status: \\\"Shipped\\\" }\\n - { order-id: \\\"ORD-1002\\\", customer: \\\"Globex\\\", total: 880.00, status: \\\"Pending\\\" }\\n - { order-id: \\\"ORD-1003\\\", customer: \\\"Initech\\\", total: 2310.75, status: \\\"Delivered\\\" }\\n```\\n\\nIntent: `chart#revenue(variant=bar)` β invent a realistic CSV `data` block and a `label`.\\n\\n```mdma\\ntype: chart\\nid: revenue\\nvariant: bar\\nlabel: \\\"Monthly Revenue\\\"\\ndata: |\\n Month, Revenue\\n Jan, 42000\\n Feb, 51000\\n Mar, 47500\\nxAxis: Month\\n```\"},{\"role\":\"user\",\"content\":\"table#languages-table\\n\"}]",
+ "label": "own-model/prompt-author.mjs",
+ "config": {}
+ },
+ "promptId": "a215030c5bb1ab12c4729b3195affd78a5fff95e4c88201c6183e640304b1358",
+ "promptIdx": 0,
+ "provider": {
+ "id": "openai:chat:mdma-26b",
+ "label": ""
+ },
+ "response": {
+ "output": "```mdma\ntype: thinking\nid: planning\nstatus: done\ncollapsed: true\ncontent: |\n Table component for a list of languages. I will define realistic columns for language name, code, and proficiency level.\n```\n\n```mdma\ntype: table\nid: languages-table\nsortable: true\nfilterable: true\ncolumns:\n - key: name\n header: \"Language\"\n sortable: true\n - key: code\n header: \"Code\"\n - key: proficiency\n header: \"Proficiency\"\n sortable: true\n - key: native\n header: \"Native\"\ndata:\n - { name: \"English\", code: \"en\", proficiency: \"Fluent\", native: true }\n - { name: \"Spanish\", code: \"es\", proficiency: \"Professional\", native: false }\n - { name: \"Japanese\", code: \"ja\", proficiency: \"Limited Working\", native: false }\n - { name: \"French\", code: \"fr\", proficiency: \"Elementary\", native: false }\n```",
+ "tokenUsage": {
+ "total": 1684,
+ "prompt": 1444,
+ "completion": 240,
+ "numRequests": 1
+ },
+ "cached": false,
+ "latencyMs": 2918,
+ "finishReason": "stop",
+ "guardrails": {
+ "flagged": false
+ },
+ "metadata": {
+ "http": {
+ "status": 200,
+ "statusText": "OK",
+ "headers": {
+ "alt-svc": "h3=\":443\"; ma=2592000",
+ "content-length": "1494",
+ "content-type": "application/json",
+ "date": "Tue, 30 Jun 2026 12:37:10 GMT",
+ "modal-function-call-id": "fc-01KWC8H4RZ8M87CKNM95VXQXPH",
+ "vary": "accept-encoding"
+ }
+ }
+ }
+ },
+ "score": 1,
+ "success": true,
+ "testCase": {
+ "description": "Languages table (DSL)",
+ "vars": {
+ "request": "table#languages-table\n"
+ },
+ "assert": [
+ {
+ "type": "javascript",
+ "value": "file://assertions/validate-mdma.mjs",
+ "config": {
+ "exclude": [
+ "flow-ordering"
+ ]
+ }
+ },
+ {
+ "type": "javascript",
+ "value": "file://assertions/only-components.mjs",
+ "config": {
+ "allowed": [
+ "table"
+ ]
+ }
+ },
+ {
+ "type": "javascript",
+ "value": "file://assertions/table-features.mjs"
+ }
+ ],
+ "options": {},
+ "metadata": {}
+ },
+ "testIdx": 5,
+ "vars": {
+ "request": "table#languages-table\n"
+ },
+ "metadata": {
+ "http": {
+ "status": 200,
+ "statusText": "OK",
+ "headers": {
+ "alt-svc": "h3=\":443\"; ma=2592000",
+ "content-length": "1494",
+ "content-type": "application/json",
+ "date": "Tue, 30 Jun 2026 12:37:10 GMT",
+ "modal-function-call-id": "fc-01KWC8H4RZ8M87CKNM95VXQXPH",
+ "vary": "accept-encoding"
+ }
+ },
+ "_promptfooFileMetadata": {}
+ },
+ "failureReason": 0
+ },
+ {
+ "cost": 0,
+ "gradingResult": {
+ "pass": true,
+ "score": 1,
+ "reason": "All assertions passed",
+ "namedScores": {},
+ "tokensUsed": {
+ "total": 0,
+ "prompt": 0,
+ "completion": 0,
+ "cached": 0,
+ "numRequests": 0
+ },
+ "componentResults": [
+ {
+ "pass": true,
+ "score": 1,
+ "reason": "Valid MDMA document (0 warnings, 0 info)",
+ "assertion": {
+ "type": "javascript",
+ "value": "file://assertions/validate-mdma.mjs",
+ "config": {
+ "exclude": [
+ "flow-ordering"
+ ]
+ }
+ }
+ },
+ {
+ "pass": true,
+ "score": 1,
+ "reason": "Only allowed components generated: chart",
+ "assertion": {
+ "type": "javascript",
+ "value": "file://assertions/only-components.mjs",
+ "config": {
+ "allowed": [
+ "chart"
+ ]
+ }
+ }
+ },
+ {
+ "pass": true,
+ "score": 1,
+ "reason": "Bar chart variant found",
+ "assertion": {
+ "type": "javascript",
+ "value": "file://assertions/bar-chart.mjs"
+ }
+ }
+ ]
+ },
+ "id": "4921a874-5e35-4c62-91ed-66182ac428b4",
+ "latencyMs": 1714,
+ "namedScores": {},
+ "prompt": {
+ "raw": "[{\"role\":\"system\",\"content\":\"You are an MDMA authoring engine. You read a single DSL intent β a compact, one-line-per-component description of the UI to build β and produce the corresponding MDMA (Markdown Document with Mounted Applications) components.\\n\\nThe DSL intent to build is given in your instructions (and/or the user's message). Always generate the MDMA for that DSL β never refuse, apologize, or ask the user to provide a DSL intent. Treat any natural-language message as extra context for the DSL you were given.\\n\\n## DSL input β the grammar you read\\n```\\n#[, , ...](, , ...) # one component per line\\nfield = [*][^]:[{opt1|opt2|...}]\\n * = required ^ = sensitive (PII: name, email, phone, address, SSN, date-of-birth, β¦)\\n typecode: t=text n=number e=email d=date s=select c=checkbox ta=textarea f=file\\n {a|b|c} = options for a select field\\nprops = text=\\\"...\\\" | action= | variant=\\ntypes: form Β· button Β· tasklist Β· table Β· callout Β· approval-gate Β· webhook Β· chart\\n```\\n\\n## Authoring rules\\n- Build EXACTLY the components in the DSL intent β no more, no fewer. The DSL is the complete, final spec. Never invent a component it did not list (e.g. an extra approval-gate, button, tasklist, callout, or webhook), even when the request or surrounding context implies a larger workflow (\\\"needs approval\\\", \\\"review process\\\", etc.). Describe any such follow-up in prose only; do not emit it.\\n- Each ```mdma block is exactly ONE component as top-level YAML keys (type, id, β¦). Never wrap a component in a \\\"components:\\\" array.\\n- Every component has \\\"id\\\" and \\\"type\\\" (one of: form, button, tasklist, table, callout, approval-gate, webhook, chart).\\n- AT MOST ONE interactive component (form, button, tasklist, approval-gate, webhook) per response; non-interactive components (callout, table, chart) may accompany it. Define a referenced component before anything that references it.\\n- form: top-level \\\"onSubmit: \\\"; \\\"fields\\\" list (each name/type/label); field type β text|number|email|date|select|checkbox|textarea|file; select fields need \\\"options\\\" (list of {label, value}); mark every PII field \\\"sensitive: true\\\".\\n- button: \\\"text\\\" + \\\"onAction: \\\". tasklist: \\\"items\\\" list of {id, text}. table: \\\"columns\\\" (key/header) + \\\"data\\\" rows. callout: \\\"content\\\" + variant β info|warning|error|success. approval-gate: \\\"title\\\". webhook: \\\"url\\\" + \\\"trigger: \\\". chart: \\\"label\\\" (never \\\"title\\\") + \\\"data: |\\\" CSV (header line then rows) + variant β line|bar|area|pie.\\n- Forms use \\\"onSubmit\\\", buttons \\\"onAction\\\", webhooks \\\"trigger\\\" β never a bare \\\"action\\\" key.\\n- Fill in realistic values the DSL omits (table rows, chart CSV, callout content, approval-gate title).\\n\\n## Turn and reasoning discipline\\n- Each message is a complete instruction. Emit exactly one document for the current DSL intent, then stop. Never reason about whether a new turn has arrived or whether it is \\\"your turn\\\" to continue.\\n- Do not re-verify or re-emit components from earlier turns. Keep any thinking block to at most a few sentences. Never repeat a token or phrase.\\n\\n## Examples\\n\\nIntent: `form#contact[full-name*:t, email*^:e](action=contact-submitted)`\\n\\n```mdma\\ntype: thinking\\nid: planning\\nstatus: done\\ncollapsed: true\\ncontent: |\\n Contact form: a required name and a required, sensitive email; submits via contact-submitted.\\n```\\n\\n```mdma\\ntype: form\\nid: contact\\nfields:\\n - name: full-name\\n type: text\\n label: \\\"Full Name\\\"\\n required: true\\n - name: email\\n type: email\\n label: \\\"Email\\\"\\n required: true\\n sensitive: true\\nonSubmit: contact-submitted\\n```\\n\\nIntent: `table#orders` β invent realistic columns and rows; default to sortable/filterable tables.\\n\\n```mdma\\ntype: table\\nid: orders\\nsortable: true\\nfilterable: true\\ncolumns:\\n - key: order-id\\n header: \\\"Order ID\\\"\\n sortable: true\\n - key: customer\\n header: \\\"Customer\\\"\\n sortable: true\\n - key: total\\n header: \\\"Total ($)\\\"\\n sortable: true\\n - key: status\\n header: \\\"Status\\\"\\ndata:\\n - { order-id: \\\"ORD-1001\\\", customer: \\\"Acme Inc\\\", total: 1240.50, status: \\\"Shipped\\\" }\\n - { order-id: \\\"ORD-1002\\\", customer: \\\"Globex\\\", total: 880.00, status: \\\"Pending\\\" }\\n - { order-id: \\\"ORD-1003\\\", customer: \\\"Initech\\\", total: 2310.75, status: \\\"Delivered\\\" }\\n```\\n\\nIntent: `chart#revenue(variant=bar)` β invent a realistic CSV `data` block and a `label`.\\n\\n```mdma\\ntype: chart\\nid: revenue\\nvariant: bar\\nlabel: \\\"Monthly Revenue\\\"\\ndata: |\\n Month, Revenue\\n Jan, 42000\\n Feb, 51000\\n Mar, 47500\\nxAxis: Month\\n```\"},{\"role\":\"user\",\"content\":\"chart#quarterly-sales(variant=bar)\\n\"}]",
+ "label": "own-model/prompt-author.mjs",
+ "config": {}
+ },
+ "promptId": "a215030c5bb1ab12c4729b3195affd78a5fff95e4c88201c6183e640304b1358",
+ "promptIdx": 0,
+ "provider": {
+ "id": "openai:chat:mdma-26b",
+ "label": ""
+ },
+ "response": {
+ "output": "```mdma\ntype: thinking\nid: planning\nstatus: done\ncollapsed: true\ncontent: |\n Bar chart for quarterly sales revenue with realistic sample data.\n```\n\n```mdma\ntype: chart\nid: quarterly-sales\nvariant: bar\nlabel: \"Quarterly Sales Revenue\"\ndata: |\n Quarter, Revenue\n Q1, 125000\n Q2, 142000\n Q3, 138000\n Q4, 165000\nxAxis: Quarter\n```",
+ "tokenUsage": {
+ "total": 1579,
+ "prompt": 1450,
+ "completion": 129,
+ "numRequests": 1
+ },
+ "cached": false,
+ "latencyMs": 1714,
+ "finishReason": "stop",
+ "guardrails": {
+ "flagged": false
+ },
+ "metadata": {
+ "http": {
+ "status": 200,
+ "statusText": "OK",
+ "headers": {
+ "alt-svc": "h3=\":443\"; ma=2592000",
+ "content-length": "987",
+ "content-type": "application/json",
+ "date": "Tue, 30 Jun 2026 12:37:13 GMT",
+ "modal-function-call-id": "fc-01KWC8H7N0QJRSKE83PR272VT2",
+ "vary": "accept-encoding"
+ }
+ }
+ }
+ },
+ "score": 1,
+ "success": true,
+ "testCase": {
+ "description": "Quarterly sales bar chart (DSL)",
+ "vars": {
+ "request": "chart#quarterly-sales(variant=bar)\n"
+ },
+ "assert": [
+ {
+ "type": "javascript",
+ "value": "file://assertions/validate-mdma.mjs",
+ "config": {
+ "exclude": [
+ "flow-ordering"
+ ]
+ }
+ },
+ {
+ "type": "javascript",
+ "value": "file://assertions/only-components.mjs",
+ "config": {
+ "allowed": [
+ "chart"
+ ]
+ }
+ },
+ {
+ "type": "javascript",
+ "value": "file://assertions/bar-chart.mjs"
+ }
+ ],
+ "options": {},
+ "metadata": {}
+ },
+ "testIdx": 6,
+ "vars": {
+ "request": "chart#quarterly-sales(variant=bar)\n"
+ },
+ "metadata": {
+ "http": {
+ "status": 200,
+ "statusText": "OK",
+ "headers": {
+ "alt-svc": "h3=\":443\"; ma=2592000",
+ "content-length": "987",
+ "content-type": "application/json",
+ "date": "Tue, 30 Jun 2026 12:37:13 GMT",
+ "modal-function-call-id": "fc-01KWC8H7N0QJRSKE83PR272VT2",
+ "vary": "accept-encoding"
+ }
+ },
+ "_promptfooFileMetadata": {}
+ },
+ "failureReason": 0
+ },
+ {
+ "cost": 0,
+ "gradingResult": {
+ "pass": true,
+ "score": 1,
+ "reason": "All assertions passed",
+ "namedScores": {},
+ "tokensUsed": {
+ "total": 0,
+ "prompt": 0,
+ "completion": 0,
+ "cached": 0,
+ "numRequests": 0
+ },
+ "componentResults": [
+ {
+ "pass": true,
+ "score": 1,
+ "reason": "Valid MDMA document (0 warnings, 0 info)",
+ "assertion": {
+ "type": "javascript",
+ "value": "file://assertions/validate-mdma.mjs",
+ "config": {
+ "exclude": [
+ "flow-ordering"
+ ]
+ }
+ }
+ },
+ {
+ "pass": true,
+ "score": 1,
+ "reason": "Only allowed components generated: approval-gate",
+ "assertion": {
+ "type": "javascript",
+ "value": "file://assertions/only-components.mjs",
+ "config": {
+ "allowed": [
+ "approval-gate"
+ ]
+ }
+ }
+ }
+ ]
+ },
+ "id": "899627b2-3f04-41de-a972-bcfa417c9f71",
+ "latencyMs": 936,
+ "namedScores": {},
+ "prompt": {
+ "raw": "[{\"role\":\"system\",\"content\":\"You are an MDMA authoring engine. You read a single DSL intent β a compact, one-line-per-component description of the UI to build β and produce the corresponding MDMA (Markdown Document with Mounted Applications) components.\\n\\nThe DSL intent to build is given in your instructions (and/or the user's message). Always generate the MDMA for that DSL β never refuse, apologize, or ask the user to provide a DSL intent. Treat any natural-language message as extra context for the DSL you were given.\\n\\n## DSL input β the grammar you read\\n```\\n#[, , ...](, , ...) # one component per line\\nfield = [*][^]:[{opt1|opt2|...}]\\n * = required ^ = sensitive (PII: name, email, phone, address, SSN, date-of-birth, β¦)\\n typecode: t=text n=number e=email d=date s=select c=checkbox ta=textarea f=file\\n {a|b|c} = options for a select field\\nprops = text=\\\"...\\\" | action= | variant=\\ntypes: form Β· button Β· tasklist Β· table Β· callout Β· approval-gate Β· webhook Β· chart\\n```\\n\\n## Authoring rules\\n- Build EXACTLY the components in the DSL intent β no more, no fewer. The DSL is the complete, final spec. Never invent a component it did not list (e.g. an extra approval-gate, button, tasklist, callout, or webhook), even when the request or surrounding context implies a larger workflow (\\\"needs approval\\\", \\\"review process\\\", etc.). Describe any such follow-up in prose only; do not emit it.\\n- Each ```mdma block is exactly ONE component as top-level YAML keys (type, id, β¦). Never wrap a component in a \\\"components:\\\" array.\\n- Every component has \\\"id\\\" and \\\"type\\\" (one of: form, button, tasklist, table, callout, approval-gate, webhook, chart).\\n- AT MOST ONE interactive component (form, button, tasklist, approval-gate, webhook) per response; non-interactive components (callout, table, chart) may accompany it. Define a referenced component before anything that references it.\\n- form: top-level \\\"onSubmit: \\\"; \\\"fields\\\" list (each name/type/label); field type β text|number|email|date|select|checkbox|textarea|file; select fields need \\\"options\\\" (list of {label, value}); mark every PII field \\\"sensitive: true\\\".\\n- button: \\\"text\\\" + \\\"onAction: \\\". tasklist: \\\"items\\\" list of {id, text}. table: \\\"columns\\\" (key/header) + \\\"data\\\" rows. callout: \\\"content\\\" + variant β info|warning|error|success. approval-gate: \\\"title\\\". webhook: \\\"url\\\" + \\\"trigger: \\\". chart: \\\"label\\\" (never \\\"title\\\") + \\\"data: |\\\" CSV (header line then rows) + variant β line|bar|area|pie.\\n- Forms use \\\"onSubmit\\\", buttons \\\"onAction\\\", webhooks \\\"trigger\\\" β never a bare \\\"action\\\" key.\\n- Fill in realistic values the DSL omits (table rows, chart CSV, callout content, approval-gate title).\\n\\n## Turn and reasoning discipline\\n- Each message is a complete instruction. Emit exactly one document for the current DSL intent, then stop. Never reason about whether a new turn has arrived or whether it is \\\"your turn\\\" to continue.\\n- Do not re-verify or re-emit components from earlier turns. Keep any thinking block to at most a few sentences. Never repeat a token or phrase.\\n\\n## Examples\\n\\nIntent: `form#contact[full-name*:t, email*^:e](action=contact-submitted)`\\n\\n```mdma\\ntype: thinking\\nid: planning\\nstatus: done\\ncollapsed: true\\ncontent: |\\n Contact form: a required name and a required, sensitive email; submits via contact-submitted.\\n```\\n\\n```mdma\\ntype: form\\nid: contact\\nfields:\\n - name: full-name\\n type: text\\n label: \\\"Full Name\\\"\\n required: true\\n - name: email\\n type: email\\n label: \\\"Email\\\"\\n required: true\\n sensitive: true\\nonSubmit: contact-submitted\\n```\\n\\nIntent: `table#orders` β invent realistic columns and rows; default to sortable/filterable tables.\\n\\n```mdma\\ntype: table\\nid: orders\\nsortable: true\\nfilterable: true\\ncolumns:\\n - key: order-id\\n header: \\\"Order ID\\\"\\n sortable: true\\n - key: customer\\n header: \\\"Customer\\\"\\n sortable: true\\n - key: total\\n header: \\\"Total ($)\\\"\\n sortable: true\\n - key: status\\n header: \\\"Status\\\"\\ndata:\\n - { order-id: \\\"ORD-1001\\\", customer: \\\"Acme Inc\\\", total: 1240.50, status: \\\"Shipped\\\" }\\n - { order-id: \\\"ORD-1002\\\", customer: \\\"Globex\\\", total: 880.00, status: \\\"Pending\\\" }\\n - { order-id: \\\"ORD-1003\\\", customer: \\\"Initech\\\", total: 2310.75, status: \\\"Delivered\\\" }\\n```\\n\\nIntent: `chart#revenue(variant=bar)` β invent a realistic CSV `data` block and a `label`.\\n\\n```mdma\\ntype: chart\\nid: revenue\\nvariant: bar\\nlabel: \\\"Monthly Revenue\\\"\\ndata: |\\n Month, Revenue\\n Jan, 42000\\n Feb, 51000\\n Mar, 47500\\nxAxis: Month\\n```\"},{\"role\":\"user\",\"content\":\"approval-gate#document-approval\\n\"}]",
+ "label": "own-model/prompt-author.mjs",
+ "config": {}
+ },
+ "promptId": "a215030c5bb1ab12c4729b3195affd78a5fff95e4c88201c6183e640304b1358",
+ "promptIdx": 0,
+ "provider": {
+ "id": "openai:chat:mdma-26b",
+ "label": ""
+ },
+ "response": {
+ "output": "```mdma\ntype: thinking\nid: planning\nstatus: done\ncollapsed: true\ncontent: |\n Approval gate for document review; requires a title.\n```\n\n```mdma\ntype: approval-gate\nid: document-approval\ntitle: \"Document Review and Approval\"\n```",
+ "tokenUsage": {
+ "total": 1511,
+ "prompt": 1446,
+ "completion": 65,
+ "numRequests": 1
+ },
+ "cached": false,
+ "latencyMs": 936,
+ "finishReason": "stop",
+ "guardrails": {
+ "flagged": false
+ },
+ "metadata": {
+ "http": {
+ "status": 200,
+ "statusText": "OK",
+ "headers": {
+ "alt-svc": "h3=\":443\"; ma=2592000",
+ "content-length": "873",
+ "content-type": "application/json",
+ "date": "Tue, 30 Jun 2026 12:37:15 GMT",
+ "modal-function-call-id": "fc-01KWC8H9BCSPZ595DQ55Y1E1Q8",
+ "vary": "accept-encoding"
+ }
+ }
+ }
+ },
+ "score": 1,
+ "success": true,
+ "testCase": {
+ "description": "Document approval gate (DSL)",
+ "vars": {
+ "request": "approval-gate#document-approval\n"
+ },
+ "assert": [
+ {
+ "type": "javascript",
+ "value": "file://assertions/validate-mdma.mjs",
+ "config": {
+ "exclude": [
+ "flow-ordering"
+ ]
+ }
+ },
+ {
+ "type": "javascript",
+ "value": "file://assertions/only-components.mjs",
+ "config": {
+ "allowed": [
+ "approval-gate"
+ ]
+ }
+ }
+ ],
+ "options": {},
+ "metadata": {}
+ },
+ "testIdx": 7,
+ "vars": {
+ "request": "approval-gate#document-approval\n"
+ },
+ "metadata": {
+ "http": {
+ "status": 200,
+ "statusText": "OK",
+ "headers": {
+ "alt-svc": "h3=\":443\"; ma=2592000",
+ "content-length": "873",
+ "content-type": "application/json",
+ "date": "Tue, 30 Jun 2026 12:37:15 GMT",
+ "modal-function-call-id": "fc-01KWC8H9BCSPZ595DQ55Y1E1Q8",
+ "vary": "accept-encoding"
+ }
+ },
+ "_promptfooFileMetadata": {}
+ },
+ "failureReason": 0
+ },
+ {
+ "cost": 0,
+ "gradingResult": {
+ "pass": true,
+ "score": 1,
+ "reason": "All assertions passed",
+ "namedScores": {},
+ "tokensUsed": {
+ "total": 0,
+ "prompt": 0,
+ "completion": 0,
+ "cached": 0,
+ "numRequests": 0
+ },
+ "componentResults": [
+ {
+ "pass": true,
+ "score": 1,
+ "reason": "Valid MDMA document (0 warnings, 0 info)",
+ "assertion": {
+ "type": "javascript",
+ "value": "file://assertions/validate-mdma.mjs",
+ "config": {
+ "exclude": [
+ "flow-ordering"
+ ]
+ }
+ }
+ },
+ {
+ "pass": true,
+ "score": 1,
+ "reason": "Only allowed components generated: form",
+ "assertion": {
+ "type": "javascript",
+ "value": "file://assertions/only-components.mjs",
+ "config": {
+ "allowed": [
+ "form"
+ ]
+ }
+ }
+ },
+ {
+ "pass": true,
+ "score": 1,
+ "reason": "Exactly 3 form fields found",
+ "assertion": {
+ "type": "javascript",
+ "value": "file://assertions/exact-field-count.mjs",
+ "config": {
+ "expected": 3
+ }
+ }
+ },
+ {
+ "pass": true,
+ "score": 1,
+ "reason": "Found sensitive: true flag",
+ "assertion": {
+ "type": "javascript",
+ "value": "file://assertions/has-sensitive.mjs"
+ }
+ }
+ ]
+ },
+ "id": "329138f6-0bb5-4ba7-a32d-9ba5025662fc",
+ "latencyMs": 1962,
+ "namedScores": {},
+ "prompt": {
+ "raw": "[{\"role\":\"system\",\"content\":\"You are an MDMA authoring engine. You read a single DSL intent β a compact, one-line-per-component description of the UI to build β and produce the corresponding MDMA (Markdown Document with Mounted Applications) components.\\n\\nThe DSL intent to build is given in your instructions (and/or the user's message). Always generate the MDMA for that DSL β never refuse, apologize, or ask the user to provide a DSL intent. Treat any natural-language message as extra context for the DSL you were given.\\n\\n## DSL input β the grammar you read\\n```\\n#[, , ...](, , ...) # one component per line\\nfield = [*][^]:[{opt1|opt2|...}]\\n * = required ^ = sensitive (PII: name, email, phone, address, SSN, date-of-birth, β¦)\\n typecode: t=text n=number e=email d=date s=select c=checkbox ta=textarea f=file\\n {a|b|c} = options for a select field\\nprops = text=\\\"...\\\" | action= | variant=\\ntypes: form Β· button Β· tasklist Β· table Β· callout Β· approval-gate Β· webhook Β· chart\\n```\\n\\n## Authoring rules\\n- Build EXACTLY the components in the DSL intent β no more, no fewer. The DSL is the complete, final spec. Never invent a component it did not list (e.g. an extra approval-gate, button, tasklist, callout, or webhook), even when the request or surrounding context implies a larger workflow (\\\"needs approval\\\", \\\"review process\\\", etc.). Describe any such follow-up in prose only; do not emit it.\\n- Each ```mdma block is exactly ONE component as top-level YAML keys (type, id, β¦). Never wrap a component in a \\\"components:\\\" array.\\n- Every component has \\\"id\\\" and \\\"type\\\" (one of: form, button, tasklist, table, callout, approval-gate, webhook, chart).\\n- AT MOST ONE interactive component (form, button, tasklist, approval-gate, webhook) per response; non-interactive components (callout, table, chart) may accompany it. Define a referenced component before anything that references it.\\n- form: top-level \\\"onSubmit: \\\"; \\\"fields\\\" list (each name/type/label); field type β text|number|email|date|select|checkbox|textarea|file; select fields need \\\"options\\\" (list of {label, value}); mark every PII field \\\"sensitive: true\\\".\\n- button: \\\"text\\\" + \\\"onAction: \\\". tasklist: \\\"items\\\" list of {id, text}. table: \\\"columns\\\" (key/header) + \\\"data\\\" rows. callout: \\\"content\\\" + variant β info|warning|error|success. approval-gate: \\\"title\\\". webhook: \\\"url\\\" + \\\"trigger: \\\". chart: \\\"label\\\" (never \\\"title\\\") + \\\"data: |\\\" CSV (header line then rows) + variant β line|bar|area|pie.\\n- Forms use \\\"onSubmit\\\", buttons \\\"onAction\\\", webhooks \\\"trigger\\\" β never a bare \\\"action\\\" key.\\n- Fill in realistic values the DSL omits (table rows, chart CSV, callout content, approval-gate title).\\n\\n## Turn and reasoning discipline\\n- Each message is a complete instruction. Emit exactly one document for the current DSL intent, then stop. Never reason about whether a new turn has arrived or whether it is \\\"your turn\\\" to continue.\\n- Do not re-verify or re-emit components from earlier turns. Keep any thinking block to at most a few sentences. Never repeat a token or phrase.\\n\\n## Examples\\n\\nIntent: `form#contact[full-name*:t, email*^:e](action=contact-submitted)`\\n\\n```mdma\\ntype: thinking\\nid: planning\\nstatus: done\\ncollapsed: true\\ncontent: |\\n Contact form: a required name and a required, sensitive email; submits via contact-submitted.\\n```\\n\\n```mdma\\ntype: form\\nid: contact\\nfields:\\n - name: full-name\\n type: text\\n label: \\\"Full Name\\\"\\n required: true\\n - name: email\\n type: email\\n label: \\\"Email\\\"\\n required: true\\n sensitive: true\\nonSubmit: contact-submitted\\n```\\n\\nIntent: `table#orders` β invent realistic columns and rows; default to sortable/filterable tables.\\n\\n```mdma\\ntype: table\\nid: orders\\nsortable: true\\nfilterable: true\\ncolumns:\\n - key: order-id\\n header: \\\"Order ID\\\"\\n sortable: true\\n - key: customer\\n header: \\\"Customer\\\"\\n sortable: true\\n - key: total\\n header: \\\"Total ($)\\\"\\n sortable: true\\n - key: status\\n header: \\\"Status\\\"\\ndata:\\n - { order-id: \\\"ORD-1001\\\", customer: \\\"Acme Inc\\\", total: 1240.50, status: \\\"Shipped\\\" }\\n - { order-id: \\\"ORD-1002\\\", customer: \\\"Globex\\\", total: 880.00, status: \\\"Pending\\\" }\\n - { order-id: \\\"ORD-1003\\\", customer: \\\"Initech\\\", total: 2310.75, status: \\\"Delivered\\\" }\\n```\\n\\nIntent: `chart#revenue(variant=bar)` β invent a realistic CSV `data` block and a `label`.\\n\\n```mdma\\ntype: chart\\nid: revenue\\nvariant: bar\\nlabel: \\\"Monthly Revenue\\\"\\ndata: |\\n Month, Revenue\\n Jan, 42000\\n Feb, 51000\\n Mar, 47500\\nxAxis: Month\\n```\"},{\"role\":\"user\",\"content\":\"form#severity-form[title*:t, reporter-email*^:e, severity*:s{critical|high|medium|low}](action=triage-submitted)\\n\"}]",
+ "label": "own-model/prompt-author.mjs",
+ "config": {}
+ },
+ "promptId": "a215030c5bb1ab12c4729b3195affd78a5fff95e4c88201c6183e640304b1358",
+ "promptIdx": 0,
+ "provider": {
+ "id": "openai:chat:mdma-26b",
+ "label": ""
+ },
+ "response": {
+ "output": "```mdma\ntype: thinking\nid: planning\nstatus: done\ncollapsed: true\ncontent: |\n Triage form: collecting a required title, required sensitive reporter email, and a required severity selection; submits via triage-submitted.\n```\n\n```mdma\ntype: form\nid: severity-form\nfields:\n - name: title\n type: text\n label: \"Issue Title\"\n required: true\n - name: reporter-email\n type: email\n label: \"Reporter Email\"\n required: true\n sensitive: true\n - name: severity\n type: select\n label: \"Severity\"\n required: true\n options:\n - label: \"Critical\"\n value: critical\n - label: \"High\"\n value: high\n - label: \"Medium\"\n value: medium\n - label: \"Low\"\n value: low\nonSubmit: triage-submitted\n```",
+ "tokenUsage": {
+ "total": 1691,
+ "prompt": 1477,
+ "completion": 214,
+ "numRequests": 1
+ },
+ "cached": false,
+ "latencyMs": 1962,
+ "finishReason": "stop",
+ "guardrails": {
+ "flagged": false
+ },
+ "metadata": {
+ "http": {
+ "status": 200,
+ "statusText": "OK",
+ "headers": {
+ "alt-svc": "h3=\":443\"; ma=2592000",
+ "content-length": "1433",
+ "content-type": "application/json",
+ "date": "Tue, 30 Jun 2026 12:37:16 GMT",
+ "modal-function-call-id": "fc-01KWC8HAA550G4G76F04BM9N0V",
+ "vary": "accept-encoding"
+ }
+ }
+ }
+ },
+ "score": 1,
+ "success": true,
+ "testCase": {
+ "description": "Incident triage severity form (DSL)",
+ "vars": {
+ "request": "form#severity-form[title*:t, reporter-email*^:e, severity*:s{critical|high|medium|low}](action=triage-submitted)\n"
+ },
+ "assert": [
+ {
+ "type": "javascript",
+ "value": "file://assertions/validate-mdma.mjs",
+ "config": {
+ "exclude": [
+ "flow-ordering"
+ ]
+ }
+ },
+ {
+ "type": "javascript",
+ "value": "file://assertions/only-components.mjs",
+ "config": {
+ "allowed": [
+ "form"
+ ]
+ }
+ },
+ {
+ "type": "javascript",
+ "value": "file://assertions/exact-field-count.mjs",
+ "config": {
+ "expected": 3
+ }
+ },
+ {
+ "type": "javascript",
+ "value": "file://assertions/has-sensitive.mjs"
+ }
+ ],
+ "options": {},
+ "metadata": {}
+ },
+ "testIdx": 8,
+ "vars": {
+ "request": "form#severity-form[title*:t, reporter-email*^:e, severity*:s{critical|high|medium|low}](action=triage-submitted)\n"
+ },
+ "metadata": {
+ "http": {
+ "status": 200,
+ "statusText": "OK",
+ "headers": {
+ "alt-svc": "h3=\":443\"; ma=2592000",
+ "content-length": "1433",
+ "content-type": "application/json",
+ "date": "Tue, 30 Jun 2026 12:37:16 GMT",
+ "modal-function-call-id": "fc-01KWC8HAA550G4G76F04BM9N0V",
+ "vary": "accept-encoding"
+ }
+ },
+ "_promptfooFileMetadata": {}
+ },
+ "failureReason": 0
+ },
+ {
+ "cost": 0,
+ "gradingResult": {
+ "pass": true,
+ "score": 1,
+ "reason": "All assertions passed",
+ "namedScores": {},
+ "tokensUsed": {
+ "total": 0,
+ "prompt": 0,
+ "completion": 0,
+ "cached": 0,
+ "numRequests": 0
+ },
+ "componentResults": [
+ {
+ "pass": true,
+ "score": 1,
+ "reason": "Valid MDMA document (0 warnings, 0 info)",
+ "assertion": {
+ "type": "javascript",
+ "value": "file://assertions/validate-mdma.mjs",
+ "config": {
+ "exclude": [
+ "flow-ordering"
+ ]
+ }
+ }
+ },
+ {
+ "pass": true,
+ "score": 1,
+ "reason": "3 unique kebab-case IDs",
+ "assertion": {
+ "type": "javascript",
+ "value": "file://assertions/unique-kebab-ids.mjs"
+ }
+ },
+ {
+ "pass": true,
+ "score": 1,
+ "reason": "Only allowed components generated: form, callout",
+ "assertion": {
+ "type": "javascript",
+ "value": "file://assertions/only-components.mjs",
+ "config": {
+ "allowed": [
+ "form",
+ "callout"
+ ]
+ }
+ }
+ },
+ {
+ "pass": true,
+ "score": 1,
+ "reason": "Found 3 components (min: 2)",
+ "assertion": {
+ "type": "javascript",
+ "value": "file://assertions/component-count.mjs",
+ "config": {
+ "min": 2
+ }
+ }
+ }
+ ]
+ },
+ "id": "cc1501b0-6ffe-4981-ac56-76d1026c59ca",
+ "latencyMs": 1710,
+ "namedScores": {},
+ "prompt": {
+ "raw": "[{\"role\":\"system\",\"content\":\"You are an MDMA authoring engine. You read a single DSL intent β a compact, one-line-per-component description of the UI to build β and produce the corresponding MDMA (Markdown Document with Mounted Applications) components.\\n\\nThe DSL intent to build is given in your instructions (and/or the user's message). Always generate the MDMA for that DSL β never refuse, apologize, or ask the user to provide a DSL intent. Treat any natural-language message as extra context for the DSL you were given.\\n\\n## DSL input β the grammar you read\\n```\\n#[, , ...](, , ...) # one component per line\\nfield = [*][^]:[{opt1|opt2|...}]\\n * = required ^ = sensitive (PII: name, email, phone, address, SSN, date-of-birth, β¦)\\n typecode: t=text n=number e=email d=date s=select c=checkbox ta=textarea f=file\\n {a|b|c} = options for a select field\\nprops = text=\\\"...\\\" | action= | variant=\\ntypes: form Β· button Β· tasklist Β· table Β· callout Β· approval-gate Β· webhook Β· chart\\n```\\n\\n## Authoring rules\\n- Build EXACTLY the components in the DSL intent β no more, no fewer. The DSL is the complete, final spec. Never invent a component it did not list (e.g. an extra approval-gate, button, tasklist, callout, or webhook), even when the request or surrounding context implies a larger workflow (\\\"needs approval\\\", \\\"review process\\\", etc.). Describe any such follow-up in prose only; do not emit it.\\n- Each ```mdma block is exactly ONE component as top-level YAML keys (type, id, β¦). Never wrap a component in a \\\"components:\\\" array.\\n- Every component has \\\"id\\\" and \\\"type\\\" (one of: form, button, tasklist, table, callout, approval-gate, webhook, chart).\\n- AT MOST ONE interactive component (form, button, tasklist, approval-gate, webhook) per response; non-interactive components (callout, table, chart) may accompany it. Define a referenced component before anything that references it.\\n- form: top-level \\\"onSubmit: \\\"; \\\"fields\\\" list (each name/type/label); field type β text|number|email|date|select|checkbox|textarea|file; select fields need \\\"options\\\" (list of {label, value}); mark every PII field \\\"sensitive: true\\\".\\n- button: \\\"text\\\" + \\\"onAction: \\\". tasklist: \\\"items\\\" list of {id, text}. table: \\\"columns\\\" (key/header) + \\\"data\\\" rows. callout: \\\"content\\\" + variant β info|warning|error|success. approval-gate: \\\"title\\\". webhook: \\\"url\\\" + \\\"trigger: \\\". chart: \\\"label\\\" (never \\\"title\\\") + \\\"data: |\\\" CSV (header line then rows) + variant β line|bar|area|pie.\\n- Forms use \\\"onSubmit\\\", buttons \\\"onAction\\\", webhooks \\\"trigger\\\" β never a bare \\\"action\\\" key.\\n- Fill in realistic values the DSL omits (table rows, chart CSV, callout content, approval-gate title).\\n\\n## Turn and reasoning discipline\\n- Each message is a complete instruction. Emit exactly one document for the current DSL intent, then stop. Never reason about whether a new turn has arrived or whether it is \\\"your turn\\\" to continue.\\n- Do not re-verify or re-emit components from earlier turns. Keep any thinking block to at most a few sentences. Never repeat a token or phrase.\\n\\n## Examples\\n\\nIntent: `form#contact[full-name*:t, email*^:e](action=contact-submitted)`\\n\\n```mdma\\ntype: thinking\\nid: planning\\nstatus: done\\ncollapsed: true\\ncontent: |\\n Contact form: a required name and a required, sensitive email; submits via contact-submitted.\\n```\\n\\n```mdma\\ntype: form\\nid: contact\\nfields:\\n - name: full-name\\n type: text\\n label: \\\"Full Name\\\"\\n required: true\\n - name: email\\n type: email\\n label: \\\"Email\\\"\\n required: true\\n sensitive: true\\nonSubmit: contact-submitted\\n```\\n\\nIntent: `table#orders` β invent realistic columns and rows; default to sortable/filterable tables.\\n\\n```mdma\\ntype: table\\nid: orders\\nsortable: true\\nfilterable: true\\ncolumns:\\n - key: order-id\\n header: \\\"Order ID\\\"\\n sortable: true\\n - key: customer\\n header: \\\"Customer\\\"\\n sortable: true\\n - key: total\\n header: \\\"Total ($)\\\"\\n sortable: true\\n - key: status\\n header: \\\"Status\\\"\\ndata:\\n - { order-id: \\\"ORD-1001\\\", customer: \\\"Acme Inc\\\", total: 1240.50, status: \\\"Shipped\\\" }\\n - { order-id: \\\"ORD-1002\\\", customer: \\\"Globex\\\", total: 880.00, status: \\\"Pending\\\" }\\n - { order-id: \\\"ORD-1003\\\", customer: \\\"Initech\\\", total: 2310.75, status: \\\"Delivered\\\" }\\n```\\n\\nIntent: `chart#revenue(variant=bar)` β invent a realistic CSV `data` block and a `label`.\\n\\n```mdma\\ntype: chart\\nid: revenue\\nvariant: bar\\nlabel: \\\"Monthly Revenue\\\"\\ndata: |\\n Month, Revenue\\n Jan, 42000\\n Feb, 51000\\n Mar, 47500\\nxAxis: Month\\n```\"},{\"role\":\"user\",\"content\":\"form#settings-form[display-name*:t, email*^:e](action=settings-saved)\\ncallout#settings-saved(variant=success)\\n\"}]",
+ "label": "own-model/prompt-author.mjs",
+ "config": {}
+ },
+ "promptId": "a215030c5bb1ab12c4729b3195affd78a5fff95e4c88201c6183e640304b1358",
+ "promptIdx": 0,
+ "provider": {
+ "id": "openai:chat:mdma-26b",
+ "label": ""
+ },
+ "response": {
+ "output": "```mdma\ntype: thinking\nid: planning\nstatus: done\ncollapsed: true\ncontent: |\n User settings form: captures display-name (required) and email (required, sensitive); submits via settings-saved. \n A success callout will display once settings are saved.\n```\n\n```mdma\ntype: form\nid: settings-form\nfields:\n - name: display-name\n type: text\n label: \"Display Name\"\n required: true\n - name: email\n type: email\n label: \"Email Address\"\n required: true\n sensitive: true\nonSubmit: settings-saved\n```\n\n```mdma\ntype: callout\nid: settings-saved\nvariant: success\ncontent: \"Your preferences and contact information have been successfully updated.\"\n```",
+ "tokenUsage": {
+ "total": 1657,
+ "prompt": 1475,
+ "completion": 182,
+ "numRequests": 1
+ },
+ "cached": false,
+ "latencyMs": 1710,
+ "finishReason": "stop",
+ "guardrails": {
+ "flagged": false
+ },
+ "metadata": {
+ "http": {
+ "status": 200,
+ "statusText": "OK",
+ "headers": {
+ "alt-svc": "h3=\":443\"; ma=2592000",
+ "content-length": "1326",
+ "content-type": "application/json",
+ "date": "Tue, 30 Jun 2026 12:37:18 GMT",
+ "modal-function-call-id": "fc-01KWC8HC7SFXAPKG1BFA006HR9",
+ "vary": "accept-encoding"
+ }
+ }
+ }
+ },
+ "score": 1,
+ "success": true,
+ "testCase": {
+ "description": "Settings form + saved callout (DSL)",
+ "vars": {
+ "request": "form#settings-form[display-name*:t, email*^:e](action=settings-saved)\ncallout#settings-saved(variant=success)\n"
+ },
+ "assert": [
+ {
+ "type": "javascript",
+ "value": "file://assertions/validate-mdma.mjs",
+ "config": {
+ "exclude": [
+ "flow-ordering"
+ ]
+ }
+ },
+ {
+ "type": "javascript",
+ "value": "file://assertions/unique-kebab-ids.mjs"
+ },
+ {
+ "type": "javascript",
+ "value": "file://assertions/only-components.mjs",
+ "config": {
+ "allowed": [
+ "form",
+ "callout"
+ ]
+ }
+ },
+ {
+ "type": "javascript",
+ "value": "file://assertions/component-count.mjs",
+ "config": {
+ "min": 2
+ }
+ }
+ ],
+ "options": {},
+ "metadata": {}
+ },
+ "testIdx": 9,
+ "vars": {
+ "request": "form#settings-form[display-name*:t, email*^:e](action=settings-saved)\ncallout#settings-saved(variant=success)\n"
+ },
+ "metadata": {
+ "http": {
+ "status": 200,
+ "statusText": "OK",
+ "headers": {
+ "alt-svc": "h3=\":443\"; ma=2592000",
+ "content-length": "1326",
+ "content-type": "application/json",
+ "date": "Tue, 30 Jun 2026 12:37:18 GMT",
+ "modal-function-call-id": "fc-01KWC8HC7SFXAPKG1BFA006HR9",
+ "vary": "accept-encoding"
+ }
+ },
+ "_promptfooFileMetadata": {}
+ },
+ "failureReason": 0
+ },
+ {
+ "cost": 0,
+ "gradingResult": {
+ "pass": true,
+ "score": 1,
+ "reason": "All assertions passed",
+ "namedScores": {},
+ "tokensUsed": {
+ "total": 0,
+ "prompt": 0,
+ "completion": 0,
+ "cached": 0,
+ "numRequests": 0
+ },
+ "componentResults": [
+ {
+ "pass": true,
+ "score": 1,
+ "reason": "Valid MDMA document (0 warnings, 0 info)",
+ "assertion": {
+ "type": "javascript",
+ "value": "file://assertions/validate-mdma.mjs",
+ "config": {
+ "exclude": [
+ "flow-ordering"
+ ]
+ }
+ }
+ },
+ {
+ "pass": true,
+ "score": 1,
+ "reason": "Only allowed components generated: callout",
+ "assertion": {
+ "type": "javascript",
+ "value": "file://assertions/only-components.mjs",
+ "config": {
+ "allowed": [
+ "callout"
+ ]
+ }
+ }
+ },
+ {
+ "pass": true,
+ "score": 1,
+ "reason": "Callout with variant: warning found",
+ "assertion": {
+ "type": "javascript",
+ "value": "file://assertions/callout-variant.mjs",
+ "config": {
+ "variant": "warning"
+ }
+ }
+ }
+ ]
+ },
+ "id": "dd73a400-e033-43c4-abc4-ac17a7aa85a3",
+ "latencyMs": 980,
+ "namedScores": {},
+ "prompt": {
+ "raw": "[{\"role\":\"system\",\"content\":\"You are an MDMA authoring engine. You read a single DSL intent β a compact, one-line-per-component description of the UI to build β and produce the corresponding MDMA (Markdown Document with Mounted Applications) components.\\n\\nThe DSL intent to build is given in your instructions (and/or the user's message). Always generate the MDMA for that DSL β never refuse, apologize, or ask the user to provide a DSL intent. Treat any natural-language message as extra context for the DSL you were given.\\n\\n## DSL input β the grammar you read\\n```\\n#[, , ...](, , ...) # one component per line\\nfield = [*][^]:[{opt1|opt2|...}]\\n * = required ^ = sensitive (PII: name, email, phone, address, SSN, date-of-birth, β¦)\\n typecode: t=text n=number e=email d=date s=select c=checkbox ta=textarea f=file\\n {a|b|c} = options for a select field\\nprops = text=\\\"...\\\" | action= | variant=\\ntypes: form Β· button Β· tasklist Β· table Β· callout Β· approval-gate Β· webhook Β· chart\\n```\\n\\n## Authoring rules\\n- Build EXACTLY the components in the DSL intent β no more, no fewer. The DSL is the complete, final spec. Never invent a component it did not list (e.g. an extra approval-gate, button, tasklist, callout, or webhook), even when the request or surrounding context implies a larger workflow (\\\"needs approval\\\", \\\"review process\\\", etc.). Describe any such follow-up in prose only; do not emit it.\\n- Each ```mdma block is exactly ONE component as top-level YAML keys (type, id, β¦). Never wrap a component in a \\\"components:\\\" array.\\n- Every component has \\\"id\\\" and \\\"type\\\" (one of: form, button, tasklist, table, callout, approval-gate, webhook, chart).\\n- AT MOST ONE interactive component (form, button, tasklist, approval-gate, webhook) per response; non-interactive components (callout, table, chart) may accompany it. Define a referenced component before anything that references it.\\n- form: top-level \\\"onSubmit: \\\"; \\\"fields\\\" list (each name/type/label); field type β text|number|email|date|select|checkbox|textarea|file; select fields need \\\"options\\\" (list of {label, value}); mark every PII field \\\"sensitive: true\\\".\\n- button: \\\"text\\\" + \\\"onAction: \\\". tasklist: \\\"items\\\" list of {id, text}. table: \\\"columns\\\" (key/header) + \\\"data\\\" rows. callout: \\\"content\\\" + variant β info|warning|error|success. approval-gate: \\\"title\\\". webhook: \\\"url\\\" + \\\"trigger: \\\". chart: \\\"label\\\" (never \\\"title\\\") + \\\"data: |\\\" CSV (header line then rows) + variant β line|bar|area|pie.\\n- Forms use \\\"onSubmit\\\", buttons \\\"onAction\\\", webhooks \\\"trigger\\\" β never a bare \\\"action\\\" key.\\n- Fill in realistic values the DSL omits (table rows, chart CSV, callout content, approval-gate title).\\n\\n## Turn and reasoning discipline\\n- Each message is a complete instruction. Emit exactly one document for the current DSL intent, then stop. Never reason about whether a new turn has arrived or whether it is \\\"your turn\\\" to continue.\\n- Do not re-verify or re-emit components from earlier turns. Keep any thinking block to at most a few sentences. Never repeat a token or phrase.\\n\\n## Examples\\n\\nIntent: `form#contact[full-name*:t, email*^:e](action=contact-submitted)`\\n\\n```mdma\\ntype: thinking\\nid: planning\\nstatus: done\\ncollapsed: true\\ncontent: |\\n Contact form: a required name and a required, sensitive email; submits via contact-submitted.\\n```\\n\\n```mdma\\ntype: form\\nid: contact\\nfields:\\n - name: full-name\\n type: text\\n label: \\\"Full Name\\\"\\n required: true\\n - name: email\\n type: email\\n label: \\\"Email\\\"\\n required: true\\n sensitive: true\\nonSubmit: contact-submitted\\n```\\n\\nIntent: `table#orders` β invent realistic columns and rows; default to sortable/filterable tables.\\n\\n```mdma\\ntype: table\\nid: orders\\nsortable: true\\nfilterable: true\\ncolumns:\\n - key: order-id\\n header: \\\"Order ID\\\"\\n sortable: true\\n - key: customer\\n header: \\\"Customer\\\"\\n sortable: true\\n - key: total\\n header: \\\"Total ($)\\\"\\n sortable: true\\n - key: status\\n header: \\\"Status\\\"\\ndata:\\n - { order-id: \\\"ORD-1001\\\", customer: \\\"Acme Inc\\\", total: 1240.50, status: \\\"Shipped\\\" }\\n - { order-id: \\\"ORD-1002\\\", customer: \\\"Globex\\\", total: 880.00, status: \\\"Pending\\\" }\\n - { order-id: \\\"ORD-1003\\\", customer: \\\"Initech\\\", total: 2310.75, status: \\\"Delivered\\\" }\\n```\\n\\nIntent: `chart#revenue(variant=bar)` β invent a realistic CSV `data` block and a `label`.\\n\\n```mdma\\ntype: chart\\nid: revenue\\nvariant: bar\\nlabel: \\\"Monthly Revenue\\\"\\ndata: |\\n Month, Revenue\\n Jan, 42000\\n Feb, 51000\\n Mar, 47500\\nxAxis: Month\\n```\"},{\"role\":\"user\",\"content\":\"callout#password-expiry(variant=warning)\\n\"}]",
+ "label": "own-model/prompt-author.mjs",
+ "config": {}
+ },
+ "promptId": "a215030c5bb1ab12c4729b3195affd78a5fff95e4c88201c6183e640304b1358",
+ "promptIdx": 0,
+ "provider": {
+ "id": "openai:chat:mdma-26b",
+ "label": ""
+ },
+ "response": {
+ "output": "```mdma\ntype: thinking\nid: planning\nstatus: done\ncollapsed: true\ncontent: |\n Generate a single warning callout informing the user about upcoming password expiration.\n```\n\n```mdma\ntype: callout\nid: password-expiry\nvariant: warning\ncontent: |\n Your password is set to expire in 7 days. Please update it soon to avoid any disruption to your account access.\n```",
+ "tokenUsage": {
+ "total": 1543,
+ "prompt": 1450,
+ "completion": 93,
+ "numRequests": 1
+ },
+ "cached": false,
+ "latencyMs": 980,
+ "finishReason": "stop",
+ "guardrails": {
+ "flagged": false
+ },
+ "metadata": {
+ "http": {
+ "status": 200,
+ "statusText": "OK",
+ "headers": {
+ "alt-svc": "h3=\":443\"; ma=2592000",
+ "content-length": "1005",
+ "content-type": "application/json",
+ "date": "Tue, 30 Jun 2026 12:37:20 GMT",
+ "modal-function-call-id": "fc-01KWC8HDXJASCHD30ED7YDGR97",
+ "vary": "accept-encoding"
+ }
+ }
+ }
+ },
+ "score": 1,
+ "success": true,
+ "testCase": {
+ "description": "Password-expiry warning callout (DSL)",
+ "vars": {
+ "request": "callout#password-expiry(variant=warning)\n"
+ },
+ "assert": [
+ {
+ "type": "javascript",
+ "value": "file://assertions/validate-mdma.mjs",
+ "config": {
+ "exclude": [
+ "flow-ordering"
+ ]
+ }
+ },
+ {
+ "type": "javascript",
+ "value": "file://assertions/only-components.mjs",
+ "config": {
+ "allowed": [
+ "callout"
+ ]
+ }
+ },
+ {
+ "type": "javascript",
+ "value": "file://assertions/callout-variant.mjs",
+ "config": {
+ "variant": "warning"
+ }
+ }
+ ],
+ "options": {},
+ "metadata": {}
+ },
+ "testIdx": 10,
+ "vars": {
+ "request": "callout#password-expiry(variant=warning)\n"
+ },
+ "metadata": {
+ "http": {
+ "status": 200,
+ "statusText": "OK",
+ "headers": {
+ "alt-svc": "h3=\":443\"; ma=2592000",
+ "content-length": "1005",
+ "content-type": "application/json",
+ "date": "Tue, 30 Jun 2026 12:37:20 GMT",
+ "modal-function-call-id": "fc-01KWC8HDXJASCHD30ED7YDGR97",
+ "vary": "accept-encoding"
+ }
+ },
+ "_promptfooFileMetadata": {}
+ },
+ "failureReason": 0
+ },
+ {
+ "cost": 0,
+ "gradingResult": {
+ "pass": true,
+ "score": 1,
+ "reason": "All assertions passed",
+ "namedScores": {},
+ "tokensUsed": {
+ "total": 0,
+ "prompt": 0,
+ "completion": 0,
+ "cached": 0,
+ "numRequests": 0
+ },
+ "componentResults": [
+ {
+ "pass": true,
+ "score": 1,
+ "reason": "Valid MDMA document (0 warnings, 0 info)",
+ "assertion": {
+ "type": "javascript",
+ "value": "file://assertions/validate-mdma.mjs",
+ "config": {
+ "exclude": [
+ "flow-ordering"
+ ]
+ }
+ }
+ },
+ {
+ "pass": true,
+ "score": 1,
+ "reason": "Only allowed components generated: callout",
+ "assertion": {
+ "type": "javascript",
+ "value": "file://assertions/only-components.mjs",
+ "config": {
+ "allowed": [
+ "callout"
+ ]
+ }
+ }
+ },
+ {
+ "pass": true,
+ "score": 1,
+ "reason": "Callout with variant: error found",
+ "assertion": {
+ "type": "javascript",
+ "value": "file://assertions/callout-variant.mjs",
+ "config": {
+ "variant": "error"
+ }
+ }
+ }
+ ]
+ },
+ "id": "4a9d46dc-bd18-4514-854c-b545a903d0cd",
+ "latencyMs": 1005,
+ "namedScores": {},
+ "prompt": {
+ "raw": "[{\"role\":\"system\",\"content\":\"You are an MDMA authoring engine. You read a single DSL intent β a compact, one-line-per-component description of the UI to build β and produce the corresponding MDMA (Markdown Document with Mounted Applications) components.\\n\\nThe DSL intent to build is given in your instructions (and/or the user's message). Always generate the MDMA for that DSL β never refuse, apologize, or ask the user to provide a DSL intent. Treat any natural-language message as extra context for the DSL you were given.\\n\\n## DSL input β the grammar you read\\n```\\n#[, , ...](, , ...) # one component per line\\nfield = [*][^]:[{opt1|opt2|...}]\\n * = required ^ = sensitive (PII: name, email, phone, address, SSN, date-of-birth, β¦)\\n typecode: t=text n=number e=email d=date s=select c=checkbox ta=textarea f=file\\n {a|b|c} = options for a select field\\nprops = text=\\\"...\\\" | action= | variant=\\ntypes: form Β· button Β· tasklist Β· table Β· callout Β· approval-gate Β· webhook Β· chart\\n```\\n\\n## Authoring rules\\n- Build EXACTLY the components in the DSL intent β no more, no fewer. The DSL is the complete, final spec. Never invent a component it did not list (e.g. an extra approval-gate, button, tasklist, callout, or webhook), even when the request or surrounding context implies a larger workflow (\\\"needs approval\\\", \\\"review process\\\", etc.). Describe any such follow-up in prose only; do not emit it.\\n- Each ```mdma block is exactly ONE component as top-level YAML keys (type, id, β¦). Never wrap a component in a \\\"components:\\\" array.\\n- Every component has \\\"id\\\" and \\\"type\\\" (one of: form, button, tasklist, table, callout, approval-gate, webhook, chart).\\n- AT MOST ONE interactive component (form, button, tasklist, approval-gate, webhook) per response; non-interactive components (callout, table, chart) may accompany it. Define a referenced component before anything that references it.\\n- form: top-level \\\"onSubmit: \\\"; \\\"fields\\\" list (each name/type/label); field type β text|number|email|date|select|checkbox|textarea|file; select fields need \\\"options\\\" (list of {label, value}); mark every PII field \\\"sensitive: true\\\".\\n- button: \\\"text\\\" + \\\"onAction: \\\". tasklist: \\\"items\\\" list of {id, text}. table: \\\"columns\\\" (key/header) + \\\"data\\\" rows. callout: \\\"content\\\" + variant β info|warning|error|success. approval-gate: \\\"title\\\". webhook: \\\"url\\\" + \\\"trigger: \\\". chart: \\\"label\\\" (never \\\"title\\\") + \\\"data: |\\\" CSV (header line then rows) + variant β line|bar|area|pie.\\n- Forms use \\\"onSubmit\\\", buttons \\\"onAction\\\", webhooks \\\"trigger\\\" β never a bare \\\"action\\\" key.\\n- Fill in realistic values the DSL omits (table rows, chart CSV, callout content, approval-gate title).\\n\\n## Turn and reasoning discipline\\n- Each message is a complete instruction. Emit exactly one document for the current DSL intent, then stop. Never reason about whether a new turn has arrived or whether it is \\\"your turn\\\" to continue.\\n- Do not re-verify or re-emit components from earlier turns. Keep any thinking block to at most a few sentences. Never repeat a token or phrase.\\n\\n## Examples\\n\\nIntent: `form#contact[full-name*:t, email*^:e](action=contact-submitted)`\\n\\n```mdma\\ntype: thinking\\nid: planning\\nstatus: done\\ncollapsed: true\\ncontent: |\\n Contact form: a required name and a required, sensitive email; submits via contact-submitted.\\n```\\n\\n```mdma\\ntype: form\\nid: contact\\nfields:\\n - name: full-name\\n type: text\\n label: \\\"Full Name\\\"\\n required: true\\n - name: email\\n type: email\\n label: \\\"Email\\\"\\n required: true\\n sensitive: true\\nonSubmit: contact-submitted\\n```\\n\\nIntent: `table#orders` β invent realistic columns and rows; default to sortable/filterable tables.\\n\\n```mdma\\ntype: table\\nid: orders\\nsortable: true\\nfilterable: true\\ncolumns:\\n - key: order-id\\n header: \\\"Order ID\\\"\\n sortable: true\\n - key: customer\\n header: \\\"Customer\\\"\\n sortable: true\\n - key: total\\n header: \\\"Total ($)\\\"\\n sortable: true\\n - key: status\\n header: \\\"Status\\\"\\ndata:\\n - { order-id: \\\"ORD-1001\\\", customer: \\\"Acme Inc\\\", total: 1240.50, status: \\\"Shipped\\\" }\\n - { order-id: \\\"ORD-1002\\\", customer: \\\"Globex\\\", total: 880.00, status: \\\"Pending\\\" }\\n - { order-id: \\\"ORD-1003\\\", customer: \\\"Initech\\\", total: 2310.75, status: \\\"Delivered\\\" }\\n```\\n\\nIntent: `chart#revenue(variant=bar)` β invent a realistic CSV `data` block and a `label`.\\n\\n```mdma\\ntype: chart\\nid: revenue\\nvariant: bar\\nlabel: \\\"Monthly Revenue\\\"\\ndata: |\\n Month, Revenue\\n Jan, 42000\\n Feb, 51000\\n Mar, 47500\\nxAxis: Month\\n```\"},{\"role\":\"user\",\"content\":\"callout#payment-error(variant=error)\\n\"}]",
+ "label": "own-model/prompt-author.mjs",
+ "config": {}
+ },
+ "promptId": "a215030c5bb1ab12c4729b3195affd78a5fff95e4c88201c6183e640304b1358",
+ "promptIdx": 0,
+ "provider": {
+ "id": "openai:chat:mdma-26b",
+ "label": ""
+ },
+ "response": {
+ "output": "```mdma\ntype: thinking\nid: planning\nstatus: done\ncollapsed: true\ncontent: |\n Creating an error callout to notify the user about a failed payment attempt.\n```\n\n```mdma\ntype: callout\nid: payment-error\nvariant: error\ncontent: \"The payment was declined. Please check your card details and try again, or use a different payment method.\"\n```",
+ "tokenUsage": {
+ "total": 1539,
+ "prompt": 1450,
+ "completion": 89,
+ "numRequests": 1
+ },
+ "cached": false,
+ "latencyMs": 1005,
+ "finishReason": "stop",
+ "guardrails": {
+ "flagged": false
+ },
+ "metadata": {
+ "http": {
+ "status": 200,
+ "statusText": "OK",
+ "headers": {
+ "alt-svc": "h3=\":443\"; ma=2592000",
+ "content-length": "983",
+ "content-type": "application/json",
+ "date": "Tue, 30 Jun 2026 12:37:21 GMT",
+ "modal-function-call-id": "fc-01KWC8HEZ308BMSD07R69AK0TV",
+ "vary": "accept-encoding"
+ }
+ }
+ }
+ },
+ "score": 1,
+ "success": true,
+ "testCase": {
+ "description": "Payment-error callout (DSL)",
+ "vars": {
+ "request": "callout#payment-error(variant=error)\n"
+ },
+ "assert": [
+ {
+ "type": "javascript",
+ "value": "file://assertions/validate-mdma.mjs",
+ "config": {
+ "exclude": [
+ "flow-ordering"
+ ]
+ }
+ },
+ {
+ "type": "javascript",
+ "value": "file://assertions/only-components.mjs",
+ "config": {
+ "allowed": [
+ "callout"
+ ]
+ }
+ },
+ {
+ "type": "javascript",
+ "value": "file://assertions/callout-variant.mjs",
+ "config": {
+ "variant": "error"
+ }
+ }
+ ],
+ "options": {},
+ "metadata": {}
+ },
+ "testIdx": 11,
+ "vars": {
+ "request": "callout#payment-error(variant=error)\n"
+ },
+ "metadata": {
+ "http": {
+ "status": 200,
+ "statusText": "OK",
+ "headers": {
+ "alt-svc": "h3=\":443\"; ma=2592000",
+ "content-length": "983",
+ "content-type": "application/json",
+ "date": "Tue, 30 Jun 2026 12:37:21 GMT",
+ "modal-function-call-id": "fc-01KWC8HEZ308BMSD07R69AK0TV",
+ "vary": "accept-encoding"
+ }
+ },
+ "_promptfooFileMetadata": {}
+ },
+ "failureReason": 0
+ },
+ {
+ "cost": 0,
+ "gradingResult": {
+ "pass": true,
+ "score": 1,
+ "reason": "All assertions passed",
+ "namedScores": {},
+ "tokensUsed": {
+ "total": 0,
+ "prompt": 0,
+ "completion": 0,
+ "cached": 0,
+ "numRequests": 0
+ },
+ "componentResults": [
+ {
+ "pass": true,
+ "score": 1,
+ "reason": "Valid MDMA document (0 warnings, 0 info)",
+ "assertion": {
+ "type": "javascript",
+ "value": "file://assertions/validate-mdma.mjs",
+ "config": {
+ "exclude": [
+ "flow-ordering"
+ ]
+ }
+ }
+ },
+ {
+ "pass": true,
+ "score": 1,
+ "reason": "Only allowed components generated: callout",
+ "assertion": {
+ "type": "javascript",
+ "value": "file://assertions/only-components.mjs",
+ "config": {
+ "allowed": [
+ "callout"
+ ]
+ }
+ }
+ },
+ {
+ "pass": true,
+ "score": 1,
+ "reason": "Callout with variant: success found",
+ "assertion": {
+ "type": "javascript",
+ "value": "file://assertions/callout-variant.mjs",
+ "config": {
+ "variant": "success"
+ }
+ }
+ }
+ ]
+ },
+ "id": "85cc6e85-0bb8-45a0-bc85-d439cafee002",
+ "latencyMs": 953,
+ "namedScores": {},
+ "prompt": {
+ "raw": "[{\"role\":\"system\",\"content\":\"You are an MDMA authoring engine. You read a single DSL intent β a compact, one-line-per-component description of the UI to build β and produce the corresponding MDMA (Markdown Document with Mounted Applications) components.\\n\\nThe DSL intent to build is given in your instructions (and/or the user's message). Always generate the MDMA for that DSL β never refuse, apologize, or ask the user to provide a DSL intent. Treat any natural-language message as extra context for the DSL you were given.\\n\\n## DSL input β the grammar you read\\n```\\n#[, , ...](, , ...) # one component per line\\nfield = [*][^]:[{opt1|opt2|...}]\\n * = required ^ = sensitive (PII: name, email, phone, address, SSN, date-of-birth, β¦)\\n typecode: t=text n=number e=email d=date s=select c=checkbox ta=textarea f=file\\n {a|b|c} = options for a select field\\nprops = text=\\\"...\\\" | action= | variant=\\ntypes: form Β· button Β· tasklist Β· table Β· callout Β· approval-gate Β· webhook Β· chart\\n```\\n\\n## Authoring rules\\n- Build EXACTLY the components in the DSL intent β no more, no fewer. The DSL is the complete, final spec. Never invent a component it did not list (e.g. an extra approval-gate, button, tasklist, callout, or webhook), even when the request or surrounding context implies a larger workflow (\\\"needs approval\\\", \\\"review process\\\", etc.). Describe any such follow-up in prose only; do not emit it.\\n- Each ```mdma block is exactly ONE component as top-level YAML keys (type, id, β¦). Never wrap a component in a \\\"components:\\\" array.\\n- Every component has \\\"id\\\" and \\\"type\\\" (one of: form, button, tasklist, table, callout, approval-gate, webhook, chart).\\n- AT MOST ONE interactive component (form, button, tasklist, approval-gate, webhook) per response; non-interactive components (callout, table, chart) may accompany it. Define a referenced component before anything that references it.\\n- form: top-level \\\"onSubmit: \\\"; \\\"fields\\\" list (each name/type/label); field type β text|number|email|date|select|checkbox|textarea|file; select fields need \\\"options\\\" (list of {label, value}); mark every PII field \\\"sensitive: true\\\".\\n- button: \\\"text\\\" + \\\"onAction: \\\". tasklist: \\\"items\\\" list of {id, text}. table: \\\"columns\\\" (key/header) + \\\"data\\\" rows. callout: \\\"content\\\" + variant β info|warning|error|success. approval-gate: \\\"title\\\". webhook: \\\"url\\\" + \\\"trigger: \\\". chart: \\\"label\\\" (never \\\"title\\\") + \\\"data: |\\\" CSV (header line then rows) + variant β line|bar|area|pie.\\n- Forms use \\\"onSubmit\\\", buttons \\\"onAction\\\", webhooks \\\"trigger\\\" β never a bare \\\"action\\\" key.\\n- Fill in realistic values the DSL omits (table rows, chart CSV, callout content, approval-gate title).\\n\\n## Turn and reasoning discipline\\n- Each message is a complete instruction. Emit exactly one document for the current DSL intent, then stop. Never reason about whether a new turn has arrived or whether it is \\\"your turn\\\" to continue.\\n- Do not re-verify or re-emit components from earlier turns. Keep any thinking block to at most a few sentences. Never repeat a token or phrase.\\n\\n## Examples\\n\\nIntent: `form#contact[full-name*:t, email*^:e](action=contact-submitted)`\\n\\n```mdma\\ntype: thinking\\nid: planning\\nstatus: done\\ncollapsed: true\\ncontent: |\\n Contact form: a required name and a required, sensitive email; submits via contact-submitted.\\n```\\n\\n```mdma\\ntype: form\\nid: contact\\nfields:\\n - name: full-name\\n type: text\\n label: \\\"Full Name\\\"\\n required: true\\n - name: email\\n type: email\\n label: \\\"Email\\\"\\n required: true\\n sensitive: true\\nonSubmit: contact-submitted\\n```\\n\\nIntent: `table#orders` β invent realistic columns and rows; default to sortable/filterable tables.\\n\\n```mdma\\ntype: table\\nid: orders\\nsortable: true\\nfilterable: true\\ncolumns:\\n - key: order-id\\n header: \\\"Order ID\\\"\\n sortable: true\\n - key: customer\\n header: \\\"Customer\\\"\\n sortable: true\\n - key: total\\n header: \\\"Total ($)\\\"\\n sortable: true\\n - key: status\\n header: \\\"Status\\\"\\ndata:\\n - { order-id: \\\"ORD-1001\\\", customer: \\\"Acme Inc\\\", total: 1240.50, status: \\\"Shipped\\\" }\\n - { order-id: \\\"ORD-1002\\\", customer: \\\"Globex\\\", total: 880.00, status: \\\"Pending\\\" }\\n - { order-id: \\\"ORD-1003\\\", customer: \\\"Initech\\\", total: 2310.75, status: \\\"Delivered\\\" }\\n```\\n\\nIntent: `chart#revenue(variant=bar)` β invent a realistic CSV `data` block and a `label`.\\n\\n```mdma\\ntype: chart\\nid: revenue\\nvariant: bar\\nlabel: \\\"Monthly Revenue\\\"\\ndata: |\\n Month, Revenue\\n Jan, 42000\\n Feb, 51000\\n Mar, 47500\\nxAxis: Month\\n```\"},{\"role\":\"user\",\"content\":\"callout#account-verified(variant=success)\\n\"}]",
+ "label": "own-model/prompt-author.mjs",
+ "config": {}
+ },
+ "promptId": "a215030c5bb1ab12c4729b3195affd78a5fff95e4c88201c6183e640304b1358",
+ "promptIdx": 0,
+ "provider": {
+ "id": "openai:chat:mdma-26b",
+ "label": ""
+ },
+ "response": {
+ "output": "```mdma\ntype: thinking\nid: planning\nstatus: done\ncollapsed: true\ncontent: |\n Verified account callout; displays a success message to the user.\n```\n\n```mdma\ntype: callout\nid: account-verified\nvariant: success\ncontent: \"Your account has been successfully verified. You now have full access to all features.\"\n```",
+ "tokenUsage": {
+ "total": 1532,
+ "prompt": 1450,
+ "completion": 82,
+ "numRequests": 1
+ },
+ "cached": false,
+ "latencyMs": 953,
+ "finishReason": "stop",
+ "guardrails": {
+ "flagged": false
+ },
+ "metadata": {
+ "http": {
+ "status": 200,
+ "statusText": "OK",
+ "headers": {
+ "alt-svc": "h3=\":443\"; ma=2592000",
+ "content-length": "957",
+ "content-type": "application/json",
+ "date": "Tue, 30 Jun 2026 12:37:22 GMT",
+ "modal-function-call-id": "fc-01KWC8HFY1P03EB2EA7W2WNGV1",
+ "vary": "accept-encoding"
+ }
+ }
+ }
+ },
+ "score": 1,
+ "success": true,
+ "testCase": {
+ "description": "Account-verified callout (DSL)",
+ "vars": {
+ "request": "callout#account-verified(variant=success)\n"
+ },
+ "assert": [
+ {
+ "type": "javascript",
+ "value": "file://assertions/validate-mdma.mjs",
+ "config": {
+ "exclude": [
+ "flow-ordering"
+ ]
+ }
+ },
+ {
+ "type": "javascript",
+ "value": "file://assertions/only-components.mjs",
+ "config": {
+ "allowed": [
+ "callout"
+ ]
+ }
+ },
+ {
+ "type": "javascript",
+ "value": "file://assertions/callout-variant.mjs",
+ "config": {
+ "variant": "success"
+ }
+ }
+ ],
+ "options": {},
+ "metadata": {}
+ },
+ "testIdx": 12,
+ "vars": {
+ "request": "callout#account-verified(variant=success)\n"
+ },
+ "metadata": {
+ "http": {
+ "status": 200,
+ "statusText": "OK",
+ "headers": {
+ "alt-svc": "h3=\":443\"; ma=2592000",
+ "content-length": "957",
+ "content-type": "application/json",
+ "date": "Tue, 30 Jun 2026 12:37:22 GMT",
+ "modal-function-call-id": "fc-01KWC8HFY1P03EB2EA7W2WNGV1",
+ "vary": "accept-encoding"
+ }
+ },
+ "_promptfooFileMetadata": {}
+ },
+ "failureReason": 0
+ },
+ {
+ "cost": 0,
+ "gradingResult": {
+ "pass": true,
+ "score": 1,
+ "reason": "All assertions passed",
+ "namedScores": {},
+ "tokensUsed": {
+ "total": 0,
+ "prompt": 0,
+ "completion": 0,
+ "cached": 0,
+ "numRequests": 0
+ },
+ "componentResults": [
+ {
+ "pass": true,
+ "score": 1,
+ "reason": "Valid MDMA document (0 warnings, 0 info)",
+ "assertion": {
+ "type": "javascript",
+ "value": "file://assertions/validate-mdma.mjs",
+ "config": {
+ "exclude": [
+ "flow-ordering"
+ ]
+ }
+ }
+ },
+ {
+ "pass": true,
+ "score": 1,
+ "reason": "Only allowed components generated: chart",
+ "assertion": {
+ "type": "javascript",
+ "value": "file://assertions/only-components.mjs",
+ "config": {
+ "allowed": [
+ "chart"
+ ]
+ }
+ }
+ },
+ {
+ "pass": true,
+ "score": 1,
+ "reason": "Pie chart variant found",
+ "assertion": {
+ "type": "javascript",
+ "value": "file://assertions/pie-chart.mjs"
+ }
+ }
+ ]
+ },
+ "id": "3f040def-5226-4fdd-b7e8-9e5c8120a485",
+ "latencyMs": 1265,
+ "namedScores": {},
+ "prompt": {
+ "raw": "[{\"role\":\"system\",\"content\":\"You are an MDMA authoring engine. You read a single DSL intent β a compact, one-line-per-component description of the UI to build β and produce the corresponding MDMA (Markdown Document with Mounted Applications) components.\\n\\nThe DSL intent to build is given in your instructions (and/or the user's message). Always generate the MDMA for that DSL β never refuse, apologize, or ask the user to provide a DSL intent. Treat any natural-language message as extra context for the DSL you were given.\\n\\n## DSL input β the grammar you read\\n```\\n#[, , ...](, , ...) # one component per line\\nfield = [*][^]:[{opt1|opt2|...}]\\n * = required ^ = sensitive (PII: name, email, phone, address, SSN, date-of-birth, β¦)\\n typecode: t=text n=number e=email d=date s=select c=checkbox ta=textarea f=file\\n {a|b|c} = options for a select field\\nprops = text=\\\"...\\\" | action= | variant=\\ntypes: form Β· button Β· tasklist Β· table Β· callout Β· approval-gate Β· webhook Β· chart\\n```\\n\\n## Authoring rules\\n- Build EXACTLY the components in the DSL intent β no more, no fewer. The DSL is the complete, final spec. Never invent a component it did not list (e.g. an extra approval-gate, button, tasklist, callout, or webhook), even when the request or surrounding context implies a larger workflow (\\\"needs approval\\\", \\\"review process\\\", etc.). Describe any such follow-up in prose only; do not emit it.\\n- Each ```mdma block is exactly ONE component as top-level YAML keys (type, id, β¦). Never wrap a component in a \\\"components:\\\" array.\\n- Every component has \\\"id\\\" and \\\"type\\\" (one of: form, button, tasklist, table, callout, approval-gate, webhook, chart).\\n- AT MOST ONE interactive component (form, button, tasklist, approval-gate, webhook) per response; non-interactive components (callout, table, chart) may accompany it. Define a referenced component before anything that references it.\\n- form: top-level \\\"onSubmit: \\\"; \\\"fields\\\" list (each name/type/label); field type β text|number|email|date|select|checkbox|textarea|file; select fields need \\\"options\\\" (list of {label, value}); mark every PII field \\\"sensitive: true\\\".\\n- button: \\\"text\\\" + \\\"onAction: \\\". tasklist: \\\"items\\\" list of {id, text}. table: \\\"columns\\\" (key/header) + \\\"data\\\" rows. callout: \\\"content\\\" + variant β info|warning|error|success. approval-gate: \\\"title\\\". webhook: \\\"url\\\" + \\\"trigger: \\\". chart: \\\"label\\\" (never \\\"title\\\") + \\\"data: |\\\" CSV (header line then rows) + variant β line|bar|area|pie.\\n- Forms use \\\"onSubmit\\\", buttons \\\"onAction\\\", webhooks \\\"trigger\\\" β never a bare \\\"action\\\" key.\\n- Fill in realistic values the DSL omits (table rows, chart CSV, callout content, approval-gate title).\\n\\n## Turn and reasoning discipline\\n- Each message is a complete instruction. Emit exactly one document for the current DSL intent, then stop. Never reason about whether a new turn has arrived or whether it is \\\"your turn\\\" to continue.\\n- Do not re-verify or re-emit components from earlier turns. Keep any thinking block to at most a few sentences. Never repeat a token or phrase.\\n\\n## Examples\\n\\nIntent: `form#contact[full-name*:t, email*^:e](action=contact-submitted)`\\n\\n```mdma\\ntype: thinking\\nid: planning\\nstatus: done\\ncollapsed: true\\ncontent: |\\n Contact form: a required name and a required, sensitive email; submits via contact-submitted.\\n```\\n\\n```mdma\\ntype: form\\nid: contact\\nfields:\\n - name: full-name\\n type: text\\n label: \\\"Full Name\\\"\\n required: true\\n - name: email\\n type: email\\n label: \\\"Email\\\"\\n required: true\\n sensitive: true\\nonSubmit: contact-submitted\\n```\\n\\nIntent: `table#orders` β invent realistic columns and rows; default to sortable/filterable tables.\\n\\n```mdma\\ntype: table\\nid: orders\\nsortable: true\\nfilterable: true\\ncolumns:\\n - key: order-id\\n header: \\\"Order ID\\\"\\n sortable: true\\n - key: customer\\n header: \\\"Customer\\\"\\n sortable: true\\n - key: total\\n header: \\\"Total ($)\\\"\\n sortable: true\\n - key: status\\n header: \\\"Status\\\"\\ndata:\\n - { order-id: \\\"ORD-1001\\\", customer: \\\"Acme Inc\\\", total: 1240.50, status: \\\"Shipped\\\" }\\n - { order-id: \\\"ORD-1002\\\", customer: \\\"Globex\\\", total: 880.00, status: \\\"Pending\\\" }\\n - { order-id: \\\"ORD-1003\\\", customer: \\\"Initech\\\", total: 2310.75, status: \\\"Delivered\\\" }\\n```\\n\\nIntent: `chart#revenue(variant=bar)` β invent a realistic CSV `data` block and a `label`.\\n\\n```mdma\\ntype: chart\\nid: revenue\\nvariant: bar\\nlabel: \\\"Monthly Revenue\\\"\\ndata: |\\n Month, Revenue\\n Jan, 42000\\n Feb, 51000\\n Mar, 47500\\nxAxis: Month\\n```\"},{\"role\":\"user\",\"content\":\"chart#browser-share(variant=pie)\\n\"}]",
+ "label": "own-model/prompt-author.mjs",
+ "config": {}
+ },
+ "promptId": "a215030c5bb1ab12c4729b3195affd78a5fff95e4c88201c6183e640304b1358",
+ "promptIdx": 0,
+ "provider": {
+ "id": "openai:chat:mdma-26b",
+ "label": ""
+ },
+ "response": {
+ "output": "```mdma\ntype: thinking\nid: planning\nstatus: done\ncollapsed: true\ncontent: |\n Creating a pie chart for browser market share with realistic dummy data.\n```\n\n```mdma\ntype: chart\nid: browser-share\nvariant: pie\nlabel: \"Browser Market Share\"\ndata: |\n Browser, Share\n Chrome, 65\n Safari, 18\n Firefox, 4\n Edge, 4\n Opera, 2\n Other, 7\n```",
+ "tokenUsage": {
+ "total": 1565,
+ "prompt": 1449,
+ "completion": 116,
+ "numRequests": 1
+ },
+ "cached": false,
+ "latencyMs": 1265,
+ "finishReason": "stop",
+ "guardrails": {
+ "flagged": false
+ },
+ "metadata": {
+ "http": {
+ "status": 200,
+ "statusText": "OK",
+ "headers": {
+ "alt-svc": "h3=\":443\"; ma=2592000",
+ "content-length": "992",
+ "content-type": "application/json",
+ "date": "Tue, 30 Jun 2026 12:37:23 GMT",
+ "modal-function-call-id": "fc-01KWC8HGXF3CGYK45GVNGWAMAQ",
+ "vary": "accept-encoding"
+ }
+ }
+ }
+ },
+ "score": 1,
+ "success": true,
+ "testCase": {
+ "description": "Browser-share pie chart (DSL)",
+ "vars": {
+ "request": "chart#browser-share(variant=pie)\n"
+ },
+ "assert": [
+ {
+ "type": "javascript",
+ "value": "file://assertions/validate-mdma.mjs",
+ "config": {
+ "exclude": [
+ "flow-ordering"
+ ]
+ }
+ },
+ {
+ "type": "javascript",
+ "value": "file://assertions/only-components.mjs",
+ "config": {
+ "allowed": [
+ "chart"
+ ]
+ }
+ },
+ {
+ "type": "javascript",
+ "value": "file://assertions/pie-chart.mjs"
+ }
+ ],
+ "options": {},
+ "metadata": {}
+ },
+ "testIdx": 13,
+ "vars": {
+ "request": "chart#browser-share(variant=pie)\n"
+ },
+ "metadata": {
+ "http": {
+ "status": 200,
+ "statusText": "OK",
+ "headers": {
+ "alt-svc": "h3=\":443\"; ma=2592000",
+ "content-length": "992",
+ "content-type": "application/json",
+ "date": "Tue, 30 Jun 2026 12:37:23 GMT",
+ "modal-function-call-id": "fc-01KWC8HGXF3CGYK45GVNGWAMAQ",
+ "vary": "accept-encoding"
+ }
+ },
+ "_promptfooFileMetadata": {}
+ },
+ "failureReason": 0
+ },
+ {
+ "cost": 0,
+ "gradingResult": {
+ "pass": true,
+ "score": 1,
+ "reason": "All assertions passed",
+ "namedScores": {},
+ "tokensUsed": {
+ "total": 0,
+ "prompt": 0,
+ "completion": 0,
+ "cached": 0,
+ "numRequests": 0
+ },
+ "componentResults": [
+ {
+ "pass": true,
+ "score": 1,
+ "reason": "Valid MDMA document (0 warnings, 0 info)",
+ "assertion": {
+ "type": "javascript",
+ "value": "file://assertions/validate-mdma.mjs",
+ "config": {
+ "exclude": [
+ "flow-ordering"
+ ]
+ }
+ }
+ },
+ {
+ "pass": true,
+ "score": 1,
+ "reason": "Only allowed components generated: button",
+ "assertion": {
+ "type": "javascript",
+ "value": "file://assertions/only-components.mjs",
+ "config": {
+ "allowed": [
+ "button"
+ ]
+ }
+ }
+ },
+ {
+ "pass": true,
+ "score": 1,
+ "reason": "Assertion passed",
+ "assertion": {
+ "type": "contains",
+ "value": "variant: danger"
+ }
+ }
+ ]
+ },
+ "id": "6aab8160-5d69-4d80-8a9d-151ab9612d38",
+ "latencyMs": 595,
+ "namedScores": {},
+ "prompt": {
+ "raw": "[{\"role\":\"system\",\"content\":\"You are an MDMA authoring engine. You read a single DSL intent β a compact, one-line-per-component description of the UI to build β and produce the corresponding MDMA (Markdown Document with Mounted Applications) components.\\n\\nThe DSL intent to build is given in your instructions (and/or the user's message). Always generate the MDMA for that DSL β never refuse, apologize, or ask the user to provide a DSL intent. Treat any natural-language message as extra context for the DSL you were given.\\n\\n## DSL input β the grammar you read\\n```\\n#[, , ...](, , ...) # one component per line\\nfield = [*][^]:[{opt1|opt2|...}]\\n * = required ^ = sensitive (PII: name, email, phone, address, SSN, date-of-birth, β¦)\\n typecode: t=text n=number e=email d=date s=select c=checkbox ta=textarea f=file\\n {a|b|c} = options for a select field\\nprops = text=\\\"...\\\" | action= | variant=\\ntypes: form Β· button Β· tasklist Β· table Β· callout Β· approval-gate Β· webhook Β· chart\\n```\\n\\n## Authoring rules\\n- Build EXACTLY the components in the DSL intent β no more, no fewer. The DSL is the complete, final spec. Never invent a component it did not list (e.g. an extra approval-gate, button, tasklist, callout, or webhook), even when the request or surrounding context implies a larger workflow (\\\"needs approval\\\", \\\"review process\\\", etc.). Describe any such follow-up in prose only; do not emit it.\\n- Each ```mdma block is exactly ONE component as top-level YAML keys (type, id, β¦). Never wrap a component in a \\\"components:\\\" array.\\n- Every component has \\\"id\\\" and \\\"type\\\" (one of: form, button, tasklist, table, callout, approval-gate, webhook, chart).\\n- AT MOST ONE interactive component (form, button, tasklist, approval-gate, webhook) per response; non-interactive components (callout, table, chart) may accompany it. Define a referenced component before anything that references it.\\n- form: top-level \\\"onSubmit: \\\"; \\\"fields\\\" list (each name/type/label); field type β text|number|email|date|select|checkbox|textarea|file; select fields need \\\"options\\\" (list of {label, value}); mark every PII field \\\"sensitive: true\\\".\\n- button: \\\"text\\\" + \\\"onAction: \\\". tasklist: \\\"items\\\" list of {id, text}. table: \\\"columns\\\" (key/header) + \\\"data\\\" rows. callout: \\\"content\\\" + variant β info|warning|error|success. approval-gate: \\\"title\\\". webhook: \\\"url\\\" + \\\"trigger: \\\". chart: \\\"label\\\" (never \\\"title\\\") + \\\"data: |\\\" CSV (header line then rows) + variant β line|bar|area|pie.\\n- Forms use \\\"onSubmit\\\", buttons \\\"onAction\\\", webhooks \\\"trigger\\\" β never a bare \\\"action\\\" key.\\n- Fill in realistic values the DSL omits (table rows, chart CSV, callout content, approval-gate title).\\n\\n## Turn and reasoning discipline\\n- Each message is a complete instruction. Emit exactly one document for the current DSL intent, then stop. Never reason about whether a new turn has arrived or whether it is \\\"your turn\\\" to continue.\\n- Do not re-verify or re-emit components from earlier turns. Keep any thinking block to at most a few sentences. Never repeat a token or phrase.\\n\\n## Examples\\n\\nIntent: `form#contact[full-name*:t, email*^:e](action=contact-submitted)`\\n\\n```mdma\\ntype: thinking\\nid: planning\\nstatus: done\\ncollapsed: true\\ncontent: |\\n Contact form: a required name and a required, sensitive email; submits via contact-submitted.\\n```\\n\\n```mdma\\ntype: form\\nid: contact\\nfields:\\n - name: full-name\\n type: text\\n label: \\\"Full Name\\\"\\n required: true\\n - name: email\\n type: email\\n label: \\\"Email\\\"\\n required: true\\n sensitive: true\\nonSubmit: contact-submitted\\n```\\n\\nIntent: `table#orders` β invent realistic columns and rows; default to sortable/filterable tables.\\n\\n```mdma\\ntype: table\\nid: orders\\nsortable: true\\nfilterable: true\\ncolumns:\\n - key: order-id\\n header: \\\"Order ID\\\"\\n sortable: true\\n - key: customer\\n header: \\\"Customer\\\"\\n sortable: true\\n - key: total\\n header: \\\"Total ($)\\\"\\n sortable: true\\n - key: status\\n header: \\\"Status\\\"\\ndata:\\n - { order-id: \\\"ORD-1001\\\", customer: \\\"Acme Inc\\\", total: 1240.50, status: \\\"Shipped\\\" }\\n - { order-id: \\\"ORD-1002\\\", customer: \\\"Globex\\\", total: 880.00, status: \\\"Pending\\\" }\\n - { order-id: \\\"ORD-1003\\\", customer: \\\"Initech\\\", total: 2310.75, status: \\\"Delivered\\\" }\\n```\\n\\nIntent: `chart#revenue(variant=bar)` β invent a realistic CSV `data` block and a `label`.\\n\\n```mdma\\ntype: chart\\nid: revenue\\nvariant: bar\\nlabel: \\\"Monthly Revenue\\\"\\ndata: |\\n Month, Revenue\\n Jan, 42000\\n Feb, 51000\\n Mar, 47500\\nxAxis: Month\\n```\"},{\"role\":\"user\",\"content\":\"button#delete-account-btn(text=\\\"Delete Account\\\", action=delete-account, variant=danger)\\n\"}]",
+ "label": "own-model/prompt-author.mjs",
+ "config": {}
+ },
+ "promptId": "a215030c5bb1ab12c4729b3195affd78a5fff95e4c88201c6183e640304b1358",
+ "promptIdx": 0,
+ "provider": {
+ "id": "openai:chat:mdma-26b",
+ "label": ""
+ },
+ "response": {
+ "output": "```mdma\ntype: button\nid: delete-account-btn\ntext: \"Delete Account\"\nonAction: delete-account\nvariant: danger\n```",
+ "tokenUsage": {
+ "total": 1498,
+ "prompt": 1462,
+ "completion": 36,
+ "numRequests": 1
+ },
+ "cached": false,
+ "latencyMs": 595,
+ "finishReason": "stop",
+ "guardrails": {
+ "flagged": false
+ },
+ "metadata": {
+ "http": {
+ "status": 200,
+ "statusText": "OK",
+ "headers": {
+ "alt-svc": "h3=\":443\"; ma=2592000",
+ "content-length": "750",
+ "content-type": "application/json",
+ "date": "Tue, 30 Jun 2026 12:37:24 GMT",
+ "modal-function-call-id": "fc-01KWC8HJ3YV7FCSPNMARG9D1E0",
+ "vary": "accept-encoding"
+ }
+ }
+ }
+ },
+ "score": 1,
+ "success": true,
+ "testCase": {
+ "description": "Delete-account danger button (DSL)",
+ "vars": {
+ "request": "button#delete-account-btn(text=\"Delete Account\", action=delete-account, variant=danger)\n"
+ },
+ "assert": [
+ {
+ "type": "javascript",
+ "value": "file://assertions/validate-mdma.mjs",
+ "config": {
+ "exclude": [
+ "flow-ordering"
+ ]
+ }
+ },
+ {
+ "type": "javascript",
+ "value": "file://assertions/only-components.mjs",
+ "config": {
+ "allowed": [
+ "button"
+ ]
+ }
+ },
+ {
+ "type": "contains",
+ "value": "variant: danger"
+ }
+ ],
+ "options": {},
+ "metadata": {}
+ },
+ "testIdx": 14,
+ "vars": {
+ "request": "button#delete-account-btn(text=\"Delete Account\", action=delete-account, variant=danger)\n"
+ },
+ "metadata": {
+ "http": {
+ "status": 200,
+ "statusText": "OK",
+ "headers": {
+ "alt-svc": "h3=\":443\"; ma=2592000",
+ "content-length": "750",
+ "content-type": "application/json",
+ "date": "Tue, 30 Jun 2026 12:37:24 GMT",
+ "modal-function-call-id": "fc-01KWC8HJ3YV7FCSPNMARG9D1E0",
+ "vary": "accept-encoding"
+ }
+ },
+ "_promptfooFileMetadata": {}
+ },
+ "failureReason": 0
+ },
+ {
+ "cost": 0,
+ "gradingResult": {
+ "pass": true,
+ "score": 1,
+ "reason": "All assertions passed",
+ "namedScores": {},
+ "tokensUsed": {
+ "total": 0,
+ "prompt": 0,
+ "completion": 0,
+ "cached": 0,
+ "numRequests": 0
+ },
+ "componentResults": [
+ {
+ "pass": true,
+ "score": 1,
+ "reason": "Valid MDMA document (0 warnings, 0 info)",
+ "assertion": {
+ "type": "javascript",
+ "value": "file://assertions/validate-mdma.mjs",
+ "config": {
+ "exclude": [
+ "flow-ordering"
+ ]
+ }
+ }
+ },
+ {
+ "pass": true,
+ "score": 1,
+ "reason": "Only allowed components generated: form, callout",
+ "assertion": {
+ "type": "javascript",
+ "value": "file://assertions/only-components.mjs",
+ "config": {
+ "allowed": [
+ "form",
+ "callout"
+ ]
+ }
+ }
+ },
+ {
+ "pass": true,
+ "score": 1,
+ "reason": "Found 3 components (min: 2)",
+ "assertion": {
+ "type": "javascript",
+ "value": "file://assertions/component-count.mjs",
+ "config": {
+ "min": 2
+ }
+ }
+ }
+ ]
+ },
+ "id": "4e289f7f-af15-431d-a8b5-c0e69013055e",
+ "latencyMs": 1630,
+ "namedScores": {},
+ "prompt": {
+ "raw": "[{\"role\":\"system\",\"content\":\"You are an MDMA authoring engine. You read a single DSL intent β a compact, one-line-per-component description of the UI to build β and produce the corresponding MDMA (Markdown Document with Mounted Applications) components.\\n\\nThe DSL intent to build is given in your instructions (and/or the user's message). Always generate the MDMA for that DSL β never refuse, apologize, or ask the user to provide a DSL intent. Treat any natural-language message as extra context for the DSL you were given.\\n\\n## DSL input β the grammar you read\\n```\\n#[, , ...](, , ...) # one component per line\\nfield = [*][^]:[{opt1|opt2|...}]\\n * = required ^ = sensitive (PII: name, email, phone, address, SSN, date-of-birth, β¦)\\n typecode: t=text n=number e=email d=date s=select c=checkbox ta=textarea f=file\\n {a|b|c} = options for a select field\\nprops = text=\\\"...\\\" | action= | variant=\\ntypes: form Β· button Β· tasklist Β· table Β· callout Β· approval-gate Β· webhook Β· chart\\n```\\n\\n## Authoring rules\\n- Build EXACTLY the components in the DSL intent β no more, no fewer. The DSL is the complete, final spec. Never invent a component it did not list (e.g. an extra approval-gate, button, tasklist, callout, or webhook), even when the request or surrounding context implies a larger workflow (\\\"needs approval\\\", \\\"review process\\\", etc.). Describe any such follow-up in prose only; do not emit it.\\n- Each ```mdma block is exactly ONE component as top-level YAML keys (type, id, β¦). Never wrap a component in a \\\"components:\\\" array.\\n- Every component has \\\"id\\\" and \\\"type\\\" (one of: form, button, tasklist, table, callout, approval-gate, webhook, chart).\\n- AT MOST ONE interactive component (form, button, tasklist, approval-gate, webhook) per response; non-interactive components (callout, table, chart) may accompany it. Define a referenced component before anything that references it.\\n- form: top-level \\\"onSubmit: \\\"; \\\"fields\\\" list (each name/type/label); field type β text|number|email|date|select|checkbox|textarea|file; select fields need \\\"options\\\" (list of {label, value}); mark every PII field \\\"sensitive: true\\\".\\n- button: \\\"text\\\" + \\\"onAction: \\\". tasklist: \\\"items\\\" list of {id, text}. table: \\\"columns\\\" (key/header) + \\\"data\\\" rows. callout: \\\"content\\\" + variant β info|warning|error|success. approval-gate: \\\"title\\\". webhook: \\\"url\\\" + \\\"trigger: \\\". chart: \\\"label\\\" (never \\\"title\\\") + \\\"data: |\\\" CSV (header line then rows) + variant β line|bar|area|pie.\\n- Forms use \\\"onSubmit\\\", buttons \\\"onAction\\\", webhooks \\\"trigger\\\" β never a bare \\\"action\\\" key.\\n- Fill in realistic values the DSL omits (table rows, chart CSV, callout content, approval-gate title).\\n\\n## Turn and reasoning discipline\\n- Each message is a complete instruction. Emit exactly one document for the current DSL intent, then stop. Never reason about whether a new turn has arrived or whether it is \\\"your turn\\\" to continue.\\n- Do not re-verify or re-emit components from earlier turns. Keep any thinking block to at most a few sentences. Never repeat a token or phrase.\\n\\n## Examples\\n\\nIntent: `form#contact[full-name*:t, email*^:e](action=contact-submitted)`\\n\\n```mdma\\ntype: thinking\\nid: planning\\nstatus: done\\ncollapsed: true\\ncontent: |\\n Contact form: a required name and a required, sensitive email; submits via contact-submitted.\\n```\\n\\n```mdma\\ntype: form\\nid: contact\\nfields:\\n - name: full-name\\n type: text\\n label: \\\"Full Name\\\"\\n required: true\\n - name: email\\n type: email\\n label: \\\"Email\\\"\\n required: true\\n sensitive: true\\nonSubmit: contact-submitted\\n```\\n\\nIntent: `table#orders` β invent realistic columns and rows; default to sortable/filterable tables.\\n\\n```mdma\\ntype: table\\nid: orders\\nsortable: true\\nfilterable: true\\ncolumns:\\n - key: order-id\\n header: \\\"Order ID\\\"\\n sortable: true\\n - key: customer\\n header: \\\"Customer\\\"\\n sortable: true\\n - key: total\\n header: \\\"Total ($)\\\"\\n sortable: true\\n - key: status\\n header: \\\"Status\\\"\\ndata:\\n - { order-id: \\\"ORD-1001\\\", customer: \\\"Acme Inc\\\", total: 1240.50, status: \\\"Shipped\\\" }\\n - { order-id: \\\"ORD-1002\\\", customer: \\\"Globex\\\", total: 880.00, status: \\\"Pending\\\" }\\n - { order-id: \\\"ORD-1003\\\", customer: \\\"Initech\\\", total: 2310.75, status: \\\"Delivered\\\" }\\n```\\n\\nIntent: `chart#revenue(variant=bar)` β invent a realistic CSV `data` block and a `label`.\\n\\n```mdma\\ntype: chart\\nid: revenue\\nvariant: bar\\nlabel: \\\"Monthly Revenue\\\"\\ndata: |\\n Month, Revenue\\n Jan, 42000\\n Feb, 51000\\n Mar, 47500\\nxAxis: Month\\n```\"},{\"role\":\"user\",\"content\":\"form#ticket-form[subject*:t, description*:ta](action=ticket-submitted)\\ncallout#ticket-submitted(variant=success)\\n\"}]",
+ "label": "own-model/prompt-author.mjs",
+ "config": {}
+ },
+ "promptId": "a215030c5bb1ab12c4729b3195affd78a5fff95e4c88201c6183e640304b1358",
+ "promptIdx": 0,
+ "provider": {
+ "id": "openai:chat:mdma-26b",
+ "label": ""
+ },
+ "response": {
+ "output": "```mdma\ntype: thinking\nid: planning\nstatus: done\ncollapsed: true\ncontent: |\n Support ticket creation: a form with a required subject and a required description; upon submission, it triggers ticket-submitted, followed by a success callout.\n```\n\n```mdma\ntype: form\nid: ticket-form\nfields:\n - name: subject\n type: text\n label: \"Subject\"\n required: true\n - name: description\n type: textarea\n label: \"Description\"\n required: true\nonSubmit: ticket-submitted\n```\n\n```mdma\ntype: callout\nid: ticket-submitted\nvariant: success\ncontent: \"Your support ticket has been submitted successfully. A representative will be with you shortly.\"\n```",
+ "tokenUsage": {
+ "total": 1643,
+ "prompt": 1471,
+ "completion": 172,
+ "numRequests": 1
+ },
+ "cached": false,
+ "latencyMs": 1630,
+ "finishReason": "stop",
+ "guardrails": {
+ "flagged": false
+ },
+ "metadata": {
+ "http": {
+ "status": 200,
+ "statusText": "OK",
+ "headers": {
+ "alt-svc": "h3=\":443\"; ma=2592000",
+ "content-length": "1315",
+ "content-type": "application/json",
+ "date": "Tue, 30 Jun 2026 12:37:25 GMT",
+ "modal-function-call-id": "fc-01KWC8HJRRSJ2MZR7ZK510T39G",
+ "vary": "accept-encoding"
+ }
+ }
+ }
+ },
+ "score": 1,
+ "success": true,
+ "testCase": {
+ "description": "Ticket form + submitted callout (DSL)",
+ "vars": {
+ "request": "form#ticket-form[subject*:t, description*:ta](action=ticket-submitted)\ncallout#ticket-submitted(variant=success)\n"
+ },
+ "assert": [
+ {
+ "type": "javascript",
+ "value": "file://assertions/validate-mdma.mjs",
+ "config": {
+ "exclude": [
+ "flow-ordering"
+ ]
+ }
+ },
+ {
+ "type": "javascript",
+ "value": "file://assertions/only-components.mjs",
+ "config": {
+ "allowed": [
+ "form",
+ "callout"
+ ]
+ }
+ },
+ {
+ "type": "javascript",
+ "value": "file://assertions/component-count.mjs",
+ "config": {
+ "min": 2
+ }
+ }
+ ],
+ "options": {},
+ "metadata": {}
+ },
+ "testIdx": 15,
+ "vars": {
+ "request": "form#ticket-form[subject*:t, description*:ta](action=ticket-submitted)\ncallout#ticket-submitted(variant=success)\n"
+ },
+ "metadata": {
+ "http": {
+ "status": 200,
+ "statusText": "OK",
+ "headers": {
+ "alt-svc": "h3=\":443\"; ma=2592000",
+ "content-length": "1315",
+ "content-type": "application/json",
+ "date": "Tue, 30 Jun 2026 12:37:25 GMT",
+ "modal-function-call-id": "fc-01KWC8HJRRSJ2MZR7ZK510T39G",
+ "vary": "accept-encoding"
+ }
+ },
+ "_promptfooFileMetadata": {}
+ },
+ "failureReason": 0
+ },
+ {
+ "cost": 0,
+ "gradingResult": {
+ "pass": true,
+ "score": 1,
+ "reason": "All assertions passed",
+ "namedScores": {},
+ "tokensUsed": {
+ "total": 0,
+ "prompt": 0,
+ "completion": 0,
+ "cached": 0,
+ "numRequests": 0
+ },
+ "componentResults": [
+ {
+ "pass": true,
+ "score": 1,
+ "reason": "Valid MDMA document (0 warnings, 0 info)",
+ "assertion": {
+ "type": "javascript",
+ "value": "file://assertions/validate-mdma.mjs",
+ "config": {
+ "exclude": [
+ "flow-ordering"
+ ]
+ }
+ }
+ },
+ {
+ "pass": true,
+ "score": 1,
+ "reason": "Only allowed components generated: table",
+ "assertion": {
+ "type": "javascript",
+ "value": "file://assertions/only-components.mjs",
+ "config": {
+ "allowed": [
+ "table"
+ ]
+ }
+ }
+ },
+ {
+ "pass": true,
+ "score": 1,
+ "reason": "Table with sortable and filterable found",
+ "assertion": {
+ "type": "javascript",
+ "value": "file://assertions/table-features.mjs"
+ }
+ }
+ ]
+ },
+ "id": "510ef276-c9c6-4cc9-baaf-2f1d7d418d9d",
+ "latencyMs": 2466,
+ "namedScores": {},
+ "prompt": {
+ "raw": "[{\"role\":\"system\",\"content\":\"You are an MDMA authoring engine. You read a single DSL intent β a compact, one-line-per-component description of the UI to build β and produce the corresponding MDMA (Markdown Document with Mounted Applications) components.\\n\\nThe DSL intent to build is given in your instructions (and/or the user's message). Always generate the MDMA for that DSL β never refuse, apologize, or ask the user to provide a DSL intent. Treat any natural-language message as extra context for the DSL you were given.\\n\\n## DSL input β the grammar you read\\n```\\n#[, , ...](, , ...) # one component per line\\nfield = [*][^]:[{opt1|opt2|...}]\\n * = required ^ = sensitive (PII: name, email, phone, address, SSN, date-of-birth, β¦)\\n typecode: t=text n=number e=email d=date s=select c=checkbox ta=textarea f=file\\n {a|b|c} = options for a select field\\nprops = text=\\\"...\\\" | action= | variant=\\ntypes: form Β· button Β· tasklist Β· table Β· callout Β· approval-gate Β· webhook Β· chart\\n```\\n\\n## Authoring rules\\n- Build EXACTLY the components in the DSL intent β no more, no fewer. The DSL is the complete, final spec. Never invent a component it did not list (e.g. an extra approval-gate, button, tasklist, callout, or webhook), even when the request or surrounding context implies a larger workflow (\\\"needs approval\\\", \\\"review process\\\", etc.). Describe any such follow-up in prose only; do not emit it.\\n- Each ```mdma block is exactly ONE component as top-level YAML keys (type, id, β¦). Never wrap a component in a \\\"components:\\\" array.\\n- Every component has \\\"id\\\" and \\\"type\\\" (one of: form, button, tasklist, table, callout, approval-gate, webhook, chart).\\n- AT MOST ONE interactive component (form, button, tasklist, approval-gate, webhook) per response; non-interactive components (callout, table, chart) may accompany it. Define a referenced component before anything that references it.\\n- form: top-level \\\"onSubmit: \\\"; \\\"fields\\\" list (each name/type/label); field type β text|number|email|date|select|checkbox|textarea|file; select fields need \\\"options\\\" (list of {label, value}); mark every PII field \\\"sensitive: true\\\".\\n- button: \\\"text\\\" + \\\"onAction: \\\". tasklist: \\\"items\\\" list of {id, text}. table: \\\"columns\\\" (key/header) + \\\"data\\\" rows. callout: \\\"content\\\" + variant β info|warning|error|success. approval-gate: \\\"title\\\". webhook: \\\"url\\\" + \\\"trigger: \\\". chart: \\\"label\\\" (never \\\"title\\\") + \\\"data: |\\\" CSV (header line then rows) + variant β line|bar|area|pie.\\n- Forms use \\\"onSubmit\\\", buttons \\\"onAction\\\", webhooks \\\"trigger\\\" β never a bare \\\"action\\\" key.\\n- Fill in realistic values the DSL omits (table rows, chart CSV, callout content, approval-gate title).\\n\\n## Turn and reasoning discipline\\n- Each message is a complete instruction. Emit exactly one document for the current DSL intent, then stop. Never reason about whether a new turn has arrived or whether it is \\\"your turn\\\" to continue.\\n- Do not re-verify or re-emit components from earlier turns. Keep any thinking block to at most a few sentences. Never repeat a token or phrase.\\n\\n## Examples\\n\\nIntent: `form#contact[full-name*:t, email*^:e](action=contact-submitted)`\\n\\n```mdma\\ntype: thinking\\nid: planning\\nstatus: done\\ncollapsed: true\\ncontent: |\\n Contact form: a required name and a required, sensitive email; submits via contact-submitted.\\n```\\n\\n```mdma\\ntype: form\\nid: contact\\nfields:\\n - name: full-name\\n type: text\\n label: \\\"Full Name\\\"\\n required: true\\n - name: email\\n type: email\\n label: \\\"Email\\\"\\n required: true\\n sensitive: true\\nonSubmit: contact-submitted\\n```\\n\\nIntent: `table#orders` β invent realistic columns and rows; default to sortable/filterable tables.\\n\\n```mdma\\ntype: table\\nid: orders\\nsortable: true\\nfilterable: true\\ncolumns:\\n - key: order-id\\n header: \\\"Order ID\\\"\\n sortable: true\\n - key: customer\\n header: \\\"Customer\\\"\\n sortable: true\\n - key: total\\n header: \\\"Total ($)\\\"\\n sortable: true\\n - key: status\\n header: \\\"Status\\\"\\ndata:\\n - { order-id: \\\"ORD-1001\\\", customer: \\\"Acme Inc\\\", total: 1240.50, status: \\\"Shipped\\\" }\\n - { order-id: \\\"ORD-1002\\\", customer: \\\"Globex\\\", total: 880.00, status: \\\"Pending\\\" }\\n - { order-id: \\\"ORD-1003\\\", customer: \\\"Initech\\\", total: 2310.75, status: \\\"Delivered\\\" }\\n```\\n\\nIntent: `chart#revenue(variant=bar)` β invent a realistic CSV `data` block and a `label`.\\n\\n```mdma\\ntype: chart\\nid: revenue\\nvariant: bar\\nlabel: \\\"Monthly Revenue\\\"\\ndata: |\\n Month, Revenue\\n Jan, 42000\\n Feb, 51000\\n Mar, 47500\\nxAxis: Month\\n```\"},{\"role\":\"user\",\"content\":\"table#employee-directory\\n\"}]",
+ "label": "own-model/prompt-author.mjs",
+ "config": {}
+ },
+ "promptId": "a215030c5bb1ab12c4729b3195affd78a5fff95e4c88201c6183e640304b1358",
+ "promptIdx": 0,
+ "provider": {
+ "id": "openai:chat:mdma-26b",
+ "label": ""
+ },
+ "response": {
+ "output": "```mdma\ntype: thinking\nid: planning\nstatus: done\ncollapsed: true\ncontent: |\n Employee Directory table: a searchable/sortable list displaying name, department, and status.\n```\n\n```mdma\ntype: table\nid: employee-directory\nsortable: true\nfilterable: true\ncolumns:\n - key: name\n header: \"Employee Name\"\n sortable: true\n - key: department\n header: \"Department\"\n sortable: true\n - key: role\n header: \"Role\"\n - key: status\n header: \"Status\"\ndata:\n - { name: \"Alice Johnson\", department: \"Engineering\", role: \"Frontend Lead\", status: \"Active\" }\n - { name: \"Bob Smith\", department: \"Sales\", role: \"Account Executive\", status: \"Active\" }\n - { name: \"Charlie Davis\", department: \"HR\", role: \"Generalist\", status: \"On Leave\" }\n - { name: \"Diana Prince\", department: \"Engineering\", role: \"DevOps Engineer\", status: \"Active\" }\n - { name: \"Edward Norton\", department: \"Marketing\", role: \"SEO Specialist\", status: \"Inactive\" }\n```",
+ "tokenUsage": {
+ "total": 1723,
+ "prompt": 1444,
+ "completion": 279,
+ "numRequests": 1
+ },
+ "cached": false,
+ "latencyMs": 2466,
+ "finishReason": "stop",
+ "guardrails": {
+ "flagged": false
+ },
+ "metadata": {
+ "http": {
+ "status": 200,
+ "statusText": "OK",
+ "headers": {
+ "alt-svc": "h3=\":443\"; ma=2592000",
+ "content-length": "1652",
+ "content-type": "application/json",
+ "date": "Tue, 30 Jun 2026 12:37:26 GMT",
+ "modal-function-call-id": "fc-01KWC8HMC9R4F20RG0CFWH9ATF",
+ "vary": "accept-encoding"
+ }
+ }
+ }
+ },
+ "score": 1,
+ "success": true,
+ "testCase": {
+ "description": "Employee directory table (DSL)",
+ "vars": {
+ "request": "table#employee-directory\n"
+ },
+ "assert": [
+ {
+ "type": "javascript",
+ "value": "file://assertions/validate-mdma.mjs",
+ "config": {
+ "exclude": [
+ "flow-ordering"
+ ]
+ }
+ },
+ {
+ "type": "javascript",
+ "value": "file://assertions/only-components.mjs",
+ "config": {
+ "allowed": [
+ "table"
+ ]
+ }
+ },
+ {
+ "type": "javascript",
+ "value": "file://assertions/table-features.mjs"
+ }
+ ],
+ "options": {},
+ "metadata": {}
+ },
+ "testIdx": 16,
+ "vars": {
+ "request": "table#employee-directory\n"
+ },
+ "metadata": {
+ "http": {
+ "status": 200,
+ "statusText": "OK",
+ "headers": {
+ "alt-svc": "h3=\":443\"; ma=2592000",
+ "content-length": "1652",
+ "content-type": "application/json",
+ "date": "Tue, 30 Jun 2026 12:37:26 GMT",
+ "modal-function-call-id": "fc-01KWC8HMC9R4F20RG0CFWH9ATF",
+ "vary": "accept-encoding"
+ }
+ },
+ "_promptfooFileMetadata": {}
+ },
+ "failureReason": 0
+ },
+ {
+ "cost": 0,
+ "gradingResult": {
+ "pass": true,
+ "score": 1,
+ "reason": "All assertions passed",
+ "namedScores": {},
+ "tokensUsed": {
+ "total": 0,
+ "prompt": 0,
+ "completion": 0,
+ "cached": 0,
+ "numRequests": 0
+ },
+ "componentResults": [
+ {
+ "pass": true,
+ "score": 1,
+ "reason": "Valid MDMA document (0 warnings, 0 info)",
+ "assertion": {
+ "type": "javascript",
+ "value": "file://assertions/validate-mdma.mjs",
+ "config": {
+ "exclude": [
+ "flow-ordering"
+ ]
+ }
+ }
+ },
+ {
+ "pass": true,
+ "score": 1,
+ "reason": "Only allowed components generated: form",
+ "assertion": {
+ "type": "javascript",
+ "value": "file://assertions/only-components.mjs",
+ "config": {
+ "allowed": [
+ "form"
+ ]
+ }
+ }
+ },
+ {
+ "pass": true,
+ "score": 1,
+ "reason": "Exactly 5 form fields found",
+ "assertion": {
+ "type": "javascript",
+ "value": "file://assertions/exact-field-count.mjs",
+ "config": {
+ "expected": 5
+ }
+ }
+ },
+ {
+ "pass": true,
+ "score": 1,
+ "reason": "Found 4 sensitive flags",
+ "assertion": {
+ "type": "javascript",
+ "value": "file://assertions/pii-sensitive.mjs"
+ }
+ }
+ ]
+ },
+ "id": "9b86a2f9-2c05-428d-90e4-ebe95feca0f1",
+ "latencyMs": 2187,
+ "namedScores": {},
+ "prompt": {
+ "raw": "[{\"role\":\"system\",\"content\":\"You are an MDMA authoring engine. You read a single DSL intent β a compact, one-line-per-component description of the UI to build β and produce the corresponding MDMA (Markdown Document with Mounted Applications) components.\\n\\nThe DSL intent to build is given in your instructions (and/or the user's message). Always generate the MDMA for that DSL β never refuse, apologize, or ask the user to provide a DSL intent. Treat any natural-language message as extra context for the DSL you were given.\\n\\n## DSL input β the grammar you read\\n```\\n#[, , ...](, , ...) # one component per line\\nfield = [*][^]:[{opt1|opt2|...}]\\n * = required ^ = sensitive (PII: name, email, phone, address, SSN, date-of-birth, β¦)\\n typecode: t=text n=number e=email d=date s=select c=checkbox ta=textarea f=file\\n {a|b|c} = options for a select field\\nprops = text=\\\"...\\\" | action= | variant=\\ntypes: form Β· button Β· tasklist Β· table Β· callout Β· approval-gate Β· webhook Β· chart\\n```\\n\\n## Authoring rules\\n- Build EXACTLY the components in the DSL intent β no more, no fewer. The DSL is the complete, final spec. Never invent a component it did not list (e.g. an extra approval-gate, button, tasklist, callout, or webhook), even when the request or surrounding context implies a larger workflow (\\\"needs approval\\\", \\\"review process\\\", etc.). Describe any such follow-up in prose only; do not emit it.\\n- Each ```mdma block is exactly ONE component as top-level YAML keys (type, id, β¦). Never wrap a component in a \\\"components:\\\" array.\\n- Every component has \\\"id\\\" and \\\"type\\\" (one of: form, button, tasklist, table, callout, approval-gate, webhook, chart).\\n- AT MOST ONE interactive component (form, button, tasklist, approval-gate, webhook) per response; non-interactive components (callout, table, chart) may accompany it. Define a referenced component before anything that references it.\\n- form: top-level \\\"onSubmit: \\\"; \\\"fields\\\" list (each name/type/label); field type β text|number|email|date|select|checkbox|textarea|file; select fields need \\\"options\\\" (list of {label, value}); mark every PII field \\\"sensitive: true\\\".\\n- button: \\\"text\\\" + \\\"onAction: \\\". tasklist: \\\"items\\\" list of {id, text}. table: \\\"columns\\\" (key/header) + \\\"data\\\" rows. callout: \\\"content\\\" + variant β info|warning|error|success. approval-gate: \\\"title\\\". webhook: \\\"url\\\" + \\\"trigger: \\\". chart: \\\"label\\\" (never \\\"title\\\") + \\\"data: |\\\" CSV (header line then rows) + variant β line|bar|area|pie.\\n- Forms use \\\"onSubmit\\\", buttons \\\"onAction\\\", webhooks \\\"trigger\\\" β never a bare \\\"action\\\" key.\\n- Fill in realistic values the DSL omits (table rows, chart CSV, callout content, approval-gate title).\\n\\n## Turn and reasoning discipline\\n- Each message is a complete instruction. Emit exactly one document for the current DSL intent, then stop. Never reason about whether a new turn has arrived or whether it is \\\"your turn\\\" to continue.\\n- Do not re-verify or re-emit components from earlier turns. Keep any thinking block to at most a few sentences. Never repeat a token or phrase.\\n\\n## Examples\\n\\nIntent: `form#contact[full-name*:t, email*^:e](action=contact-submitted)`\\n\\n```mdma\\ntype: thinking\\nid: planning\\nstatus: done\\ncollapsed: true\\ncontent: |\\n Contact form: a required name and a required, sensitive email; submits via contact-submitted.\\n```\\n\\n```mdma\\ntype: form\\nid: contact\\nfields:\\n - name: full-name\\n type: text\\n label: \\\"Full Name\\\"\\n required: true\\n - name: email\\n type: email\\n label: \\\"Email\\\"\\n required: true\\n sensitive: true\\nonSubmit: contact-submitted\\n```\\n\\nIntent: `table#orders` β invent realistic columns and rows; default to sortable/filterable tables.\\n\\n```mdma\\ntype: table\\nid: orders\\nsortable: true\\nfilterable: true\\ncolumns:\\n - key: order-id\\n header: \\\"Order ID\\\"\\n sortable: true\\n - key: customer\\n header: \\\"Customer\\\"\\n sortable: true\\n - key: total\\n header: \\\"Total ($)\\\"\\n sortable: true\\n - key: status\\n header: \\\"Status\\\"\\ndata:\\n - { order-id: \\\"ORD-1001\\\", customer: \\\"Acme Inc\\\", total: 1240.50, status: \\\"Shipped\\\" }\\n - { order-id: \\\"ORD-1002\\\", customer: \\\"Globex\\\", total: 880.00, status: \\\"Pending\\\" }\\n - { order-id: \\\"ORD-1003\\\", customer: \\\"Initech\\\", total: 2310.75, status: \\\"Delivered\\\" }\\n```\\n\\nIntent: `chart#revenue(variant=bar)` β invent a realistic CSV `data` block and a `label`.\\n\\n```mdma\\ntype: chart\\nid: revenue\\nvariant: bar\\nlabel: \\\"Monthly Revenue\\\"\\ndata: |\\n Month, Revenue\\n Jan, 42000\\n Feb, 51000\\n Mar, 47500\\nxAxis: Month\\n```\"},{\"role\":\"user\",\"content\":\"form#payment-form[cardholder-name*:t, card-number*^:t, expiry*^:t, cvv*^:t, billing-address*^:ta](action=payment-submitted)\\n\"}]",
+ "label": "own-model/prompt-author.mjs",
+ "config": {}
+ },
+ "promptId": "a215030c5bb1ab12c4729b3195affd78a5fff95e4c88201c6183e640304b1358",
+ "promptIdx": 0,
+ "provider": {
+ "id": "openai:chat:mdma-26b",
+ "label": ""
+ },
+ "response": {
+ "output": "```mdma\ntype: thinking\nid: planning\nstatus: done\ncollapsed: true\ncontent: |\n Payment form: collects cardholder name, card number, expiry, CVV, and billing address; uses action payment-submitted. PII sensitivity applied to card details and address.\n```\n\n```mdma\ntype: form\nid: payment-form\nfields:\n - name: cardholder-name\n type: text\n label: \"Cardholder Name\"\n required: true\n - name: card-number\n type: text\n label: \"Card Number\"\n required: true\n sensitive: true\n - name: expiry\n type: text\n label: \"Expiry (MM/YY)\"\n required: true\n sensitive: true\n - name: cvv\n type: text\n label: \"CVV\"\n required: true\n sensitive: true\n - name: billing-address\n type: textarea\n label: \"Billing Address\"\n required: true\n sensitive: true\nonSubmit: payment-submitted\n```",
+ "tokenUsage": {
+ "total": 1727,
+ "prompt": 1486,
+ "completion": 241,
+ "numRequests": 1
+ },
+ "cached": false,
+ "latencyMs": 2187,
+ "finishReason": "stop",
+ "guardrails": {
+ "flagged": false
+ },
+ "metadata": {
+ "http": {
+ "status": 200,
+ "statusText": "OK",
+ "headers": {
+ "alt-svc": "h3=\":443\"; ma=2592000",
+ "content-length": "1496",
+ "content-type": "application/json",
+ "date": "Tue, 30 Jun 2026 12:37:29 GMT",
+ "modal-function-call-id": "fc-01KWC8HPSNJ9DAHCJRBMFYD3SE",
+ "vary": "accept-encoding"
+ }
+ }
+ }
+ },
+ "score": 1,
+ "success": true,
+ "testCase": {
+ "description": "Payment form, financial PII (DSL)",
+ "vars": {
+ "request": "form#payment-form[cardholder-name*:t, card-number*^:t, expiry*^:t, cvv*^:t, billing-address*^:ta](action=payment-submitted)\n"
+ },
+ "assert": [
+ {
+ "type": "javascript",
+ "value": "file://assertions/validate-mdma.mjs",
+ "config": {
+ "exclude": [
+ "flow-ordering"
+ ]
+ }
+ },
+ {
+ "type": "javascript",
+ "value": "file://assertions/only-components.mjs",
+ "config": {
+ "allowed": [
+ "form"
+ ]
+ }
+ },
+ {
+ "type": "javascript",
+ "value": "file://assertions/exact-field-count.mjs",
+ "config": {
+ "expected": 5
+ }
+ },
+ {
+ "type": "javascript",
+ "value": "file://assertions/pii-sensitive.mjs"
+ }
+ ],
+ "options": {},
+ "metadata": {}
+ },
+ "testIdx": 17,
+ "vars": {
+ "request": "form#payment-form[cardholder-name*:t, card-number*^:t, expiry*^:t, cvv*^:t, billing-address*^:ta](action=payment-submitted)\n"
+ },
+ "metadata": {
+ "http": {
+ "status": 200,
+ "statusText": "OK",
+ "headers": {
+ "alt-svc": "h3=\":443\"; ma=2592000",
+ "content-length": "1496",
+ "content-type": "application/json",
+ "date": "Tue, 30 Jun 2026 12:37:29 GMT",
+ "modal-function-call-id": "fc-01KWC8HPSNJ9DAHCJRBMFYD3SE",
+ "vary": "accept-encoding"
+ }
+ },
+ "_promptfooFileMetadata": {}
+ },
+ "failureReason": 0
+ },
+ {
+ "cost": 0,
+ "gradingResult": {
+ "pass": true,
+ "score": 1,
+ "reason": "All assertions passed",
+ "namedScores": {},
+ "tokensUsed": {
+ "total": 0,
+ "prompt": 0,
+ "completion": 0,
+ "cached": 0,
+ "numRequests": 0
+ },
+ "componentResults": [
+ {
+ "pass": true,
+ "score": 1,
+ "reason": "Valid MDMA document (0 warnings, 0 info)",
+ "assertion": {
+ "type": "javascript",
+ "value": "file://assertions/validate-mdma.mjs",
+ "config": {
+ "exclude": [
+ "flow-ordering"
+ ]
+ }
+ }
+ },
+ {
+ "pass": true,
+ "score": 1,
+ "reason": "Only allowed components generated: form",
+ "assertion": {
+ "type": "javascript",
+ "value": "file://assertions/only-components.mjs",
+ "config": {
+ "allowed": [
+ "form"
+ ]
+ }
+ }
+ },
+ {
+ "pass": true,
+ "score": 1,
+ "reason": "Exactly 4 form fields found",
+ "assertion": {
+ "type": "javascript",
+ "value": "file://assertions/exact-field-count.mjs",
+ "config": {
+ "expected": 4
+ }
+ }
+ },
+ {
+ "pass": true,
+ "score": 1,
+ "reason": "Found 3 required fields",
+ "assertion": {
+ "type": "javascript",
+ "value": "file://assertions/has-required-fields.mjs",
+ "config": {
+ "min": 3
+ }
+ }
+ }
+ ]
+ },
+ "id": "80cabe84-bd2e-4b2e-9723-e144863ab681",
+ "latencyMs": 1686,
+ "namedScores": {},
+ "prompt": {
+ "raw": "[{\"role\":\"system\",\"content\":\"You are an MDMA authoring engine. You read a single DSL intent β a compact, one-line-per-component description of the UI to build β and produce the corresponding MDMA (Markdown Document with Mounted Applications) components.\\n\\nThe DSL intent to build is given in your instructions (and/or the user's message). Always generate the MDMA for that DSL β never refuse, apologize, or ask the user to provide a DSL intent. Treat any natural-language message as extra context for the DSL you were given.\\n\\n## DSL input β the grammar you read\\n```\\n