diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index eb8c16d1f..edd0d898b 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -84,7 +84,5 @@ jobs:
enable-cache: true
- name: Install dependencies
run: make sync
- - name: Install Python 3.9 dependencies
- run: UV_PROJECT_ENVIRONMENT=.venv_39 uv sync --all-extras --all-packages --group dev
- name: Run tests
run: make old_version_tests
diff --git a/.gitignore b/.gitignore
index 2e9b92379..c0c4b3254 100644
--- a/.gitignore
+++ b/.gitignore
@@ -100,7 +100,8 @@ celerybeat.pid
*.sage.py
# Environments
-.env
+.python-version
+.env*
.venv
env/
venv/
diff --git a/Makefile b/Makefile
index 470d97c14..506f198a9 100644
--- a/Makefile
+++ b/Makefile
@@ -39,7 +39,8 @@ snapshots-create:
uv run pytest --inline-snapshot=create
.PHONY: old_version_tests
-old_version_tests:
+old_version_tests:
+ UV_PROJECT_ENVIRONMENT=.venv_39 uv sync --python 3.9 --all-extras --all-packages --group dev
UV_PROJECT_ENVIRONMENT=.venv_39 uv run --python 3.9 -m pytest
.PHONY: build-docs
diff --git a/examples/realtime/app/README.md b/examples/realtime/app/README.md
index cb5519a79..420134bba 100644
--- a/examples/realtime/app/README.md
+++ b/examples/realtime/app/README.md
@@ -29,14 +29,19 @@ To use the same UI with your own agents, edit `agent.py` and ensure get_starting
1. Click **Connect** to establish a realtime session
2. Audio capture starts automatically - just speak naturally
3. Click the **Mic On/Off** button to mute/unmute your microphone
-4. Watch the conversation unfold in the left pane
-5. Monitor raw events in the right pane (click to expand/collapse)
-6. Click **Disconnect** when done
+4. To send an image, enter an optional prompt and click **🖼️ Send Image** (select a file)
+5. Watch the conversation unfold in the left pane (image thumbnails are shown)
+6. Monitor raw events in the right pane (click to expand/collapse)
+7. Click **Disconnect** when done
## Architecture
- **Backend**: FastAPI server with WebSocket connections for real-time communication
- **Session Management**: Each connection gets a unique session with the OpenAI Realtime API
+- **Image Inputs**: The UI uploads images and the server forwards a
+ `conversation.item.create` event with `input_image` (plus optional `input_text`),
+ followed by `response.create` to start the model response. The messages pane
+ renders image bubbles for `input_image` content.
- **Audio Processing**: 24kHz mono audio capture and playback
- **Event Handling**: Full event stream processing with transcript generation
- **Frontend**: Vanilla JavaScript with clean, responsive CSS
diff --git a/examples/realtime/app/server.py b/examples/realtime/app/server.py
index 26c544dd2..d4ff47e80 100644
--- a/examples/realtime/app/server.py
+++ b/examples/realtime/app/server.py
@@ -12,6 +12,8 @@
from typing_extensions import assert_never
from agents.realtime import RealtimeRunner, RealtimeSession, RealtimeSessionEvent
+from agents.realtime.config import RealtimeUserInputMessage
+from agents.realtime.model_inputs import RealtimeModelSendRawMessage
# Import TwilioHandler class - handle both module and package use cases
if TYPE_CHECKING:
@@ -64,6 +66,34 @@ async def send_audio(self, session_id: str, audio_bytes: bytes):
if session_id in self.active_sessions:
await self.active_sessions[session_id].send_audio(audio_bytes)
+ async def send_client_event(self, session_id: str, event: dict[str, Any]):
+ """Send a raw client event to the underlying realtime model."""
+ session = self.active_sessions.get(session_id)
+ if not session:
+ return
+ await session.model.send_event(
+ RealtimeModelSendRawMessage(
+ message={
+ "type": event["type"],
+ "other_data": {k: v for k, v in event.items() if k != "type"},
+ }
+ )
+ )
+
+ async def send_user_message(self, session_id: str, message: RealtimeUserInputMessage):
+ """Send a structured user message via the higher-level API (supports input_image)."""
+ session = self.active_sessions.get(session_id)
+ if not session:
+ return
+ await session.send_message(message) # delegates to RealtimeModelSendUserInput path
+
+ async def interrupt(self, session_id: str) -> None:
+ """Interrupt current model playback/response for a session."""
+ session = self.active_sessions.get(session_id)
+ if not session:
+ return
+ await session.interrupt()
+
async def _process_events(self, session_id: str):
try:
session = self.active_sessions[session_id]
@@ -101,7 +131,11 @@ async def _serialize_event(self, event: RealtimeSessionEvent) -> dict[str, Any]:
elif event.type == "history_updated":
base_event["history"] = [item.model_dump(mode="json") for item in event.history]
elif event.type == "history_added":
- pass
+ # Provide the added item so the UI can render incrementally.
+ try:
+ base_event["item"] = event.item.model_dump(mode="json")
+ except Exception:
+ base_event["item"] = None
elif event.type == "guardrail_tripped":
base_event["guardrail_results"] = [
{"name": result.guardrail.name} for result in event.guardrail_results
@@ -134,6 +168,7 @@ async def lifespan(app: FastAPI):
@app.websocket("/ws/{session_id}")
async def websocket_endpoint(websocket: WebSocket, session_id: str):
await manager.connect(websocket, session_id)
+ image_buffers: dict[str, dict[str, Any]] = {}
try:
while True:
data = await websocket.receive_text()
@@ -144,6 +179,124 @@ async def websocket_endpoint(websocket: WebSocket, session_id: str):
int16_data = message["data"]
audio_bytes = struct.pack(f"{len(int16_data)}h", *int16_data)
await manager.send_audio(session_id, audio_bytes)
+ elif message["type"] == "image":
+ logger.info("Received image message from client (session %s).", session_id)
+ # Build a conversation.item.create with input_image (and optional input_text)
+ data_url = message.get("data_url")
+ prompt_text = message.get("text") or "Please describe this image."
+ if data_url:
+ logger.info(
+ "Forwarding image (structured message) to Realtime API (len=%d).",
+ len(data_url),
+ )
+ user_msg: RealtimeUserInputMessage = {
+ "type": "message",
+ "role": "user",
+ "content": (
+ [
+ {"type": "input_image", "image_url": data_url, "detail": "high"},
+ {"type": "input_text", "text": prompt_text},
+ ]
+ if prompt_text
+ else [
+ {"type": "input_image", "image_url": data_url, "detail": "high"}
+ ]
+ ),
+ }
+ await manager.send_user_message(session_id, user_msg)
+ # Acknowledge to client UI
+ await websocket.send_text(
+ json.dumps(
+ {
+ "type": "client_info",
+ "info": "image_enqueued",
+ "size": len(data_url),
+ }
+ )
+ )
+ else:
+ await websocket.send_text(
+ json.dumps(
+ {
+ "type": "error",
+ "error": "No data_url for image message.",
+ }
+ )
+ )
+ elif message["type"] == "commit_audio":
+ # Force close the current input audio turn
+ await manager.send_client_event(session_id, {"type": "input_audio_buffer.commit"})
+ elif message["type"] == "image_start":
+ img_id = str(message.get("id"))
+ image_buffers[img_id] = {
+ "text": message.get("text") or "Please describe this image.",
+ "chunks": [],
+ }
+ await websocket.send_text(
+ json.dumps({"type": "client_info", "info": "image_start_ack", "id": img_id})
+ )
+ elif message["type"] == "image_chunk":
+ img_id = str(message.get("id"))
+ chunk = message.get("chunk", "")
+ if img_id in image_buffers:
+ image_buffers[img_id]["chunks"].append(chunk)
+ if len(image_buffers[img_id]["chunks"]) % 10 == 0:
+ await websocket.send_text(
+ json.dumps(
+ {
+ "type": "client_info",
+ "info": "image_chunk_ack",
+ "id": img_id,
+ "count": len(image_buffers[img_id]["chunks"]),
+ }
+ )
+ )
+ elif message["type"] == "image_end":
+ img_id = str(message.get("id"))
+ buf = image_buffers.pop(img_id, None)
+ if buf is None:
+ await websocket.send_text(
+ json.dumps({"type": "error", "error": "Unknown image id for image_end."})
+ )
+ else:
+ data_url = "".join(buf["chunks"]) if buf["chunks"] else None
+ prompt_text = buf["text"]
+ if data_url:
+ logger.info(
+ "Forwarding chunked image (structured message) to Realtime API (len=%d).",
+ len(data_url),
+ )
+ user_msg2: RealtimeUserInputMessage = {
+ "type": "message",
+ "role": "user",
+ "content": (
+ [
+ {"type": "input_image", "image_url": data_url, "detail": "high"},
+ {"type": "input_text", "text": prompt_text},
+ ]
+ if prompt_text
+ else [
+ {"type": "input_image", "image_url": data_url, "detail": "high"}
+ ]
+ ),
+ }
+ await manager.send_user_message(session_id, user_msg2)
+ await websocket.send_text(
+ json.dumps(
+ {
+ "type": "client_info",
+ "info": "image_enqueued",
+ "id": img_id,
+ "size": len(data_url),
+ }
+ )
+ )
+ else:
+ await websocket.send_text(
+ json.dumps({"type": "error", "error": "Empty image."})
+ )
+ elif message["type"] == "interrupt":
+ await manager.interrupt(session_id)
except WebSocketDisconnect:
await manager.disconnect(session_id)
@@ -160,4 +313,10 @@ async def read_index():
if __name__ == "__main__":
import uvicorn
- uvicorn.run(app, host="0.0.0.0", port=8000)
+ uvicorn.run(
+ app,
+ host="0.0.0.0",
+ port=8000,
+ # Increased WebSocket frame size to comfortably handle image data URLs.
+ ws_max_size=16 * 1024 * 1024,
+ )
diff --git a/examples/realtime/app/static/app.js b/examples/realtime/app/static/app.js
index 3ec8fcc99..6858428c6 100644
--- a/examples/realtime/app/static/app.js
+++ b/examples/realtime/app/static/app.js
@@ -8,26 +8,33 @@ class RealtimeDemo {
this.processor = null;
this.stream = null;
this.sessionId = this.generateSessionId();
-
+
// Audio playback queue
this.audioQueue = [];
this.isPlayingAudio = false;
this.playbackAudioContext = null;
this.currentAudioSource = null;
-
+ this.currentAudioGain = null; // per-chunk gain for smooth fades
+ this.playbackFadeSec = 0.02; // ~20ms fade to reduce clicks
+ this.messageNodes = new Map(); // item_id -> DOM node
+ this.seenItemIds = new Set(); // item_id set for append-only syncing
+
this.initializeElements();
this.setupEventListeners();
}
-
+
initializeElements() {
this.connectBtn = document.getElementById('connectBtn');
this.muteBtn = document.getElementById('muteBtn');
+ this.imageBtn = document.getElementById('imageBtn');
+ this.imageInput = document.getElementById('imageInput');
+ this.imagePrompt = document.getElementById('imagePrompt');
this.status = document.getElementById('status');
this.messagesContent = document.getElementById('messagesContent');
this.eventsContent = document.getElementById('eventsContent');
this.toolsContent = document.getElementById('toolsContent');
}
-
+
setupEventListeners() {
this.connectBtn.addEventListener('click', () => {
if (this.isConnected) {
@@ -36,52 +43,99 @@ class RealtimeDemo {
this.connect();
}
});
-
+
this.muteBtn.addEventListener('click', () => {
this.toggleMute();
});
+
+ // Image upload
+ this.imageBtn.addEventListener('click', (e) => {
+ e.preventDefault();
+ e.stopPropagation();
+ console.log('Send Image clicked');
+ // Programmatically open the hidden file input
+ this.imageInput.click();
+ });
+
+ this.imageInput.addEventListener('change', async (e) => {
+ console.log('Image input change fired');
+ const file = e.target.files && e.target.files[0];
+ if (!file) return;
+ await this._handlePickedFile(file);
+ this.imageInput.value = '';
+ });
+
+ this._handlePickedFile = async (file) => {
+ try {
+ const dataUrl = await this.prepareDataURL(file);
+ const promptText = (this.imagePrompt && this.imagePrompt.value) || '';
+ // Send to server; server forwards to Realtime API.
+ // Use chunked frames to avoid WS frame limits.
+ if (this.ws && this.ws.readyState === WebSocket.OPEN) {
+ console.log('Interrupting and sending image (chunked) to server WebSocket');
+ // Stop any current audio locally and tell model to interrupt
+ this.stopAudioPlayback();
+ this.ws.send(JSON.stringify({ type: 'interrupt' }));
+ const id = 'img_' + Math.random().toString(36).slice(2);
+ const CHUNK = 60_000; // ~60KB per frame
+ this.ws.send(JSON.stringify({ type: 'image_start', id, text: promptText }));
+ for (let i = 0; i < dataUrl.length; i += CHUNK) {
+ const chunk = dataUrl.slice(i, i + CHUNK);
+ this.ws.send(JSON.stringify({ type: 'image_chunk', id, chunk }));
+ }
+ this.ws.send(JSON.stringify({ type: 'image_end', id }));
+ } else {
+ console.warn('Not connected; image will not be sent. Click Connect first.');
+ }
+ // Add to UI immediately for better feedback
+ console.log('Adding local user image bubble');
+ this.addUserImageMessage(dataUrl, promptText);
+ } catch (err) {
+ console.error('Failed to process image:', err);
+ }
+ };
}
-
+
generateSessionId() {
return 'session_' + Math.random().toString(36).substr(2, 9);
}
-
+
async connect() {
try {
this.ws = new WebSocket(`ws://localhost:8000/ws/${this.sessionId}`);
-
+
this.ws.onopen = () => {
this.isConnected = true;
this.updateConnectionUI();
this.startContinuousCapture();
};
-
+
this.ws.onmessage = (event) => {
const data = JSON.parse(event.data);
this.handleRealtimeEvent(data);
};
-
+
this.ws.onclose = () => {
this.isConnected = false;
this.updateConnectionUI();
};
-
+
this.ws.onerror = (error) => {
console.error('WebSocket error:', error);
};
-
+
} catch (error) {
console.error('Failed to connect:', error);
}
}
-
+
disconnect() {
if (this.ws) {
this.ws.close();
}
this.stopContinuousCapture();
}
-
+
updateConnectionUI() {
if (this.isConnected) {
this.connectBtn.textContent = 'Disconnect';
@@ -97,12 +151,12 @@ class RealtimeDemo {
this.muteBtn.disabled = true;
}
}
-
+
toggleMute() {
this.isMuted = !this.isMuted;
this.updateMuteUI();
}
-
+
updateMuteUI() {
if (this.isMuted) {
this.muteBtn.textContent = '🔇 Mic Off';
@@ -115,90 +169,128 @@ class RealtimeDemo {
}
}
}
-
+
+ readFileAsDataURL(file) {
+ return new Promise((resolve, reject) => {
+ const reader = new FileReader();
+ reader.onload = () => resolve(reader.result);
+ reader.onerror = reject;
+ reader.readAsDataURL(file);
+ });
+ }
+
+ async prepareDataURL(file) {
+ const original = await this.readFileAsDataURL(file);
+ try {
+ const img = new Image();
+ img.decoding = 'async';
+ const loaded = new Promise((res, rej) => {
+ img.onload = () => res();
+ img.onerror = rej;
+ });
+ img.src = original;
+ await loaded;
+
+ const maxDim = 1024;
+ const maxSide = Math.max(img.width, img.height);
+ const scale = maxSide > maxDim ? (maxDim / maxSide) : 1;
+ const w = Math.max(1, Math.round(img.width * scale));
+ const h = Math.max(1, Math.round(img.height * scale));
+
+ const canvas = document.createElement('canvas');
+ canvas.width = w; canvas.height = h;
+ const ctx = canvas.getContext('2d');
+ ctx.drawImage(img, 0, 0, w, h);
+ return canvas.toDataURL('image/jpeg', 0.85);
+ } catch (e) {
+ console.warn('Image resize failed; sending original', e);
+ return original;
+ }
+ }
+
async startContinuousCapture() {
if (!this.isConnected || this.isCapturing) return;
-
+
// Check if getUserMedia is available
if (!navigator.mediaDevices || !navigator.mediaDevices.getUserMedia) {
throw new Error('getUserMedia not available. Please use HTTPS or localhost.');
}
-
+
try {
- this.stream = await navigator.mediaDevices.getUserMedia({
+ this.stream = await navigator.mediaDevices.getUserMedia({
audio: {
sampleRate: 24000,
channelCount: 1,
echoCancellation: true,
noiseSuppression: true
- }
+ }
});
-
- this.audioContext = new AudioContext({ sampleRate: 24000 });
+
+ this.audioContext = new AudioContext({ sampleRate: 24000, latencyHint: 'interactive' });
const source = this.audioContext.createMediaStreamSource(this.stream);
-
+
// Create a script processor to capture audio data
this.processor = this.audioContext.createScriptProcessor(4096, 1, 1);
source.connect(this.processor);
this.processor.connect(this.audioContext.destination);
-
+
this.processor.onaudioprocess = (event) => {
if (!this.isMuted && this.ws && this.ws.readyState === WebSocket.OPEN) {
const inputBuffer = event.inputBuffer.getChannelData(0);
const int16Buffer = new Int16Array(inputBuffer.length);
-
+
// Convert float32 to int16
for (let i = 0; i < inputBuffer.length; i++) {
int16Buffer[i] = Math.max(-32768, Math.min(32767, inputBuffer[i] * 32768));
}
-
+
this.ws.send(JSON.stringify({
type: 'audio',
data: Array.from(int16Buffer)
}));
}
};
-
+
this.isCapturing = true;
this.updateMuteUI();
-
+
} catch (error) {
console.error('Failed to start audio capture:', error);
}
}
-
+
stopContinuousCapture() {
if (!this.isCapturing) return;
-
+
this.isCapturing = false;
-
+
if (this.processor) {
this.processor.disconnect();
this.processor = null;
}
-
+
if (this.audioContext) {
this.audioContext.close();
this.audioContext = null;
}
-
+
if (this.stream) {
this.stream.getTracks().forEach(track => track.stop());
this.stream = null;
}
-
+
this.updateMuteUI();
}
-
+
handleRealtimeEvent(event) {
// Add to raw events pane
this.addRawEvent(event);
-
+
// Add to tools panel if it's a tool or handoff event
if (event.type === 'tool_start' || event.type === 'tool_end' || event.type === 'handoff') {
this.addToolEvent(event);
}
-
+
// Handle specific event types
switch (event.type) {
case 'audio':
@@ -207,115 +299,214 @@ class RealtimeDemo {
case 'audio_interrupted':
this.stopAudioPlayback();
break;
+ case 'input_audio_timeout_triggered':
+ // Ask server to commit the input buffer to expedite model response
+ if (this.ws && this.ws.readyState === WebSocket.OPEN) {
+ this.ws.send(JSON.stringify({ type: 'commit_audio' }));
+ }
+ break;
case 'history_updated':
- this.updateMessagesFromHistory(event.history);
+ this.syncMissingFromHistory(event.history);
+ this.updateLastMessageFromHistory(event.history);
+ break;
+ case 'history_added':
+ // Append just the new item without clearing the thread.
+ if (event.item) {
+ this.addMessageFromItem(event.item);
+ }
break;
}
}
-
-
- updateMessagesFromHistory(history) {
- console.log('updateMessagesFromHistory called with:', history);
-
- // Clear all existing messages
- this.messagesContent.innerHTML = '';
-
- // Add messages from history
- if (history && Array.isArray(history)) {
- console.log('Processing history array with', history.length, 'items');
- history.forEach((item, index) => {
- console.log(`History item ${index}:`, item);
- if (item.type === 'message') {
- const role = item.role;
- let content = '';
-
- console.log(`Message item - role: ${role}, content:`, item.content);
-
- if (item.content && Array.isArray(item.content)) {
- // Extract text from content array
- item.content.forEach(contentPart => {
- console.log('Content part:', contentPart);
- if (contentPart.type === 'text' && contentPart.text) {
- content += contentPart.text;
- } else if (contentPart.type === 'input_text' && contentPart.text) {
- content += contentPart.text;
- } else if (contentPart.type === 'input_audio' && contentPart.transcript) {
- content += contentPart.transcript;
- } else if (contentPart.type === 'audio' && contentPart.transcript) {
- content += contentPart.transcript;
- }
- });
- }
-
- console.log(`Final content for ${role}:`, content);
-
- if (content.trim()) {
- this.addMessage(role, content.trim());
- console.log(`Added message: ${role} - ${content.trim()}`);
+ updateLastMessageFromHistory(history) {
+ if (!history || !Array.isArray(history) || history.length === 0) return;
+ // Find the last message item in history
+ let last = null;
+ for (let i = history.length - 1; i >= 0; i--) {
+ const it = history[i];
+ if (it && it.type === 'message') { last = it; break; }
+ }
+ if (!last) return;
+ const itemId = last.item_id;
+
+ // Extract a text representation (for assistant transcript updates)
+ let text = '';
+ if (Array.isArray(last.content)) {
+ for (const part of last.content) {
+ if (!part || typeof part !== 'object') continue;
+ if (part.type === 'text' && part.text) text += part.text;
+ else if (part.type === 'input_text' && part.text) text += part.text;
+ else if ((part.type === 'input_audio' || part.type === 'audio') && part.transcript) text += part.transcript;
+ }
+ }
+
+ const node = this.messageNodes.get(itemId);
+ if (!node) {
+ // If we haven't rendered this item yet, append it now.
+ this.addMessageFromItem(last);
+ return;
+ }
+
+ // Update only the text content of the bubble, preserving any images already present.
+ const bubble = node.querySelector('.message-bubble');
+ if (bubble && text && text.trim()) {
+ // If there's an , keep it and only update the trailing caption/text node.
+ const hasImg = !!bubble.querySelector('img');
+ if (hasImg) {
+ // Ensure there is a caption div after the image
+ let cap = bubble.querySelector('.image-caption');
+ if (!cap) {
+ cap = document.createElement('div');
+ cap.className = 'image-caption';
+ cap.style.marginTop = '0.5rem';
+ bubble.appendChild(cap);
+ }
+ cap.textContent = text.trim();
+ } else {
+ bubble.textContent = text.trim();
+ }
+ this.scrollToBottom();
+ }
+ }
+
+ syncMissingFromHistory(history) {
+ if (!history || !Array.isArray(history)) return;
+ for (const item of history) {
+ if (!item || item.type !== 'message') continue;
+ const id = item.item_id;
+ if (!id) continue;
+ if (!this.seenItemIds.has(id)) {
+ this.addMessageFromItem(item);
+ }
+ }
+ }
+
+ addMessageFromItem(item) {
+ try {
+ if (!item || item.type !== 'message') return;
+ const role = item.role;
+ let content = '';
+ let imageUrls = [];
+
+ if (Array.isArray(item.content)) {
+ for (const contentPart of item.content) {
+ if (!contentPart || typeof contentPart !== 'object') continue;
+ if (contentPart.type === 'text' && contentPart.text) {
+ content += contentPart.text;
+ } else if (contentPart.type === 'input_text' && contentPart.text) {
+ content += contentPart.text;
+ } else if (contentPart.type === 'input_audio' && contentPart.transcript) {
+ content += contentPart.transcript;
+ } else if (contentPart.type === 'audio' && contentPart.transcript) {
+ content += contentPart.transcript;
+ } else if (contentPart.type === 'input_image') {
+ const url = contentPart.image_url || contentPart.url;
+ if (typeof url === 'string' && url) imageUrls.push(url);
}
- } else {
- console.log(`Skipping non-message item of type: ${item.type}`);
}
- });
- } else {
- console.log('History is not an array or is null/undefined');
+ }
+
+ let node = null;
+ if (imageUrls.length > 0) {
+ for (const url of imageUrls) {
+ node = this.addImageMessage(role, url, content.trim());
+ }
+ } else if (content && content.trim()) {
+ node = this.addMessage(role, content.trim());
+ }
+ if (node && item.item_id) {
+ this.messageNodes.set(item.item_id, node);
+ this.seenItemIds.add(item.item_id);
+ }
+ } catch (e) {
+ console.error('Failed to add message from item:', e, item);
}
-
- this.scrollToBottom();
}
-
+
addMessage(type, content) {
const messageDiv = document.createElement('div');
messageDiv.className = `message ${type}`;
-
+
const bubbleDiv = document.createElement('div');
bubbleDiv.className = 'message-bubble';
bubbleDiv.textContent = content;
-
+
messageDiv.appendChild(bubbleDiv);
this.messagesContent.appendChild(messageDiv);
this.scrollToBottom();
-
+
return messageDiv;
}
-
+
+ addImageMessage(role, imageUrl, caption = '') {
+ const messageDiv = document.createElement('div');
+ messageDiv.className = `message ${role}`;
+
+ const bubbleDiv = document.createElement('div');
+ bubbleDiv.className = 'message-bubble';
+
+ const img = document.createElement('img');
+ img.src = imageUrl;
+ img.alt = 'Uploaded image';
+ img.style.maxWidth = '220px';
+ img.style.borderRadius = '8px';
+ img.style.display = 'block';
+
+ bubbleDiv.appendChild(img);
+ if (caption) {
+ const cap = document.createElement('div');
+ cap.textContent = caption;
+ cap.style.marginTop = '0.5rem';
+ bubbleDiv.appendChild(cap);
+ }
+
+ messageDiv.appendChild(bubbleDiv);
+ this.messagesContent.appendChild(messageDiv);
+ this.scrollToBottom();
+
+ return messageDiv;
+ }
+
+ addUserImageMessage(imageUrl, caption = '') {
+ return this.addImageMessage('user', imageUrl, caption);
+ }
+
addRawEvent(event) {
const eventDiv = document.createElement('div');
eventDiv.className = 'event';
-
+
const headerDiv = document.createElement('div');
headerDiv.className = 'event-header';
headerDiv.innerHTML = `
${event.type}
▼
`;
-
+
const contentDiv = document.createElement('div');
contentDiv.className = 'event-content collapsed';
contentDiv.textContent = JSON.stringify(event, null, 2);
-
+
headerDiv.addEventListener('click', () => {
const isCollapsed = contentDiv.classList.contains('collapsed');
contentDiv.classList.toggle('collapsed');
headerDiv.querySelector('span:last-child').textContent = isCollapsed ? '▲' : '▼';
});
-
+
eventDiv.appendChild(headerDiv);
eventDiv.appendChild(contentDiv);
this.eventsContent.appendChild(eventDiv);
-
+
// Auto-scroll events pane
this.eventsContent.scrollTop = this.eventsContent.scrollHeight;
}
-
+
addToolEvent(event) {
const eventDiv = document.createElement('div');
eventDiv.className = 'event';
-
+
let title = '';
let description = '';
let eventClass = '';
-
+
if (event.type === 'handoff') {
title = `🔄 Handoff`;
description = `From ${event.from} to ${event.to}`;
@@ -329,7 +520,7 @@ class RealtimeDemo {
description = `${event.tool}: ${event.output || 'No output'}`;
eventClass = 'tool';
}
-
+
eventDiv.innerHTML = `