Skip to content

Commit f02b096

Browse files
committed
Migrate to gpt-realtime model
1 parent 0a7bb1b commit f02b096

File tree

13 files changed

+512
-166
lines changed

13 files changed

+512
-166
lines changed

examples/realtime/app/server.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -160,4 +160,6 @@ async def read_index():
160160
if __name__ == "__main__":
161161
import uvicorn
162162

163-
uvicorn.run(app, host="0.0.0.0", port=8000)
163+
# log_level = "debug"
164+
log_level = "info"
165+
uvicorn.run(app, host="0.0.0.0", port=8000, log_level=log_level)

examples/realtime/cli/demo.py

Lines changed: 58 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -8,10 +8,17 @@
88
import sounddevice as sd
99

1010
from agents import function_tool
11-
from agents.realtime import RealtimeAgent, RealtimeRunner, RealtimeSession, RealtimeSessionEvent
11+
from agents.realtime import (
12+
RealtimeAgent,
13+
RealtimePlaybackTracker,
14+
RealtimeRunner,
15+
RealtimeSession,
16+
RealtimeSessionEvent,
17+
)
18+
from agents.realtime.model import RealtimeModelConfig
1219

1320
# Audio configuration
14-
CHUNK_LENGTH_S = 0.05 # 50ms
21+
CHUNK_LENGTH_S = 0.04 # 40ms aligns with realtime defaults
1522
SAMPLE_RATE = 24000
1623
FORMAT = np.int16
1724
CHANNELS = 1
@@ -49,11 +56,16 @@ def __init__(self) -> None:
4956
self.audio_player: sd.OutputStream | None = None
5057
self.recording = False
5158

59+
# Playback tracker lets the model know our real playback progress
60+
self.playback_tracker = RealtimePlaybackTracker()
61+
5262
# Audio output state for callback system
53-
self.output_queue: queue.Queue[Any] = queue.Queue(maxsize=10) # Buffer more chunks
63+
# Store tuples: (samples_np, item_id, content_index)
64+
self.output_queue: queue.Queue[Any] = queue.Queue(maxsize=100)
5465
self.interrupt_event = threading.Event()
55-
self.current_audio_chunk: np.ndarray[Any, np.dtype[Any]] | None = None
66+
self.current_audio_chunk: tuple[np.ndarray[Any, np.dtype[Any]], str, int] | None = None
5667
self.chunk_position = 0
68+
self.bytes_per_sample = np.dtype(FORMAT).itemsize
5769

5870
def _output_callback(self, outdata, frames: int, time, status) -> None:
5971
"""Callback for audio output - handles continuous audio stream from server."""
@@ -92,20 +104,29 @@ def _output_callback(self, outdata, frames: int, time, status) -> None:
92104

93105
# Copy data from current chunk to output buffer
94106
remaining_output = len(outdata) - samples_filled
95-
remaining_chunk = len(self.current_audio_chunk) - self.chunk_position
107+
samples, item_id, content_index = self.current_audio_chunk
108+
remaining_chunk = len(samples) - self.chunk_position
96109
samples_to_copy = min(remaining_output, remaining_chunk)
97110

98111
if samples_to_copy > 0:
99-
chunk_data = self.current_audio_chunk[
100-
self.chunk_position : self.chunk_position + samples_to_copy
101-
]
112+
chunk_data = samples[self.chunk_position : self.chunk_position + samples_to_copy]
102113
# More efficient: direct assignment for mono audio instead of reshape
103114
outdata[samples_filled : samples_filled + samples_to_copy, 0] = chunk_data
104115
samples_filled += samples_to_copy
105116
self.chunk_position += samples_to_copy
106117

118+
# Inform playback tracker about played bytes
119+
try:
120+
self.playback_tracker.on_play_bytes(
121+
item_id=item_id,
122+
item_content_index=content_index,
123+
bytes=chunk_data.tobytes(),
124+
)
125+
except Exception:
126+
pass
127+
107128
# If we've used up the entire chunk, reset for next iteration
108-
if self.chunk_position >= len(self.current_audio_chunk):
129+
if self.chunk_position >= len(samples):
109130
self.current_audio_chunk = None
110131
self.chunk_position = 0
111132

@@ -125,7 +146,15 @@ async def run(self) -> None:
125146

126147
try:
127148
runner = RealtimeRunner(agent)
128-
async with await runner.run() as session:
149+
# Attach playback tracker and disable server-side response interruption,
150+
# which can truncate assistant audio when mic picks up speaker output.
151+
model_config: RealtimeModelConfig = {
152+
"playback_tracker": self.playback_tracker,
153+
"initial_model_settings": {
154+
"turn_detection": {"type": "semantic_vad", "interrupt_response": False},
155+
},
156+
}
157+
async with await runner.run(model_config=model_config) as session:
129158
self.session = session
130159
print("Connected. Starting audio recording...")
131160

@@ -170,6 +199,14 @@ async def capture_audio(self) -> None:
170199
read_size = int(SAMPLE_RATE * CHUNK_LENGTH_S)
171200

172201
try:
202+
# Simple energy-based barge-in: if user speaks while audio is playing, interrupt.
203+
def rms_energy(samples: np.ndarray[Any, np.dtype[Any]]) -> float:
204+
if samples.size == 0:
205+
return 0.0
206+
# Normalize int16 to [-1, 1]
207+
x = samples.astype(np.float32) / 32768.0
208+
return float(np.sqrt(np.mean(x * x)))
209+
173210
while self.recording:
174211
# Check if there's enough data to read
175212
if self.audio_stream.read_available < read_size:
@@ -182,8 +219,12 @@ async def capture_audio(self) -> None:
182219
# Convert numpy array to bytes
183220
audio_bytes = data.tobytes()
184221

185-
# Send audio to session
186-
await self.session.send_audio(audio_bytes)
222+
# Half-duplex gating: do not send mic while assistant audio is playing
223+
assistant_playing = (
224+
self.current_audio_chunk is not None or not self.output_queue.empty()
225+
)
226+
if not assistant_playing:
227+
await self.session.send_audio(audio_bytes)
187228

188229
# Yield control back to event loop
189230
await asyncio.sleep(0)
@@ -212,17 +253,19 @@ async def _on_event(self, event: RealtimeSessionEvent) -> None:
212253
elif event.type == "audio_end":
213254
print("Audio ended")
214255
elif event.type == "audio":
215-
# Enqueue audio for callback-based playback
256+
# Enqueue audio for callback-based playback with metadata
216257
np_audio = np.frombuffer(event.audio.data, dtype=np.int16)
217258
try:
218-
self.output_queue.put_nowait(np_audio)
259+
self.output_queue.put_nowait((np_audio, event.item_id, event.content_index))
219260
except queue.Full:
220261
# Queue is full - only drop if we have significant backlog
221262
# This prevents aggressive dropping that could cause choppiness
222263
if self.output_queue.qsize() > 8: # Keep some buffer
223264
try:
224265
self.output_queue.get_nowait()
225-
self.output_queue.put_nowait(np_audio)
266+
self.output_queue.put_nowait(
267+
(np_audio, event.item_id, event.content_index)
268+
)
226269
except queue.Empty:
227270
pass
228271
# If queue isn't too full, just skip this chunk to avoid blocking

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ requires-python = ">=3.9"
77
license = "MIT"
88
authors = [{ name = "OpenAI", email = "[email protected]" }]
99
dependencies = [
10-
"openai>=1.104.1,<2",
10+
"openai>=1.105,<2",
1111
"pydantic>=2.10, <3",
1212
"griffe>=1.5.6, <2",
1313
"typing-extensions>=4.12.2, <5",

src/agents/realtime/config.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515

1616
RealtimeModelName: TypeAlias = Union[
1717
Literal[
18+
"gpt-realtime",
1819
"gpt-4o-realtime-preview",
1920
"gpt-4o-mini-realtime-preview",
2021
"gpt-4o-realtime-preview-2025-06-03",

0 commit comments

Comments
 (0)