10
10
11
11
import pydantic
12
12
import websockets
13
+ from openai .types .realtime import realtime_audio_config as _rt_audio_config
13
14
from openai .types .realtime .conversation_item import (
14
15
ConversationItem ,
15
16
ConversationItem as OpenAIConversationItem ,
29
30
from openai .types .realtime .input_audio_buffer_commit_event import (
30
31
InputAudioBufferCommitEvent as OpenAIInputAudioBufferCommitEvent ,
31
32
)
32
- from openai .types .realtime .realtime_audio_config import (
33
- RealtimeAudioConfig as OpenAIRealtimeAudioConfig ,
34
- RealtimeAudioConfigInput as OpenAIRealtimeAudioInput ,
35
- RealtimeAudioConfigOutput as OpenAIRealtimeAudioOutput ,
36
- )
37
33
from openai .types .realtime .realtime_client_event import (
38
34
RealtimeClientEvent as OpenAIRealtimeClientEvent ,
39
35
)
62
58
from openai .types .realtime .realtime_tracing_config import (
63
59
TracingConfiguration as OpenAITracingConfiguration ,
64
60
)
61
+ from openai .types .realtime .realtime_transcription_session_create_request import (
62
+ RealtimeTranscriptionSessionCreateRequest as OpenAIRealtimeTranscriptionSessionCreateRequest ,
63
+ )
65
64
from openai .types .realtime .response_audio_delta_event import ResponseAudioDeltaEvent
66
65
from openai .types .realtime .response_cancel_event import (
67
66
ResponseCancelEvent as OpenAIResponseCancelEvent ,
@@ -535,7 +534,8 @@ async def _handle_ws_event(self, event: dict[str, Any]):
535
534
if status not in ("in_progress" , "completed" , "incomplete" ):
536
535
is_done = event .get ("type" ) == "response.output_item.done"
537
536
status = "completed" if is_done else "in_progress"
538
- type_adapter = TypeAdapter (RealtimeMessageItem )
537
+ # Explicitly type the adapter for mypy
538
+ type_adapter : TypeAdapter [RealtimeMessageItem ] = TypeAdapter (RealtimeMessageItem )
539
539
message_item : RealtimeMessageItem = type_adapter .validate_python (
540
540
{
541
541
"item_id" : item .get ("id" , "" ),
@@ -559,21 +559,21 @@ async def _handle_ws_event(self, event: dict[str, Any]):
559
559
except Exception as e :
560
560
event_type = event .get ("type" , "unknown" ) if isinstance (event , dict ) else "unknown"
561
561
logger .error (f"Failed to validate server event: { event } " , exc_info = True )
562
- event = RealtimeModelExceptionEvent (
562
+ exception_event = RealtimeModelExceptionEvent (
563
563
exception = e ,
564
564
context = f"Failed to validate server event: { event_type } " ,
565
565
)
566
- await self ._emit_event (event )
566
+ await self ._emit_event (exception_event )
567
567
return
568
568
569
569
if parsed .type == "response.output_audio.delta" :
570
570
await self ._handle_audio_delta (parsed )
571
571
elif parsed .type == "response.output_audio.done" :
572
- event = RealtimeModelAudioDoneEvent (
572
+ audio_done_event = RealtimeModelAudioDoneEvent (
573
573
item_id = parsed .item_id ,
574
574
content_index = parsed .content_index ,
575
575
)
576
- await self ._emit_event (event )
576
+ await self ._emit_event (audio_done_event )
577
577
elif parsed .type == "input_audio_buffer.speech_started" :
578
578
# On VAD speech start, immediately stop local playback so the user can
579
579
# barge‑in without overlapping assistant audio.
@@ -673,17 +673,39 @@ async def _handle_ws_event(self, event: dict[str, Any]):
673
673
)
674
674
)
675
675
676
- def _update_created_session (self , session : OpenAISessionCreateRequest ) -> None :
677
- self ._created_session = session
678
- if (
679
- session .audio is not None
680
- and session .audio .output is not None
681
- and session .audio .output .format is not None
682
- ):
683
- audio_format = session .audio .output .format
684
- self ._audio_state_tracker .set_audio_format (audio_format )
685
- if self ._playback_tracker :
686
- self ._playback_tracker .set_audio_format (audio_format )
676
+ def _update_created_session (
677
+ self ,
678
+ session : OpenAISessionCreateRequest | OpenAIRealtimeTranscriptionSessionCreateRequest ,
679
+ ) -> None :
680
+ # Only store/playback-format information for realtime sessions (not transcription-only)
681
+ if isinstance (session , OpenAISessionCreateRequest ):
682
+ self ._created_session = session
683
+ if (
684
+ session .audio is not None
685
+ and session .audio .output is not None
686
+ and session .audio .output .format is not None
687
+ ):
688
+ # Convert OpenAI audio format objects to our internal string format
689
+ from openai .types .realtime .realtime_audio_formats import (
690
+ AudioPCM ,
691
+ AudioPCMA ,
692
+ AudioPCMU ,
693
+ )
694
+
695
+ fmt = session .audio .output .format
696
+ if isinstance (fmt , AudioPCM ):
697
+ normalized = "pcm16"
698
+ elif isinstance (fmt , AudioPCMU ):
699
+ normalized = "g711_ulaw"
700
+ elif isinstance (fmt , AudioPCMA ):
701
+ normalized = "g711_alaw"
702
+ else :
703
+ # Fallback for unknown/str-like values
704
+ normalized = cast ("str" , getattr (fmt , "type" , str (fmt )))
705
+
706
+ self ._audio_state_tracker .set_audio_format (normalized )
707
+ if self ._playback_tracker :
708
+ self ._playback_tracker .set_audio_format (normalized )
687
709
688
710
async def _update_session_config (self , model_settings : RealtimeSessionModelSettings ) -> None :
689
711
session_config = self ._get_session_config (model_settings )
@@ -718,6 +740,11 @@ def _get_session_config(
718
740
DEFAULT_MODEL_SETTINGS .get ("output_audio_format" ),
719
741
)
720
742
743
+ # Avoid direct imports of non-exported names by referencing via module
744
+ OpenAIRealtimeAudioConfig = _rt_audio_config .RealtimeAudioConfig
745
+ OpenAIRealtimeAudioInput = _rt_audio_config .RealtimeAudioConfigInput # type: ignore[attr-defined]
746
+ OpenAIRealtimeAudioOutput = _rt_audio_config .RealtimeAudioConfigOutput # type: ignore[attr-defined]
747
+
721
748
input_audio_config = None
722
749
if any (
723
750
value is not None
@@ -816,7 +843,7 @@ def conversation_item_to_realtime_message_item(
816
843
),
817
844
):
818
845
raise ValueError ("Unsupported conversation item type for message conversion." )
819
- content : list [dict ] = []
846
+ content : list [dict [ str , Any ] ] = []
820
847
for each in item .content :
821
848
c = each .model_dump ()
822
849
if each .type == "output_text" :
0 commit comments