@@ -298,6 +298,7 @@ class AudioStreamer {
298298 } else if (jsType && strcmp (jsType, " response.audio.delta" ) == 0 ) {
299299 const char * jsonAudio = cJSON_GetObjectCstr (json, " delta" );
300300 playback_clear_requested = false ;
301+ m_response_audio_done = false ;
301302
302303 if (jsonAudio && strlen (jsonAudio) > 0 ) {
303304 std::string rawAudio;
@@ -336,7 +337,10 @@ class AudioStreamer {
336337 } else {
337338 switch_log_printf (SWITCH_CHANNEL_SESSION_LOG (session), SWITCH_LOG_ERROR, " (%s) processMessage - response.audio.delta no audio data\n " , m_sessionId.c_str ());
338339 }
339- }
340+ } else if (jsType && strcmp (jsType, " response.audio.done" ) == 0 ) {
341+ switch_log_printf (SWITCH_CHANNEL_SESSION_LOG (session), SWITCH_LOG_DEBUG, " (%s) processMessage - audio done\n " , m_sessionId.c_str ());
342+ m_response_audio_done = true ;
343+ }
340344 cJSON_Delete (json);
341345 return status;
342346 }
@@ -351,6 +355,7 @@ class AudioStreamer {
351355 void push_audio_queue (const std::vector<int16_t >& audio_data) {
352356 std::lock_guard<std::mutex> lock (m_audio_queue_mutex);
353357 m_audio_queue.push (audio_data);
358+ switch_log_printf (SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, " (%s) audio queue size: %zu\n " , m_sessionId.c_str (), m_audio_queue.size ());
354359 }
355360
356361 std::vector<int16_t > pop_audio_queue () {
@@ -429,6 +434,33 @@ class AudioStreamer {
429434 return playback_clear_requested;
430435 }
431436
437+ bool is_openai_speaking () {
438+ return m_openai_speaking;
439+ }
440+
441+ bool is_response_audio_done () {
442+ return m_response_audio_done;
443+ }
444+
445+ void openai_speech_started () {
446+ m_openai_speaking = true ;
447+ switch_core_session_t * psession = switch_core_session_locate (m_sessionId.c_str ());
448+ switch_log_printf (SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, " (%s) Openai started speaking\n " , m_sessionId.c_str ());
449+ const char *payload = " {\" status\" :\" started\" }" ;
450+ m_notify (psession, EVENT_OPENAI_SPEECH_STARTED, payload);
451+ switch_core_session_rwunlock (psession);
452+ }
453+
454+ void openai_speech_stopped () {
455+ m_openai_speaking = false ;
456+ switch_core_session_t * psession = switch_core_session_locate (m_sessionId.c_str ());
457+ switch_log_printf (SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, " (%s) Openai stopped speaking\n " , m_sessionId.c_str ());
458+
459+ const char *payload = " {\" status\" :\" stopped\" }" ;
460+ m_notify (psession, EVENT_OPENAI_SPEECH_STOPPED, payload);
461+ switch_core_session_rwunlock (psession);
462+ }
463+
432464
433465private:
434466 std::string m_sessionId;
@@ -446,6 +478,8 @@ class AudioStreamer {
446478 std::mutex m_audio_queue_mutex;
447479 bool playback_clear_requested = false ;
448480 bool m_disable_audiofiles = false ; // disable saving audio files if true
481+ bool m_openai_speaking = false ;
482+ bool m_response_audio_done = false ;
449483};
450484
451485
@@ -983,11 +1017,16 @@ extern "C" {
9831017
9841018 if (as->clear_requested ()) {
9851019 switch_buffer_zero (tech_pvt->playback_buffer );
1020+ inuse = 0 ;
9861021 }
9871022 if (inuse < bytes_needed * 2 && !as->is_audio_queue_empty ()) {
9881023 auto chunk = as->pop_audio_queue ();
9891024 switch_buffer_write (tech_pvt->playback_buffer , chunk.data (), chunk.size () * sizeof (int16_t ));
9901025 } else if (inuse == 0 ) {
1026+ // Openai just finished speaking for interruption or end of response
1027+ if (as->is_openai_speaking () && as->is_response_audio_done ()) {
1028+ as->openai_speech_stopped ();
1029+ }
9911030 return SWITCH_TRUE;
9921031 }
9931032
@@ -999,6 +1038,10 @@ extern "C" {
9991038 switch_buffer_read (tech_pvt->playback_buffer , data, inuse);
10001039 }
10011040
1041+ if (!as->is_openai_speaking ()) {
1042+ as->openai_speech_started ();
1043+ }
1044+
10021045 frame->datalen = inuse > bytes_needed ? bytes_needed : inuse;
10031046 frame->samples = frame->datalen / bytes_per_sample;
10041047
@@ -1043,4 +1086,3 @@ extern "C" {
10431086 return SWITCH_STATUS_FALSE;
10441087 }
10451088}
1046-
0 commit comments