Skip to content

Commit e7bb6c4

Browse files
add openai speech start and stop events
2 parents 1f55caf + 60fbeec commit e7bb6c4

File tree

3 files changed

+56
-8
lines changed

3 files changed

+56
-8
lines changed

mod_openai_audio_stream.c

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -259,7 +259,9 @@ SWITCH_MODULE_LOAD_FUNCTION(mod_openai_audio_stream_load)
259259
if (switch_event_reserve_subclass(EVENT_JSON) != SWITCH_STATUS_SUCCESS ||
260260
switch_event_reserve_subclass(EVENT_CONNECT) != SWITCH_STATUS_SUCCESS ||
261261
switch_event_reserve_subclass(EVENT_ERROR) != SWITCH_STATUS_SUCCESS ||
262-
switch_event_reserve_subclass(EVENT_DISCONNECT) != SWITCH_STATUS_SUCCESS) {
262+
switch_event_reserve_subclass(EVENT_DISCONNECT) != SWITCH_STATUS_SUCCESS ||
263+
switch_event_reserve_subclass(EVENT_OPENAI_SPEECH_STARTED) != SWITCH_STATUS_SUCCESS ||
264+
switch_event_reserve_subclass(EVENT_OPENAI_SPEECH_STOPPED) != SWITCH_STATUS_SUCCESS) {
263265
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Couldn't register an event subclass for mod_openai_audio_stream API.\n");
264266
return SWITCH_STATUS_TERM;
265267
}
@@ -285,6 +287,8 @@ SWITCH_MODULE_SHUTDOWN_FUNCTION(mod_openai_audio_stream_shutdown)
285287
switch_event_free_subclass(EVENT_CONNECT);
286288
switch_event_free_subclass(EVENT_DISCONNECT);
287289
switch_event_free_subclass(EVENT_ERROR);
290+
switch_event_free_subclass(EVENT_OPENAI_SPEECH_STARTED);
291+
switch_event_free_subclass(EVENT_OPENAI_SPEECH_STOPPED);
288292

289293
return SWITCH_STATUS_SUCCESS;
290294
}

mod_openai_audio_stream.h

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -10,11 +10,13 @@
1010
#define MAX_SESSION_ID (256)
1111
#define MAX_WS_URI (4096)
1212

13-
#define EVENT_CONNECT "mod_openai_audio_stream::connect"
14-
#define EVENT_DISCONNECT "mod_openai_audio_stream::disconnect"
15-
#define EVENT_ERROR "mod_openai_audio_stream::error"
16-
#define EVENT_JSON "mod_openai_audio_stream::json"
17-
#define EVENT_PLAY "mod_openai_audio_stream::play"
13+
#define EVENT_CONNECT "mod_openai_audio_stream::connect"
14+
#define EVENT_DISCONNECT "mod_openai_audio_stream::disconnect"
15+
#define EVENT_ERROR "mod_openai_audio_stream::error"
16+
#define EVENT_JSON "mod_openai_audio_stream::json"
17+
#define EVENT_PLAY "mod_openai_audio_stream::play"
18+
#define EVENT_OPENAI_SPEECH_STARTED "mod_openai_audio_stream::openai_speech_start"
19+
#define EVENT_OPENAI_SPEECH_STOPPED "mod_openai_audio_stream::openai_speech_stop"
1820

1921
typedef void (*responseHandler_t)(switch_core_session_t* session, const char* eventName, const char* json);
2022

openai_audio_streamer_glue.cpp

Lines changed: 44 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -298,6 +298,7 @@ class AudioStreamer {
298298
} else if(jsType && strcmp(jsType, "response.audio.delta") == 0) {
299299
const char* jsonAudio = cJSON_GetObjectCstr(json, "delta");
300300
playback_clear_requested = false;
301+
m_response_audio_done = false;
301302

302303
if(jsonAudio && strlen(jsonAudio) > 0) {
303304
std::string rawAudio;
@@ -336,7 +337,10 @@ class AudioStreamer {
336337
} else {
337338
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_ERROR, "(%s) processMessage - response.audio.delta no audio data\n", m_sessionId.c_str());
338339
}
339-
}
340+
} else if(jsType && strcmp(jsType, "response.audio.done") == 0) {
341+
switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "(%s) processMessage - audio done\n", m_sessionId.c_str());
342+
m_response_audio_done = true;
343+
}
340344
cJSON_Delete(json);
341345
return status;
342346
}
@@ -351,6 +355,7 @@ class AudioStreamer {
351355
void push_audio_queue(const std::vector<int16_t>& audio_data) {
352356
std::lock_guard<std::mutex> lock(m_audio_queue_mutex);
353357
m_audio_queue.push(audio_data);
358+
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "(%s) audio queue size: %zu\n", m_sessionId.c_str(), m_audio_queue.size());
354359
}
355360

356361
std::vector<int16_t> pop_audio_queue() {
@@ -429,6 +434,33 @@ class AudioStreamer {
429434
return playback_clear_requested;
430435
}
431436

437+
bool is_openai_speaking() {
438+
return m_openai_speaking;
439+
}
440+
441+
bool is_response_audio_done() {
442+
return m_response_audio_done;
443+
}
444+
445+
void openai_speech_started() {
446+
m_openai_speaking = true;
447+
switch_core_session_t* psession = switch_core_session_locate(m_sessionId.c_str());
448+
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "(%s) Openai started speaking\n", m_sessionId.c_str());
449+
const char *payload = "{\"status\":\"started\"}";
450+
m_notify(psession, EVENT_OPENAI_SPEECH_STARTED, payload);
451+
switch_core_session_rwunlock(psession);
452+
}
453+
454+
void openai_speech_stopped() {
455+
m_openai_speaking = false;
456+
switch_core_session_t* psession = switch_core_session_locate(m_sessionId.c_str());
457+
switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "(%s) Openai stopped speaking\n", m_sessionId.c_str());
458+
459+
const char *payload = "{\"status\":\"stopped\"}";
460+
m_notify(psession, EVENT_OPENAI_SPEECH_STOPPED, payload);
461+
switch_core_session_rwunlock(psession);
462+
}
463+
432464

433465
private:
434466
std::string m_sessionId;
@@ -446,6 +478,8 @@ class AudioStreamer {
446478
std::mutex m_audio_queue_mutex;
447479
bool playback_clear_requested = false;
448480
bool m_disable_audiofiles = false; // disable saving audio files if true
481+
bool m_openai_speaking = false;
482+
bool m_response_audio_done = false;
449483
};
450484

451485

@@ -983,11 +1017,16 @@ extern "C" {
9831017

9841018
if (as->clear_requested()) {
9851019
switch_buffer_zero(tech_pvt->playback_buffer);
1020+
inuse = 0;
9861021
}
9871022
if (inuse < bytes_needed * 2 && !as->is_audio_queue_empty()) {
9881023
auto chunk = as->pop_audio_queue();
9891024
switch_buffer_write(tech_pvt->playback_buffer, chunk.data(), chunk.size() * sizeof(int16_t));
9901025
} else if (inuse == 0) {
1026+
// Openai just finished speaking for interruption or end of response
1027+
if(as->is_openai_speaking() && as->is_response_audio_done()) {
1028+
as->openai_speech_stopped();
1029+
}
9911030
return SWITCH_TRUE;
9921031
}
9931032

@@ -999,6 +1038,10 @@ extern "C" {
9991038
switch_buffer_read(tech_pvt->playback_buffer, data, inuse);
10001039
}
10011040

1041+
if (!as->is_openai_speaking()) {
1042+
as->openai_speech_started();
1043+
}
1044+
10021045
frame->datalen = inuse > bytes_needed ? bytes_needed : inuse;
10031046
frame->samples = frame->datalen / bytes_per_sample;
10041047

@@ -1043,4 +1086,3 @@ extern "C" {
10431086
return SWITCH_STATUS_FALSE;
10441087
}
10451088
}
1046-

0 commit comments

Comments
 (0)