add openai speech start and stop events

dariopellegrino00 · dariopellegrino00 · commit e7bb6c4a30f4 · 2025-10-08T12:30:31.000+02:00
diff --git a/mod_openai_audio_stream.c b/mod_openai_audio_stream.c
@@ -259,7 +259,9 @@ SWITCH_MODULE_LOAD_FUNCTION(mod_openai_audio_stream_load)
     if (switch_event_reserve_subclass(EVENT_JSON) != SWITCH_STATUS_SUCCESS ||
         switch_event_reserve_subclass(EVENT_CONNECT) != SWITCH_STATUS_SUCCESS ||
         switch_event_reserve_subclass(EVENT_ERROR) != SWITCH_STATUS_SUCCESS ||
-        switch_event_reserve_subclass(EVENT_DISCONNECT) != SWITCH_STATUS_SUCCESS) {
+        switch_event_reserve_subclass(EVENT_DISCONNECT) != SWITCH_STATUS_SUCCESS ||
+        switch_event_reserve_subclass(EVENT_OPENAI_SPEECH_STARTED) != SWITCH_STATUS_SUCCESS ||
+        switch_event_reserve_subclass(EVENT_OPENAI_SPEECH_STOPPED) != SWITCH_STATUS_SUCCESS) {
         switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Couldn't register an event subclass for mod_openai_audio_stream API.\n");
         return SWITCH_STATUS_TERM;
     }
@@ -285,6 +287,8 @@ SWITCH_MODULE_SHUTDOWN_FUNCTION(mod_openai_audio_stream_shutdown)
     switch_event_free_subclass(EVENT_CONNECT);
     switch_event_free_subclass(EVENT_DISCONNECT);
     switch_event_free_subclass(EVENT_ERROR);
+    switch_event_free_subclass(EVENT_OPENAI_SPEECH_STARTED);
+    switch_event_free_subclass(EVENT_OPENAI_SPEECH_STOPPED);
 
     return SWITCH_STATUS_SUCCESS;
 }
diff --git a/mod_openai_audio_stream.h b/mod_openai_audio_stream.h
@@ -10,11 +10,13 @@
 #define MAX_SESSION_ID (256)
 #define MAX_WS_URI (4096)
 
-#define EVENT_CONNECT           "mod_openai_audio_stream::connect"
-#define EVENT_DISCONNECT        "mod_openai_audio_stream::disconnect"
-#define EVENT_ERROR             "mod_openai_audio_stream::error"
-#define EVENT_JSON              "mod_openai_audio_stream::json"
-#define EVENT_PLAY              "mod_openai_audio_stream::play"
+#define EVENT_CONNECT                 "mod_openai_audio_stream::connect"
+#define EVENT_DISCONNECT              "mod_openai_audio_stream::disconnect"
+#define EVENT_ERROR                   "mod_openai_audio_stream::error"
+#define EVENT_JSON                    "mod_openai_audio_stream::json"
+#define EVENT_PLAY                    "mod_openai_audio_stream::play"
+#define EVENT_OPENAI_SPEECH_STARTED   "mod_openai_audio_stream::openai_speech_start"
+#define EVENT_OPENAI_SPEECH_STOPPED   "mod_openai_audio_stream::openai_speech_stop"
 
 typedef void (*responseHandler_t)(switch_core_session_t* session, const char* eventName, const char* json);
 
diff --git a/openai_audio_streamer_glue.cpp b/openai_audio_streamer_glue.cpp
@@ -298,6 +298,7 @@ class AudioStreamer {
         } else if(jsType && strcmp(jsType, "response.audio.delta") == 0) {
             const char* jsonAudio = cJSON_GetObjectCstr(json, "delta");
             playback_clear_requested = false;
+            m_response_audio_done = false;
             
             if(jsonAudio && strlen(jsonAudio) > 0) {
                 std::string rawAudio;
@@ -336,7 +337,10 @@ class AudioStreamer {
             } else {
                 switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_ERROR, "(%s) processMessage - response.audio.delta no audio data\n", m_sessionId.c_str());
             }
-        }
+        } else if(jsType && strcmp(jsType, "response.audio.done") == 0) {
+            switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "(%s) processMessage - audio done\n", m_sessionId.c_str());
+            m_response_audio_done = true;
+        } 
         cJSON_Delete(json);
         return status;
     }
@@ -351,6 +355,7 @@ class AudioStreamer {
     void push_audio_queue(const std::vector<int16_t>& audio_data) {
         std::lock_guard<std::mutex> lock(m_audio_queue_mutex);
         m_audio_queue.push(audio_data);
+        switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "(%s) audio queue size: %zu\n", m_sessionId.c_str(), m_audio_queue.size());
     }
 
     std::vector<int16_t> pop_audio_queue() {
@@ -429,6 +434,33 @@ class AudioStreamer {
         return playback_clear_requested;
     }
 
+    bool is_openai_speaking() {
+        return m_openai_speaking;
+    }
+
+    bool is_response_audio_done() {
+        return m_response_audio_done;
+    }
+
+    void openai_speech_started() {
+        m_openai_speaking = true;
+        switch_core_session_t* psession = switch_core_session_locate(m_sessionId.c_str());
+        switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "(%s) Openai started speaking\n", m_sessionId.c_str());
+        const char *payload = "{\"status\":\"started\"}";
+        m_notify(psession, EVENT_OPENAI_SPEECH_STARTED, payload);
+        switch_core_session_rwunlock(psession);
+     }
+
+    void openai_speech_stopped() {
+        m_openai_speaking = false;
+        switch_core_session_t* psession = switch_core_session_locate(m_sessionId.c_str());
+        switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "(%s) Openai stopped speaking\n", m_sessionId.c_str());
+
+        const char *payload = "{\"status\":\"stopped\"}";
+        m_notify(psession, EVENT_OPENAI_SPEECH_STOPPED, payload);
+        switch_core_session_rwunlock(psession);
+     }
+
 
 private:
     std::string m_sessionId;
@@ -446,6 +478,8 @@ class AudioStreamer {
     std::mutex m_audio_queue_mutex;
     bool playback_clear_requested = false; 
     bool m_disable_audiofiles = false; // disable saving audio files if true
+    bool m_openai_speaking = false;
+    bool m_response_audio_done = false;
 };
 
 
@@ -983,11 +1017,16 @@ extern "C" {
 
         if (as->clear_requested()) {
             switch_buffer_zero(tech_pvt->playback_buffer);
+            inuse = 0;
         }
         if (inuse < bytes_needed * 2 && !as->is_audio_queue_empty()) {
             auto chunk = as->pop_audio_queue();
             switch_buffer_write(tech_pvt->playback_buffer, chunk.data(), chunk.size() * sizeof(int16_t));
         } else if (inuse == 0) {
+            // Openai just finished speaking for interruption or end of response
+            if(as->is_openai_speaking() && as->is_response_audio_done()) { 
+                as->openai_speech_stopped();
+            }
             return SWITCH_TRUE;
         }
 
@@ -999,6 +1038,10 @@ extern "C" {
             switch_buffer_read(tech_pvt->playback_buffer, data, inuse);
         }
 
+        if (!as->is_openai_speaking()) {
+            as->openai_speech_started();
+        }
+
         frame->datalen = inuse > bytes_needed ? bytes_needed : inuse;
         frame->samples = frame->datalen / bytes_per_sample;
 
@@ -1043,4 +1086,3 @@ extern "C" {
         return SWITCH_STATUS_FALSE;
     }
 }
-