From 462dd1bd4bd213aacb2fb8a94c6b3866568bddeb Mon Sep 17 00:00:00 2001 From: Ryan Hurey Date: Fri, 11 Jul 2025 13:41:58 -0700 Subject: [PATCH 01/48] Initial pass --- .../ai/data-plane/VoiceLive/custom.tsp | 164 ++ .../VoiceLive/custom/content_parts.tsp | 36 + .../ai/data-plane/VoiceLive/custom/events.tsp | 57 + .../ai/data-plane/VoiceLive/custom/items.tsp | 109 + .../ai/data-plane/VoiceLive/custom/tools.tsp | 73 + .../ai/data-plane/VoiceLive/main.tsp | 1 + .../ai/data-plane/VoiceLive/models.tsp | 1971 +++++++++++++++++ .../ai/data-plane/VoiceLive/operations.tsp | 45 + .../ai/data-plane/VoiceLive/tspconfig.yaml | 53 + 9 files changed, 2509 insertions(+) create mode 100644 specification/ai/data-plane/VoiceLive/custom.tsp create mode 100644 specification/ai/data-plane/VoiceLive/custom/content_parts.tsp create mode 100644 specification/ai/data-plane/VoiceLive/custom/events.tsp create mode 100644 specification/ai/data-plane/VoiceLive/custom/items.tsp create mode 100644 specification/ai/data-plane/VoiceLive/custom/tools.tsp create mode 100644 specification/ai/data-plane/VoiceLive/main.tsp create mode 100644 specification/ai/data-plane/VoiceLive/models.tsp create mode 100644 specification/ai/data-plane/VoiceLive/operations.tsp create mode 100644 specification/ai/data-plane/VoiceLive/tspconfig.yaml diff --git a/specification/ai/data-plane/VoiceLive/custom.tsp b/specification/ai/data-plane/VoiceLive/custom.tsp new file mode 100644 index 000000000000..faa8785ccd5b --- /dev/null +++ b/specification/ai/data-plane/VoiceLive/custom.tsp @@ -0,0 +1,164 @@ +import "./custom/events.tsp"; +import "./custom/items.tsp"; +import "./custom/tools.tsp"; + +using TypeSpec.OpenAPI; + +namespace OpenAI; + +model VoiceLiveRequestSession { + ...VoiceLiveSessionBase; + modalities?: VoiceLiveModality[]; + instructions?: string; + `model`?: + | "gpt-4o-realtime-preview" + | "gpt-4o-realtime-preview-2024-10-01" + | "gpt-4o-realtime-preview-2024-12-17" + | "gpt-4o-mini-realtime-preview" + | "gpt-4o-mini-realtime-preview-2024-12-17"; + voice?: VoiceIdsShared; + input_audio_format?: VoiceLiveAudioFormat = VoiceLiveAudioFormat.pcm16; + output_audio_format?: VoiceLiveAudioFormat = VoiceLiveAudioFormat.pcm16; + input_audio_transcription?: VoiceLiveAudioInputTranscriptionSettings | null; + turn_detection?: VoiceLiveTurnDetection | null; + input_audio_noise_reduction?: VoiceLiveAudioNoiseReduction; + tools?: VoiceLiveTool[]; + tool_choice?: VoiceLiveToolChoice; + temperature?: float32; + max_response_output_tokens?: int32 | "inf"; +} + +model VoiceLiveResponseSession { + ...VoiceLiveSessionBase; + object: "voicelive.session"; + id: string; + `model`: string; + modalities: VoiceLiveModality[]; + instructions: string; + voice: VoiceIdsShared; + input_audio_format: VoiceLiveAudioFormat; + output_audio_format: VoiceLiveAudioFormat; + input_audio_transcription: VoiceLiveAudioInputTranscriptionSettings | null; + turn_detection: VoiceLiveTurnDetection; + input_audio_noise_reduction: VoiceLiveAudioNoiseReduction; + tools: VoiceLiveTool[]; + tool_choice: VoiceLiveToolChoice; + temperature: float32; + max_response_output_tokens: int32 | "inf" | null; +} + +union VoiceLiveAudioFormat { + string, + pcm16: "pcm16", + g711_ulaw: "g711_ulaw", + g711_alaw: "g711_alaw", +} + +union VoiceLiveAudioInputTranscriptionModel { + string, + whisper_1: "whisper-1", +} + +model VoiceLiveAudioInputTranscriptionSettings { + `model`?: VoiceLiveAudioInputTranscriptionModel = VoiceLiveAudioInputTranscriptionModel.whisper_1; + language?: string; + prompt?: string; +} + +union VoiceLiveModality { + string, + text: "text", + audio: "audio", +} + +union VoiceLiveTurnDetectionType { + string, + + /** + * Indicates that server-side voice activity detection (VAD) should be enabled, allowing the server to determine when + * add_user_audio commands present ends of speech and should be automatically committed. + * + * The API will also detect when the user begins talking, sending a generation_canceled command. + */ + server_vad: "server_vad", + + semantic_vad: "semantic_vad", +} + +@discriminator("type") +model VoiceLiveTurnDetection { + type: VoiceLiveTurnDetectionType; + + /** + * Whether or not to automatically generate a response when VAD is enabled. true by default. + */ + create_response?: boolean = true; + + /** + * Whether or not to automatically interrupt any ongoing response with output to the default conversation (i.e. `conversation` of `auto`) when a VAD start event occurs. + */ + interrupt_response?: boolean = true; +} + +model VoiceLiveServerVadTurnDetection extends VoiceLiveTurnDetection { + type: VoiceLiveTurnDetectionType.server_vad; + + /** + * Activation threshold for VAD (0.0 to 1.0), this defaults to 0.5. A higher threshold will require louder audio to activate the model, and thus might perform better in noisy environments. + */ + threshold?: float32 = 0.5; + + // @encode("milliseconds", int32) + /** + * Amount of audio to include before the VAD detected speech (in milliseconds). Defaults to 300ms. + */ + prefix_padding_ms?: duration; // = 300ms + + // @encode("milliseconds", int32) + /** + * Duration of silence to detect speech stop (in milliseconds). Defaults to 500ms. With shorter values the model will respond more quickly, but may jump in on short pauses from the user. + */ + silence_duration_ms?: duration; // = 500ms +} + +model VoiceLiveSemanticVadTurnDetection extends VoiceLiveTurnDetection { + type: VoiceLiveTurnDetectionType.semantic_vad; + + /** + * Used only for `semantic_vad` mode. The eagerness of the model to respond. `low` will wait longer for the user to continue speaking, `high` will respond more quickly. `auto` is the default and is equivalent to `medium`. + */ + eagerness?: "low" | "medium" | "high" | "auto" = "auto"; +} + +model VoiceLiveServerEventRateLimitsUpdatedRateLimitsItem { + /** The rate limit property name that this item includes information about. */ + name: string; + + /** The maximum configured limit for this rate limit property. */ + limit: int32; + + /** The remaining quota available against the configured limit for this rate limit property. */ + remaining: int32; + + /** The remaining time, in seconds, until this rate limit property is reset. */ + @encode("seconds", float32) + reset_seconds: duration; +} + +union VoiceLiveAudioNoiseReductionType { + near_field: "near_field", + far_field: "far_field", +} + +@discriminator("type") +model VoiceLiveAudioNoiseReduction { + type: VoiceLiveAudioNoiseReductionType; +} + +model VoiceLiveAudioNearFieldNoiseReduction extends VoiceLiveAudioNoiseReduction { + type: VoiceLiveAudioNoiseReductionType.near_field; +} + +model VoiceLiveAudioFarFieldNoiseReduction extends VoiceLiveAudioNoiseReduction { + type: VoiceLiveAudioNoiseReductionType.far_field; +} diff --git a/specification/ai/data-plane/VoiceLive/custom/content_parts.tsp b/specification/ai/data-plane/VoiceLive/custom/content_parts.tsp new file mode 100644 index 000000000000..258f083003b3 --- /dev/null +++ b/specification/ai/data-plane/VoiceLive/custom/content_parts.tsp @@ -0,0 +1,36 @@ +using TypeSpec.OpenAPI; + +namespace OpenAI; + +union VoiceLiveContentPartType { + string, + input_text: "input_text", + input_audio: "input_audio", + text: "text", + audio: "audio", +} + +@discriminator("type") +model VoiceLiveContentPart { + type: VoiceLiveContentPartType; +} + +model VoiceLiveRequestTextContentPart extends VoiceLiveContentPart { + type: VoiceLiveContentPartType.input_text; + text: string; +} + +model VoiceLiveRequestAudioContentPart extends VoiceLiveContentPart { + type: VoiceLiveContentPartType.input_audio; + transcript?: string; +} + +model VoiceLiveResponseTextContentPart extends VoiceLiveContentPart { + type: VoiceLiveContentPartType.text; + text: string; +} + +model VoiceLiveResponseAudioContentPart extends VoiceLiveContentPart { + type: VoiceLiveContentPartType.audio; + transcript: string | null; +} diff --git a/specification/ai/data-plane/VoiceLive/custom/events.tsp b/specification/ai/data-plane/VoiceLive/custom/events.tsp new file mode 100644 index 000000000000..0f7d5d5221f0 --- /dev/null +++ b/specification/ai/data-plane/VoiceLive/custom/events.tsp @@ -0,0 +1,57 @@ +using TypeSpec.OpenAPI; + +namespace OpenAI; + +union VoiceLiveClientEventType { + string, + session_update: "session.update", + input_audio_buffer_append: "input_audio_buffer.append", + input_audio_buffer_commit: "input_audio_buffer.commit", + input_audio_buffer_clear: "input_audio_buffer.clear", + output_audio_buffer_clear: "output_audio_buffer.clear", + conversation_item_create: "conversation.item.create", + conversation_item_retrieve: "conversation.item.retrieve", + conversation_item_truncate: "conversation.item.truncate", + conversation_item_delete: "conversation.item.delete", + response_create: "response.create", + response_cancel: "response.cancel", + transcription_session_update: "transcription_session.update", +} + +union VoiceLiveServerEventType { + string, + error: "error", + session_created: "session.created", + session_updated: "session.updated", + conversation_created: "conversation.created", + conversation_item_input_audio_transcription_completed: "conversation.item.input_audio_transcription.completed", + conversation_item_input_audio_transcription_delta: "conversation.item.input_audio_transcription.delta", + conversation_item_input_audio_transcription_failed: "conversation.item.input_audio_transcription.failed", + conversation_item_created: "conversation.item.created", + conversation_item_retrieved: "conversation.item.retrieved", + conversation_item_truncated: "conversation.item.truncated", + conversation_item_deleted: "conversation.item.deleted", + input_audio_buffer_committed: "input_audio_buffer.committed", + input_audio_buffer_cleared: "input_audio_buffer.cleared", + input_audio_buffer_speech_started: "input_audio_buffer.speech_started", + input_audio_buffer_speech_stopped: "input_audio_buffer.speech_stopped", + output_audio_buffer_cleared: "output_audio_buffer.cleared", + output_audio_buffer_started: "output_audio_buffer.started", + output_audio_buffer_stopped: "output_audio_buffer.stopped", + response_created: "response.created", + response_done: "response.done", + response_output_item_added: "response.output_item.added", + response_output_item_done: "response.output_item.done", + response_content_part_added: "response.content_part.added", + response_content_part_done: "response.content_part.done", + response_text_delta: "response.text.delta", + response_text_done: "response.text.done", + response_audio_transcript_delta: "response.audio_transcript.delta", + response_audio_transcript_done: "response.audio_transcript.done", + response_audio_delta: "response.audio.delta", + response_audio_done: "response.audio.done", + response_function_call_arguments_delta: "response.function_call_arguments.delta", + response_function_call_arguments_done: "response.function_call_arguments.done", + transcription_session_updated: "transcription_session.updated", + rate_limits_updated: "rate_limits.updated", +} diff --git a/specification/ai/data-plane/VoiceLive/custom/items.tsp b/specification/ai/data-plane/VoiceLive/custom/items.tsp new file mode 100644 index 000000000000..32f6bb432b7d --- /dev/null +++ b/specification/ai/data-plane/VoiceLive/custom/items.tsp @@ -0,0 +1,109 @@ +import "./content_parts.tsp"; + +using TypeSpec.OpenAPI; + +namespace OpenAI; + +@discriminator("type") +model VoiceLiveConversationRequestItem { + ...VoiceLiveConversationItemBase; + type: VoiceLiveItemType; + id?: string; +} + +@discriminator("role") +model VoiceLiveRequestMessageItem extends VoiceLiveConversationRequestItem { + type: VoiceLiveItemType.message; + role: VoiceLiveMessageRole; + status?: VoiceLiveItemStatus; +} + +model VoiceLiveRequestSystemMessageItem extends VoiceLiveRequestMessageItem { + role: VoiceLiveMessageRole.system; + content: VoiceLiveRequestTextContentPart[]; +} + +model VoiceLiveRequestUserMessageItem extends VoiceLiveRequestMessageItem { + role: VoiceLiveMessageRole.user; + content: (VoiceLiveRequestTextContentPart | VoiceLiveRequestAudioContentPart)[]; +} + +model VoiceLiveRequestAssistantMessageItem extends VoiceLiveRequestMessageItem { + role: VoiceLiveMessageRole.assistant; + content: VoiceLiveRequestTextContentPart[]; +} + +model VoiceLiveRequestFunctionCallItem extends VoiceLiveConversationRequestItem { + type: VoiceLiveItemType.function_call; + name: string; + call_id: string; + arguments: string; + status?: VoiceLiveItemStatus; +} + +model VoiceLiveRequestFunctionCallOutputItem + extends VoiceLiveConversationRequestItem { + type: VoiceLiveItemType.function_call_output; + call_id: string; + output: string; +} + +// TODO: representation of a doubly-discriminated type with an absent second discriminator +// (first discriminator: type = message; second discriminator: no role present) + +model VoiceLiveRequestMessageReferenceItem { // extends VoiceLiveConversationRequestItem { + type: VoiceLiveItemType.message; + id: string; +} + +@discriminator("type") +model VoiceLiveConversationResponseItem { + ...VoiceLiveConversationItemBase; + object: "voicelive.item"; + type: VoiceLiveItemType; + id: string | null; +} + +model VoiceLiveResponseMessageItem extends VoiceLiveConversationResponseItem { + type: VoiceLiveItemType.message; + role: VoiceLiveMessageRole; + content: VoiceLiveContentPart[]; + status: VoiceLiveItemStatus; +} + +model VoiceLiveResponseFunctionCallItem + extends VoiceLiveConversationResponseItem { + type: VoiceLiveItemType.function_call; + name: string; + call_id: string; + arguments: string; + status: VoiceLiveItemStatus; +} + +model VoiceLiveResponseFunctionCallOutputItem + extends VoiceLiveConversationResponseItem { + type: VoiceLiveItemType.function_call_output; + call_id: string; + output: string; +} + +union VoiceLiveItemType { + string, + message: "message", + function_call: "function_call", + function_call_output: "function_call_output", +} + +union VoiceLiveItemStatus { + string, + in_progress: "in_progress", + completed: "completed", + incomplete: "incomplete", +} + +union VoiceLiveMessageRole { + string, + system: "system", + user: "user", + assistant: "assistant", +} diff --git a/specification/ai/data-plane/VoiceLive/custom/tools.tsp b/specification/ai/data-plane/VoiceLive/custom/tools.tsp new file mode 100644 index 000000000000..aa592aaf0c06 --- /dev/null +++ b/specification/ai/data-plane/VoiceLive/custom/tools.tsp @@ -0,0 +1,73 @@ +using TypeSpec.OpenAPI; + +namespace OpenAI; + +/** + * The supported tool type discriminators for voicelive tools. + * Currently, only 'function' tools are supported. + */ +union VoiceLiveToolType { + string, + function: "function", +} + +/** + * The base representation of a voicelive tool definition. + */ +@discriminator("type") +model VoiceLiveTool { + type: VoiceLiveToolType; +} + +/** + * The definition of a function tool as used by the voicelive endpoint. + */ +model VoiceLiveFunctionTool extends VoiceLiveTool { + type: VoiceLiveToolType.function; + name: string; + description?: string; + parameters?: unknown; +} + +/** + * The combined set of available representations for a voicelive tool_choice parameter, encompassing both string + * literal options like 'auto' as well as structured references to defined tools. + */ +union VoiceLiveToolChoice { + VoiceLiveToolChoiceLiteral, + VoiceLiveToolChoiceObject, +} + +/** + * The available set of mode-level, string literal tool_choice options for the voicelive endpoint. + */ +union VoiceLiveToolChoiceLiteral { + string, + + /** Specifies that the model should freely determine which tool or tools, if any, to call. */ + auto: "auto", + + /** Specifies that the model should call no tools whatsoever. */ + none: "none", + + /** Specifies that the model should call at least one tool. */ + required: "required", +} + +/** + * A base representation for a voicelive tool_choice selecting a named tool. + */ +@discriminator("type") +model VoiceLiveToolChoiceObject { + type: VoiceLiveToolType; +} + +/** + * The representation of a voicelive tool_choice selecting a named function tool. + */ +model VoiceLiveToolChoiceFunctionObject extends VoiceLiveToolChoiceObject { + type: VoiceLiveToolType.function; + function: { + name: string; + }; +} diff --git a/specification/ai/data-plane/VoiceLive/main.tsp b/specification/ai/data-plane/VoiceLive/main.tsp new file mode 100644 index 000000000000..144c4aeaff10 --- /dev/null +++ b/specification/ai/data-plane/VoiceLive/main.tsp @@ -0,0 +1 @@ +import "./operations.tsp"; diff --git a/specification/ai/data-plane/VoiceLive/models.tsp b/specification/ai/data-plane/VoiceLive/models.tsp new file mode 100644 index 000000000000..a4fe5f78b487 --- /dev/null +++ b/specification/ai/data-plane/VoiceLive/models.tsp @@ -0,0 +1,1971 @@ +/* + * This file was automatically generated from an OpenAPI .yaml file. + * Edits made directly to this file will be lost. + */ + +import "../audio"; +import "../common"; +import "./custom.tsp"; + +using TypeSpec.OpenAPI; + +namespace OpenAI; + +// Tool generated type. Extracts from VoiceLiveConversationItemWithReference.content +alias VoiceLiveConversationItemWithReferenceContent = { + @doc(""" + The content type (`input_text`, `input_audio`, `item_reference`, `text`). + """) + type?: "input_audio" | "input_text" | "item_reference" | "text"; + + @doc(""" + The text content, used for `input_text` and `text` content types. + """) + text?: string; + + @doc(""" + ID of a previous conversation item to reference (for `item_reference` + content types in `response.create` events). These can reference both + client and server created items. + """) + id?: string; + + @doc(""" + Base64-encoded audio bytes, used for `input_audio` content type. + """) + audio?: string; + + @doc(""" + The transcript of the audio, used for `input_audio` content type. + """) + transcript?: string; +}; + +// Tool customization: Adjust union to be a discriminated type base +/** A voicelive client event. */ +@discriminator("type") +model VoiceLiveClientEvent { + /** The type of event. */ + type: VoiceLiveClientEventType; + + event_id?: string; +} + +// Tool customization (apply_discriminator): apply discriminated type base +@doc(""" + Send this event to update the session’s default configuration. + The client may send this event at any time to update any field, + except for `voice`. However, note that once a session has been + initialized with a particular `model`, it can’t be changed to + another model using `session.update`. + + When the server receives a `session.update`, it will respond + with a `session.updated` event showing the full, effective configuration. + Only the fields that are present are updated. To clear a field like + `instructions`, pass an empty string. + """) +model VoiceLiveClientEventSessionUpdate extends VoiceLiveClientEvent { + @doc(""" + The event type, must be `session.update`. + """) + type: VoiceLiveClientEventType.session_update; + + // Tool customization: apply enriched request-specific model + session: VoiceLiveRequestSession; +} + +// Tool customization: establish custom, enriched discriminated type hierarchy +/** The item to add to the conversation. */ +model VoiceLiveConversationItemBase { + /** Customized to enriched VoiceLiveConversation{Request,Response}Item models */ +} + +/** The response resource. */ +model VoiceLiveResponse { + /** The unique ID of the response. */ + id?: string; + + @doc(""" + The object type, must be `voicelive.response`. + """) + object?: "voicelive.response"; + + @doc(""" + The final status of the response (`completed`, `cancelled`, `failed`, or + `incomplete`). + """) + status?: "completed" | "cancelled" | "failed" | "incomplete"; + + /** Additional details about the status. */ + status_details?: { + @doc(""" + The type of error that caused the response to fail, corresponding + with the `status` field (`completed`, `cancelled`, `incomplete`, + `failed`). + """) + type?: "completed" | "cancelled" | "failed" | "incomplete"; + + @doc(""" + The reason the Response did not complete. For a `cancelled` Response, + one of `turn_detected` (the server VAD detected a new start of speech) + or `client_cancelled` (the client sent a cancel event). For an + `incomplete` Response, one of `max_output_tokens` or `content_filter` + (the server-side safety filter activated and cut off the response). + """) + reason?: + | "turn_detected" + | "client_cancelled" + | "max_output_tokens" + | "content_filter"; + + @doc(""" + A description of the error that caused the response to fail, + populated when the `status` is `failed`. + """) + error?: { + /** The type of error. */ + type?: string; + + /** Error code, if any. */ + code?: string; + }; + }; + + // Tool customization: apply enriched response-specific type + /** The list of output items generated by the response. */ + output?: VoiceLiveConversationResponseItem[]; + + ...MetadataPropertyForResponse; + + /** + * Usage statistics for the Response, this will correspond to billing. A + * VoiceLive API session will maintain a conversation context and append new + * Items to the Conversation, thus output from previous turns (text and + * audio tokens) will become the input for later turns. + */ + usage?: { + /** + * The total number of tokens in the Response including input and output + * text and audio tokens. + */ + total_tokens?: int32; + + /** + * The number of input tokens used in the Response, including text and + * audio tokens. + */ + input_tokens?: int32; + + /** + * The number of output tokens sent in the Response, including text and + * audio tokens. + */ + output_tokens?: int32; + + /** Details about the input tokens used in the Response. */ + input_token_details?: { + /** The number of cached tokens used in the Response. */ + cached_tokens?: int32; + + /** The number of text tokens used in the Response. */ + text_tokens?: int32; + + /** The number of audio tokens used in the Response. */ + audio_tokens?: int32; + }; + + /** Details about the output tokens used in the Response. */ + output_token_details?: { + /** The number of text tokens used in the Response. */ + text_tokens?: int32; + + /** The number of audio tokens used in the Response. */ + audio_tokens?: int32; + }; + }; + + @doc(""" + Which conversation the response is added to, determined by the `conversation` + field in the `response.create` event. If `auto`, the response will be added to + the default conversation and the value of `conversation_id` will be an id like + `conv_1234`. If `none`, the response will not be added to any conversation and + the value of `conversation_id` will be `null`. If responses are being triggered + by server VAD, the response will be added to the default conversation, thus + the `conversation_id` will be an id like `conv_1234`. + """) + conversation_id?: string; + + @doc(""" + The voice the model used to respond. + Current voice options are `alloy`, `ash`, `ballad`, `coral`, `echo`, `fable`, + `onyx`, `nova`, `sage`, `shimmer`, and `verse`. + """) + voice?: VoiceIdsShared; + + @doc(""" + The set of modalities the model used to respond. If there are multiple modalities, + the model will pick one, for example if `modalities` is `["text", "audio"]`, the model + could be responding in either text or audio. + """) + modalities?: ("text" | "audio")[]; + + @doc(""" + The format of output audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`. + """) + output_audio_format?: "pcm16" | "g711_ulaw" | "g711_alaw"; + + /** Sampling temperature for the model, limited to [0.6, 1.2]. Defaults to 0.8. */ + temperature?: float32; + + /** + * Maximum number of output tokens for a single assistant response, + * inclusive of tool calls, that was used in this response. + */ + max_output_tokens?: int32 | "inf"; +} + +// Tool customization (apply_discriminator): apply discriminated type base +/** + * Send this event to append audio bytes to the input audio buffer. The audio + * buffer is temporary storage you can write to and later commit. In Server VAD + * mode, the audio buffer is used to detect speech and the server will decide + * when to commit. When Server VAD is disabled, you must commit the audio buffer + * manually. + * + * The client may choose how much audio to place in each event up to a maximum + * of 15 MiB, for example streaming smaller chunks from the client may allow the + * VAD to be more responsive. Unlike made other client events, the server will + * not send a confirmation response to this event. + */ +model VoiceLiveClientEventInputAudioBufferAppend extends VoiceLiveClientEvent { + @doc(""" + The event type, must be `input_audio_buffer.append`. + """) + type: VoiceLiveClientEventType.input_audio_buffer_append; + + // Tool customization: use encoded type for audio data + @doc(""" + Base64-encoded audio bytes. This must be in the format specified by the + `input_audio_format` field in the session configuration. + """) + @encode("base64") + audio: bytes; +} + +// Tool customization (apply_discriminator): apply discriminated type base +@doc(""" + Send this event to commit the user input audio buffer, which will create a + new user message item in the conversation. This event will produce an error + if the input audio buffer is empty. When in Server VAD mode, the client does + not need to send this event, the server will commit the audio buffer + automatically. + + Committing the input audio buffer will trigger input audio transcription + (if enabled in session configuration), but it will not create a response + from the model. The server will respond with an `input_audio_buffer.committed` + event. + """) +model VoiceLiveClientEventInputAudioBufferCommit extends VoiceLiveClientEvent { + @doc(""" + The event type, must be `input_audio_buffer.commit`. + """) + type: VoiceLiveClientEventType.input_audio_buffer_commit; +} + +// Tool customization (apply_discriminator): apply discriminated type base +@doc(""" + Send this event to clear the audio bytes in the buffer. The server will + respond with an `input_audio_buffer.cleared` event. + """) +model VoiceLiveClientEventInputAudioBufferClear extends VoiceLiveClientEvent { + @doc(""" + The event type, must be `input_audio_buffer.clear`. + """) + type: VoiceLiveClientEventType.input_audio_buffer_clear; +} + +// Tool customization (apply_discriminator): apply discriminated type base +@doc(""" + **WebRTC Only:** Emit to cut off the current audio response. This will trigger the server to + stop generating audio and emit a `output_audio_buffer.cleared` event. This + event should be preceded by a `response.cancel` client event to stop the + generation of the current response. + [Learn more](/docs/guides/voicelive-conversations#client-and-server-events-for-audio-in-webrtc). + """) +model VoiceLiveClientEventOutputAudioBufferClear extends VoiceLiveClientEvent { + @doc(""" + The event type, must be `output_audio_buffer.clear`. + """) + type: VoiceLiveClientEventType.output_audio_buffer_clear; +} + +// Tool customization (apply_discriminator): apply discriminated type base +@doc(""" + Add a new Item to the Conversation's context, including messages, function + calls, and function call responses. This event can be used both to populate a + "history" of the conversation and to add new items mid-stream, but has the + current limitation that it cannot populate assistant audio messages. + + If successful, the server will respond with a `conversation.item.created` + event, otherwise an `error` event will be sent. + """) +model VoiceLiveClientEventConversationItemCreate extends VoiceLiveClientEvent { + @doc(""" + The event type, must be `conversation.item.create`. + """) + type: VoiceLiveClientEventType.conversation_item_create; + + @doc(""" + The ID of the preceding item after which the new item will be inserted. + If not set, the new item will be appended to the end of the conversation. + If set to `root`, the new item will be added to the beginning of the conversation. + If set to an existing ID, it allows an item to be inserted mid-conversation. If the + ID cannot be found, an error will be returned and the item will not be added. + """) + previous_item_id?: string; + + // Tool customization: apply enriched item definition hierarchy + item: VoiceLiveConversationRequestItem; +} + +// Tool customization (apply_discriminator): apply discriminated type base +@doc(""" + Send this event to truncate a previous assistant message’s audio. The server + will produce audio faster than voicelive, so this event is useful when the user + interrupts to truncate audio that has already been sent to the client but not + yet played. This will synchronize the server's understanding of the audio with + the client's playback. + + Truncating audio will delete the server-side text transcript to ensure there + is not text in the context that hasn't been heard by the user. + + If successful, the server will respond with a `conversation.item.truncated` + event. + """) +model VoiceLiveClientEventConversationItemTruncate extends VoiceLiveClientEvent { + @doc(""" + The event type, must be `conversation.item.truncate`. + """) + type: VoiceLiveClientEventType.conversation_item_truncate; + + /** + * The ID of the assistant message item to truncate. Only assistant message + * items can be truncated. + */ + item_id: string; + + /** The index of the content part to truncate. Set this to 0. */ + content_index: int32; + + /** + * Inclusive duration up to which audio is truncated, in milliseconds. If + * the audio_end_ms is greater than the actual audio duration, the server + * will respond with an error. + */ + audio_end_ms: int32; +} + +// Tool customization (apply_discriminator): apply discriminated type base +@doc(""" + Send this event when you want to remove any item from the conversation + history. The server will respond with a `conversation.item.deleted` event, + unless the item does not exist in the conversation history, in which case the + server will respond with an error. + """) +model VoiceLiveClientEventConversationItemDelete extends VoiceLiveClientEvent { + @doc(""" + The event type, must be `conversation.item.delete`. + """) + type: VoiceLiveClientEventType.conversation_item_delete; + + /** The ID of the item to delete. */ + item_id: string; +} + +// Tool customization (apply_discriminator): apply discriminated type base +@doc(""" + This event instructs the server to create a Response, which means triggering + model inference. When in Server VAD mode, the server will create Responses + automatically. + + A Response will include at least one Item, and may have two, in which case + the second will be a function call. These Items will be appended to the + conversation history. + + The server will respond with a `response.created` event, events for Items + and content created, and finally a `response.done` event to indicate the + Response is complete. + + The `response.create` event includes inference configuration like + `instructions`, and `temperature`. These fields will override the Session's + configuration for this Response only. + """) +model VoiceLiveClientEventResponseCreate extends VoiceLiveClientEvent { + @doc(""" + The event type, must be `response.create`. + """) + type: VoiceLiveClientEventType.response_create; + + response?: VoiceLiveResponseCreateParams; +} + +// Tool customization (apply_discriminator): apply discriminated type base +@doc(""" + Send this event to cancel an in-progress response. The server will respond + with a `response.cancelled` event or an error if there is no response to + cancel. + """) +model VoiceLiveClientEventResponseCancel extends VoiceLiveClientEvent { + @doc(""" + The event type, must be `response.cancel`. + """) + type: VoiceLiveClientEventType.response_cancel; + + /** + * A specific response ID to cancel - if not provided, will cancel an + * in-progress response in the default conversation. + */ + response_id?: string; +} + +// Tool customization (apply_discriminator): apply discriminated type +/** + * Returned when an error occurs, which could be a client problem or a server + * problem. Most errors are recoverable and the session will stay open, we + * recommend to implementors to monitor and log error messages by default. + */ +model VoiceLiveServerEventError extends VoiceLiveServerEvent { + @doc(""" + The event type, must be `error`. + """) + type: VoiceLiveServerEventType.error; + + /** Details of the error. */ + error: { + /** The type of error (e.g., "invalid_request_error", "server_error"). */ + type: string; + + /** Error code, if any. */ + code?: string | null; + + /** A human-readable error message. */ + message: string; + + /** Parameter related to the error, if any. */ + param?: string | null; + + /** The event_id of the client event that caused the error, if applicable. */ + event_id?: string | null; + }; +} + +// Tool customization (apply_discriminator): apply discriminated type +/** + * Returned when a Session is created. Emitted automatically when a new + * connection is established as the first server event. This event will contain + * the default Session configuration. + */ +model VoiceLiveServerEventSessionCreated extends VoiceLiveServerEvent { + @doc(""" + The event type, must be `session.created`. + """) + type: VoiceLiveServerEventType.session_created; + + // Tool customization: apply enriched response-specific model + session: VoiceLiveResponseSession; +} + +// Tool customization (apply_discriminator): apply discriminated type +@doc(""" + Returned when a session is updated with a `session.update` event, unless + there is an error. + """) +model VoiceLiveServerEventSessionUpdated extends VoiceLiveServerEvent { + @doc(""" + The event type, must be `session.updated`. + """) + type: VoiceLiveServerEventType.session_updated; + + // Tool customization: apply enriched response-specific model + session: VoiceLiveResponseSession; +} + +// Tool customization: establish base for enriched request/response split models +/** VoiceLive session object configuration. */ +model VoiceLiveSessionBase {} + +// Tool customization: Adjust union to be a discriminated type base +/** A voicelive server event. */ +@discriminator("type") +model VoiceLiveServerEvent { + /** The type of event. */ + type: VoiceLiveServerEventType; + + event_id?: string; +} + +// Tool customization (apply_discriminator): apply discriminated type +/** Returned when a conversation is created. Emitted right after session creation. */ +model VoiceLiveServerEventConversationCreated extends VoiceLiveServerEvent { + @doc(""" + The event type, must be `conversation.created`. + """) + type: VoiceLiveServerEventType.conversation_created; + + /** The conversation resource. */ + conversation: { + /** The unique ID of the conversation. */ + id?: string; + + @doc(""" + The object type, must be `voicelive.conversation`. + """) + object?: string; + }; +} + +// Tool customization (apply_discriminator): apply discriminated type +@doc(""" + Returned when an input audio buffer is committed, either by the client or + automatically in server VAD mode. The `item_id` property is the ID of the user + message item that will be created, thus a `conversation.item.created` event + will also be sent to the client. + """) +model VoiceLiveServerEventInputAudioBufferCommitted extends VoiceLiveServerEvent { + @doc(""" + The event type, must be `input_audio_buffer.committed`. + """) + type: VoiceLiveServerEventType.input_audio_buffer_committed; + + /** The ID of the preceding item after which the new item will be inserted. */ + previous_item_id: string; + + /** The ID of the user message item that will be created. */ + item_id: string; +} + +// Tool customization (apply_discriminator): apply discriminated type +@doc(""" + Returned when the input audio buffer is cleared by the client with a + `input_audio_buffer.clear` event. + """) +model VoiceLiveServerEventInputAudioBufferCleared extends VoiceLiveServerEvent { + @doc(""" + The event type, must be `input_audio_buffer.cleared`. + """) + type: VoiceLiveServerEventType.input_audio_buffer_cleared; +} + +// Tool customization (apply_discriminator): apply discriminated type +@doc(""" + Sent by the server when in `server_vad` mode to indicate that speech has been + detected in the audio buffer. This can happen any time audio is added to the + buffer (unless speech is already detected). The client may want to use this + event to interrupt audio playback or provide visual feedback to the user. + + The client should expect to receive a `input_audio_buffer.speech_stopped` event + when speech stops. The `item_id` property is the ID of the user message item + that will be created when speech stops and will also be included in the + `input_audio_buffer.speech_stopped` event (unless the client manually commits + the audio buffer during VAD activation). + """) +model VoiceLiveServerEventInputAudioBufferSpeechStarted + extends VoiceLiveServerEvent { + @doc(""" + The event type, must be `input_audio_buffer.speech_started`. + """) + type: VoiceLiveServerEventType.input_audio_buffer_speech_started; + + @doc(""" + Milliseconds from the start of all audio written to the buffer during the + session when speech was first detected. This will correspond to the + beginning of audio sent to the model, and thus includes the + `prefix_padding_ms` configured in the Session. + """) + audio_start_ms: int32; + + /** The ID of the user message item that will be created when speech stops. */ + item_id: string; +} + +// Tool customization (apply_discriminator): apply discriminated type +@doc(""" + Returned in `server_vad` mode when the server detects the end of speech in + the audio buffer. The server will also send an `conversation.item.created` + event with the user message item that is created from the audio buffer. + """) +model VoiceLiveServerEventInputAudioBufferSpeechStopped + extends VoiceLiveServerEvent { + @doc(""" + The event type, must be `input_audio_buffer.speech_stopped`. + """) + type: VoiceLiveServerEventType.input_audio_buffer_speech_stopped; + + @doc(""" + Milliseconds since the session started when speech stopped. This will + correspond to the end of audio sent to the model, and thus includes the + `min_silence_duration_ms` configured in the Session. + """) + audio_end_ms: int32; + + /** The ID of the user message item that will be created. */ + item_id: string; +} + +// Tool customization (apply_discriminator): apply discriminated type +@doc(""" + **WebRTC Only:** Emitted when the output audio buffer is cleared. This happens either in VAD + mode when the user has interrupted (`input_audio_buffer.speech_started`), + or when the client has emitted the `output_audio_buffer.clear` event to manually + cut off the current audio response. + [Learn more](/docs/guides/voicelive-conversations#client-and-server-events-for-audio-in-webrtc). + """) +model VoiceLiveServerEventOutputAudioBufferCleared extends VoiceLiveServerEvent { + @doc(""" + The event type, must be `output_audio_buffer.cleared`. + """) + type: VoiceLiveServerEventType.output_audio_buffer_cleared; + + /** The unique ID of the response that produced the audio. */ + response_id: string; +} + +// Tool customization (apply_discriminator): apply discriminated type +@doc(""" + **WebRTC Only:** Emitted when the server begins streaming audio to the client. This event is + emitted after an audio content part has been added (`response.content_part.added`) + to the response. + [Learn more](/docs/guides/voicelive-conversations#client-and-server-events-for-audio-in-webrtc). + """) +model VoiceLiveServerEventOutputAudioBufferStarted extends VoiceLiveServerEvent { + @doc(""" + The event type, must be `output_audio_buffer.started`. + """) + type: VoiceLiveServerEventType.output_audio_buffer_started; + + /** The unique ID of the response that produced the audio. */ + response_id: string; +} + +// Tool customization (apply_discriminator): apply discriminated type +@doc(""" + **WebRTC Only:** Emitted when the output audio buffer has been completely drained on the server, + and no more audio is forthcoming. This event is emitted after the full response + data has been sent to the client (`response.done`). + [Learn more](/docs/guides/voicelive-conversations#client-and-server-events-for-audio-in-webrtc). + """) +model VoiceLiveServerEventOutputAudioBufferStopped extends VoiceLiveServerEvent { + @doc(""" + The event type, must be `output_audio_buffer.stopped`. + """) + type: VoiceLiveServerEventType.output_audio_buffer_stopped; + + /** The unique ID of the response that produced the audio. */ + response_id: string; +} + +// Tool customization (apply_discriminator): apply discriminated type +@doc(""" + Returned when a conversation item is created. There are several scenarios that produce this event: + - The server is generating a Response, which if successful will produce + either one or two Items, which will be of type `message` + (role `assistant`) or type `function_call`. + - The input audio buffer has been committed, either by the client or the + server (in `server_vad` mode). The server will take the content of the + input audio buffer and add it to a new user message Item. + - The client has sent a `conversation.item.create` event to add a new Item + to the Conversation. + """) +model VoiceLiveServerEventConversationItemCreated extends VoiceLiveServerEvent { + @doc(""" + The event type, must be `conversation.item.created`. + """) + type: VoiceLiveServerEventType.conversation_item_created; + + /** + * The ID of the preceding item in the Conversation context, allows the + * client to understand the order of the conversation. + */ + previous_item_id: string; + + // Tool customization: apply enriched item definition hierarchy + item: VoiceLiveConversationResponseItem; +} + +// Tool customization (apply_discriminator): apply discriminated type +@doc(""" + This event is the output of audio transcription for user audio written to the + user audio buffer. Transcription begins when the input audio buffer is + committed by the client or server (in `server_vad` mode). Transcription runs + asynchronously with Response creation, so this event may come before or after + the Response events. + + VoiceLive API models accept audio natively, and thus input transcription is a + separate process run on a separate ASR (Automatic Speech Recognition) model. + The transcript may diverge somewhat from the model's interpretation, and + should be treated as a rough guide. + """) +model VoiceLiveServerEventConversationItemInputAudioTranscriptionCompleted + extends VoiceLiveServerEvent { + @doc(""" + The event type, must be + `conversation.item.input_audio_transcription.completed`. + """) + type: VoiceLiveServerEventType.conversation_item_input_audio_transcription_completed; + + /** The ID of the user message item containing the audio. */ + item_id: string; + + /** The index of the content part containing the audio. */ + content_index: int32; + + /** The transcribed text. */ + transcript: string; + + /** The log probabilities of the transcription. */ + logprobs?: LogProbProperties[] | null; + + // Tool customization: Substitute common discriminated type base + /** Usage statistics for the transcription. */ + usage: TranscriptTextUsage; +} + +// Tool customization (apply_discriminator): apply discriminated type +@doc(""" + Returned when input audio transcription is configured, and a transcription + request for a user message failed. These events are separate from other + `error` events so that the client can identify the related Item. + """) +model VoiceLiveServerEventConversationItemInputAudioTranscriptionFailed + extends VoiceLiveServerEvent { + @doc(""" + The event type, must be + `conversation.item.input_audio_transcription.failed`. + """) + type: VoiceLiveServerEventType.conversation_item_input_audio_transcription_failed; + + /** The ID of the user message item. */ + item_id: string; + + /** The index of the content part containing the audio. */ + content_index: int32; + + /** Details of the transcription error. */ + error: { + /** The type of error. */ + type?: string; + + /** Error code, if any. */ + code?: string; + + /** A human-readable error message. */ + message?: string; + + /** Parameter related to the error, if any. */ + param?: string; + }; +} + +// Tool customization (apply_discriminator): apply discriminated type +@doc(""" + Returned when an earlier assistant audio message item is truncated by the + client with a `conversation.item.truncate` event. This event is used to + synchronize the server's understanding of the audio with the client's playback. + + This action will truncate the audio and remove the server-side text transcript + to ensure there is no text in the context that hasn't been heard by the user. + """) +model VoiceLiveServerEventConversationItemTruncated extends VoiceLiveServerEvent { + @doc(""" + The event type, must be `conversation.item.truncated`. + """) + type: VoiceLiveServerEventType.conversation_item_truncated; + + /** The ID of the assistant message item that was truncated. */ + item_id: string; + + /** The index of the content part that was truncated. */ + content_index: int32; + + /** The duration up to which the audio was truncated, in milliseconds. */ + audio_end_ms: int32; +} + +// Tool customization (apply_discriminator): apply discriminated type +@doc(""" + Returned when an item in the conversation is deleted by the client with a + `conversation.item.delete` event. This event is used to synchronize the + server's understanding of the conversation history with the client's view. + """) +model VoiceLiveServerEventConversationItemDeleted extends VoiceLiveServerEvent { + @doc(""" + The event type, must be `conversation.item.deleted`. + """) + type: VoiceLiveServerEventType.conversation_item_deleted; + + /** The ID of the item that was deleted. */ + item_id: string; +} + +// Tool customization (apply_discriminator): apply discriminated type +@doc(""" + Returned when a new Response is created. The first event of response creation, + where the response is in an initial state of `in_progress`. + """) +model VoiceLiveServerEventResponseCreated extends VoiceLiveServerEvent { + @doc(""" + The event type, must be `response.created`. + """) + type: VoiceLiveServerEventType.response_created; + + response: VoiceLiveResponse; +} + +// Tool customization (apply_discriminator): apply discriminated type +@doc(""" + Returned when a Response is done streaming. Always emitted, no matter the + final state. The Response object included in the `response.done` event will + include all output Items in the Response but will omit the raw audio data. + """) +model VoiceLiveServerEventResponseDone extends VoiceLiveServerEvent { + @doc(""" + The event type, must be `response.done`. + """) + type: VoiceLiveServerEventType.response_done; + + response: VoiceLiveResponse; +} + +// Tool customization (apply_discriminator): apply discriminated type +/** Returned when a new Item is created during Response generation. */ +model VoiceLiveServerEventResponseOutputItemAdded extends VoiceLiveServerEvent { + @doc(""" + The event type, must be `response.output_item.added`. + """) + type: VoiceLiveServerEventType.response_output_item_added; + + /** The ID of the Response to which the item belongs. */ + response_id: string; + + /** The index of the output item in the Response. */ + output_index: int32; + + // Tool customization: apply enriched item definition hierarchy + item: VoiceLiveConversationResponseItem; +} + +// Tool customization (apply_discriminator): apply discriminated type +/** + * Returned when an Item is done streaming. Also emitted when a Response is + * interrupted, incomplete, or cancelled. + */ +model VoiceLiveServerEventResponseOutputItemDone extends VoiceLiveServerEvent { + @doc(""" + The event type, must be `response.output_item.done`. + """) + type: VoiceLiveServerEventType.response_output_item_done; + + /** The ID of the Response to which the item belongs. */ + response_id: string; + + /** The index of the output item in the Response. */ + output_index: int32; + + // Tool customization: apply enriched item definition hierarchy + item: VoiceLiveConversationResponseItem; +} + +// Tool customization (apply_discriminator): apply discriminated type +/** + * Returned when a new content part is added to an assistant message item during + * response generation. + */ +model VoiceLiveServerEventResponseContentPartAdded extends VoiceLiveServerEvent { + @doc(""" + The event type, must be `response.content_part.added`. + """) + type: VoiceLiveServerEventType.response_content_part_added; + + /** The ID of the response. */ + response_id: string; + + /** The ID of the item to which the content part was added. */ + item_id: string; + + /** The index of the output item in the response. */ + output_index: int32; + + /** The index of the content part in the item's content array. */ + content_index: int32; + + // Tool customization: apply detailed content part type + /** The content part that was added. */ + part: VoiceLiveContentPart; +} + +// Tool customization (apply_discriminator): apply discriminated type +/** + * Returned when a content part is done streaming in an assistant message item. + * Also emitted when a Response is interrupted, incomplete, or cancelled. + */ +model VoiceLiveServerEventResponseContentPartDone extends VoiceLiveServerEvent { + @doc(""" + The event type, must be `response.content_part.done`. + """) + type: VoiceLiveServerEventType.response_content_part_done; + + /** The ID of the response. */ + response_id: string; + + /** The ID of the item. */ + item_id: string; + + /** The index of the output item in the response. */ + output_index: int32; + + /** The index of the content part in the item's content array. */ + content_index: int32; + + // Tool customization: apply detailed content part type + /** The content part that is done. */ + part: VoiceLiveContentPart; +} + +// Tool customization (apply_discriminator): apply discriminated type +/** Returned when the text value of a "text" content part is updated. */ +model VoiceLiveServerEventResponseTextDelta extends VoiceLiveServerEvent { + @doc(""" + The event type, must be `response.text.delta`. + """) + type: VoiceLiveServerEventType.response_text_delta; + + /** The ID of the response. */ + response_id: string; + + /** The ID of the item. */ + item_id: string; + + /** The index of the output item in the response. */ + output_index: int32; + + /** The index of the content part in the item's content array. */ + content_index: int32; + + /** The text delta. */ + delta: string; +} + +// Tool customization (apply_discriminator): apply discriminated type +/** + * Returned when the text value of a "text" content part is done streaming. Also + * emitted when a Response is interrupted, incomplete, or cancelled. + */ +model VoiceLiveServerEventResponseTextDone extends VoiceLiveServerEvent { + @doc(""" + The event type, must be `response.text.done`. + """) + type: VoiceLiveServerEventType.response_text_done; + + /** The ID of the response. */ + response_id: string; + + /** The ID of the item. */ + item_id: string; + + /** The index of the output item in the response. */ + output_index: int32; + + /** The index of the content part in the item's content array. */ + content_index: int32; + + /** The final text content. */ + text: string; +} + +// Tool customization (apply_discriminator): apply discriminated type +/** Returned when the model-generated transcription of audio output is updated. */ +model VoiceLiveServerEventResponseAudioTranscriptDelta + extends VoiceLiveServerEvent { + @doc(""" + The event type, must be `response.audio_transcript.delta`. + """) + type: VoiceLiveServerEventType.response_audio_transcript_delta; + + /** The ID of the response. */ + response_id: string; + + /** The ID of the item. */ + item_id: string; + + /** The index of the output item in the response. */ + output_index: int32; + + /** The index of the content part in the item's content array. */ + content_index: int32; + + /** The transcript delta. */ + delta: string; +} + +// Tool customization (apply_discriminator): apply discriminated type +/** + * Returned when the model-generated transcription of audio output is done + * streaming. Also emitted when a Response is interrupted, incomplete, or + * cancelled. + */ +model VoiceLiveServerEventResponseAudioTranscriptDone + extends VoiceLiveServerEvent { + @doc(""" + The event type, must be `response.audio_transcript.done`. + """) + type: VoiceLiveServerEventType.response_audio_transcript_done; + + /** The ID of the response. */ + response_id: string; + + /** The ID of the item. */ + item_id: string; + + /** The index of the output item in the response. */ + output_index: int32; + + /** The index of the content part in the item's content array. */ + content_index: int32; + + /** The final transcript of the audio. */ + transcript: string; +} + +// Tool customization (apply_discriminator): apply discriminated type +/** Returned when the model-generated audio is updated. */ +model VoiceLiveServerEventResponseAudioDelta extends VoiceLiveServerEvent { + @doc(""" + The event type, must be `response.audio.delta`. + """) + type: VoiceLiveServerEventType.response_audio_delta; + + /** The ID of the response. */ + response_id: string; + + /** The ID of the item. */ + item_id: string; + + /** The index of the output item in the response. */ + output_index: int32; + + /** The index of the content part in the item's content array. */ + content_index: int32; + + // Tool customization: use encoded type for audio data + /** Base64-encoded audio data delta. */ + @encode("base64") + delta: bytes; +} + +// Tool customization (apply_discriminator): apply discriminated type +/** + * Returned when the model-generated audio is done. Also emitted when a Response + * is interrupted, incomplete, or cancelled. + */ +model VoiceLiveServerEventResponseAudioDone extends VoiceLiveServerEvent { + @doc(""" + The event type, must be `response.audio.done`. + """) + type: VoiceLiveServerEventType.response_audio_done; + + /** The ID of the response. */ + response_id: string; + + /** The ID of the item. */ + item_id: string; + + /** The index of the output item in the response. */ + output_index: int32; + + /** The index of the content part in the item's content array. */ + content_index: int32; +} + +// Tool customization (apply_discriminator): apply discriminated type +/** Returned when the model-generated function call arguments are updated. */ +model VoiceLiveServerEventResponseFunctionCallArgumentsDelta + extends VoiceLiveServerEvent { + @doc(""" + The event type, must be `response.function_call_arguments.delta`. + """) + type: VoiceLiveServerEventType.response_function_call_arguments_delta; + + /** The ID of the response. */ + response_id: string; + + /** The ID of the function call item. */ + item_id: string; + + /** The index of the output item in the response. */ + output_index: int32; + + /** The ID of the function call. */ + call_id: string; + + /** The arguments delta as a JSON string. */ + delta: string; +} + +// Tool customization (apply_discriminator): apply discriminated type +/** + * Returned when the model-generated function call arguments are done streaming. + * Also emitted when a Response is interrupted, incomplete, or cancelled. + */ +model VoiceLiveServerEventResponseFunctionCallArgumentsDone + extends VoiceLiveServerEvent { + @doc(""" + The event type, must be `response.function_call_arguments.done`. + """) + type: VoiceLiveServerEventType.response_function_call_arguments_done; + + /** The ID of the response. */ + response_id: string; + + /** The ID of the function call item. */ + item_id: string; + + /** The index of the output item in the response. */ + output_index: int32; + + /** The ID of the function call. */ + call_id: string; + + /** The final arguments as a JSON string. */ + arguments: string; +} + +// Tool customization (apply_discriminator): apply discriminated type +/** + * Emitted at the beginning of a Response to indicate the updated rate limits. + * When a Response is created some tokens will be "reserved" for the output + * tokens, the rate limits shown here reflect that reservation, which is then + * adjusted accordingly once the Response is completed. + */ +model VoiceLiveServerEventRateLimitsUpdated extends VoiceLiveServerEvent { + @doc(""" + The event type, must be `rate_limits.updated`. + """) + type: VoiceLiveServerEventType.rate_limits_updated; + + // Tool customization: use custom type for rate limit items (applying encoded duration) + /** List of rate limit information. */ + rate_limits: VoiceLiveServerEventRateLimitsUpdatedRateLimitsItem[]; +} + +/** Create a new VoiceLive response with these parameters */ +model VoiceLiveResponseCreateParams { + // Tool customization: Apply reusable modality representation + /** + * The set of modalities the model can respond with. To disable audio, + * set this to ["text"]. + */ + modalities?: VoiceLiveModality[]; + + @doc(""" + The default system instructions (i.e. system message) prepended to model + calls. This field allows the client to guide the model on desired + responses. The model can be instructed on response content and format, + (e.g. "be extremely succinct", "act friendly", "here are examples of good + responses") and on audio behavior (e.g. "talk quickly", "inject emotion + into your voice", "laugh frequently"). The instructions are not guaranteed + to be followed by the model, but they provide guidance to the model on the + desired behavior. + + Note that the server sets default instructions which will be used if this + field is not set and are visible in the `session.created` event at the + start of the session. + """) + instructions?: string; + + @doc(""" + The voice the model uses to respond. Voice cannot be changed during the + session once the model has responded with audio at least once. Current + voice options are `alloy`, `ash`, `ballad`, `coral`, `echo`, `fable`, + `onyx`, `nova`, `sage`, `shimmer`, and `verse`. + """) + voice?: VoiceIdsShared; + + // Tool customization: use extracted and reusable audio format definition + @doc(""" + The format of output audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`. + """) + output_audio_format?: VoiceLiveAudioFormat = VoiceLiveAudioFormat.pcm16; + + // Tool customization: use enriched tool definition + /** Tools (functions) available to the model. */ + tools?: VoiceLiveTool[]; + + @doc(""" + How the model chooses tools. Options are `auto`, `none`, `required`, or + specify a function, like `{"type": "function", "function": {"name": "my_function"}}`. + """) + tool_choice?: string; + + /** Sampling temperature for the model, limited to [0.6, 1.2]. Defaults to 0.8. */ + temperature?: float32; + + // Tool customization: Address (observed as of 2025-01-31) spec issue with 'max_response_output_tokens' + @doc(""" + Maximum number of output tokens for a single assistant response, + inclusive of tool calls. Provide an integer between 1 and 4096 to + limit output tokens, or `inf` for the maximum available tokens for a + given model. Defaults to `inf`. + """) + max_output_tokens?: int32 | "inf"; + + @doc(""" + Controls which conversation the response is added to. Currently supports + `auto` and `none`, with `auto` as the default value. The `auto` value + means that the contents of the response will be added to the default + conversation. Set this to `none` to create an out-of-band response which + will not add items to default conversation. + """) + conversation?: string | "auto" | "none"; + + ...MetadataPropertyForRequest; + + // Tool customization: apply a customized, more specific discriminated type hierarchy + @doc(""" + Input items to include in the prompt for the model. Using this field + creates a new context for this Response instead of using the default + conversation. An empty array `[]` will clear the context for this Response. + Note that this can include references to items from the default conversation. + """) + input?: VoiceLiveConversationRequestItem[]; +} + +/** VoiceLive session object configuration. */ +model VoiceLiveSessionCreateRequest { + // Tool customization: Apply reusable modality representation + /** + * The set of modalities the model can respond with. To disable audio, + * set this to ["text"]. + */ + modalities?: VoiceLiveModality[]; + + /** The VoiceLive model used for this session. */ + `model`?: + | "gpt-4o-realtime-preview" + | "gpt-4o-realtime-preview-2024-10-01" + | "gpt-4o-realtime-preview-2024-12-17" + | "gpt-4o-realtime-preview-2025-06-03" + | "gpt-4o-mini-realtime-preview" + | "gpt-4o-mini-realtime-preview-2024-12-17"; + + @doc(""" + The default system instructions (i.e. system message) prepended to model calls. This field allows the client to guide the model on desired responses. The model can be instructed on response content and format, (e.g. "be extremely succinct", "act friendly", "here are examples of good responses") and on audio behavior (e.g. "talk quickly", "inject emotion into your voice", "laugh frequently"). The instructions are not guaranteed to be followed by the model, but they provide guidance to the model on the desired behavior. + + Note that the server sets default instructions which will be used if this field is not set and are visible in the `session.created` event at the start of the session. + """) + instructions?: string; + + @doc(""" + The voice the model uses to respond. Voice cannot be changed during the + session once the model has responded with audio at least once. Current + voice options are `alloy`, `ash`, `ballad`, `coral`, `echo`, `fable`, + `onyx`, `nova`, `sage`, `shimmer`, and `verse`. + """) + voice?: VoiceIdsShared; + + // Tool customization: use extracted and reusable audio format definition + @doc(""" + The format of input audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`. + For `pcm16`, input audio must be 16-bit PCM at a 24kHz sample rate, + single channel (mono), and little-endian byte order. + """) + input_audio_format?: VoiceLiveAudioFormat = VoiceLiveAudioFormat.pcm16; + + // Tool customization: use extracted and reusable audio format definition + @doc(""" + The format of output audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`. + For `pcm16`, output audio is sampled at a rate of 24kHz. + """) + output_audio_format?: VoiceLiveAudioFormat = VoiceLiveAudioFormat.pcm16; + + @doc(""" + Configuration for input audio transcription, defaults to off and can be set to `null` to turn off once on. Input audio transcription is not native to the model, since the model consumes audio directly. Transcription runs asynchronously through [the /audio/transcriptions endpoint](https://platform.openai.com/docs/api-reference/audio/createTranscription) and should be treated as guidance of input audio content rather than precisely what the model heard. The client can optionally set the language and prompt for transcription, these offer additional guidance to the transcription service. + """) + input_audio_transcription?: { + @doc(""" + The model to use for transcription, current options are `gpt-4o-transcribe`, `gpt-4o-mini-transcribe`, and `whisper-1`. + """) + `model`?: string; + + @doc(""" + The language of the input audio. Supplying the input language in + [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`) format + will improve accuracy and latency. + """) + language?: string; + + @doc(""" + An optional text to guide the model's style or continue a previous audio + segment. + For `whisper-1`, the [prompt is a list of keywords](/docs/guides/speech-to-text#prompting). + For `gpt-4o-transcribe` models, the prompt is a free text string, for example "expect words related to technology". + """) + prompt?: string; + }; + + @doc(""" + Configuration for turn detection, ether Server VAD or Semantic VAD. This can be set to `null` to turn off, in which case the client must manually trigger model response. + Server VAD means that the model will detect the start and end of speech based on audio volume and respond at the end of user speech. + Semantic VAD is more advanced and uses a turn detection model (in conjuction with VAD) to semantically estimate whether the user has finished speaking, then dynamically sets a timeout based on this probability. For example, if user audio trails off with "uhhm", the model will score a low probability of turn end and wait longer for the user to continue speaking. This can be useful for more natural conversations, but may have a higher latency. + """) + turn_detection?: { + /** Type of turn detection. */ + type?: "server_vad" | "semantic_vad" = "server_vad"; + + @doc(""" + Used only for `semantic_vad` mode. The eagerness of the model to respond. `low` will wait longer for the user to continue speaking, `high` will respond more quickly. `auto` is the default and is equivalent to `medium`. + """) + eagerness?: "low" | "medium" | "high" | "auto" = "auto"; + + @doc(""" + Used only for `server_vad` mode. Activation threshold for VAD (0.0 to 1.0), this defaults to 0.5. A + higher threshold will require louder audio to activate the model, and + thus might perform better in noisy environments. + """) + threshold?: float32; + + @doc(""" + Used only for `server_vad` mode. Amount of audio to include before the VAD detected speech (in + milliseconds). Defaults to 300ms. + """) + prefix_padding_ms?: int32; + + @doc(""" + Used only for `server_vad` mode. Duration of silence to detect speech stop (in milliseconds). Defaults + to 500ms. With shorter values the model will respond more quickly, + but may jump in on short pauses from the user. + """) + silence_duration_ms?: int32; + + /** Whether or not to automatically generate a response when a VAD stop event occurs. */ + create_response?: boolean = true; + + @doc(""" + Whether or not to automatically interrupt any ongoing response with output to the default + conversation (i.e. `conversation` of `auto`) when a VAD start event occurs. + """) + interrupt_response?: boolean = true; + }; + + @doc(""" + Configuration for input audio noise reduction. This can be set to `null` to turn off. + Noise reduction filters audio added to the input audio buffer before it is sent to VAD and the model. + Filtering the audio can improve VAD and turn detection accuracy (reducing false positives) and model performance by improving perception of the input audio. + """) + input_audio_noise_reduction?: { + @doc(""" + Type of noise reduction. `near_field` is for close-talking microphones such as headphones, `far_field` is for far-field microphones such as laptop or conference room microphones. + """) + type?: "near_field" | "far_field"; + } | null = null; + + /** + * The speed of the model's spoken response. 1.0 is the default speed. 0.25 is + * the minimum speed. 1.5 is the maximum speed. This value can only be changed + * in between model turns, not while a response is in progress. + */ + @minValue(0.25) + @maxValue(1.5) + speed?: float32 = 1; + + @doc(""" + Configuration options for tracing. Set to null to disable tracing. Once + tracing is enabled for a session, the configuration cannot be modified. + + `auto` will create a trace for the session with default values for the + workflow name, group id, and metadata. + """) + tracing?: "auto" | { + /** + * The name of the workflow to attach to this trace. This is used to + * name the trace in the traces dashboard. + */ + workflow_name?: string; + + /** + * The group id to attach to this trace to enable filtering and + * grouping in the traces dashboard. + */ + group_id?: string; + + /** + * The arbitrary metadata to attach to this trace to enable + * filtering in the traces dashboard. + */ + metadata?: unknown; + }; + + // Tool customization: use enriched tool definition + /** Tools (functions) available to the model. */ + tools?: VoiceLiveTool[]; + + @doc(""" + How the model chooses tools. Options are `auto`, `none`, `required`, or + specify a function. + """) + tool_choice?: string = "auto"; + + /** Sampling temperature for the model, limited to [0.6, 1.2]. For audio models a temperature of 0.8 is highly recommended for best performance. */ + temperature?: float32 = 0.8; + + @doc(""" + Maximum number of output tokens for a single assistant response, + inclusive of tool calls. Provide an integer between 1 and 4096 to + limit output tokens, or `inf` for the maximum available tokens for a + given model. Defaults to `inf`. + """) + max_response_output_tokens?: int32 | "inf"; + + /** Configuration options for the generated client secret. */ + client_secret?: { + /** Configuration for the ephemeral token expiration. */ + expires_after?: { + @doc(""" + The anchor point for the ephemeral token expiration. Only `created_at` is currently supported. + """) + anchor: "created_at"; + + @doc(""" + The number of seconds from the anchor point to the expiration. Select a value between `10` and `7200`. + """) + seconds?: int32 = 600; + }; + }; +} + +/** + * A new VoiceLive session configuration, with an ephermeral key. Default TTL + * for keys is one minute. + */ +model VoiceLiveSessionCreateResponse { + /** Ephemeral key returned by the API. */ + client_secret: { + /** + * Ephemeral key usable in client environments to authenticate connections + * to the VoiceLive API. Use this in client-side environments rather than + * a standard API token, which should only be used server-side. + */ + value: string; + + // Tool customization: 'created' and fields ending in '_at' are Unix encoded utcDateTime + /** + * Timestamp for when the token expires. Currently, all tokens expire + * after one minute. + */ + @encode("unixTimestamp", int32) + expires_at: utcDateTime; + }; + + // Tool customization: Apply reusable modality representation + /** + * The set of modalities the model can respond with. To disable audio, + * set this to ["text"]. + */ + modalities?: VoiceLiveModality[]; + + @doc(""" + The default system instructions (i.e. system message) prepended to model + calls. This field allows the client to guide the model on desired + responses. The model can be instructed on response content and format, + (e.g. "be extremely succinct", "act friendly", "here are examples of good + responses") and on audio behavior (e.g. "talk quickly", "inject emotion + into your voice", "laugh frequently"). The instructions are not guaranteed + to be followed by the model, but they provide guidance to the model on the + desired behavior. + + Note that the server sets default instructions which will be used if this + field is not set and are visible in the `session.created` event at the + start of the session. + """) + instructions?: string; + + @doc(""" + The voice the model uses to respond. Voice cannot be changed during the + session once the model has responded with audio at least once. Current + voice options are `alloy`, `ash`, `ballad`, `coral`, `echo` `sage`, + `shimmer` and `verse`. + """) + voice?: VoiceIdsShared; + + // Tool customization: use extracted and reusable audio format definition + @doc(""" + The format of input audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`. + """) + input_audio_format?: VoiceLiveAudioFormat; + + // Tool customization: use extracted and reusable audio format definition + @doc(""" + The format of output audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`. + """) + output_audio_format?: VoiceLiveAudioFormat; + + @doc(""" + Configuration for input audio transcription, defaults to off and can be + set to `null` to turn off once on. Input audio transcription is not native + to the model, since the model consumes audio directly. Transcription runs + asynchronously and should be treated as rough guidance + rather than the representation understood by the model. + """) + input_audio_transcription?: { + /** The model to use for transcription. */ + `model`?: string; + }; + + /** + * The speed of the model's spoken response. 1.0 is the default speed. 0.25 is + * the minimum speed. 1.5 is the maximum speed. This value can only be changed + * in between model turns, not while a response is in progress. + */ + @minValue(0.25) + @maxValue(1.5) + speed?: float32 = 1; + + @doc(""" + Configuration options for tracing. Set to null to disable tracing. Once + tracing is enabled for a session, the configuration cannot be modified. + + `auto` will create a trace for the session with default values for the + workflow name, group id, and metadata. + """) + tracing?: "auto" | { + /** + * The name of the workflow to attach to this trace. This is used to + * name the trace in the traces dashboard. + */ + workflow_name?: string; + + /** + * The group id to attach to this trace to enable filtering and + * grouping in the traces dashboard. + */ + group_id?: string; + + /** + * The arbitrary metadata to attach to this trace to enable + * filtering in the traces dashboard. + */ + metadata?: unknown; + }; + + @doc(""" + Configuration for turn detection. Can be set to `null` to turn off. Server + VAD means that the model will detect the start and end of speech based on + audio volume and respond at the end of user speech. + """) + turn_detection?: { + @doc(""" + Type of turn detection, only `server_vad` is currently supported. + """) + type?: string; + + /** + * Activation threshold for VAD (0.0 to 1.0), this defaults to 0.5. A + * higher threshold will require louder audio to activate the model, and + * thus might perform better in noisy environments. + */ + threshold?: float32; + + /** + * Amount of audio to include before the VAD detected speech (in + * milliseconds). Defaults to 300ms. + */ + prefix_padding_ms?: int32; + + /** + * Duration of silence to detect speech stop (in milliseconds). Defaults + * to 500ms. With shorter values the model will respond more quickly, + * but may jump in on short pauses from the user. + */ + silence_duration_ms?: int32; + }; + + // Tool customization: use enriched tool definition + /** Tools (functions) available to the model. */ + tools?: VoiceLiveTool[]; + + @doc(""" + How the model chooses tools. Options are `auto`, `none`, `required`, or + specify a function. + """) + tool_choice?: string; + + /** Sampling temperature for the model, limited to [0.6, 1.2]. Defaults to 0.8. */ + temperature?: float32; + + @doc(""" + Maximum number of output tokens for a single assistant response, + inclusive of tool calls. Provide an integer between 1 and 4096 to + limit output tokens, or `inf` for the maximum available tokens for a + given model. Defaults to `inf`. + """) + max_response_output_tokens?: int32 | "inf"; +} + +/** The item to add to the conversation. */ +model VoiceLiveConversationItemWithReference { + @doc(""" + For an item of type (`message` | `function_call` | `function_call_output`) + this field allows the client to assign the unique ID of the item. It is + not required because the server will generate one if not provided. + + For an item of type `item_reference`, this field is required and is a + reference to any item that has previously existed in the conversation. + """) + id?: string; + + @doc(""" + The type of the item (`message`, `function_call`, `function_call_output`, `item_reference`). + """) + type?: "message" | "function_call" | "function_call_output"; + + @doc(""" + Identifier for the API object being returned - always `voicelive.item`. + """) + object?: "voicelive.item"; + + @doc(""" + The status of the item (`completed`, `incomplete`). These have no effect + on the conversation, but are accepted for consistency with the + `conversation.item.created` event. + """) + status?: "completed" | "incomplete"; + + @doc(""" + The role of the message sender (`user`, `assistant`, `system`), only + applicable for `message` items. + """) + role?: "user" | "assistant" | "system"; + + @doc(""" + The content of the message, applicable for `message` items. + - Message items of role `system` support only `input_text` content + - Message items of role `user` support `input_text` and `input_audio` + content + - Message items of role `assistant` support `text` content. + """) + content?: VoiceLiveConversationItemWithReferenceContent[]; + + @doc(""" + The ID of the function call (for `function_call` and + `function_call_output` items). If passed on a `function_call_output` + item, the server will check that a `function_call` item with the same + ID exists in the conversation history. + """) + call_id?: string; + + @doc(""" + The name of the function being called (for `function_call` items). + """) + name?: string; + + @doc(""" + The arguments of the function call (for `function_call` items). + """) + arguments?: string; + + @doc(""" + The output of the function call (for `function_call_output` items). + """) + output?: string; +} + +// Tool customization (apply_discriminator): apply discriminated type base +@doc(""" + Send this event when you want to retrieve the server's representation of a specific item in the conversation history. This is useful, for example, to inspect user audio after noise cancellation and VAD. + The server will respond with a `conversation.item.retrieved` event, + unless the item does not exist in the conversation history, in which case the + server will respond with an error. + """) +model VoiceLiveClientEventConversationItemRetrieve extends VoiceLiveClientEvent { + @doc(""" + The event type, must be `conversation.item.retrieve`. + """) + type: VoiceLiveClientEventType.conversation_item_retrieve; + + /** The ID of the item to retrieve. */ + item_id: string; +} + +// Tool customization (apply_discriminator): apply discriminated type base +/** Send this event to update a transcription session. */ +model VoiceLiveClientEventTranscriptionSessionUpdate + extends VoiceLiveClientEvent { + @doc(""" + The event type, must be `transcription_session.update`. + """) + type: VoiceLiveClientEventType.transcription_session_update; + + session: VoiceLiveTranscriptionSessionCreateRequest; +} + +// Tool customization (apply_discriminator): apply discriminated type +/** Returned when the text value of an input audio transcription content part is updated. */ +model VoiceLiveServerEventConversationItemInputAudioTranscriptionDelta + extends VoiceLiveServerEvent { + @doc(""" + The event type, must be `conversation.item.input_audio_transcription.delta`. + """) + type: VoiceLiveServerEventType.conversation_item_input_audio_transcription_delta; + + /** The ID of the item. */ + item_id: string; + + /** The index of the content part in the item's content array. */ + content_index?: int32; + + /** The text delta. */ + delta?: string; + + /** The log probabilities of the transcription. */ + logprobs?: LogProbProperties[] | null; +} + +// Tool customization (apply_discriminator): apply discriminated type +@doc(""" + Returned when a conversation item is retrieved with `conversation.item.retrieve`. + """) +model VoiceLiveServerEventConversationItemRetrieved extends VoiceLiveServerEvent { + @doc(""" + The event type, must be `conversation.item.retrieved`. + """) + type: VoiceLiveServerEventType.conversation_item_retrieved; + + // Tool customization: apply enriched item definition hierarchy + item: VoiceLiveConversationResponseItem; +} + +// Tool customization (apply_discriminator): apply discriminated type +@doc(""" + Returned when a transcription session is updated with a `transcription_session.update` event, unless + there is an error. + """) +model VoiceLiveServerEventTranscriptionSessionUpdated + extends VoiceLiveServerEvent { + @doc(""" + The event type, must be `transcription_session.updated`. + """) + type: VoiceLiveServerEventType.transcription_session_updated; + + session: VoiceLiveTranscriptionSessionCreateResponse; +} + +/** VoiceLive transcription session object configuration. */ +model VoiceLiveTranscriptionSessionCreateRequest { + /** + * The set of modalities the model can respond with. To disable audio, + * set this to ["text"]. + */ + modalities?: ("text" | "audio")[]; + + @doc(""" + The format of input audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`. + For `pcm16`, input audio must be 16-bit PCM at a 24kHz sample rate, + single channel (mono), and little-endian byte order. + """) + input_audio_format?: "pcm16" | "g711_ulaw" | "g711_alaw" = "pcm16"; + + /** Configuration for input audio transcription. The client can optionally set the language and prompt for transcription, these offer additional guidance to the transcription service. */ + input_audio_transcription?: { + @doc(""" + The model to use for transcription, current options are `gpt-4o-transcribe`, `gpt-4o-mini-transcribe`, and `whisper-1`. + """) + `model`?: "gpt-4o-transcribe" | "gpt-4o-mini-transcribe" | "whisper-1"; + + @doc(""" + The language of the input audio. Supplying the input language in + [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`) format + will improve accuracy and latency. + """) + language?: string; + + @doc(""" + An optional text to guide the model's style or continue a previous audio + segment. + For `whisper-1`, the [prompt is a list of keywords](/docs/guides/speech-to-text#prompting). + For `gpt-4o-transcribe` models, the prompt is a free text string, for example "expect words related to technology". + """) + prompt?: string; + }; + + @doc(""" + Configuration for turn detection, ether Server VAD or Semantic VAD. This can be set to `null` to turn off, in which case the client must manually trigger model response. + Server VAD means that the model will detect the start and end of speech based on audio volume and respond at the end of user speech. + Semantic VAD is more advanced and uses a turn detection model (in conjuction with VAD) to semantically estimate whether the user has finished speaking, then dynamically sets a timeout based on this probability. For example, if user audio trails off with "uhhm", the model will score a low probability of turn end and wait longer for the user to continue speaking. This can be useful for more natural conversations, but may have a higher latency. + """) + turn_detection?: { + /** Type of turn detection. */ + type?: "server_vad" | "semantic_vad" = "server_vad"; + + @doc(""" + Used only for `semantic_vad` mode. The eagerness of the model to respond. `low` will wait longer for the user to continue speaking, `high` will respond more quickly. `auto` is the default and is equivalent to `medium`. + """) + eagerness?: "low" | "medium" | "high" | "auto" = "auto"; + + @doc(""" + Used only for `server_vad` mode. Activation threshold for VAD (0.0 to 1.0), this defaults to 0.5. A + higher threshold will require louder audio to activate the model, and + thus might perform better in noisy environments. + """) + threshold?: float32; + + @doc(""" + Used only for `server_vad` mode. Amount of audio to include before the VAD detected speech (in + milliseconds). Defaults to 300ms. + """) + prefix_padding_ms?: int32; + + @doc(""" + Used only for `server_vad` mode. Duration of silence to detect speech stop (in milliseconds). Defaults + to 500ms. With shorter values the model will respond more quickly, + but may jump in on short pauses from the user. + """) + silence_duration_ms?: int32; + + /** Whether or not to automatically generate a response when a VAD stop event occurs. Not available for transcription sessions. */ + create_response?: boolean = true; + + @doc(""" + Whether or not to automatically interrupt any ongoing response with output to the default + conversation (i.e. `conversation` of `auto`) when a VAD start event occurs. Not available for transcription sessions. + """) + interrupt_response?: boolean = true; + }; + + @doc(""" + Configuration for input audio noise reduction. This can be set to `null` to turn off. + Noise reduction filters audio added to the input audio buffer before it is sent to VAD and the model. + Filtering the audio can improve VAD and turn detection accuracy (reducing false positives) and model performance by improving perception of the input audio. + """) + input_audio_noise_reduction?: { + @doc(""" + Type of noise reduction. `near_field` is for close-talking microphones such as headphones, `far_field` is for far-field microphones such as laptop or conference room microphones. + """) + type?: "near_field" | "far_field"; + } | null = null; + + @doc(""" + The set of items to include in the transcription. Current available items are: + - `item.input_audio_transcription.logprobs` + """) + include?: string[]; + + /** Configuration options for the generated client secret. */ + client_secret?: { + /** Configuration for the ephemeral token expiration. */ + expires_at?: { + @doc(""" + The anchor point for the ephemeral token expiration. Only `created_at` is currently supported. + """) + anchor?: "created_at" = "created_at"; + + @doc(""" + The number of seconds from the anchor point to the expiration. Select a value between `10` and `7200`. + """) + seconds?: int32 = 600; + }; + }; +} + +/** + * A new VoiceLive transcription session configuration. + * + * When a session is created on the server via REST API, the session object + * also contains an ephemeral key. Default TTL for keys is 10 minutes. This + * property is not present when a session is updated via the WebSocket API. + */ +model VoiceLiveTranscriptionSessionCreateResponse { + /** + * Ephemeral key returned by the API. Only present when the session is + * created on the server via REST API. + */ + client_secret: { + /** + * Ephemeral key usable in client environments to authenticate connections + * to the VoiceLive API. Use this in client-side environments rather than + * a standard API token, which should only be used server-side. + */ + value: string; + + // Tool customization: 'created' and fields ending in '_at' are Unix encoded utcDateTime + /** + * Timestamp for when the token expires. Currently, all tokens expire + * after one minute. + */ + @encode("unixTimestamp", int32) + expires_at: utcDateTime; + }; + + /** + * The set of modalities the model can respond with. To disable audio, + * set this to ["text"]. + */ + modalities?: ("text" | "audio")[]; + + @doc(""" + The format of input audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`. + """) + input_audio_format?: string; + + /** Configuration of the transcription model. */ + input_audio_transcription?: { + @doc(""" + The model to use for transcription. Can be `gpt-4o-transcribe`, `gpt-4o-mini-transcribe`, or `whisper-1`. + """) + `model`?: "gpt-4o-transcribe" | "gpt-4o-mini-transcribe" | "whisper-1"; + + @doc(""" + The language of the input audio. Supplying the input language in + [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`) format + will improve accuracy and latency. + """) + language?: string; + + /** + * An optional text to guide the model's style or continue a previous audio + * segment. The [prompt](/docs/guides/speech-to-text#prompting) should match + * the audio language. + */ + prompt?: string; + }; + + @doc(""" + Configuration for turn detection. Can be set to `null` to turn off. Server + VAD means that the model will detect the start and end of speech based on + audio volume and respond at the end of user speech. + """) + turn_detection?: { + @doc(""" + Type of turn detection, only `server_vad` is currently supported. + """) + type?: string; + + /** + * Activation threshold for VAD (0.0 to 1.0), this defaults to 0.5. A + * higher threshold will require louder audio to activate the model, and + * thus might perform better in noisy environments. + */ + threshold?: float32; + + /** + * Amount of audio to include before the VAD detected speech (in + * milliseconds). Defaults to 300ms. + */ + prefix_padding_ms?: int32; + + /** + * Duration of silence to detect speech stop (in milliseconds). Defaults + * to 500ms. With shorter values the model will respond more quickly, + * but may jump in on short pauses from the user. + */ + silence_duration_ms?: int32; + }; +} diff --git a/specification/ai/data-plane/VoiceLive/operations.tsp b/specification/ai/data-plane/VoiceLive/operations.tsp new file mode 100644 index 000000000000..00f424748b9b --- /dev/null +++ b/specification/ai/data-plane/VoiceLive/operations.tsp @@ -0,0 +1,45 @@ +import "../common"; +import "./models.tsp"; + +using TypeSpec.Http; +using TypeSpec.OpenAPI; + +namespace OpenAI; + +@route("voicelive") +@tag("VoiceLive") +interface VoiceLive { + @summary("Starts a real-time session for conversation or transcription.") + startVoiceLiveSession( + ...VoiceLiveBetaHeader, + @body requestMessages: VoiceLiveClientEvent[], + ): VoiceLiveServerEvent[]; + + @post + @route("sessions") + @operationId("create-voicelive-session") + @summary(""" + Create an ephemeral API token for use in client-side applications with the VoiceLive API. Can be configured with the same session parameters as the session.update client event. + + It responds with a session object, plus a client_secret key which contains a usable ephemeral API token that can be used to authenticate browser clients for the VoiceLive API. + """) + createEphemeralToken( + @body request: VoiceLiveSessionCreateRequest, + ): VoiceLiveSessionCreateResponse | ErrorResponse; + + @post + @route("transcription_sessions") + @operationId("create-voicelive-transcription-session") + @summary(""" + Create an ephemeral API token for use in client-side applications with the VoiceLive API specifically for voicelive transcriptions. Can be configured with the same session parameters as the transcription_session.update client event. + + It responds with a session object, plus a client_secret key which contains a usable ephemeral API token that can be used to authenticate browser clients for the VoiceLive API. + """) + createEphemeralTranscriptionToken( + @body request: VoiceLiveTranscriptionSessionCreateRequest, + ): VoiceLiveTranscriptionSessionCreateResponse | ErrorResponse; +} + +alias VoiceLiveBetaHeader = { + @header("OpenAI-Beta") openAIBeta: "voicelive=v1"; +}; diff --git a/specification/ai/data-plane/VoiceLive/tspconfig.yaml b/specification/ai/data-plane/VoiceLive/tspconfig.yaml new file mode 100644 index 000000000000..b86f7f2f3ea9 --- /dev/null +++ b/specification/ai/data-plane/VoiceLive/tspconfig.yaml @@ -0,0 +1,53 @@ +parameters: + "service-dir": + default: "sdk/ai" + "dependencies": + default: "" +emit: + - "@azure-tools/typespec-autorest" +linter: + extends: + - "@azure-tools/typespec-azure-rulesets/data-plane" +options: + "@azure-tools/typespec-autorest": + azure-resource-provider-folder: "data-plane" + emit-lro-options: "none" + emitter-output-dir: "{project-root}/.." + output-file: "{azure-resource-provider-folder}/{service-name}/{version-status}/{version}/widgets.json" + "@azure-tools/typespec-python": + package-dir: "azure-ai-voicelive" + namespace: "azure.ai.voicelive" + generate-test: true + generate-sample: true + flavor: azure + "@azure-tools/typespec-csharp": + package-dir: "Azure.AI.VoiceLive" + clear-output-folder: true + model-namespace: false + namespace: "{package-dir}" + flavor: azure + "@azure-typespec/http-client-csharp": + namespace: Azure.AI.VoiceLive + model-namespace: false + "@azure-tools/typespec-ts": + package-dir: "azure-ai-voicelive" + package-details: + name: "@azure-rest/ai-voicelive" + flavor: azure + "@azure-tools/typespec-java": + package-dir: "azure-ai-voicelive" + namespace: com.azure.ai.voicelive + flavor: azure + "@azure-tools/typespec-go": + module: "github.com/Azure/azure-sdk-for-go/{service-dir}/{package-dir}" + service-dir: "sdk/ai" + package-dir: "voicelive" + module-version: "0.0.1" + generate-fakes: true + inject-spans: true + single-client: true + slice-elements-byval: true + flavor: azure + "@azure-tools/typespec-client-generator-cli": + additionalDirectories: + - "specification/ai/ai/data-plane/VoiceLive/" \ No newline at end of file From 2e0558345d6c57e7e583cec80cd978c3d8808505 Mon Sep 17 00:00:00 2001 From: Ryan Hurey Date: Mon, 14 Jul 2025 11:11:10 -0700 Subject: [PATCH 02/48] Pass2 --- .../ai/data-plane/VoiceLive/audio/client.tsp | 10 + .../ai/data-plane/VoiceLive/audio/custom.tsp | 23 + .../ai/data-plane/VoiceLive/audio/main.tsp | 2 + .../ai/data-plane/VoiceLive/audio/models.tsp | 489 ++++++++++++++++++ .../data-plane/VoiceLive/audio/operations.tsp | 70 +++ .../ai/data-plane/VoiceLive/client.tsp | 4 + .../ai/data-plane/VoiceLive/common/custom.tsp | 70 +++ .../ai/data-plane/VoiceLive/common/main.tsp | 2 + .../ai/data-plane/VoiceLive/common/models.tsp | 475 +++++++++++++++++ .../ai/data-plane/VoiceLive/models.tsp | 5 +- .../ai/data-plane/VoiceLive/operations.tsp | 2 +- .../VoiceLive/servers/websocket.tsp | 20 + .../ai/data-plane/VoiceLive/tspconfig.yaml | 7 +- 13 files changed, 1173 insertions(+), 6 deletions(-) create mode 100644 specification/ai/data-plane/VoiceLive/audio/client.tsp create mode 100644 specification/ai/data-plane/VoiceLive/audio/custom.tsp create mode 100644 specification/ai/data-plane/VoiceLive/audio/main.tsp create mode 100644 specification/ai/data-plane/VoiceLive/audio/models.tsp create mode 100644 specification/ai/data-plane/VoiceLive/audio/operations.tsp create mode 100644 specification/ai/data-plane/VoiceLive/client.tsp create mode 100644 specification/ai/data-plane/VoiceLive/common/custom.tsp create mode 100644 specification/ai/data-plane/VoiceLive/common/main.tsp create mode 100644 specification/ai/data-plane/VoiceLive/common/models.tsp create mode 100644 specification/ai/data-plane/VoiceLive/servers/websocket.tsp diff --git a/specification/ai/data-plane/VoiceLive/audio/client.tsp b/specification/ai/data-plane/VoiceLive/audio/client.tsp new file mode 100644 index 000000000000..47b17848e1a0 --- /dev/null +++ b/specification/ai/data-plane/VoiceLive/audio/client.tsp @@ -0,0 +1,10 @@ +import "@azure-tools/typespec-client-generator-core"; +import "./models.tsp"; + +using Azure.ClientGenerator.Core; +using OpenAI; + +@@visibility(CreateTranscriptionResponseVerboseJson.words, "read"); +@@visibility(CreateTranscriptionResponseVerboseJson.segments, "read"); + +@@visibility(CreateTranslationResponseVerboseJson.segments, "read"); diff --git a/specification/ai/data-plane/VoiceLive/audio/custom.tsp b/specification/ai/data-plane/VoiceLive/audio/custom.tsp new file mode 100644 index 000000000000..3dc9b2d92731 --- /dev/null +++ b/specification/ai/data-plane/VoiceLive/audio/custom.tsp @@ -0,0 +1,23 @@ +import "./models.tsp"; + +using TypeSpec.OpenAPI; + +namespace OpenAI; + +union TranscriptionAudioResponseFormat { + AudioResponseFormat, +} + +union TranslationAudioResponseFormat { + AudioResponseFormat, +} + +union TranscriptTextUsageType { + tokens: "tokens", + duration: "duration", +} + +@discriminator("type") +model TranscriptTextUsage { + type: TranscriptTextUsageType; +} diff --git a/specification/ai/data-plane/VoiceLive/audio/main.tsp b/specification/ai/data-plane/VoiceLive/audio/main.tsp new file mode 100644 index 000000000000..e7af5325f311 --- /dev/null +++ b/specification/ai/data-plane/VoiceLive/audio/main.tsp @@ -0,0 +1,2 @@ +import "./client.tsp"; +import "./operations.tsp"; diff --git a/specification/ai/data-plane/VoiceLive/audio/models.tsp b/specification/ai/data-plane/VoiceLive/audio/models.tsp new file mode 100644 index 000000000000..b6a3c9a4db02 --- /dev/null +++ b/specification/ai/data-plane/VoiceLive/audio/models.tsp @@ -0,0 +1,489 @@ +/* + * This file was automatically generated from an OpenAPI .yaml file. + * Edits made directly to this file will be lost. + */ + +import "../common"; +import "./custom.tsp"; + +using TypeSpec.OpenAPI; + +namespace OpenAI; + +// Tool generated type. Extracts from CreateTranscriptionResponseJson.logprobs +alias CreateTranscriptionResponseJsonLogprob = { + /** The token in the transcription. */ + token?: string; + + /** The log probability of the token. */ + logprob?: float32; + + /** The bytes of the token. */ + bytes?: float32[]; +}; + +// Tool generated type. Extracts from TranscriptTextDeltaEvent.logprobs +alias TranscriptTextDeltaEventLogprob = { + /** The token that was used to generate the log probability. */ + token?: string; + + /** The log probability of the token. */ + logprob?: float32; + + /** The bytes that were used to generate the log probability. */ + bytes?: int32[]; +}; + +// Tool generated type. Extracts from TranscriptTextDoneEvent.logprobs +alias TranscriptTextDoneEventLogprob = { + /** The token that was used to generate the log probability. */ + token?: string; + + /** The log probability of the token. */ + logprob?: float32; + + /** The bytes that were used to generate the log probability. */ + bytes?: int32[]; +}; + +@doc(""" + The format of the output, in one of these options: `json`, `text`, `srt`, `verbose_json`, or `vtt`. For `gpt-4o-transcribe` and `gpt-4o-mini-transcribe`, the only supported format is `json`. + """) +union AudioResponseFormat { + "json", + "text", + "srt", + "verbose_json", + "vtt", +} + +model CreateSpeechRequest { + @doc(""" + One of the available [TTS models](/docs/models#tts): `tts-1`, `tts-1-hd` or `gpt-4o-mini-tts`. + """) + @extension("x-oaiTypeLabel", "string") + `model`: string | "tts-1" | "tts-1-hd" | "gpt-4o-mini-tts"; + + /** The text to generate audio for. The maximum length is 4096 characters. */ + @maxLength(4096) + input: string; + + @doc(""" + Control the voice of your generated audio with additional instructions. Does not work with `tts-1` or `tts-1-hd`. + """) + @maxLength(4096) + instructions?: string; + + @doc(""" + The voice to use when generating the audio. Supported voices are `alloy`, `ash`, `ballad`, `coral`, `echo`, `fable`, `onyx`, `nova`, `sage`, `shimmer`, and `verse`. Previews of the voices are available in the [Text to speech guide](/docs/guides/text-to-speech#voice-options). + """) + voice: VoiceIdsShared; + + @doc(""" + The format to audio in. Supported formats are `mp3`, `opus`, `aac`, `flac`, `wav`, and `pcm`. + """) + response_format?: "mp3" | "opus" | "aac" | "flac" | "wav" | "pcm" = "mp3"; + + @doc(""" + The speed of the generated audio. Select a value from `0.25` to `4.0`. `1.0` is the default. + """) + @minValue(0.25) + @maxValue(4) + speed?: float32 = 1; + + @doc(""" + The format to stream the audio in. Supported formats are `sse` and `audio`. `sse` is not supported for `tts-1` or `tts-1-hd`. + """) + stream_format?: "sse" | "audio" = "audio"; +} + +// Tool customization: Convert to discriminated type base +union CreateSpeechResponseStreamEventType { + speech_audio_delta: "speech.audio.delta", + speech_audio_done: "speech.audio.done", +} +@discriminator("type") +model CreateSpeechResponseStreamEvent { + type: CreateSpeechResponseStreamEventType; +} + +// Tool customization (apply_discriminator): Apply discriminated type base +/** Emitted for each chunk of audio data generated during speech synthesis. */ +model SpeechAudioDeltaEvent extends CreateSpeechResponseStreamEvent { + @doc(""" + The type of the event. Always `speech.audio.delta`. + """) + type: CreateSpeechResponseStreamEventType.speech_audio_delta; + + // Tool customization: base64 input uses an encoded bytes type + /** A chunk of Base64-encoded audio data. */ + @encode("base64", string) + audio: bytes; +} + +// Tool customization (apply_discriminator): Apply discriminated type base +/** Emitted when the speech synthesis is complete and all audio has been streamed. */ +model SpeechAudioDoneEvent extends CreateSpeechResponseStreamEvent { + @doc(""" + The type of the event. Always `speech.audio.done`. + """) + type: CreateSpeechResponseStreamEventType.speech_audio_done; + + /** Token usage statistics for the request. */ + usage: { + /** Number of input tokens in the prompt. */ + input_tokens: int32; + + /** Number of output tokens generated. */ + output_tokens: int32; + + /** Total number of tokens used (input + output). */ + total_tokens: int32; + }; +} + +model CreateTranscriptionRequest { + /** The audio file object (not file name) to transcribe, in one of these formats: flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm. */ + @extension("x-oaiTypeLabel", "file") + file: bytes; + + @doc(""" + ID of the model to use. The options are `gpt-4o-transcribe`, `gpt-4o-mini-transcribe`, and `whisper-1` (which is powered by our open source Whisper V2 model). + """) + @extension("x-oaiTypeLabel", "string") + `model`: + | string + | "whisper-1" + | "gpt-4o-transcribe" + | "gpt-4o-mini-transcribe"; + + @doc(""" + The language of the input audio. Supplying the input language in [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`) format will improve accuracy and latency. + """) + language?: string; + + /** An optional text to guide the model's style or continue a previous audio segment. The [prompt](/docs/guides/speech-to-text#prompting) should match the audio language. */ + prompt?: string; + + // Tool customization: use scenario-specific composed union + response_format?: TranscriptionAudioResponseFormat = "json"; + + // Tool customization: add missing but documented min/max for temperature + /** The sampling temperature, between 0 and 1. Higher values like 0.8 will make the output more random, while lower values like 0.2 will make it more focused and deterministic. If set to 0, the model will use [log probability](https://en.wikipedia.org/wiki/Log_probability) to automatically increase the temperature until certain thresholds are hit. */ + @minValue(0) + @maxValue(1) + temperature?: float32 = 0; + + @doc(""" + Additional information to include in the transcription response. + `logprobs` will return the log probabilities of the tokens in the + response to understand the model's confidence in the transcription. + `logprobs` only works with response_format set to `json` and only with + the models `gpt-4o-transcribe` and `gpt-4o-mini-transcribe`. + """) + `include[]`?: TranscriptionInclude[]; + + @doc(""" + The timestamp granularities to populate for this transcription. `response_format` must be set `verbose_json` to use timestamp granularities. Either or both of these options are supported: `word`, or `segment`. Note: There is no additional latency for segment timestamps, but generating word timestamps incurs additional latency. + """) + `timestamp_granularities[]`?: ("word" | "segment")[] = #["segment"]; + + @doc(""" + If set to true, the model response data will be streamed to the client + as it is generated using [server-sent events](https://developer.mozilla.org/en-US/docs/Web/API/Server-sent_events/Using_server-sent_events#Event_stream_format). + See the [Streaming section of the Speech-to-Text guide](/docs/guides/speech-to-text?lang=curl#streaming-transcriptions) + for more information. + + Note: Streaming is not supported for the `whisper-1` model and will be ignored. + """) + stream?: boolean | null = false; + + @doc(""" + Controls how the audio is cut into chunks. When set to `"auto"`, the server first normalizes loudness and then uses voice activity detection (VAD) to choose boundaries. `server_vad` object can be provided to tweak VAD detection parameters manually. If unset, the audio is transcribed as a single block. + """) + @extension("x-oaiTypeLabel", "string") + chunking_strategy?: VadConfig | null; +} + +model CreateTranslationRequest { + /** The audio file object (not file name) translate, in one of these formats: flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm. */ + @extension("x-oaiTypeLabel", "file") + file: bytes; + + @doc(""" + ID of the model to use. Only `whisper-1` (which is powered by our open source Whisper V2 model) is currently available. + """) + @extension("x-oaiTypeLabel", "string") + `model`: string | "whisper-1"; + + /** An optional text to guide the model's style or continue a previous audio segment. The [prompt](/docs/guides/speech-to-text#prompting) should be in English. */ + prompt?: string; + + // Tool customization: use scenario-specific composed union + @doc(""" + The format of the output, in one of these options: `json`, `text`, `srt`, `verbose_json`, or `vtt`. + """) + response_format?: TranslationAudioResponseFormat = "json"; + + // Tool customization: add missing but documented min/max for temperature + /** The sampling temperature, between 0 and 1. Higher values like 0.8 will make the output more random, while lower values like 0.2 will make it more focused and deterministic. If set to 0, the model will use [log probability](https://en.wikipedia.org/wiki/Log_probability) to automatically increase the temperature until certain thresholds are hit. */ + @minValue(0) + @maxValue(1) + temperature?: float32 = 0; +} + +/** Represents a transcription response returned by model, based on the provided input. */ +model CreateTranscriptionResponseJson { + /** The transcribed text. */ + text: string; + + @doc(""" + The log probabilities of the tokens in the transcription. Only returned with the models `gpt-4o-transcribe` and `gpt-4o-mini-transcribe` if `logprobs` is added to the `include` array. + """) + logprobs?: CreateTranscriptionResponseJsonLogprob[]; + + // Tool customization: Substitute common discriminated type base + /** Token usage statistics for the request. */ + usage?: TranscriptTextUsage; +} + +// Tool customization (apply_discriminator): Apply discriminated base +/** Usage statistics for models billed by audio input duration. */ +model TranscriptTextUsageDuration extends TranscriptTextUsage { + @doc(""" + The type of the usage object. Always `duration` for this variant. + """) + type: TranscriptTextUsageType.duration; + + // Tool customization: numeric timespans are encoded durations + /** Duration of the input audio in seconds. */ + @encode("seconds", float32) + seconds: duration; +} + +// Tool customization (apply_discriminator): Apply discriminated base +/** Usage statistics for models billed by token usage. */ +model TranscriptTextUsageTokens extends TranscriptTextUsage { + @doc(""" + The type of the usage object. Always `tokens` for this variant. + """) + type: TranscriptTextUsageType.tokens; + + /** Number of input tokens billed for this request. */ + input_tokens: int32; + + /** Details about the input tokens billed for this request. */ + input_token_details?: { + /** Number of text tokens billed for this request. */ + text_tokens?: int32; + + /** Number of audio tokens billed for this request. */ + audio_tokens?: int32; + }; + + /** Number of output tokens generated. */ + output_tokens: int32; + + /** Total number of tokens used (input + output). */ + total_tokens: int32; +} + +// Tool customization: Add a missing 'task' field, present on the wire but not in the spec +/** Represents a verbose json transcription response returned by model, based on the provided input. */ +model CreateTranscriptionResponseVerboseJson { + /** The task label. */ + task: "transcribe"; + + /** The language of the input audio. */ + language: string; + + // Tool customization: improve representation of float duration + /** The duration of the input audio. */ + @encode("seconds", float32) + duration: duration; + + /** The transcribed text. */ + text: string; + + /** Extracted words and their corresponding timestamps. */ + words?: TranscriptionWord[]; + + /** Segments of the transcribed text and their corresponding details. */ + segments?: TranscriptionSegment[]; + + // Tool customization: Substitute common discriminated type base (underspecification of non-verbose union parity assumed) + usage?: TranscriptTextUsage; +} + +model CreateTranslationResponseJson { + text: string; +} + +// Tool customization: Add a missing 'task' field, present on the wire but not in the spec +model CreateTranslationResponseVerboseJson { + /** The task label. */ + task: "translate"; + + @doc(""" + The language of the output translation (always `english`). + """) + language: string; + + // Tool customization: improve representation of float duration + /** The duration of the input audio. */ + @encode("seconds", float32) + duration: duration; + + /** The translated text. */ + text: string; + + /** Segments of the translated text and their corresponding details. */ + segments?: TranscriptionSegment[]; +} + +// Tool customization: Establish discriminated type hierarchy for transcription stream events +union CreateTranscriptionResponseStreamEventType { + string, + transcript_text_delta: "transcript.text.delta", + transcript_text_done: "transcript.text.done", +} +@discriminator("type") +model CreateTranscriptionResponseStreamEvent { + type: CreateTranscriptionResponseStreamEventType; +} + +model TranscriptionSegment { + /** Unique identifier of the segment. */ + id: int32; + + /** Seek offset of the segment. */ + seek: int32; + + // Tool customization: numeric timespans are encoded durations + /** Start time of the segment in seconds. */ + @encode("seconds", float32) + start: duration; + + // Tool customization: numeric timespans are encoded durations + /** End time of the segment in seconds. */ + @encode("seconds", float32) + end: duration; + + /** Text content of the segment. */ + text: string; + + /** Array of token IDs for the text content. */ + tokens: int32[]; + + /** Temperature parameter used for generating the segment. */ + temperature: float32; + + /** Average logprob of the segment. If the value is lower than -1, consider the logprobs failed. */ + avg_logprob: float32; + + /** Compression ratio of the segment. If the value is greater than 2.4, consider the compression failed. */ + compression_ratio: float32; + + @doc(""" + Probability of no speech in the segment. If the value is higher than 1.0 and the `avg_logprob` is below -1, consider this segment silent. + """) + no_speech_prob: float32; +} + +model TranscriptionWord { + /** The text content of the word. */ + word: string; + + // Tool customization: numeric timespans are encoded durations + /** Start time of the word in seconds. */ + @encode("seconds", float32) + start: duration; + + // Tool customization: numeric timespans are encoded durations + /** End time of the word in seconds. */ + @encode("seconds", float32) + end: duration; +} + +// Tool customization (apply_discriminator): Apply discriminated type base for transcription stream events +@doc(""" + Emitted when there is an additional text delta. This is also the first event emitted when the transcription starts. Only emitted when you [create a transcription](/docs/api-reference/audio/create-transcription) with the `Stream` parameter set to `true`. + """) +model TranscriptTextDeltaEvent extends CreateTranscriptionResponseStreamEvent { + @doc(""" + The type of the event. Always `transcript.text.delta`. + """) + type: CreateTranscriptionResponseStreamEventType.transcript_text_delta; + + /** The text delta that was additionally transcribed. */ + delta: string; + + @doc(""" + The log probabilities of the delta. Only included if you [create a transcription](/docs/api-reference/audio/create-transcription) with the `include[]` parameter set to `logprobs`. + """) + logprobs?: TranscriptTextDeltaEventLogprob[]; +} + +// Tool customization (apply_discriminator): Apply discriminated type base for transcription stream events +@doc(""" + Emitted when the transcription is complete. Contains the complete transcription text. Only emitted when you [create a transcription](/docs/api-reference/audio/create-transcription) with the `Stream` parameter set to `true`. + """) +model TranscriptTextDoneEvent extends CreateTranscriptionResponseStreamEvent { + @doc(""" + The type of the event. Always `transcript.text.done`. + """) + type: CreateTranscriptionResponseStreamEventType.transcript_text_done; + + /** The text that was transcribed. */ + text: string; + + @doc(""" + The log probabilities of the individual tokens in the transcription. Only included if you [create a transcription](/docs/api-reference/audio/create-transcription) with the `include[]` parameter set to `logprobs`. + """) + logprobs?: TranscriptTextDoneEventLogprob[]; + + // Tool customization: Substitute common discriminated type base (underspecification of non-verbose/non-streaming union parity assumed) + usage?: TranscriptTextUsage; +} + +union TranscriptionInclude { + "logprobs", +} + +@doc(""" + Controls how the audio is cut into chunks. When set to `"auto"`, the + server first normalizes loudness and then uses voice activity detection (VAD) to + choose boundaries. `server_vad` object can be provided to tweak VAD detection + parameters manually. If unset, the audio is transcribed as a single block. + """) +union TranscriptionChunkingStrategy { + "auto", + VadConfig, +} + +model VadConfig { + @doc(""" + Must be set to `server_vad` to enable manual chunking using server side VAD. + """) + type: "server_vad"; + + /** + * Amount of audio to include before the VAD detected speech (in + * milliseconds). + */ + prefix_padding_ms?: int32 = 300; + + /** + * Duration of silence to detect speech stop (in milliseconds). + * With shorter values the model will respond more quickly, + * but may jump in on short pauses from the user. + */ + silence_duration_ms?: int32 = 200; + + /** + * Sensitivity threshold (0.0 to 1.0) for voice activity detection. A + * higher threshold will require louder audio to activate the model, and + * thus might perform better in noisy environments. + */ + threshold?: float32 = 0.5; +} diff --git a/specification/ai/data-plane/VoiceLive/audio/operations.tsp b/specification/ai/data-plane/VoiceLive/audio/operations.tsp new file mode 100644 index 000000000000..95599c938eff --- /dev/null +++ b/specification/ai/data-plane/VoiceLive/audio/operations.tsp @@ -0,0 +1,70 @@ +import "@typespec/http"; +import "@typespec/openapi"; + +import "../common"; +import "./models.tsp"; + +using TypeSpec.Http; +using TypeSpec.OpenAPI; + +namespace OpenAI; + +@route("/audio") +interface Audio { + @route("speech") + @post + @operationId("createSpeech") + @tag("Audio") + @summary("Generates audio from the input text.") + createSpeech( + @header accept: "application/octet-stream", + @body requestBody: CreateSpeechRequest, + ): { + /** chunked */ + @header("Transfer-Encoding") transferEncoding?: string; + + @header contentType: "application/octet-stream"; + @body responseBody: bytes; + } | SseResponseOf | ErrorResponse; + + @route("transcriptions") + @post + @operationId("createTranscription") + @tag("Audio") + @summary("Transcribes audio into the input language.") + createTranscription( + ...AcceptJsonOrEventStreamHeader, + @header contentType: "multipart/form-data", + @body requestBody: CreateTranscriptionRequest, + ): + | CreateTranscriptionResponseVerboseJson + | CreateTranscriptionResponseJson + | SseResponseOf + | { + // TODO: This response is not defined in the OpenAPI spec. + @header contentType: "text/plain"; + + @body responseBody: string; + } + | ErrorResponse; + + @route("translations") + @post + @operationId("createTranslation") + @tag("Audio") + @summary("Translates audio into English..") + createTranslation( + @header accept: "application/json", + @header contentType: "multipart/form-data", + @body requestBody: CreateTranslationRequest, + ): + | CreateTranslationResponseVerboseJson + | CreateTranslationResponseJson + | { + // TODO: This response is not defined in the OpenAPI spec. + @header contentType: "text/plain"; + + @body responseBody: string; + } + | ErrorResponse; +} diff --git a/specification/ai/data-plane/VoiceLive/client.tsp b/specification/ai/data-plane/VoiceLive/client.tsp new file mode 100644 index 000000000000..16f81c764b21 --- /dev/null +++ b/specification/ai/data-plane/VoiceLive/client.tsp @@ -0,0 +1,4 @@ +import "@azure-tools/typespec-client-generator-core"; +import "./servers/websocket.tsp"; + +using Azure.ClientGenerator.Core; diff --git a/specification/ai/data-plane/VoiceLive/common/custom.tsp b/specification/ai/data-plane/VoiceLive/common/custom.tsp new file mode 100644 index 000000000000..9ee4b1fdadb9 --- /dev/null +++ b/specification/ai/data-plane/VoiceLive/common/custom.tsp @@ -0,0 +1,70 @@ +import "@typespec/http"; +import "@typespec/openapi"; + +using TypeSpec.Http; +using TypeSpec.OpenAPI; + +namespace OpenAI; + +@discriminator("type") +model ResponseFormat { + type: "text" | "json_object" | "json_schema"; +} + +alias AcceptJsonHeader = { + @header accept: "application/json"; +}; + +alias AcceptJsonOrEventStreamHeader = { + @header accept: "application/json" | "text/event-stream"; +}; + +alias AssistantsBetaHeader = { + @header("OpenAI-Beta") openAIBeta: "assistants=v2"; +}; + +alias PageLimitQueryParameter = { + /** + * A limit on the number of objects to be returned. Limit can range between 1 and 100, and the + * default is 20. + */ + @query limit?: int32 = 20; +}; + +alias PageOrderQueryParameter = { + /** + * Sort order by the `created_at` timestamp of the objects. `asc` for ascending order and`desc` + * for descending order. + */ + @query order?: "asc" | "desc"; +}; + +alias PageAfterQueryParameter = { + /** + * A cursor for use in pagination. `after` is an object ID that defines your place in the list. + * For instance, if you make a list request and receive 100 objects, ending with obj_foo, your + * subsequent call can include after=obj_foo in order to fetch the next page of the list. + */ + @query after?: string; +}; + +alias PageBeforeQueryParameter = { + /** + * A cursor for use in pagination. `before` is an object ID that defines your place in the list. + * For instance, if you make a list request and receive 100 objects, ending with obj_foo, your + * subsequent call can include before=obj_foo in order to fetch the previous page of the list. + */ + @query before?: string; +}; + +alias CommonPageQueryParameters = { + ...PageLimitQueryParameter; + ...PageOrderQueryParameter; + ...PageAfterQueryParameter; + ...PageBeforeQueryParameter; +}; + +alias SseResponseOf = { + @header("Content-Type") contentType: "text/event-stream"; + @body responseBody: T; +}; diff --git a/specification/ai/data-plane/VoiceLive/common/main.tsp b/specification/ai/data-plane/VoiceLive/common/main.tsp new file mode 100644 index 000000000000..223114fb0ebc --- /dev/null +++ b/specification/ai/data-plane/VoiceLive/common/main.tsp @@ -0,0 +1,2 @@ +import "./custom.tsp"; +import "./models.tsp"; diff --git a/specification/ai/data-plane/VoiceLive/common/models.tsp b/specification/ai/data-plane/VoiceLive/common/models.tsp new file mode 100644 index 000000000000..60b18b5a2ef4 --- /dev/null +++ b/specification/ai/data-plane/VoiceLive/common/models.tsp @@ -0,0 +1,475 @@ +/* + * This file was automatically generated from an OpenAPI .yaml file. + * Edits made directly to this file will be lost. + */ + +import "./custom.tsp"; + +using TypeSpec.OpenAPI; + +namespace OpenAI; + +model Error { + code: string | null; + message: string; + param: string | null; + type: string; +} + +// Tool customization: apply error decorator +@error +model ErrorResponse { + error: Error; +} + +// Tool customization: Wrap for reuse +@doc(""" + The parameters the functions accepts, described as a JSON Schema object. See the [guide](/docs/guides/function-calling) for examples, and the [JSON Schema reference](https://json-schema.org/understanding-json-schema/) for documentation about the format. + + Omitting `parameters` defines a function with an empty parameter list. + """) +model FunctionParametersCommon { + /** + * The parameters the functions accepts, described as a JSON Schema object. See the [guide](/docs/guides/function-calling) for examples, and the [JSON Schema reference](https://json-schema.org/understanding-json-schema/) for documentation about the format. + * + * Omitting `parameters` defines a function with an empty parameter list. + */ + parameters?: unknown; +} + +model FunctionObject { + /** A description of what the function does, used by the model to choose when and how to call the function. */ + description?: string; + + /** The name of the function to be called. Must be a-z, A-Z, 0-9, or contain underscores and dashes, with a maximum length of 64. */ + name: string; + + ...FunctionParametersCommon; + + @doc(""" + Whether to enable strict schema adherence when generating the function call. If set to true, the model will follow the exact schema defined in the `parameters` field. Only a subset of JSON Schema is supported when `strict` is `true`. Learn more about Structured Outputs in the [function calling guide](docs/guides/function-calling). + """) + strict?: boolean | null = false; +} + +// Tool customization: Wrap for reuse +/** + * Set of 16 key-value pairs that can be attached to an object. This can be + * useful for storing additional information about the object in a structured + * format, and querying for objects via API or the dashboard. + * + * Keys are strings with a maximum length of 64 characters. Values are strings + * with a maximum length of 512 characters. + */ +model MetadataPropertyForRequest { + /** + * Set of 16 key-value pairs that can be attached to an object. This can be + * useful for storing additional information about the object in a structured + * format, and querying for objects via API or the dashboard. + * + * Keys are strings with a maximum length of 64 characters. Values are strings + * with a maximum length of 512 characters. + */ + @extension("x-oaiTypeLabel", "map") + metadata?: Record; +} +model MetadataPropertyForResponse { + /** + * Set of 16 key-value pairs that can be attached to an object. This can be + * useful for storing additional information about the object in a structured + * format, and querying for objects via API or the dashboard. + * + * Keys are strings with a maximum length of 64 characters. Values are strings + * with a maximum length of 512 characters. + */ + @extension("x-oaiTypeLabel", "map") + metadata: Record | null; +} + +// Tool customization (apply_discriminator): establish a common, discriminated union +/** Default response format. Used to generate text responses. */ +model ResponseFormatText extends ResponseFormat { + @doc(""" + The type of response format being defined. Always `text`. + """) + type: "text"; +} + +// Tool customization (apply_discriminator): establish a common, discriminated union +@doc(""" + JSON object response format. An older method of generating JSON responses. + Using `json_schema` is recommended for models that support it. Note that the + model will not generate JSON without a system or user message instructing it + to do so. + """) +model ResponseFormatJsonObject extends ResponseFormat { + @doc(""" + The type of response format being defined. Always `json_object`. + """) + type: "json_object"; +} + +/** + * The schema for the response format, described as a JSON Schema object. + * Learn how to build JSON schemas [here](https://json-schema.org/). + */ +model ResponseFormatJsonSchemaSchema is Record; + +// Tool customization (apply_discriminator): establish a common, discriminated union +/** + * JSON Schema response format. Used to generate structured JSON responses. + * Learn more about [Structured Outputs](/docs/guides/structured-outputs). + */ +model ResponseFormatJsonSchema extends ResponseFormat { + @doc(""" + The type of response format being defined. Always `json_schema`. + """) + type: "json_schema"; + + /** Structured Outputs configuration options, including a JSON Schema. */ + json_schema: { + /** + * A description of what the response format is for, used by the model to + * determine how to respond in the format. + */ + description?: string; + + /** + * The name of the response format. Must be a-z, A-Z, 0-9, or contain + * underscores and dashes, with a maximum length of 64. + */ + name: string; + + schema?: ResponseFormatJsonSchemaSchema; + + @doc(""" + Whether to enable strict schema adherence when generating the output. + If set to true, the model will always follow the exact schema defined + in the `schema` field. Only a subset of JSON Schema is supported when + `strict` is `true`. To learn more, read the [Structured Outputs + guide](/docs/guides/structured-outputs). + """) + strict?: boolean | null = false; + }; +} + +/** Whether to enable [parallel function calling](/docs/guides/function-calling#configuring-parallel-function-calling) during tool use. */ +scalar ParallelToolCalls extends boolean; + +/** Usage statistics for the completion request. */ +model CompletionUsage { + /** Number of tokens in the generated completion. */ + completion_tokens: int32 = 0; + + /** Number of tokens in the prompt. */ + prompt_tokens: int32 = 0; + + /** Total number of tokens used in the request (prompt + completion). */ + total_tokens: int32 = 0; + + /** Breakdown of tokens used in a completion. */ + completion_tokens_details?: { + /** + * When using Predicted Outputs, the number of tokens in the + * prediction that appeared in the completion. + */ + accepted_prediction_tokens?: int32 = 0; + + /** Audio input tokens generated by the model. */ + audio_tokens?: int32 = 0; + + /** Tokens generated by the model for reasoning. */ + reasoning_tokens?: int32 = 0; + + /** + * When using Predicted Outputs, the number of tokens in the + * prediction that did not appear in the completion. However, like + * reasoning tokens, these tokens are still counted in the total + * completion tokens for purposes of billing, output, and context window + * limits. + */ + rejected_prediction_tokens?: int32 = 0; + }; + + /** Breakdown of tokens used in the prompt. */ + prompt_tokens_details?: { + /** Audio input tokens present in the prompt. */ + audio_tokens?: int32 = 0; + + /** Cached tokens present in the prompt. */ + cached_tokens?: int32 = 0; + }; +} + +@doc(""" + Options for streaming response. Only set this when you set `stream: true`. + """) +model ChatCompletionStreamOptions { + @doc(""" + If set, an additional chunk will be streamed before the `data: [DONE]` + message. The `usage` field on this chunk shows the token usage statistics + for the entire request, and the `choices` field will always be an empty + array. + + All other chunks will also include a `usage` field, but with a null + value. **NOTE:** If the stream is interrupted, you may not receive the + final usage chunk which contains the total token usage for the request. + """) + include_usage?: boolean; +} + +@doc(""" + **o-series models only** + + Constrains effort on reasoning for + [reasoning models](https://platform.openai.com/docs/guides/reasoning). + Currently supported values are `low`, `medium`, and `high`. Reducing + reasoning effort can result in faster responses and fewer tokens used + on reasoning in a response. + """) +union ReasoningEffort { + "low", + "medium", + "high", +} + +// Tool customization: Stub for separated type +alias CreateModelResponseProperties = ModelResponsePropertiesForRequest; + +// Tool customization: Replace imprecise common spread source with split request/response models honoring optionality and nullability +model ModelResponsePropertiesForRequest { + ...MetadataPropertyForRequest; + + @doc(""" + What sampling temperature to use, between 0 and 2. Higher values like 0.8 will make the output more random, while lower values like 0.2 will make it more focused and deterministic. + We generally recommend altering this or `top_p` but not both. + """) + @minValue(0) + @maxValue(2) + temperature?: float32 | null = 1; + + @doc(""" + An alternative to sampling with temperature, called nucleus sampling, + where the model considers the results of the tokens with top_p probability + mass. So 0.1 means only the tokens comprising the top 10% probability mass + are considered. + + We generally recommend altering this or `temperature` but not both. + """) + @minValue(0) + @maxValue(1) + top_p?: float32 | null = 1; + + /** A unique identifier representing your end-user, which can help OpenAI to monitor and detect abuse. [Learn more](/docs/guides/safety-best-practices#end-user-ids). */ + user?: string; + + service_tier?: ServiceTier; + + /** An integer between 0 and 20 specifying the number of most likely tokens to return at each token position, each with an associated log probability. */ + @minValue(0) + @maxValue(20) + top_logprobs?: int32; +} +model ModelResponsePropertiesForResponse { + ...MetadataPropertyForResponse; + + @doc(""" + What sampling temperature to use, between 0 and 2. Higher values like 0.8 will make the output more random, while lower values like 0.2 will make it more focused and deterministic. + We generally recommend altering this or `top_p` but not both. + """) + @minValue(0) + @maxValue(2) + temperature: float32 | null; + + @doc(""" + An alternative to sampling with temperature, called nucleus sampling, + where the model considers the results of the tokens with top_p probability + mass. So 0.1 means only the tokens comprising the top 10% probability mass + are considered. + + We generally recommend altering this or `temperature` but not both. + """) + @minValue(0) + @maxValue(1) + top_p: float32 | null; + + /** A unique identifier representing your end-user, which can help OpenAI to monitor and detect abuse. [Learn more](/docs/guides/safety-best-practices#end-user-ids). */ + user: string | null; + + service_tier?: ServiceTier; + + /** An integer between 0 and 20 specifying the number of most likely tokens to return at each token position, each with an associated log probability. */ + top_logprobs?: int32 | null; +} + +@doc(""" + Output types that you would like the model to generate. + Most models are capable of generating text, which is the default: + + `["text"]` + + The `gpt-4o-audio-preview` model can also be used to + [generate audio](/docs/guides/audio). To request that this model generate + both text and audio responses, you can use: + + `["text", "audio"]` + """) +model ResponseModalities is ("text" | "audio")[]; + +@doc(""" + Not supported with latest reasoning models `o3` and `o4-mini`. + + Up to 4 sequences where the API will stop generating further tokens. The + returned text will not contain the stop sequence. + """) +union StopConfiguration { + string, + string[], +} + +@doc(""" + High level guidance for the amount of context window space to use for the + search. One of `low`, `medium`, or `high`. `medium` is the default. + """) +union WebSearchContextSize { + "low", + "medium", + "high", +} + +/** Approximate location parameters for the search. */ +model WebSearchLocation { + @doc(""" + The two-letter + [ISO country code](https://en.wikipedia.org/wiki/ISO_3166-1) of the user, + e.g. `US`. + """) + country?: string; + + @doc(""" + Free text input for the region of the user, e.g. `California`. + """) + region?: string; + + @doc(""" + Free text input for the city of the user, e.g. `San Francisco`. + """) + city?: string; + + @doc(""" + The [IANA timezone](https://timeapi.io/documentation/iana-timezones) + of the user, e.g. `America/Los_Angeles`. + """) + timezone?: string; +} + +@doc(""" + The ranker to use for the file search. If not specified will use the `auto` ranker. + """) +union FileSearchRanker { + "auto", + "default_2024_08_21", +} + +union ModelIdsShared { + string, + "gpt-4.1", + "gpt-4.1-mini", + "gpt-4.1-nano", + "gpt-4.1-2025-04-14", + "gpt-4.1-mini-2025-04-14", + "gpt-4.1-nano-2025-04-14", + "o4-mini", + "o4-mini-2025-04-16", + "o3", + "o3-2025-04-16", + "o3-mini", + "o3-mini-2025-01-31", + "o1", + "o1-2024-12-17", + "o1-preview", + "o1-preview-2024-09-12", + "o1-mini", + "o1-mini-2024-09-12", + "gpt-4o", + "gpt-4o-2024-11-20", + "gpt-4o-2024-08-06", + "gpt-4o-2024-05-13", + "gpt-4o-audio-preview", + "gpt-4o-audio-preview-2024-10-01", + "gpt-4o-audio-preview-2024-12-17", + "gpt-4o-audio-preview-2025-06-03", + "gpt-4o-mini-audio-preview", + "gpt-4o-mini-audio-preview-2024-12-17", + "gpt-4o-search-preview", + "gpt-4o-mini-search-preview", + "gpt-4o-search-preview-2025-03-11", + "gpt-4o-mini-search-preview-2025-03-11", + "chatgpt-4o-latest", + "codex-mini-latest", + "gpt-4o-mini", + "gpt-4o-mini-2024-07-18", + "gpt-4-turbo", + "gpt-4-turbo-2024-04-09", + "gpt-4-0125-preview", + "gpt-4-turbo-preview", + "gpt-4-1106-preview", + "gpt-4-vision-preview", + "gpt-4", + "gpt-4-0314", + "gpt-4-0613", + "gpt-4-32k", + "gpt-4-32k-0314", + "gpt-4-32k-0613", + "gpt-3.5-turbo", + "gpt-3.5-turbo-16k", + "gpt-3.5-turbo-0301", + "gpt-3.5-turbo-0613", + "gpt-3.5-turbo-1106", + "gpt-3.5-turbo-0125", + "gpt-3.5-turbo-16k-0613", +} + +/** A log probability object. */ +model LogProbProperties { + /** The token that was used to generate the log probability. */ + token: string; + + /** The log probability of the token. */ + logprob: float32; + + /** The bytes that were used to generate the log probability. */ + bytes: int32[]; +} + +union VoiceIdsShared { + string, + "alloy", + "ash", + "ballad", + "coral", + "echo", + "fable", + "onyx", + "nova", + "sage", + "shimmer", + "verse", +} + +@doc(""" + Specifies the processing type used for serving the request. + - If set to 'auto', then the request will be processed with the service tier configured in the Project settings. Unless otherwise configured, the Project will use 'default'. + - If set to 'default', then the requset will be processed with the standard pricing and performance for the selected model. + - If set to '[flex](/docs/guides/flex-processing)' or 'priority', then the request will be processed with the corresponding service tier. [Contact sales](https://openai.com/contact-sales) to learn more about Priority processing. + - When not set, the default behavior is 'auto'. + + When the `service_tier` parameter is set, the response body will include the `service_tier` value based on the processing mode actually used to serve the request. This response value may be different from the value set in the parameter. + """) +union ServiceTier { + "auto", + "default", + "flex", + "scale", + "priority", +} diff --git a/specification/ai/data-plane/VoiceLive/models.tsp b/specification/ai/data-plane/VoiceLive/models.tsp index a4fe5f78b487..ad7dbf830433 100644 --- a/specification/ai/data-plane/VoiceLive/models.tsp +++ b/specification/ai/data-plane/VoiceLive/models.tsp @@ -3,8 +3,9 @@ * Edits made directly to this file will be lost. */ -import "../audio"; -import "../common"; +import "./client.tsp"; +import "./audio"; +import "./common"; import "./custom.tsp"; using TypeSpec.OpenAPI; diff --git a/specification/ai/data-plane/VoiceLive/operations.tsp b/specification/ai/data-plane/VoiceLive/operations.tsp index 00f424748b9b..b3d73612cfdc 100644 --- a/specification/ai/data-plane/VoiceLive/operations.tsp +++ b/specification/ai/data-plane/VoiceLive/operations.tsp @@ -1,4 +1,4 @@ -import "../common"; +import "./common"; import "./models.tsp"; using TypeSpec.Http; diff --git a/specification/ai/data-plane/VoiceLive/servers/websocket.tsp b/specification/ai/data-plane/VoiceLive/servers/websocket.tsp new file mode 100644 index 000000000000..a31276e9099d --- /dev/null +++ b/specification/ai/data-plane/VoiceLive/servers/websocket.tsp @@ -0,0 +1,20 @@ +import "@typespec/http"; + +using TypeSpec.Http; + +/** The OpenAI REST API. Please see https://platform.openai.com/docs/api-reference for more details. */ +@service({ + title: "OpenAI API", + termsOfService: "https://openai.com/policies/terms-of-use", + contact: { + name: "OpenAI Support", + url: "https://help.openai.com", + }, + license: { + name: "MIT", + url: "https://github.com/openai/openai-openapi/blob/master/LICENSE", + }, +}) +@server("wss://api.openai.com/v1", "OpenAI Endpoint") +@useAuth(BearerAuth) +namespace OpenAI; diff --git a/specification/ai/data-plane/VoiceLive/tspconfig.yaml b/specification/ai/data-plane/VoiceLive/tspconfig.yaml index b86f7f2f3ea9..d40eca7fd621 100644 --- a/specification/ai/data-plane/VoiceLive/tspconfig.yaml +++ b/specification/ai/data-plane/VoiceLive/tspconfig.yaml @@ -17,9 +17,10 @@ options: "@azure-tools/typespec-python": package-dir: "azure-ai-voicelive" namespace: "azure.ai.voicelive" - generate-test: true - generate-sample: true + generate-test: false + generate-sample: false flavor: azure + package-name: "azure-ai-voicelive" "@azure-tools/typespec-csharp": package-dir: "Azure.AI.VoiceLive" clear-output-folder: true @@ -50,4 +51,4 @@ options: flavor: azure "@azure-tools/typespec-client-generator-cli": additionalDirectories: - - "specification/ai/ai/data-plane/VoiceLive/" \ No newline at end of file + - "specification/ai/data-plane/VoiceLive/" \ No newline at end of file From 9bbe59fedd8714894d9176d5ae3745bfd9b873a0 Mon Sep 17 00:00:00 2001 From: Ryan Hurey Date: Wed, 16 Jul 2025 09:56:47 -0700 Subject: [PATCH 03/48] Some updates --- .../ai/data-plane/VoiceLive/audio/client.tsp | 6 +-- .../data-plane/VoiceLive/audio/operations.tsp | 41 ------------------- .../ai/data-plane/VoiceLive/operations.tsp | 12 ------ .../VoiceLive/servers/websocket.tsp | 20 ++++----- 4 files changed, 10 insertions(+), 69 deletions(-) diff --git a/specification/ai/data-plane/VoiceLive/audio/client.tsp b/specification/ai/data-plane/VoiceLive/audio/client.tsp index 47b17848e1a0..a8a7a0e40e69 100644 --- a/specification/ai/data-plane/VoiceLive/audio/client.tsp +++ b/specification/ai/data-plane/VoiceLive/audio/client.tsp @@ -4,7 +4,7 @@ import "./models.tsp"; using Azure.ClientGenerator.Core; using OpenAI; -@@visibility(CreateTranscriptionResponseVerboseJson.words, "read"); -@@visibility(CreateTranscriptionResponseVerboseJson.segments, "read"); +// @@visibility(CreateTranscriptionResponseVerboseJson.words, "read"); +// @@visibility(CreateTranscriptionResponseVerboseJson.segments, "read"); -@@visibility(CreateTranslationResponseVerboseJson.segments, "read"); +// @@visibility(CreateTranslationResponseVerboseJson.segments, "read"); diff --git a/specification/ai/data-plane/VoiceLive/audio/operations.tsp b/specification/ai/data-plane/VoiceLive/audio/operations.tsp index 95599c938eff..12d9e1058015 100644 --- a/specification/ai/data-plane/VoiceLive/audio/operations.tsp +++ b/specification/ai/data-plane/VoiceLive/audio/operations.tsp @@ -26,45 +26,4 @@ interface Audio { @header contentType: "application/octet-stream"; @body responseBody: bytes; } | SseResponseOf | ErrorResponse; - - @route("transcriptions") - @post - @operationId("createTranscription") - @tag("Audio") - @summary("Transcribes audio into the input language.") - createTranscription( - ...AcceptJsonOrEventStreamHeader, - @header contentType: "multipart/form-data", - @body requestBody: CreateTranscriptionRequest, - ): - | CreateTranscriptionResponseVerboseJson - | CreateTranscriptionResponseJson - | SseResponseOf - | { - // TODO: This response is not defined in the OpenAPI spec. - @header contentType: "text/plain"; - - @body responseBody: string; - } - | ErrorResponse; - - @route("translations") - @post - @operationId("createTranslation") - @tag("Audio") - @summary("Translates audio into English..") - createTranslation( - @header accept: "application/json", - @header contentType: "multipart/form-data", - @body requestBody: CreateTranslationRequest, - ): - | CreateTranslationResponseVerboseJson - | CreateTranslationResponseJson - | { - // TODO: This response is not defined in the OpenAPI spec. - @header contentType: "text/plain"; - - @body responseBody: string; - } - | ErrorResponse; } diff --git a/specification/ai/data-plane/VoiceLive/operations.tsp b/specification/ai/data-plane/VoiceLive/operations.tsp index b3d73612cfdc..bb9d0edc3bcf 100644 --- a/specification/ai/data-plane/VoiceLive/operations.tsp +++ b/specification/ai/data-plane/VoiceLive/operations.tsp @@ -27,18 +27,6 @@ interface VoiceLive { @body request: VoiceLiveSessionCreateRequest, ): VoiceLiveSessionCreateResponse | ErrorResponse; - @post - @route("transcription_sessions") - @operationId("create-voicelive-transcription-session") - @summary(""" - Create an ephemeral API token for use in client-side applications with the VoiceLive API specifically for voicelive transcriptions. Can be configured with the same session parameters as the transcription_session.update client event. - - It responds with a session object, plus a client_secret key which contains a usable ephemeral API token that can be used to authenticate browser clients for the VoiceLive API. - """) - createEphemeralTranscriptionToken( - @body request: VoiceLiveTranscriptionSessionCreateRequest, - ): VoiceLiveTranscriptionSessionCreateResponse | ErrorResponse; -} alias VoiceLiveBetaHeader = { @header("OpenAI-Beta") openAIBeta: "voicelive=v1"; diff --git a/specification/ai/data-plane/VoiceLive/servers/websocket.tsp b/specification/ai/data-plane/VoiceLive/servers/websocket.tsp index a31276e9099d..c73cfeafbeff 100644 --- a/specification/ai/data-plane/VoiceLive/servers/websocket.tsp +++ b/specification/ai/data-plane/VoiceLive/servers/websocket.tsp @@ -1,20 +1,14 @@ import "@typespec/http"; - +import "../models.tsp"; using TypeSpec.Http; -/** The OpenAI REST API. Please see https://platform.openai.com/docs/api-reference for more details. */ -@service({ - title: "OpenAI API", - termsOfService: "https://openai.com/policies/terms-of-use", - contact: { - name: "OpenAI Support", - url: "https://help.openai.com", - }, - license: { - name: "MIT", - url: "https://github.com/openai/openai-openapi/blob/master/LICENSE", - }, +@service(#{ + title: "OpenAI API" + }) @server("wss://api.openai.com/v1", "OpenAI Endpoint") + + @useAuth(BearerAuth) namespace OpenAI; +op force_models(session: VoiceLiveClientEventSessionUpdate ): VoiceLiveServerEventSessionUpdated ; From 78cdb2f606fa3eded5542ec83250ecdfb0836b8e Mon Sep 17 00:00:00 2001 From: Xiting Zhang Date: Wed, 16 Jul 2025 11:14:55 -0700 Subject: [PATCH 04/48] remove some redundant models --- .../ai/data-plane/VoiceLive/audio/custom.tsp | 22 - .../ai/data-plane/VoiceLive/audio/models.tsp | 470 +----------------- .../ai/data-plane/VoiceLive/common/models.tsp | 431 +--------------- 3 files changed, 18 insertions(+), 905 deletions(-) diff --git a/specification/ai/data-plane/VoiceLive/audio/custom.tsp b/specification/ai/data-plane/VoiceLive/audio/custom.tsp index 3dc9b2d92731..5ad1d3a2bec6 100644 --- a/specification/ai/data-plane/VoiceLive/audio/custom.tsp +++ b/specification/ai/data-plane/VoiceLive/audio/custom.tsp @@ -1,23 +1 @@ import "./models.tsp"; - -using TypeSpec.OpenAPI; - -namespace OpenAI; - -union TranscriptionAudioResponseFormat { - AudioResponseFormat, -} - -union TranslationAudioResponseFormat { - AudioResponseFormat, -} - -union TranscriptTextUsageType { - tokens: "tokens", - duration: "duration", -} - -@discriminator("type") -model TranscriptTextUsage { - type: TranscriptTextUsageType; -} diff --git a/specification/ai/data-plane/VoiceLive/audio/models.tsp b/specification/ai/data-plane/VoiceLive/audio/models.tsp index b6a3c9a4db02..5c5e281751e1 100644 --- a/specification/ai/data-plane/VoiceLive/audio/models.tsp +++ b/specification/ai/data-plane/VoiceLive/audio/models.tsp @@ -1,7 +1,5 @@ -/* - * This file was automatically generated from an OpenAPI .yaml file. - * Edits made directly to this file will be lost. - */ +// Cleaned TypeSpec file aligned with Python model definitions +// Removed models not found in Python code and adjusted field shapes to match Python baseline import "../common"; import "./custom.tsp"; @@ -10,45 +8,9 @@ using TypeSpec.OpenAPI; namespace OpenAI; -// Tool generated type. Extracts from CreateTranscriptionResponseJson.logprobs -alias CreateTranscriptionResponseJsonLogprob = { - /** The token in the transcription. */ - token?: string; - - /** The log probability of the token. */ - logprob?: float32; - - /** The bytes of the token. */ - bytes?: float32[]; -}; - -// Tool generated type. Extracts from TranscriptTextDeltaEvent.logprobs -alias TranscriptTextDeltaEventLogprob = { - /** The token that was used to generate the log probability. */ - token?: string; - - /** The log probability of the token. */ - logprob?: float32; - - /** The bytes that were used to generate the log probability. */ - bytes?: int32[]; -}; - -// Tool generated type. Extracts from TranscriptTextDoneEvent.logprobs -alias TranscriptTextDoneEventLogprob = { - /** The token that was used to generate the log probability. */ - token?: string; - - /** The log probability of the token. */ - logprob?: float32; - - /** The bytes that were used to generate the log probability. */ - bytes?: int32[]; -}; - @doc(""" The format of the output, in one of these options: `json`, `text`, `srt`, `verbose_json`, or `vtt`. For `gpt-4o-transcribe` and `gpt-4o-mini-transcribe`, the only supported format is `json`. - """) +""") union AudioResponseFormat { "json", "text", @@ -57,397 +19,15 @@ union AudioResponseFormat { "vtt", } -model CreateSpeechRequest { - @doc(""" - One of the available [TTS models](/docs/models#tts): `tts-1`, `tts-1-hd` or `gpt-4o-mini-tts`. - """) - @extension("x-oaiTypeLabel", "string") - `model`: string | "tts-1" | "tts-1-hd" | "gpt-4o-mini-tts"; - - /** The text to generate audio for. The maximum length is 4096 characters. */ - @maxLength(4096) - input: string; - - @doc(""" - Control the voice of your generated audio with additional instructions. Does not work with `tts-1` or `tts-1-hd`. - """) - @maxLength(4096) - instructions?: string; - - @doc(""" - The voice to use when generating the audio. Supported voices are `alloy`, `ash`, `ballad`, `coral`, `echo`, `fable`, `onyx`, `nova`, `sage`, `shimmer`, and `verse`. Previews of the voices are available in the [Text to speech guide](/docs/guides/text-to-speech#voice-options). - """) - voice: VoiceIdsShared; - - @doc(""" - The format to audio in. Supported formats are `mp3`, `opus`, `aac`, `flac`, `wav`, and `pcm`. - """) - response_format?: "mp3" | "opus" | "aac" | "flac" | "wav" | "pcm" = "mp3"; - - @doc(""" - The speed of the generated audio. Select a value from `0.25` to `4.0`. `1.0` is the default. - """) - @minValue(0.25) - @maxValue(4) - speed?: float32 = 1; - - @doc(""" - The format to stream the audio in. Supported formats are `sse` and `audio`. `sse` is not supported for `tts-1` or `tts-1-hd`. - """) - stream_format?: "sse" | "audio" = "audio"; -} - -// Tool customization: Convert to discriminated type base -union CreateSpeechResponseStreamEventType { - speech_audio_delta: "speech.audio.delta", - speech_audio_done: "speech.audio.done", -} -@discriminator("type") -model CreateSpeechResponseStreamEvent { - type: CreateSpeechResponseStreamEventType; -} - -// Tool customization (apply_discriminator): Apply discriminated type base -/** Emitted for each chunk of audio data generated during speech synthesis. */ -model SpeechAudioDeltaEvent extends CreateSpeechResponseStreamEvent { - @doc(""" - The type of the event. Always `speech.audio.delta`. - """) - type: CreateSpeechResponseStreamEventType.speech_audio_delta; - - // Tool customization: base64 input uses an encoded bytes type - /** A chunk of Base64-encoded audio data. */ - @encode("base64", string) - audio: bytes; -} - -// Tool customization (apply_discriminator): Apply discriminated type base -/** Emitted when the speech synthesis is complete and all audio has been streamed. */ -model SpeechAudioDoneEvent extends CreateSpeechResponseStreamEvent { - @doc(""" - The type of the event. Always `speech.audio.done`. - """) - type: CreateSpeechResponseStreamEventType.speech_audio_done; - - /** Token usage statistics for the request. */ - usage: { - /** Number of input tokens in the prompt. */ - input_tokens: int32; - - /** Number of output tokens generated. */ - output_tokens: int32; - - /** Total number of tokens used (input + output). */ - total_tokens: int32; - }; -} - -model CreateTranscriptionRequest { - /** The audio file object (not file name) to transcribe, in one of these formats: flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm. */ - @extension("x-oaiTypeLabel", "file") - file: bytes; - - @doc(""" - ID of the model to use. The options are `gpt-4o-transcribe`, `gpt-4o-mini-transcribe`, and `whisper-1` (which is powered by our open source Whisper V2 model). - """) - @extension("x-oaiTypeLabel", "string") - `model`: - | string - | "whisper-1" - | "gpt-4o-transcribe" - | "gpt-4o-mini-transcribe"; - - @doc(""" - The language of the input audio. Supplying the input language in [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`) format will improve accuracy and latency. - """) - language?: string; - - /** An optional text to guide the model's style or continue a previous audio segment. The [prompt](/docs/guides/speech-to-text#prompting) should match the audio language. */ - prompt?: string; - - // Tool customization: use scenario-specific composed union - response_format?: TranscriptionAudioResponseFormat = "json"; - - // Tool customization: add missing but documented min/max for temperature - /** The sampling temperature, between 0 and 1. Higher values like 0.8 will make the output more random, while lower values like 0.2 will make it more focused and deterministic. If set to 0, the model will use [log probability](https://en.wikipedia.org/wiki/Log_probability) to automatically increase the temperature until certain thresholds are hit. */ - @minValue(0) - @maxValue(1) - temperature?: float32 = 0; - - @doc(""" - Additional information to include in the transcription response. - `logprobs` will return the log probabilities of the tokens in the - response to understand the model's confidence in the transcription. - `logprobs` only works with response_format set to `json` and only with - the models `gpt-4o-transcribe` and `gpt-4o-mini-transcribe`. - """) - `include[]`?: TranscriptionInclude[]; - - @doc(""" - The timestamp granularities to populate for this transcription. `response_format` must be set `verbose_json` to use timestamp granularities. Either or both of these options are supported: `word`, or `segment`. Note: There is no additional latency for segment timestamps, but generating word timestamps incurs additional latency. - """) - `timestamp_granularities[]`?: ("word" | "segment")[] = #["segment"]; - - @doc(""" - If set to true, the model response data will be streamed to the client - as it is generated using [server-sent events](https://developer.mozilla.org/en-US/docs/Web/API/Server-sent_events/Using_server-sent_events#Event_stream_format). - See the [Streaming section of the Speech-to-Text guide](/docs/guides/speech-to-text?lang=curl#streaming-transcriptions) - for more information. - - Note: Streaming is not supported for the `whisper-1` model and will be ignored. - """) - stream?: boolean | null = false; - - @doc(""" - Controls how the audio is cut into chunks. When set to `"auto"`, the server first normalizes loudness and then uses voice activity detection (VAD) to choose boundaries. `server_vad` object can be provided to tweak VAD detection parameters manually. If unset, the audio is transcribed as a single block. - """) - @extension("x-oaiTypeLabel", "string") - chunking_strategy?: VadConfig | null; -} - -model CreateTranslationRequest { - /** The audio file object (not file name) translate, in one of these formats: flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm. */ - @extension("x-oaiTypeLabel", "file") - file: bytes; - - @doc(""" - ID of the model to use. Only `whisper-1` (which is powered by our open source Whisper V2 model) is currently available. - """) - @extension("x-oaiTypeLabel", "string") - `model`: string | "whisper-1"; - - /** An optional text to guide the model's style or continue a previous audio segment. The [prompt](/docs/guides/speech-to-text#prompting) should be in English. */ - prompt?: string; - - // Tool customization: use scenario-specific composed union - @doc(""" - The format of the output, in one of these options: `json`, `text`, `srt`, `verbose_json`, or `vtt`. - """) - response_format?: TranslationAudioResponseFormat = "json"; - - // Tool customization: add missing but documented min/max for temperature - /** The sampling temperature, between 0 and 1. Higher values like 0.8 will make the output more random, while lower values like 0.2 will make it more focused and deterministic. If set to 0, the model will use [log probability](https://en.wikipedia.org/wiki/Log_probability) to automatically increase the temperature until certain thresholds are hit. */ - @minValue(0) - @maxValue(1) - temperature?: float32 = 0; -} - -/** Represents a transcription response returned by model, based on the provided input. */ -model CreateTranscriptionResponseJson { - /** The transcribed text. */ - text: string; - - @doc(""" - The log probabilities of the tokens in the transcription. Only returned with the models `gpt-4o-transcribe` and `gpt-4o-mini-transcribe` if `logprobs` is added to the `include` array. - """) - logprobs?: CreateTranscriptionResponseJsonLogprob[]; - - // Tool customization: Substitute common discriminated type base - /** Token usage statistics for the request. */ - usage?: TranscriptTextUsage; -} - -// Tool customization (apply_discriminator): Apply discriminated base -/** Usage statistics for models billed by audio input duration. */ -model TranscriptTextUsageDuration extends TranscriptTextUsage { - @doc(""" - The type of the usage object. Always `duration` for this variant. - """) - type: TranscriptTextUsageType.duration; - - // Tool customization: numeric timespans are encoded durations - /** Duration of the input audio in seconds. */ - @encode("seconds", float32) - seconds: duration; -} - -// Tool customization (apply_discriminator): Apply discriminated base -/** Usage statistics for models billed by token usage. */ -model TranscriptTextUsageTokens extends TranscriptTextUsage { - @doc(""" - The type of the usage object. Always `tokens` for this variant. - """) - type: TranscriptTextUsageType.tokens; - - /** Number of input tokens billed for this request. */ - input_tokens: int32; - - /** Details about the input tokens billed for this request. */ - input_token_details?: { - /** Number of text tokens billed for this request. */ - text_tokens?: int32; - - /** Number of audio tokens billed for this request. */ - audio_tokens?: int32; - }; - - /** Number of output tokens generated. */ - output_tokens: int32; - - /** Total number of tokens used (input + output). */ - total_tokens: int32; -} - -// Tool customization: Add a missing 'task' field, present on the wire but not in the spec -/** Represents a verbose json transcription response returned by model, based on the provided input. */ -model CreateTranscriptionResponseVerboseJson { - /** The task label. */ - task: "transcribe"; - - /** The language of the input audio. */ - language: string; - - // Tool customization: improve representation of float duration - /** The duration of the input audio. */ - @encode("seconds", float32) - duration: duration; - - /** The transcribed text. */ - text: string; - - /** Extracted words and their corresponding timestamps. */ - words?: TranscriptionWord[]; - - /** Segments of the transcribed text and their corresponding details. */ - segments?: TranscriptionSegment[]; - - // Tool customization: Substitute common discriminated type base (underspecification of non-verbose union parity assumed) - usage?: TranscriptTextUsage; -} - -model CreateTranslationResponseJson { - text: string; -} - -// Tool customization: Add a missing 'task' field, present on the wire but not in the spec -model CreateTranslationResponseVerboseJson { - /** The task label. */ - task: "translate"; - - @doc(""" - The language of the output translation (always `english`). - """) - language: string; - - // Tool customization: improve representation of float duration - /** The duration of the input audio. */ - @encode("seconds", float32) - duration: duration; - - /** The translated text. */ - text: string; - - /** Segments of the translated text and their corresponding details. */ - segments?: TranscriptionSegment[]; -} - -// Tool customization: Establish discriminated type hierarchy for transcription stream events -union CreateTranscriptionResponseStreamEventType { - string, - transcript_text_delta: "transcript.text.delta", - transcript_text_done: "transcript.text.done", -} -@discriminator("type") -model CreateTranscriptionResponseStreamEvent { - type: CreateTranscriptionResponseStreamEventType; -} - -model TranscriptionSegment { - /** Unique identifier of the segment. */ - id: int32; - - /** Seek offset of the segment. */ - seek: int32; - - // Tool customization: numeric timespans are encoded durations - /** Start time of the segment in seconds. */ - @encode("seconds", float32) - start: duration; - - // Tool customization: numeric timespans are encoded durations - /** End time of the segment in seconds. */ - @encode("seconds", float32) - end: duration; - - /** Text content of the segment. */ - text: string; - - /** Array of token IDs for the text content. */ - tokens: int32[]; - - /** Temperature parameter used for generating the segment. */ - temperature: float32; - - /** Average logprob of the segment. If the value is lower than -1, consider the logprobs failed. */ - avg_logprob: float32; - - /** Compression ratio of the segment. If the value is greater than 2.4, consider the compression failed. */ - compression_ratio: float32; - - @doc(""" - Probability of no speech in the segment. If the value is higher than 1.0 and the `avg_logprob` is below -1, consider this segment silent. - """) - no_speech_prob: float32; -} - -model TranscriptionWord { - /** The text content of the word. */ - word: string; - - // Tool customization: numeric timespans are encoded durations - /** Start time of the word in seconds. */ - @encode("seconds", float32) - start: duration; - - // Tool customization: numeric timespans are encoded durations - /** End time of the word in seconds. */ - @encode("seconds", float32) - end: duration; -} - -// Tool customization (apply_discriminator): Apply discriminated type base for transcription stream events -@doc(""" - Emitted when there is an additional text delta. This is also the first event emitted when the transcription starts. Only emitted when you [create a transcription](/docs/api-reference/audio/create-transcription) with the `Stream` parameter set to `true`. - """) -model TranscriptTextDeltaEvent extends CreateTranscriptionResponseStreamEvent { - @doc(""" - The type of the event. Always `transcript.text.delta`. - """) - type: CreateTranscriptionResponseStreamEventType.transcript_text_delta; - - /** The text delta that was additionally transcribed. */ - delta: string; - +model VadConfig { @doc(""" - The log probabilities of the delta. Only included if you [create a transcription](/docs/api-reference/audio/create-transcription) with the `include[]` parameter set to `logprobs`. - """) - logprobs?: TranscriptTextDeltaEventLogprob[]; -} - -// Tool customization (apply_discriminator): Apply discriminated type base for transcription stream events -@doc(""" - Emitted when the transcription is complete. Contains the complete transcription text. Only emitted when you [create a transcription](/docs/api-reference/audio/create-transcription) with the `Stream` parameter set to `true`. + Must be set to `server_vad` to enable manual chunking using server side VAD. """) -model TranscriptTextDoneEvent extends CreateTranscriptionResponseStreamEvent { - @doc(""" - The type of the event. Always `transcript.text.done`. - """) - type: CreateTranscriptionResponseStreamEventType.transcript_text_done; - - /** The text that was transcribed. */ - text: string; - - @doc(""" - The log probabilities of the individual tokens in the transcription. Only included if you [create a transcription](/docs/api-reference/audio/create-transcription) with the `include[]` parameter set to `logprobs`. - """) - logprobs?: TranscriptTextDoneEventLogprob[]; - - // Tool customization: Substitute common discriminated type base (underspecification of non-verbose/non-streaming union parity assumed) - usage?: TranscriptTextUsage; -} + type: "server_vad"; -union TranscriptionInclude { - "logprobs", + prefix_padding_ms?: int32 = 300; + silence_duration_ms?: int32 = 200; + threshold?: float32 = 0.5; } @doc(""" @@ -455,35 +35,15 @@ union TranscriptionInclude { server first normalizes loudness and then uses voice activity detection (VAD) to choose boundaries. `server_vad` object can be provided to tweak VAD detection parameters manually. If unset, the audio is transcribed as a single block. - """) +""") union TranscriptionChunkingStrategy { "auto", VadConfig, } -model VadConfig { - @doc(""" - Must be set to `server_vad` to enable manual chunking using server side VAD. - """) - type: "server_vad"; - - /** - * Amount of audio to include before the VAD detected speech (in - * milliseconds). - */ - prefix_padding_ms?: int32 = 300; - - /** - * Duration of silence to detect speech stop (in milliseconds). - * With shorter values the model will respond more quickly, - * but may jump in on short pauses from the user. - */ - silence_duration_ms?: int32 = 200; - - /** - * Sensitivity threshold (0.0 to 1.0) for voice activity detection. A - * higher threshold will require louder audio to activate the model, and - * thus might perform better in noisy environments. - */ - threshold?: float32 = 0.5; +union TranscriptionInclude { + "logprobs", } + +// Other models removed because they do not correspond to Python models or are redundant +// Please re-add as needed with proper alignment to the source Python definitions \ No newline at end of file diff --git a/specification/ai/data-plane/VoiceLive/common/models.tsp b/specification/ai/data-plane/VoiceLive/common/models.tsp index 60b18b5a2ef4..c905f45ef099 100644 --- a/specification/ai/data-plane/VoiceLive/common/models.tsp +++ b/specification/ai/data-plane/VoiceLive/common/models.tsp @@ -1,7 +1,5 @@ -/* - * This file was automatically generated from an OpenAPI .yaml file. - * Edits made directly to this file will be lost. - */ +// Cleaned TypeSpec file aligned with Python model definitions +// Removed models not defined or needed based on your Python code baseline import "./custom.tsp"; @@ -16,432 +14,21 @@ model Error { type: string; } -// Tool customization: apply error decorator @error model ErrorResponse { error: Error; } -// Tool customization: Wrap for reuse -@doc(""" - The parameters the functions accepts, described as a JSON Schema object. See the [guide](/docs/guides/function-calling) for examples, and the [JSON Schema reference](https://json-schema.org/understanding-json-schema/) for documentation about the format. - - Omitting `parameters` defines a function with an empty parameter list. - """) -model FunctionParametersCommon { - /** - * The parameters the functions accepts, described as a JSON Schema object. See the [guide](/docs/guides/function-calling) for examples, and the [JSON Schema reference](https://json-schema.org/understanding-json-schema/) for documentation about the format. - * - * Omitting `parameters` defines a function with an empty parameter list. - */ - parameters?: unknown; -} - -model FunctionObject { - /** A description of what the function does, used by the model to choose when and how to call the function. */ - description?: string; - - /** The name of the function to be called. Must be a-z, A-Z, 0-9, or contain underscores and dashes, with a maximum length of 64. */ - name: string; - - ...FunctionParametersCommon; - - @doc(""" - Whether to enable strict schema adherence when generating the function call. If set to true, the model will follow the exact schema defined in the `parameters` field. Only a subset of JSON Schema is supported when `strict` is `true`. Learn more about Structured Outputs in the [function calling guide](docs/guides/function-calling). - """) - strict?: boolean | null = false; -} - -// Tool customization: Wrap for reuse -/** - * Set of 16 key-value pairs that can be attached to an object. This can be - * useful for storing additional information about the object in a structured - * format, and querying for objects via API or the dashboard. - * - * Keys are strings with a maximum length of 64 characters. Values are strings - * with a maximum length of 512 characters. - */ model MetadataPropertyForRequest { - /** - * Set of 16 key-value pairs that can be attached to an object. This can be - * useful for storing additional information about the object in a structured - * format, and querying for objects via API or the dashboard. - * - * Keys are strings with a maximum length of 64 characters. Values are strings - * with a maximum length of 512 characters. - */ @extension("x-oaiTypeLabel", "map") metadata?: Record; } + model MetadataPropertyForResponse { - /** - * Set of 16 key-value pairs that can be attached to an object. This can be - * useful for storing additional information about the object in a structured - * format, and querying for objects via API or the dashboard. - * - * Keys are strings with a maximum length of 64 characters. Values are strings - * with a maximum length of 512 characters. - */ @extension("x-oaiTypeLabel", "map") metadata: Record | null; } -// Tool customization (apply_discriminator): establish a common, discriminated union -/** Default response format. Used to generate text responses. */ -model ResponseFormatText extends ResponseFormat { - @doc(""" - The type of response format being defined. Always `text`. - """) - type: "text"; -} - -// Tool customization (apply_discriminator): establish a common, discriminated union -@doc(""" - JSON object response format. An older method of generating JSON responses. - Using `json_schema` is recommended for models that support it. Note that the - model will not generate JSON without a system or user message instructing it - to do so. - """) -model ResponseFormatJsonObject extends ResponseFormat { - @doc(""" - The type of response format being defined. Always `json_object`. - """) - type: "json_object"; -} - -/** - * The schema for the response format, described as a JSON Schema object. - * Learn how to build JSON schemas [here](https://json-schema.org/). - */ -model ResponseFormatJsonSchemaSchema is Record; - -// Tool customization (apply_discriminator): establish a common, discriminated union -/** - * JSON Schema response format. Used to generate structured JSON responses. - * Learn more about [Structured Outputs](/docs/guides/structured-outputs). - */ -model ResponseFormatJsonSchema extends ResponseFormat { - @doc(""" - The type of response format being defined. Always `json_schema`. - """) - type: "json_schema"; - - /** Structured Outputs configuration options, including a JSON Schema. */ - json_schema: { - /** - * A description of what the response format is for, used by the model to - * determine how to respond in the format. - */ - description?: string; - - /** - * The name of the response format. Must be a-z, A-Z, 0-9, or contain - * underscores and dashes, with a maximum length of 64. - */ - name: string; - - schema?: ResponseFormatJsonSchemaSchema; - - @doc(""" - Whether to enable strict schema adherence when generating the output. - If set to true, the model will always follow the exact schema defined - in the `schema` field. Only a subset of JSON Schema is supported when - `strict` is `true`. To learn more, read the [Structured Outputs - guide](/docs/guides/structured-outputs). - """) - strict?: boolean | null = false; - }; -} - -/** Whether to enable [parallel function calling](/docs/guides/function-calling#configuring-parallel-function-calling) during tool use. */ -scalar ParallelToolCalls extends boolean; - -/** Usage statistics for the completion request. */ -model CompletionUsage { - /** Number of tokens in the generated completion. */ - completion_tokens: int32 = 0; - - /** Number of tokens in the prompt. */ - prompt_tokens: int32 = 0; - - /** Total number of tokens used in the request (prompt + completion). */ - total_tokens: int32 = 0; - - /** Breakdown of tokens used in a completion. */ - completion_tokens_details?: { - /** - * When using Predicted Outputs, the number of tokens in the - * prediction that appeared in the completion. - */ - accepted_prediction_tokens?: int32 = 0; - - /** Audio input tokens generated by the model. */ - audio_tokens?: int32 = 0; - - /** Tokens generated by the model for reasoning. */ - reasoning_tokens?: int32 = 0; - - /** - * When using Predicted Outputs, the number of tokens in the - * prediction that did not appear in the completion. However, like - * reasoning tokens, these tokens are still counted in the total - * completion tokens for purposes of billing, output, and context window - * limits. - */ - rejected_prediction_tokens?: int32 = 0; - }; - - /** Breakdown of tokens used in the prompt. */ - prompt_tokens_details?: { - /** Audio input tokens present in the prompt. */ - audio_tokens?: int32 = 0; - - /** Cached tokens present in the prompt. */ - cached_tokens?: int32 = 0; - }; -} - -@doc(""" - Options for streaming response. Only set this when you set `stream: true`. - """) -model ChatCompletionStreamOptions { - @doc(""" - If set, an additional chunk will be streamed before the `data: [DONE]` - message. The `usage` field on this chunk shows the token usage statistics - for the entire request, and the `choices` field will always be an empty - array. - - All other chunks will also include a `usage` field, but with a null - value. **NOTE:** If the stream is interrupted, you may not receive the - final usage chunk which contains the total token usage for the request. - """) - include_usage?: boolean; -} - -@doc(""" - **o-series models only** - - Constrains effort on reasoning for - [reasoning models](https://platform.openai.com/docs/guides/reasoning). - Currently supported values are `low`, `medium`, and `high`. Reducing - reasoning effort can result in faster responses and fewer tokens used - on reasoning in a response. - """) -union ReasoningEffort { - "low", - "medium", - "high", -} - -// Tool customization: Stub for separated type -alias CreateModelResponseProperties = ModelResponsePropertiesForRequest; - -// Tool customization: Replace imprecise common spread source with split request/response models honoring optionality and nullability -model ModelResponsePropertiesForRequest { - ...MetadataPropertyForRequest; - - @doc(""" - What sampling temperature to use, between 0 and 2. Higher values like 0.8 will make the output more random, while lower values like 0.2 will make it more focused and deterministic. - We generally recommend altering this or `top_p` but not both. - """) - @minValue(0) - @maxValue(2) - temperature?: float32 | null = 1; - - @doc(""" - An alternative to sampling with temperature, called nucleus sampling, - where the model considers the results of the tokens with top_p probability - mass. So 0.1 means only the tokens comprising the top 10% probability mass - are considered. - - We generally recommend altering this or `temperature` but not both. - """) - @minValue(0) - @maxValue(1) - top_p?: float32 | null = 1; - - /** A unique identifier representing your end-user, which can help OpenAI to monitor and detect abuse. [Learn more](/docs/guides/safety-best-practices#end-user-ids). */ - user?: string; - - service_tier?: ServiceTier; - - /** An integer between 0 and 20 specifying the number of most likely tokens to return at each token position, each with an associated log probability. */ - @minValue(0) - @maxValue(20) - top_logprobs?: int32; -} -model ModelResponsePropertiesForResponse { - ...MetadataPropertyForResponse; - - @doc(""" - What sampling temperature to use, between 0 and 2. Higher values like 0.8 will make the output more random, while lower values like 0.2 will make it more focused and deterministic. - We generally recommend altering this or `top_p` but not both. - """) - @minValue(0) - @maxValue(2) - temperature: float32 | null; - - @doc(""" - An alternative to sampling with temperature, called nucleus sampling, - where the model considers the results of the tokens with top_p probability - mass. So 0.1 means only the tokens comprising the top 10% probability mass - are considered. - - We generally recommend altering this or `temperature` but not both. - """) - @minValue(0) - @maxValue(1) - top_p: float32 | null; - - /** A unique identifier representing your end-user, which can help OpenAI to monitor and detect abuse. [Learn more](/docs/guides/safety-best-practices#end-user-ids). */ - user: string | null; - - service_tier?: ServiceTier; - - /** An integer between 0 and 20 specifying the number of most likely tokens to return at each token position, each with an associated log probability. */ - top_logprobs?: int32 | null; -} - -@doc(""" - Output types that you would like the model to generate. - Most models are capable of generating text, which is the default: - - `["text"]` - - The `gpt-4o-audio-preview` model can also be used to - [generate audio](/docs/guides/audio). To request that this model generate - both text and audio responses, you can use: - - `["text", "audio"]` - """) -model ResponseModalities is ("text" | "audio")[]; - -@doc(""" - Not supported with latest reasoning models `o3` and `o4-mini`. - - Up to 4 sequences where the API will stop generating further tokens. The - returned text will not contain the stop sequence. - """) -union StopConfiguration { - string, - string[], -} - -@doc(""" - High level guidance for the amount of context window space to use for the - search. One of `low`, `medium`, or `high`. `medium` is the default. - """) -union WebSearchContextSize { - "low", - "medium", - "high", -} - -/** Approximate location parameters for the search. */ -model WebSearchLocation { - @doc(""" - The two-letter - [ISO country code](https://en.wikipedia.org/wiki/ISO_3166-1) of the user, - e.g. `US`. - """) - country?: string; - - @doc(""" - Free text input for the region of the user, e.g. `California`. - """) - region?: string; - - @doc(""" - Free text input for the city of the user, e.g. `San Francisco`. - """) - city?: string; - - @doc(""" - The [IANA timezone](https://timeapi.io/documentation/iana-timezones) - of the user, e.g. `America/Los_Angeles`. - """) - timezone?: string; -} - -@doc(""" - The ranker to use for the file search. If not specified will use the `auto` ranker. - """) -union FileSearchRanker { - "auto", - "default_2024_08_21", -} - -union ModelIdsShared { - string, - "gpt-4.1", - "gpt-4.1-mini", - "gpt-4.1-nano", - "gpt-4.1-2025-04-14", - "gpt-4.1-mini-2025-04-14", - "gpt-4.1-nano-2025-04-14", - "o4-mini", - "o4-mini-2025-04-16", - "o3", - "o3-2025-04-16", - "o3-mini", - "o3-mini-2025-01-31", - "o1", - "o1-2024-12-17", - "o1-preview", - "o1-preview-2024-09-12", - "o1-mini", - "o1-mini-2024-09-12", - "gpt-4o", - "gpt-4o-2024-11-20", - "gpt-4o-2024-08-06", - "gpt-4o-2024-05-13", - "gpt-4o-audio-preview", - "gpt-4o-audio-preview-2024-10-01", - "gpt-4o-audio-preview-2024-12-17", - "gpt-4o-audio-preview-2025-06-03", - "gpt-4o-mini-audio-preview", - "gpt-4o-mini-audio-preview-2024-12-17", - "gpt-4o-search-preview", - "gpt-4o-mini-search-preview", - "gpt-4o-search-preview-2025-03-11", - "gpt-4o-mini-search-preview-2025-03-11", - "chatgpt-4o-latest", - "codex-mini-latest", - "gpt-4o-mini", - "gpt-4o-mini-2024-07-18", - "gpt-4-turbo", - "gpt-4-turbo-2024-04-09", - "gpt-4-0125-preview", - "gpt-4-turbo-preview", - "gpt-4-1106-preview", - "gpt-4-vision-preview", - "gpt-4", - "gpt-4-0314", - "gpt-4-0613", - "gpt-4-32k", - "gpt-4-32k-0314", - "gpt-4-32k-0613", - "gpt-3.5-turbo", - "gpt-3.5-turbo-16k", - "gpt-3.5-turbo-0301", - "gpt-3.5-turbo-0613", - "gpt-3.5-turbo-1106", - "gpt-3.5-turbo-0125", - "gpt-3.5-turbo-16k-0613", -} - -/** A log probability object. */ -model LogProbProperties { - /** The token that was used to generate the log probability. */ - token: string; - - /** The log probability of the token. */ - logprob: float32; - - /** The bytes that were used to generate the log probability. */ - bytes: int32[]; -} - union VoiceIdsShared { string, "alloy", @@ -449,23 +36,11 @@ union VoiceIdsShared { "ballad", "coral", "echo", - "fable", - "onyx", - "nova", "sage", "shimmer", "verse", } -@doc(""" - Specifies the processing type used for serving the request. - - If set to 'auto', then the request will be processed with the service tier configured in the Project settings. Unless otherwise configured, the Project will use 'default'. - - If set to 'default', then the requset will be processed with the standard pricing and performance for the selected model. - - If set to '[flex](/docs/guides/flex-processing)' or 'priority', then the request will be processed with the corresponding service tier. [Contact sales](https://openai.com/contact-sales) to learn more about Priority processing. - - When not set, the default behavior is 'auto'. - - When the `service_tier` parameter is set, the response body will include the `service_tier` value based on the processing mode actually used to serve the request. This response value may be different from the value set in the parameter. - """) union ServiceTier { "auto", "default", From deab4edd91e55ab7438a663411f8862cec03dcca Mon Sep 17 00:00:00 2001 From: Ryan Hurey Date: Wed, 16 Jul 2025 13:26:38 -0700 Subject: [PATCH 05/48] Change to voice live --- specification/ai/data-plane/VoiceLive/audio/client.tsp | 2 +- specification/ai/data-plane/VoiceLive/audio/custom.tsp | 2 +- specification/ai/data-plane/VoiceLive/audio/models.tsp | 2 +- .../ai/data-plane/VoiceLive/audio/operations.tsp | 2 +- specification/ai/data-plane/VoiceLive/common/custom.tsp | 4 ++-- specification/ai/data-plane/VoiceLive/common/models.tsp | 2 +- specification/ai/data-plane/VoiceLive/custom.tsp | 2 +- .../ai/data-plane/VoiceLive/custom/content_parts.tsp | 2 +- specification/ai/data-plane/VoiceLive/custom/events.tsp | 2 +- specification/ai/data-plane/VoiceLive/custom/items.tsp | 2 +- specification/ai/data-plane/VoiceLive/custom/tools.tsp | 2 +- specification/ai/data-plane/VoiceLive/models.tsp | 2 +- specification/ai/data-plane/VoiceLive/operations.tsp | 4 ++-- .../ai/data-plane/VoiceLive/servers/websocket.tsp | 8 ++++---- 14 files changed, 19 insertions(+), 19 deletions(-) diff --git a/specification/ai/data-plane/VoiceLive/audio/client.tsp b/specification/ai/data-plane/VoiceLive/audio/client.tsp index a8a7a0e40e69..2c877169e4c4 100644 --- a/specification/ai/data-plane/VoiceLive/audio/client.tsp +++ b/specification/ai/data-plane/VoiceLive/audio/client.tsp @@ -2,7 +2,7 @@ import "@azure-tools/typespec-client-generator-core"; import "./models.tsp"; using Azure.ClientGenerator.Core; -using OpenAI; +using VoiceLive; // @@visibility(CreateTranscriptionResponseVerboseJson.words, "read"); // @@visibility(CreateTranscriptionResponseVerboseJson.segments, "read"); diff --git a/specification/ai/data-plane/VoiceLive/audio/custom.tsp b/specification/ai/data-plane/VoiceLive/audio/custom.tsp index 3dc9b2d92731..70edd231bee2 100644 --- a/specification/ai/data-plane/VoiceLive/audio/custom.tsp +++ b/specification/ai/data-plane/VoiceLive/audio/custom.tsp @@ -2,7 +2,7 @@ import "./models.tsp"; using TypeSpec.OpenAPI; -namespace OpenAI; +namespace VoiceLive; union TranscriptionAudioResponseFormat { AudioResponseFormat, diff --git a/specification/ai/data-plane/VoiceLive/audio/models.tsp b/specification/ai/data-plane/VoiceLive/audio/models.tsp index b6a3c9a4db02..83358fbf4813 100644 --- a/specification/ai/data-plane/VoiceLive/audio/models.tsp +++ b/specification/ai/data-plane/VoiceLive/audio/models.tsp @@ -8,7 +8,7 @@ import "./custom.tsp"; using TypeSpec.OpenAPI; -namespace OpenAI; +namespace VoiceLive; // Tool generated type. Extracts from CreateTranscriptionResponseJson.logprobs alias CreateTranscriptionResponseJsonLogprob = { diff --git a/specification/ai/data-plane/VoiceLive/audio/operations.tsp b/specification/ai/data-plane/VoiceLive/audio/operations.tsp index 12d9e1058015..c63820c360cd 100644 --- a/specification/ai/data-plane/VoiceLive/audio/operations.tsp +++ b/specification/ai/data-plane/VoiceLive/audio/operations.tsp @@ -7,7 +7,7 @@ import "./models.tsp"; using TypeSpec.Http; using TypeSpec.OpenAPI; -namespace OpenAI; +namespace VoiceLive; @route("/audio") interface Audio { diff --git a/specification/ai/data-plane/VoiceLive/common/custom.tsp b/specification/ai/data-plane/VoiceLive/common/custom.tsp index 9ee4b1fdadb9..b1e3503e5d43 100644 --- a/specification/ai/data-plane/VoiceLive/common/custom.tsp +++ b/specification/ai/data-plane/VoiceLive/common/custom.tsp @@ -4,7 +4,7 @@ import "@typespec/openapi"; using TypeSpec.Http; using TypeSpec.OpenAPI; -namespace OpenAI; +namespace VoiceLive; @discriminator("type") model ResponseFormat { @@ -20,7 +20,7 @@ alias AcceptJsonOrEventStreamHeader = { }; alias AssistantsBetaHeader = { - @header("OpenAI-Beta") openAIBeta: "assistants=v2"; + @header("VoiceLive-Beta") voiceLiveBeta: "assistants=v2"; }; alias PageLimitQueryParameter = { diff --git a/specification/ai/data-plane/VoiceLive/common/models.tsp b/specification/ai/data-plane/VoiceLive/common/models.tsp index 60b18b5a2ef4..421613912de4 100644 --- a/specification/ai/data-plane/VoiceLive/common/models.tsp +++ b/specification/ai/data-plane/VoiceLive/common/models.tsp @@ -7,7 +7,7 @@ import "./custom.tsp"; using TypeSpec.OpenAPI; -namespace OpenAI; +namespace VoiceLive; model Error { code: string | null; diff --git a/specification/ai/data-plane/VoiceLive/custom.tsp b/specification/ai/data-plane/VoiceLive/custom.tsp index faa8785ccd5b..985f2035aabf 100644 --- a/specification/ai/data-plane/VoiceLive/custom.tsp +++ b/specification/ai/data-plane/VoiceLive/custom.tsp @@ -4,7 +4,7 @@ import "./custom/tools.tsp"; using TypeSpec.OpenAPI; -namespace OpenAI; +namespace VoiceLive; model VoiceLiveRequestSession { ...VoiceLiveSessionBase; diff --git a/specification/ai/data-plane/VoiceLive/custom/content_parts.tsp b/specification/ai/data-plane/VoiceLive/custom/content_parts.tsp index 258f083003b3..4aa2e14c6724 100644 --- a/specification/ai/data-plane/VoiceLive/custom/content_parts.tsp +++ b/specification/ai/data-plane/VoiceLive/custom/content_parts.tsp @@ -1,6 +1,6 @@ using TypeSpec.OpenAPI; -namespace OpenAI; +namespace VoiceLive; union VoiceLiveContentPartType { string, diff --git a/specification/ai/data-plane/VoiceLive/custom/events.tsp b/specification/ai/data-plane/VoiceLive/custom/events.tsp index 0f7d5d5221f0..4fccd5668c4e 100644 --- a/specification/ai/data-plane/VoiceLive/custom/events.tsp +++ b/specification/ai/data-plane/VoiceLive/custom/events.tsp @@ -1,6 +1,6 @@ using TypeSpec.OpenAPI; -namespace OpenAI; +namespace VoiceLive; union VoiceLiveClientEventType { string, diff --git a/specification/ai/data-plane/VoiceLive/custom/items.tsp b/specification/ai/data-plane/VoiceLive/custom/items.tsp index 32f6bb432b7d..9cdc0d9cbdb6 100644 --- a/specification/ai/data-plane/VoiceLive/custom/items.tsp +++ b/specification/ai/data-plane/VoiceLive/custom/items.tsp @@ -2,7 +2,7 @@ import "./content_parts.tsp"; using TypeSpec.OpenAPI; -namespace OpenAI; +namespace VoiceLive; @discriminator("type") model VoiceLiveConversationRequestItem { diff --git a/specification/ai/data-plane/VoiceLive/custom/tools.tsp b/specification/ai/data-plane/VoiceLive/custom/tools.tsp index aa592aaf0c06..d7e24cd8f1ee 100644 --- a/specification/ai/data-plane/VoiceLive/custom/tools.tsp +++ b/specification/ai/data-plane/VoiceLive/custom/tools.tsp @@ -1,6 +1,6 @@ using TypeSpec.OpenAPI; -namespace OpenAI; +namespace VoiceLive; /** * The supported tool type discriminators for voicelive tools. diff --git a/specification/ai/data-plane/VoiceLive/models.tsp b/specification/ai/data-plane/VoiceLive/models.tsp index ad7dbf830433..7774fe5d69ca 100644 --- a/specification/ai/data-plane/VoiceLive/models.tsp +++ b/specification/ai/data-plane/VoiceLive/models.tsp @@ -10,7 +10,7 @@ import "./custom.tsp"; using TypeSpec.OpenAPI; -namespace OpenAI; +namespace VoiceLive; // Tool generated type. Extracts from VoiceLiveConversationItemWithReference.content alias VoiceLiveConversationItemWithReferenceContent = { diff --git a/specification/ai/data-plane/VoiceLive/operations.tsp b/specification/ai/data-plane/VoiceLive/operations.tsp index bb9d0edc3bcf..366fb68cf779 100644 --- a/specification/ai/data-plane/VoiceLive/operations.tsp +++ b/specification/ai/data-plane/VoiceLive/operations.tsp @@ -4,7 +4,7 @@ import "./models.tsp"; using TypeSpec.Http; using TypeSpec.OpenAPI; -namespace OpenAI; +namespace VoiceLive; @route("voicelive") @tag("VoiceLive") @@ -29,5 +29,5 @@ interface VoiceLive { alias VoiceLiveBetaHeader = { - @header("OpenAI-Beta") openAIBeta: "voicelive=v1"; + @header("VoiceLive-Beta") voiceLiveBeta: "voicelive=v1"; }; diff --git a/specification/ai/data-plane/VoiceLive/servers/websocket.tsp b/specification/ai/data-plane/VoiceLive/servers/websocket.tsp index c73cfeafbeff..5cf1e52027bd 100644 --- a/specification/ai/data-plane/VoiceLive/servers/websocket.tsp +++ b/specification/ai/data-plane/VoiceLive/servers/websocket.tsp @@ -3,12 +3,12 @@ import "../models.tsp"; using TypeSpec.Http; @service(#{ - title: "OpenAI API" - + title: "VoiceLive API" + }) -@server("wss://api.openai.com/v1", "OpenAI Endpoint") +@server("wss://api.voicelive.com/v1", "VoiceLive Endpoint") @useAuth(BearerAuth) -namespace OpenAI; +namespace VoiceLive; op force_models(session: VoiceLiveClientEventSessionUpdate ): VoiceLiveServerEventSessionUpdated ; From cc92957f3f3b37edf8859ccc37a15c9e024cdd24 Mon Sep 17 00:00:00 2001 From: Xiting Zhang Date: Wed, 16 Jul 2025 14:52:50 -0700 Subject: [PATCH 06/48] update sessoin --- .../ai/data-plane/VoiceLive/custom.tsp | 202 ++++--- .../ai/data-plane/VoiceLive/custom/events.tsp | 27 +- .../ai/data-plane/VoiceLive/models.tsp | 554 ++++++------------ 3 files changed, 309 insertions(+), 474 deletions(-) diff --git a/specification/ai/data-plane/VoiceLive/custom.tsp b/specification/ai/data-plane/VoiceLive/custom.tsp index faa8785ccd5b..7ac7067638d6 100644 --- a/specification/ai/data-plane/VoiceLive/custom.tsp +++ b/specification/ai/data-plane/VoiceLive/custom.tsp @@ -8,20 +8,21 @@ namespace OpenAI; model VoiceLiveRequestSession { ...VoiceLiveSessionBase; + model?: string; modalities?: VoiceLiveModality[]; - instructions?: string; - `model`?: - | "gpt-4o-realtime-preview" - | "gpt-4o-realtime-preview-2024-10-01" - | "gpt-4o-realtime-preview-2024-12-17" - | "gpt-4o-mini-realtime-preview" - | "gpt-4o-mini-realtime-preview-2024-12-17"; + animation?: VoiceLiveAnimation; voice?: VoiceIdsShared; + instructions?: string; + input_audio?: VoiceLiveInputAudio; + input_audio_sampling_rate?: int32 = 24000; input_audio_format?: VoiceLiveAudioFormat = VoiceLiveAudioFormat.pcm16; output_audio_format?: VoiceLiveAudioFormat = VoiceLiveAudioFormat.pcm16; - input_audio_transcription?: VoiceLiveAudioInputTranscriptionSettings | null; turn_detection?: VoiceLiveTurnDetection | null; input_audio_noise_reduction?: VoiceLiveAudioNoiseReduction; + input_audio_echo_cancellation?: VoiceLiveAudioEchoCancellation; + avatar?: VoiceLiveAvatarConfig; + input_audio_transcription?: VoiceLiveAudioInputTranscriptionSettings; + output_audio_timestamp_types?: VoiceLiveAudioTimestampType[]; tools?: VoiceLiveTool[]; tool_choice?: VoiceLiveToolChoice; temperature?: float32; @@ -30,21 +31,26 @@ model VoiceLiveRequestSession { model VoiceLiveResponseSession { ...VoiceLiveSessionBase; - object: "voicelive.session"; id: string; - `model`: string; + model: string; modalities: VoiceLiveModality[]; instructions: string; + animation?: VoiceLiveAnimation; voice: VoiceIdsShared; + input_audio?: VoiceLiveInputAudio; input_audio_format: VoiceLiveAudioFormat; output_audio_format: VoiceLiveAudioFormat; - input_audio_transcription: VoiceLiveAudioInputTranscriptionSettings | null; + input_audio_sampling_rate?: int32; turn_detection: VoiceLiveTurnDetection; input_audio_noise_reduction: VoiceLiveAudioNoiseReduction; - tools: VoiceLiveTool[]; + input_audio_echo_cancellation?: VoiceLiveAudioEchoCancellation; + avatar?: VoiceLiveAvatarConfig; + input_audio_transcription: VoiceLiveAudioInputTranscriptionSettings | null; + tools: list; tool_choice: VoiceLiveToolChoice; temperature: float32; max_response_output_tokens: int32 | "inf" | null; + agent?: AgentConfig; } union VoiceLiveAudioFormat { @@ -59,106 +65,140 @@ union VoiceLiveAudioInputTranscriptionModel { whisper_1: "whisper-1", } +@doc("Configuration for input audio transcription.") model VoiceLiveAudioInputTranscriptionSettings { - `model`?: VoiceLiveAudioInputTranscriptionModel = VoiceLiveAudioInputTranscriptionModel.whisper_1; + @doc("The model used for transcription. E.g., 'whisper-1', 'azure-fast-transcription', 's2s-ingraph'.") + model: "whisper-1" | "azure-fast-transcription" | "s2s-ingraph"; + + @doc("The language code to use for transcription, if specified.") language?: string; - prompt?: string; + + @doc("Whether transcription is enabled.") + enabled: boolean; + + @doc("Whether a custom model is being used.") + custom_model: boolean; } union VoiceLiveModality { string, text: "text", audio: "audio", -} - -union VoiceLiveTurnDetectionType { - string, - - /** - * Indicates that server-side voice activity detection (VAD) should be enabled, allowing the server to determine when - * add_user_audio commands present ends of speech and should be automatically committed. - * - * The API will also detect when the user begins talking, sending a generation_canceled command. - */ - server_vad: "server_vad", - - semantic_vad: "semantic_vad", + animation: "animation", + avatar: "avatar", } @discriminator("type") +@doc("Top-level union for turn detection configuration.") model VoiceLiveTurnDetection { - type: VoiceLiveTurnDetectionType; + type: "none" | "server_vad" | "azure_semantic_vad"; +} - /** - * Whether or not to automatically generate a response when VAD is enabled. true by default. - */ - create_response?: boolean = true; +@doc("Disables turn detection.") +model VoiceLiveNoTurnDetection extends VoiceLiveTurnDetection { + type: "none"; +} - /** - * Whether or not to automatically interrupt any ongoing response with output to the default conversation (i.e. `conversation` of `auto`) when a VAD start event occurs. - */ - interrupt_response?: boolean = true; +@doc("Base model for VAD-based turn detection.") +model VoiceLiveServerVad extends VoiceLiveTurnDetection { + type: "server_vad"; + threshold?: float32; + prefix_padding_ms?: int32; + silence_duration_ms?: int32; + end_of_utterance_detection?: unknown; } -model VoiceLiveServerVadTurnDetection extends VoiceLiveTurnDetection { - type: VoiceLiveTurnDetectionType.server_vad; +@doc("Semantic VAD settings based on Azure SDK features.") +model VoiceLiveAzureSemanticVad extends VoiceLiveServerVad { + type: "azure_semantic_vad"; + neg_threshold?: float32; + window_size?: int32; + distinct_ci_phones?: int32; + require_vowel?: boolean; + remove_filler_words?: boolean; +} - /** - * Activation threshold for VAD (0.0 to 1.0), this defaults to 0.5. A higher threshold will require louder audio to activate the model, and thus might perform better in noisy environments. - */ - threshold?: float32 = 0.5; +@doc("Configuration for input audio noise reduction.") +model VoiceLiveAudioNoiseReduction { + @doc("The type of noise reduction model.") + type: "azure_deep_noise_suppression"; +} - // @encode("milliseconds", int32) - /** - * Amount of audio to include before the VAD detected speech (in milliseconds). Defaults to 300ms. - */ - prefix_padding_ms?: duration; // = 300ms +@doc("Configuration for client audio input. Used to specify the audio model and optional phrase list.") +model VoiceLiveInputAudio { + @doc("The name of the model to use for input audio (currently only 'azure-standard' is supported).") + model: "azure-standard"; - // @encode("milliseconds", int32) - /** - * Duration of silence to detect speech stop (in milliseconds). Defaults to 500ms. With shorter values the model will respond more quickly, but may jump in on short pauses from the user. - */ - silence_duration_ms?: duration; // = 500ms + @doc("Optional list of phrases to bias the speech recognition engine.") + phrase_list?: string[]; } -model VoiceLiveSemanticVadTurnDetection extends VoiceLiveTurnDetection { - type: VoiceLiveTurnDetectionType.semantic_vad; +@doc("Echo cancellation configuration for server-side audio processing.") +model VoiceLiveAudioEchoCancellation { + @doc("The type of echo cancellation model to use.") + type: "server_echo_cancellation"; +} - /** - * Used only for `semantic_vad` mode. The eagerness of the model to respond. `low` will wait longer for the user to continue speaking, `high` will respond more quickly. `auto` is the default and is equivalent to `medium`. - */ - eagerness?: "low" | "medium" | "high" | "auto" = "auto"; +@doc("Output timestamp types supported in audio response content.") +union VoiceLiveAudioTimestampType { + string, + @doc("Timestamps per word in the output audio.") + word: "word", } -model VoiceLiveServerEventRateLimitsUpdatedRateLimitsItem { - /** The rate limit property name that this item includes information about. */ - name: string; +@doc("Specifies the types of animation data to output.") +union VoiceLiveAnimationOutputType { + blendshapes: "blendshapes", + viseme_id: "viseme_id", + emotion: "emotion", +} - /** The maximum configured limit for this rate limit property. */ - limit: int32; +@doc("Configuration for animation outputs including blendshapes, visemes, and emotion metadata.") +model VoiceLiveAnimation { + @doc("The name of the animation model to use.") + model_name?: string = "default"; - /** The remaining quota available against the configured limit for this rate limit property. */ - remaining: int32; + @doc("Set of output data types requested from the animation system.") + outputs?: VoiceLiveAnimationOutputType[] = [VoiceLiveAnimationOutputType.blendshapes]; - /** The remaining time, in seconds, until this rate limit property is reset. */ - @encode("seconds", float32) - reset_seconds: duration; + @doc("Interval for emotion detection in milliseconds. If not set, emotion detection is disabled.") + emotion_detection_interval_ms?: int32; } -union VoiceLiveAudioNoiseReductionType { - near_field: "near_field", - far_field: "far_field", -} +@doc("Configuration for avatar streaming and behavior during the session.") +model VoiceLiveAvatarConfig { + @doc("Optional list of ICE servers to use for WebRTC connection establishment.") + ice_servers?: list; -@discriminator("type") -model VoiceLiveAudioNoiseReduction { - type: VoiceLiveAudioNoiseReductionType; -} + @doc("The character name or ID used for the avatar.") + character: string; + + @doc("Optional avatar style, such as emotional tone or speaking style.") + style?: string; -model VoiceLiveAudioNearFieldNoiseReduction extends VoiceLiveAudioNoiseReduction { - type: VoiceLiveAudioNoiseReductionType.near_field; + @doc("Indicates whether the avatar is customized or not.") + customized: boolean; + + @doc("Optional video configuration including resolution, bitrate, and codec.") + video?: VideoParams; } -model VoiceLiveAudioFarFieldNoiseReduction extends VoiceLiveAudioNoiseReduction { - type: VoiceLiveAudioNoiseReductionType.far_field; +@doc("ICE server configuration for WebRTC connection negotiation.") +model IceServer { + @doc("List of ICE server URLs (e.g., TURN or STUN endpoints).") + urls: list; + + @doc("Optional username used for authentication with the ICE server.") + username?: string; + + @doc("Optional credential (e.g., password or token) used for authentication.") + credential?: string; } + +model AgentConfig { + type: "agent"; + name: string; + description?: string; + agent_id: string; + thread_id: string; +} \ No newline at end of file diff --git a/specification/ai/data-plane/VoiceLive/custom/events.tsp b/specification/ai/data-plane/VoiceLive/custom/events.tsp index 0f7d5d5221f0..bf2e9e145158 100644 --- a/specification/ai/data-plane/VoiceLive/custom/events.tsp +++ b/specification/ai/data-plane/VoiceLive/custom/events.tsp @@ -2,25 +2,32 @@ using TypeSpec.OpenAPI; namespace OpenAI; +@doc("Client event types used in VoiceLive protocol.") union VoiceLiveClientEventType { string, session_update: "session.update", input_audio_buffer_append: "input_audio_buffer.append", input_audio_buffer_commit: "input_audio_buffer.commit", input_audio_buffer_clear: "input_audio_buffer.clear", - output_audio_buffer_clear: "output_audio_buffer.clear", + input_audio_turn_start: "input_audio.turn.start", + input_audio_turn_append: "input_audio.turn.append", + input_audio_turn_end: "input_audio.turn.end", + input_audio_turn_cancel: "input_audio.turn.cancel", + input_audio_clear: "input_audio.clear", conversation_item_create: "conversation.item.create", conversation_item_retrieve: "conversation.item.retrieve", conversation_item_truncate: "conversation.item.truncate", conversation_item_delete: "conversation.item.delete", response_create: "response.create", response_cancel: "response.cancel", - transcription_session_update: "transcription_session.update", + session_avatar_connect: "session.avatar.connect", } +@doc("Server event types used in VoiceLive protocol.") union VoiceLiveServerEventType { string, error: "error", + session_avatar_connecting: "session.avatar.connecting", session_created: "session.created", session_updated: "session.updated", conversation_created: "conversation.created", @@ -35,9 +42,6 @@ union VoiceLiveServerEventType { input_audio_buffer_cleared: "input_audio_buffer.cleared", input_audio_buffer_speech_started: "input_audio_buffer.speech_started", input_audio_buffer_speech_stopped: "input_audio_buffer.speech_stopped", - output_audio_buffer_cleared: "output_audio_buffer.cleared", - output_audio_buffer_started: "output_audio_buffer.started", - output_audio_buffer_stopped: "output_audio_buffer.stopped", response_created: "response.created", response_done: "response.done", response_output_item_added: "response.output_item.added", @@ -50,8 +54,11 @@ union VoiceLiveServerEventType { response_audio_transcript_done: "response.audio_transcript.done", response_audio_delta: "response.audio.delta", response_audio_done: "response.audio.done", - response_function_call_arguments_delta: "response.function_call_arguments.delta", - response_function_call_arguments_done: "response.function_call_arguments.done", - transcription_session_updated: "transcription_session.updated", - rate_limits_updated: "rate_limits.updated", -} + response_animation_blendshapes_delta: "response.animation_blendshapes.delta", + response_animation_blendshapes_done: "response.animation_blendshapes.done", + response_emotion_hypothesis: "response.emotion_hypothesis", + response_audio_timestamp_delta: "response.audio_timestamp.delta", + response_audio_timestamp_done: "response.audio_timestamp.done", + response_animation_viseme_delta: "response.animation_viseme.delta", + response_animation_viseme_done: "response.animation_viseme.done", +} \ No newline at end of file diff --git a/specification/ai/data-plane/VoiceLive/models.tsp b/specification/ai/data-plane/VoiceLive/models.tsp index ad7dbf830433..973d83db8659 100644 --- a/specification/ai/data-plane/VoiceLive/models.tsp +++ b/specification/ai/data-plane/VoiceLive/models.tsp @@ -75,6 +75,73 @@ model VoiceLiveClientEventSessionUpdate extends VoiceLiveClientEvent { session: VoiceLiveRequestSession; } +@doc(""" + Sent when the client connects and provides its SDP (Session Description Protocol) + for avatar-related media negotiation. +""") +model VoiceLiveClientEventSessionAvatarConnect extends ClientEventBase { + @doc("The event type, must be 'session.avatar.connect'.") + type: VoiceLiveClientEventType.session_avatar_connect; + + @doc("The client's SDP offer.") + client_sdp: string; +} + +@doc(""" + Indicates the start of a new audio input turn. +""") +model VoiceLiveClientEventInputAudioTurnStart extends ClientEventBase { + @doc("The event type, must be 'input_audio.turn.start'.") + type: VoiceLiveClientEventType.input_audio_turn_start; + + @doc("Unique identifier for the input audio turn.") + turn_id: string; +} + +@doc(""" + Appends audio data to an ongoing input turn. +""") +model VoiceLiveClientEventInputAudioTurnAppend extends ClientEventBase { + @doc("The event type, must be 'input_audio.turn.append'.") + type: VoiceLiveClientEventType.input_audio_turn_append; + + @doc("The ID of the turn this audio is part of.") + turn_id: string; + + @doc("Base64-encoded audio chunk.") + audio: string; +} + +@doc(""" + Marks the end of an audio input turn. +""") +model VoiceLiveClientEventInputAudioTurnEnd extends ClientEventBase { + @doc("The event type, must be 'input_audio.turn.end'.") + type: VoiceLiveClientEventType.input_audio_turn_end; + + @doc("The ID of the audio turn being ended.") + turn_id: string; +} + +@doc(""" + Cancels an in-progress input audio turn. +""") +model VoiceLiveClientEventInputAudioTurnCancel extends ClientEventBase { + @doc("The event type, must be 'input_audio.turn.cancel'.") + type: VoiceLiveClientEventType.input_audio_turn_cancel; + + @doc("The ID of the turn to cancel.") + turn_id: string; +} + +@doc(""" + Clears all input audio currently being streamed. +""") +model VoiceLiveClientEventInputAudioClear extends ClientEventBase { + @doc("The event type, must be 'input_audio.clear'.") + type: VoiceLiveClientEventType.input_audio_clear; +} + // Tool customization: establish custom, enriched discriminated type hierarchy /** The item to add to the conversation. */ model VoiceLiveConversationItemBase { @@ -246,11 +313,10 @@ model VoiceLiveClientEventInputAudioBufferAppend extends VoiceLiveClientEvent { // Tool customization: use encoded type for audio data @doc(""" - Base64-encoded audio bytes. This must be in the format specified by the + Base64-encoded audio. This must be in the format specified by the `input_audio_format` field in the session configuration. """) - @encode("base64") - audio: bytes; + audio: string; } // Tool customization (apply_discriminator): apply discriminated type base @@ -285,21 +351,6 @@ model VoiceLiveClientEventInputAudioBufferClear extends VoiceLiveClientEvent { type: VoiceLiveClientEventType.input_audio_buffer_clear; } -// Tool customization (apply_discriminator): apply discriminated type base -@doc(""" - **WebRTC Only:** Emit to cut off the current audio response. This will trigger the server to - stop generating audio and emit a `output_audio_buffer.cleared` event. This - event should be preceded by a `response.cancel` client event to stop the - generation of the current response. - [Learn more](/docs/guides/voicelive-conversations#client-and-server-events-for-audio-in-webrtc). - """) -model VoiceLiveClientEventOutputAudioBufferClear extends VoiceLiveClientEvent { - @doc(""" - The event type, must be `output_audio_buffer.clear`. - """) - type: VoiceLiveClientEventType.output_audio_buffer_clear; -} - // Tool customization (apply_discriminator): apply discriminated type base @doc(""" Add a new Item to the Conversation's context, including messages, function @@ -491,6 +542,15 @@ model VoiceLiveServerEventSessionUpdated extends VoiceLiveServerEvent { session: VoiceLiveResponseSession; } +@doc("Sent when the server is in the process of establishing an avatar media connection and provides its SDP answer.") +model VoiceLiveServerEventSessionAvatarConnecting extends VoiceLiveServerEvent { + @doc("The event type, must be 'session.avatar.connecting'.") + type: VoiceLiveServerEventType.session_avatar_connecting; + + @doc("The server's SDP answer for the avatar connection.") + server_sdp: string; +} + // Tool customization: establish base for enriched request/response split models /** VoiceLive session object configuration. */ model VoiceLiveSessionBase {} @@ -613,58 +673,6 @@ model VoiceLiveServerEventInputAudioBufferSpeechStopped item_id: string; } -// Tool customization (apply_discriminator): apply discriminated type -@doc(""" - **WebRTC Only:** Emitted when the output audio buffer is cleared. This happens either in VAD - mode when the user has interrupted (`input_audio_buffer.speech_started`), - or when the client has emitted the `output_audio_buffer.clear` event to manually - cut off the current audio response. - [Learn more](/docs/guides/voicelive-conversations#client-and-server-events-for-audio-in-webrtc). - """) -model VoiceLiveServerEventOutputAudioBufferCleared extends VoiceLiveServerEvent { - @doc(""" - The event type, must be `output_audio_buffer.cleared`. - """) - type: VoiceLiveServerEventType.output_audio_buffer_cleared; - - /** The unique ID of the response that produced the audio. */ - response_id: string; -} - -// Tool customization (apply_discriminator): apply discriminated type -@doc(""" - **WebRTC Only:** Emitted when the server begins streaming audio to the client. This event is - emitted after an audio content part has been added (`response.content_part.added`) - to the response. - [Learn more](/docs/guides/voicelive-conversations#client-and-server-events-for-audio-in-webrtc). - """) -model VoiceLiveServerEventOutputAudioBufferStarted extends VoiceLiveServerEvent { - @doc(""" - The event type, must be `output_audio_buffer.started`. - """) - type: VoiceLiveServerEventType.output_audio_buffer_started; - - /** The unique ID of the response that produced the audio. */ - response_id: string; -} - -// Tool customization (apply_discriminator): apply discriminated type -@doc(""" - **WebRTC Only:** Emitted when the output audio buffer has been completely drained on the server, - and no more audio is forthcoming. This event is emitted after the full response - data has been sent to the client (`response.done`). - [Learn more](/docs/guides/voicelive-conversations#client-and-server-events-for-audio-in-webrtc). - """) -model VoiceLiveServerEventOutputAudioBufferStopped extends VoiceLiveServerEvent { - @doc(""" - The event type, must be `output_audio_buffer.stopped`. - """) - type: VoiceLiveServerEventType.output_audio_buffer_stopped; - - /** The unique ID of the response that produced the audio. */ - response_id: string; -} - // Tool customization (apply_discriminator): apply discriminated type @doc(""" Returned when a conversation item is created. There are several scenarios that produce this event: @@ -1087,79 +1095,118 @@ model VoiceLiveServerEventResponseAudioDone extends VoiceLiveServerEvent { content_index: int32; } -// Tool customization (apply_discriminator): apply discriminated type -/** Returned when the model-generated function call arguments are updated. */ -model VoiceLiveServerEventResponseFunctionCallArgumentsDelta - extends VoiceLiveServerEvent { - @doc(""" - The event type, must be `response.function_call_arguments.delta`. - """) - type: VoiceLiveServerEventType.response_function_call_arguments_delta; - - /** The ID of the response. */ +@doc(""" +Represents a delta update of blendshape animation frames for a specific output of a response. +""") +model ResponseAnimationBlendshapeDeltaEvent extends VoiceLiveServerEvent { + type: VoiceLiveServerEventType.response_animation_blendshapes_delta; response_id: string; - - /** The ID of the function call item. */ item_id: string; - - /** The index of the output item in the response. */ output_index: int32; + content_index: int32; + frames: list> | string; + frame_index: int32; +} - /** The ID of the function call. */ - call_id: string; +@doc(""" +Indicates the completion of blendshape animation processing for a specific output of a response. +""") +model ResponseAnimationBlendshapeDoneEvent extends VoiceLiveServerEvent { + type: VoiceLiveServerEventType.response_animation_blendshapes_done; + response_id: string; + item_id: string; + output_index: int32; +} - /** The arguments delta as a JSON string. */ - delta: string; +@doc(""" +Represents an emotion hypothesis detected from response audio with multiple candidates. +""") +model ResponseEmotionHypothesis extends VoiceLiveServerEvent { + type: VoiceLiveServerEventType.response_emotion_hypothesis; + emotion: string; + candidates: list<{ + emotion: string; + confidence: float32; + }>; + audio_offset_ms: int32; + audio_duration_ms: int32; + response_id: string; + item_id: string; } -// Tool customization (apply_discriminator): apply discriminated type -/** - * Returned when the model-generated function call arguments are done streaming. - * Also emitted when a Response is interrupted, incomplete, or cancelled. - */ -model VoiceLiveServerEventResponseFunctionCallArgumentsDone - extends VoiceLiveServerEvent { - @doc(""" - The event type, must be `response.function_call_arguments.done`. - """) - type: VoiceLiveServerEventType.response_function_call_arguments_done; +@doc(""" +Represents a word-level audio timestamp delta for a response. +""") +model ResponseAudioTimestampDeltaEvent extends VoiceLiveServerEvent { + type: VoiceLiveServerEventType.response_audio_timestamp_delta; + response_id: string; + item_id: string; + output_index: int32; + content_index: int32; + audio_offset_ms: int32; + audio_duration_ms: int32; + text: string; + timestamp_type: "word"; +} - /** The ID of the response. */ +@doc(""" +Indicates completion of audio timestamp delivery for a response. +""") +model ResponseAudioTimestampDoneEvent extends VoiceLiveServerEvent { + type: VoiceLiveServerEventType.response_audio_timestamp_done; response_id: string; + item_id: string; + output_index: int32; + content_index: int32; +} - /** The ID of the function call item. */ +@doc(""" +Represents a viseme ID delta update for animation based on audio. +""") +model ResponseAnimationVisemeDeltaEvent extends VoiceLiveServerEvent { + type: VoiceLiveServerEventType.response_animation_viseme_delta; + response_id: string; item_id: string; + output_index: int32; + content_index: int32; + audio_offset_ms: int32; + viseme_id: int32; +} - /** The index of the output item in the response. */ +@doc(""" +Indicates completion of viseme animation delivery for a response. +""") +model ResponseAnimationVisemeDoneEvent extends VoiceLiveServerEvent { + type: VoiceLiveServerEventType.response_animation_viseme_done; + response_id: string; + item_id: string; output_index: int32; + content_index: int32; +} - /** The ID of the function call. */ - call_id: string; +/** Create a new VoiceLive response with these parameters */ +model VoiceLiveResponseCreateParams { + @doc(""" + Whether to commit the response to the conversation. Defaults to true. + """) + commit?: boolean = true; - /** The final arguments as a JSON string. */ - arguments: string; -} + @doc(""" + Whether to cancel any ongoing generation before starting this one. Defaults to true. + """) + cancel_previous?: boolean = true; -// Tool customization (apply_discriminator): apply discriminated type -/** - * Emitted at the beginning of a Response to indicate the updated rate limits. - * When a Response is created some tokens will be "reserved" for the output - * tokens, the rate limits shown here reflect that reservation, which is then - * adjusted accordingly once the Response is completed. - */ -model VoiceLiveServerEventRateLimitsUpdated extends VoiceLiveServerEvent { @doc(""" - The event type, must be `rate_limits.updated`. - """) - type: VoiceLiveServerEventType.rate_limits_updated; + Input items to append to the conversation context before generating a response. + """) + append_input_items?: list; - // Tool customization: use custom type for rate limit items (applying encoded duration) - /** List of rate limit information. */ - rate_limits: VoiceLiveServerEventRateLimitsUpdatedRateLimitsItem[]; -} + @doc(""" + Input items to be used as the context for this response. + An empty array clears previous context. + """) + input_items?: list; -/** Create a new VoiceLive response with these parameters */ -model VoiceLiveResponseCreateParams { // Tool customization: Apply reusable modality representation /** * The set of modalities the model can respond with. To disable audio, @@ -1218,26 +1265,6 @@ model VoiceLiveResponseCreateParams { given model. Defaults to `inf`. """) max_output_tokens?: int32 | "inf"; - - @doc(""" - Controls which conversation the response is added to. Currently supports - `auto` and `none`, with `auto` as the default value. The `auto` value - means that the contents of the response will be added to the default - conversation. Set this to `none` to create an out-of-band response which - will not add items to default conversation. - """) - conversation?: string | "auto" | "none"; - - ...MetadataPropertyForRequest; - - // Tool customization: apply a customized, more specific discriminated type hierarchy - @doc(""" - Input items to include in the prompt for the model. Using this field - creates a new context for this Response instead of using the default - conversation. An empty array `[]` will clear the context for this Response. - Note that this can include references to items from the default conversation. - """) - input?: VoiceLiveConversationRequestItem[]; } /** VoiceLive session object configuration. */ @@ -1696,18 +1723,6 @@ model VoiceLiveClientEventConversationItemRetrieve extends VoiceLiveClientEvent item_id: string; } -// Tool customization (apply_discriminator): apply discriminated type base -/** Send this event to update a transcription session. */ -model VoiceLiveClientEventTranscriptionSessionUpdate - extends VoiceLiveClientEvent { - @doc(""" - The event type, must be `transcription_session.update`. - """) - type: VoiceLiveClientEventType.transcription_session_update; - - session: VoiceLiveTranscriptionSessionCreateRequest; -} - // Tool customization (apply_discriminator): apply discriminated type /** Returned when the text value of an input audio transcription content part is updated. */ model VoiceLiveServerEventConversationItemInputAudioTranscriptionDelta @@ -1743,230 +1758,3 @@ model VoiceLiveServerEventConversationItemRetrieved extends VoiceLiveServerEvent // Tool customization: apply enriched item definition hierarchy item: VoiceLiveConversationResponseItem; } - -// Tool customization (apply_discriminator): apply discriminated type -@doc(""" - Returned when a transcription session is updated with a `transcription_session.update` event, unless - there is an error. - """) -model VoiceLiveServerEventTranscriptionSessionUpdated - extends VoiceLiveServerEvent { - @doc(""" - The event type, must be `transcription_session.updated`. - """) - type: VoiceLiveServerEventType.transcription_session_updated; - - session: VoiceLiveTranscriptionSessionCreateResponse; -} - -/** VoiceLive transcription session object configuration. */ -model VoiceLiveTranscriptionSessionCreateRequest { - /** - * The set of modalities the model can respond with. To disable audio, - * set this to ["text"]. - */ - modalities?: ("text" | "audio")[]; - - @doc(""" - The format of input audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`. - For `pcm16`, input audio must be 16-bit PCM at a 24kHz sample rate, - single channel (mono), and little-endian byte order. - """) - input_audio_format?: "pcm16" | "g711_ulaw" | "g711_alaw" = "pcm16"; - - /** Configuration for input audio transcription. The client can optionally set the language and prompt for transcription, these offer additional guidance to the transcription service. */ - input_audio_transcription?: { - @doc(""" - The model to use for transcription, current options are `gpt-4o-transcribe`, `gpt-4o-mini-transcribe`, and `whisper-1`. - """) - `model`?: "gpt-4o-transcribe" | "gpt-4o-mini-transcribe" | "whisper-1"; - - @doc(""" - The language of the input audio. Supplying the input language in - [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`) format - will improve accuracy and latency. - """) - language?: string; - - @doc(""" - An optional text to guide the model's style or continue a previous audio - segment. - For `whisper-1`, the [prompt is a list of keywords](/docs/guides/speech-to-text#prompting). - For `gpt-4o-transcribe` models, the prompt is a free text string, for example "expect words related to technology". - """) - prompt?: string; - }; - - @doc(""" - Configuration for turn detection, ether Server VAD or Semantic VAD. This can be set to `null` to turn off, in which case the client must manually trigger model response. - Server VAD means that the model will detect the start and end of speech based on audio volume and respond at the end of user speech. - Semantic VAD is more advanced and uses a turn detection model (in conjuction with VAD) to semantically estimate whether the user has finished speaking, then dynamically sets a timeout based on this probability. For example, if user audio trails off with "uhhm", the model will score a low probability of turn end and wait longer for the user to continue speaking. This can be useful for more natural conversations, but may have a higher latency. - """) - turn_detection?: { - /** Type of turn detection. */ - type?: "server_vad" | "semantic_vad" = "server_vad"; - - @doc(""" - Used only for `semantic_vad` mode. The eagerness of the model to respond. `low` will wait longer for the user to continue speaking, `high` will respond more quickly. `auto` is the default and is equivalent to `medium`. - """) - eagerness?: "low" | "medium" | "high" | "auto" = "auto"; - - @doc(""" - Used only for `server_vad` mode. Activation threshold for VAD (0.0 to 1.0), this defaults to 0.5. A - higher threshold will require louder audio to activate the model, and - thus might perform better in noisy environments. - """) - threshold?: float32; - - @doc(""" - Used only for `server_vad` mode. Amount of audio to include before the VAD detected speech (in - milliseconds). Defaults to 300ms. - """) - prefix_padding_ms?: int32; - - @doc(""" - Used only for `server_vad` mode. Duration of silence to detect speech stop (in milliseconds). Defaults - to 500ms. With shorter values the model will respond more quickly, - but may jump in on short pauses from the user. - """) - silence_duration_ms?: int32; - - /** Whether or not to automatically generate a response when a VAD stop event occurs. Not available for transcription sessions. */ - create_response?: boolean = true; - - @doc(""" - Whether or not to automatically interrupt any ongoing response with output to the default - conversation (i.e. `conversation` of `auto`) when a VAD start event occurs. Not available for transcription sessions. - """) - interrupt_response?: boolean = true; - }; - - @doc(""" - Configuration for input audio noise reduction. This can be set to `null` to turn off. - Noise reduction filters audio added to the input audio buffer before it is sent to VAD and the model. - Filtering the audio can improve VAD and turn detection accuracy (reducing false positives) and model performance by improving perception of the input audio. - """) - input_audio_noise_reduction?: { - @doc(""" - Type of noise reduction. `near_field` is for close-talking microphones such as headphones, `far_field` is for far-field microphones such as laptop or conference room microphones. - """) - type?: "near_field" | "far_field"; - } | null = null; - - @doc(""" - The set of items to include in the transcription. Current available items are: - - `item.input_audio_transcription.logprobs` - """) - include?: string[]; - - /** Configuration options for the generated client secret. */ - client_secret?: { - /** Configuration for the ephemeral token expiration. */ - expires_at?: { - @doc(""" - The anchor point for the ephemeral token expiration. Only `created_at` is currently supported. - """) - anchor?: "created_at" = "created_at"; - - @doc(""" - The number of seconds from the anchor point to the expiration. Select a value between `10` and `7200`. - """) - seconds?: int32 = 600; - }; - }; -} - -/** - * A new VoiceLive transcription session configuration. - * - * When a session is created on the server via REST API, the session object - * also contains an ephemeral key. Default TTL for keys is 10 minutes. This - * property is not present when a session is updated via the WebSocket API. - */ -model VoiceLiveTranscriptionSessionCreateResponse { - /** - * Ephemeral key returned by the API. Only present when the session is - * created on the server via REST API. - */ - client_secret: { - /** - * Ephemeral key usable in client environments to authenticate connections - * to the VoiceLive API. Use this in client-side environments rather than - * a standard API token, which should only be used server-side. - */ - value: string; - - // Tool customization: 'created' and fields ending in '_at' are Unix encoded utcDateTime - /** - * Timestamp for when the token expires. Currently, all tokens expire - * after one minute. - */ - @encode("unixTimestamp", int32) - expires_at: utcDateTime; - }; - - /** - * The set of modalities the model can respond with. To disable audio, - * set this to ["text"]. - */ - modalities?: ("text" | "audio")[]; - - @doc(""" - The format of input audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`. - """) - input_audio_format?: string; - - /** Configuration of the transcription model. */ - input_audio_transcription?: { - @doc(""" - The model to use for transcription. Can be `gpt-4o-transcribe`, `gpt-4o-mini-transcribe`, or `whisper-1`. - """) - `model`?: "gpt-4o-transcribe" | "gpt-4o-mini-transcribe" | "whisper-1"; - - @doc(""" - The language of the input audio. Supplying the input language in - [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`) format - will improve accuracy and latency. - """) - language?: string; - - /** - * An optional text to guide the model's style or continue a previous audio - * segment. The [prompt](/docs/guides/speech-to-text#prompting) should match - * the audio language. - */ - prompt?: string; - }; - - @doc(""" - Configuration for turn detection. Can be set to `null` to turn off. Server - VAD means that the model will detect the start and end of speech based on - audio volume and respond at the end of user speech. - """) - turn_detection?: { - @doc(""" - Type of turn detection, only `server_vad` is currently supported. - """) - type?: string; - - /** - * Activation threshold for VAD (0.0 to 1.0), this defaults to 0.5. A - * higher threshold will require louder audio to activate the model, and - * thus might perform better in noisy environments. - */ - threshold?: float32; - - /** - * Amount of audio to include before the VAD detected speech (in - * milliseconds). Defaults to 300ms. - */ - prefix_padding_ms?: int32; - - /** - * Duration of silence to detect speech stop (in milliseconds). Defaults - * to 500ms. With shorter values the model will respond more quickly, - * but may jump in on short pauses from the user. - */ - silence_duration_ms?: int32; - }; -} From ebaaa659da47d626c8102113d3e8f94ecc60894e Mon Sep 17 00:00:00 2001 From: Ryan Hurey Date: Wed, 16 Jul 2025 15:05:09 -0700 Subject: [PATCH 07/48] More models --- .../ai/data-plane/VoiceLive/audio/custom.tsp | 10 ++++ .../ai/data-plane/VoiceLive/audio/models.tsp | 53 ++++++++++++++++++- .../ai/data-plane/VoiceLive/client.tsp | 5 ++ .../ai/data-plane/VoiceLive/common/models.tsp | 12 +++++ .../VoiceLive/servers/websocket.tsp | 36 ++++++++++++- 5 files changed, 114 insertions(+), 2 deletions(-) diff --git a/specification/ai/data-plane/VoiceLive/audio/custom.tsp b/specification/ai/data-plane/VoiceLive/audio/custom.tsp index 5ad1d3a2bec6..ca2c23c7f496 100644 --- a/specification/ai/data-plane/VoiceLive/audio/custom.tsp +++ b/specification/ai/data-plane/VoiceLive/audio/custom.tsp @@ -1 +1,11 @@ import "./models.tsp"; + +union TranscriptTextUsageType { + tokens: "tokens", + duration: "duration", +} + +@discriminator("type") +model TranscriptTextUsage { + type: TranscriptTextUsageType; +} diff --git a/specification/ai/data-plane/VoiceLive/audio/models.tsp b/specification/ai/data-plane/VoiceLive/audio/models.tsp index 4b16fe1c619a..906f064f362c 100644 --- a/specification/ai/data-plane/VoiceLive/audio/models.tsp +++ b/specification/ai/data-plane/VoiceLive/audio/models.tsp @@ -46,4 +46,55 @@ union TranscriptionInclude { } // Other models removed because they do not correspond to Python models or are redundant -// Please re-add as needed with proper alignment to the source Python definitions \ No newline at end of file +// Please re-add as needed with proper alignment to the source Python definitions + +model CreateSpeechRequest { + @doc(""" + One of the available [TTS models](/docs/models#tts): `tts-1`, `tts-1-hd` or `gpt-4o-mini-tts`. + """) + @extension("x-oaiTypeLabel", "string") + `model`: string | "tts-1" | "tts-1-hd" | "gpt-4o-mini-tts"; + + /** The text to generate audio for. The maximum length is 4096 characters. */ + @maxLength(4096) + input: string; + + @doc(""" + Control the voice of your generated audio with additional instructions. Does not work with `tts-1` or `tts-1-hd`. + """) + @maxLength(4096) + instructions?: string; + + @doc(""" + The voice to use when generating the audio. Supported voices are `alloy`, `ash`, `ballad`, `coral`, `echo`, `fable`, `onyx`, `nova`, `sage`, `shimmer`, and `verse`. Previews of the voices are available in the [Text to speech guide](/docs/guides/text-to-speech#voice-options). + """) + voice: VoiceIdsShared; + + @doc(""" + The format to audio in. Supported formats are `mp3`, `opus`, `aac`, `flac`, `wav`, and `pcm`. + """) + response_format?: "mp3" | "opus" | "aac" | "flac" | "wav" | "pcm" = "mp3"; + + @doc(""" + The speed of the generated audio. Select a value from `0.25` to `4.0`. `1.0` is the default. + """) + @minValue(0.25) + @maxValue(4) + speed?: float32 = 1; + + @doc(""" + The format to stream the audio in. Supported formats are `sse` and `audio`. `sse` is not supported for `tts-1` or `tts-1-hd`. + """) + stream_format?: "sse" | "audio" = "audio"; +} + +// Tool customization: Convert to discriminated type base +union CreateSpeechResponseStreamEventType { + speech_audio_delta: "speech.audio.delta", + speech_audio_done: "speech.audio.done", +} + +@discriminator("type") +model CreateSpeechResponseStreamEvent { + type: CreateSpeechResponseStreamEventType; +} \ No newline at end of file diff --git a/specification/ai/data-plane/VoiceLive/client.tsp b/specification/ai/data-plane/VoiceLive/client.tsp index 16f81c764b21..a47c9f377938 100644 --- a/specification/ai/data-plane/VoiceLive/client.tsp +++ b/specification/ai/data-plane/VoiceLive/client.tsp @@ -2,3 +2,8 @@ import "@azure-tools/typespec-client-generator-core"; import "./servers/websocket.tsp"; using Azure.ClientGenerator.Core; + +@@access(VoiceLive.VoiceLiveServerEventSessionCreated, Access.public, "python"); +@@access(VoiceLive.VoiceLiveServerEventResponseAudioDelta, Access.public, "python"); + + diff --git a/specification/ai/data-plane/VoiceLive/common/models.tsp b/specification/ai/data-plane/VoiceLive/common/models.tsp index 23ee3a807907..d08a8810d454 100644 --- a/specification/ai/data-plane/VoiceLive/common/models.tsp +++ b/specification/ai/data-plane/VoiceLive/common/models.tsp @@ -29,6 +29,18 @@ model MetadataPropertyForResponse { metadata: Record | null; } +/** A log probability object. */ +model LogProbProperties { + /** The token that was used to generate the log probability. */ + token: string; + + /** The log probability of the token. */ + logprob: float32; + + /** The bytes that were used to generate the log probability. */ + bytes: int32[]; +} + union VoiceIdsShared { string, "alloy", diff --git a/specification/ai/data-plane/VoiceLive/servers/websocket.tsp b/specification/ai/data-plane/VoiceLive/servers/websocket.tsp index 5cf1e52027bd..d4901df41c8e 100644 --- a/specification/ai/data-plane/VoiceLive/servers/websocket.tsp +++ b/specification/ai/data-plane/VoiceLive/servers/websocket.tsp @@ -11,4 +11,38 @@ using TypeSpec.Http; @useAuth(BearerAuth) namespace VoiceLive; -op force_models(session: VoiceLiveClientEventSessionUpdate ): VoiceLiveServerEventSessionUpdated ; +op force_models(session: VoiceLiveClientEventSessionUpdate ): + VoiceLiveServerEventSessionUpdated | + VoiceLiveServerEventSessionCreated | + VoiceLiveServerEventError | + VoiceLiveServerEventResponseTextDelta | + VoiceLiveServerEventResponseAudioDelta | + VoiceLiveServerEventConversationCreated| + VoiceLiveServerEventConversationItemCreated| + VoiceLiveServerEventConversationItemDeleted| + VoiceLiveServerEventConversationItemRetrieved| + VoiceLiveServerEventConversationItemTruncated| + VoiceLiveServerEventConversationItemInputAudioTranscriptionCompleted| + VoiceLiveServerEventConversationItemInputAudioTranscriptionDelta| + VoiceLiveServerEventConversationItemInputAudioTranscriptionFailed| + VoiceLiveServerEventInputAudioBufferCommitted| + VoiceLiveServerEventInputAudioBufferCleared| + VoiceLiveServerEventInputAudioBufferSpeechStarted| + VoiceLiveServerEventInputAudioBufferSpeechStopped| + VoiceLiveServerEventOutputAudioBufferCleared| + VoiceLiveServerEventOutputAudioBufferStarted| + VoiceLiveServerEventOutputAudioBufferStopped| + VoiceLiveServerEventResponseCreated| + VoiceLiveServerEventResponseDone| + VoiceLiveServerEventResponseOutputItemAdded| + VoiceLiveServerEventResponseOutputItemDone| + VoiceLiveServerEventResponseContentPartAdded| + VoiceLiveServerEventResponseContentPartDone| + VoiceLiveServerEventResponseTextDone| + VoiceLiveServerEventResponseAudioTranscriptDelta| + VoiceLiveServerEventResponseAudioTranscriptDone| + VoiceLiveServerEventResponseAudioDone| + VoiceLiveServerEventResponseFunctionCallArgumentsDelta| + VoiceLiveServerEventResponseFunctionCallArgumentsDone| + VoiceLiveServerEventTranscriptionSessionUpdated| + VoiceLiveServerEventRateLimitsUpdated; From 1e6ccebfc6a7f1ad4f6e6382107997b2ac31e396 Mon Sep 17 00:00:00 2001 From: Xiting Zhang Date: Wed, 16 Jul 2025 16:18:28 -0700 Subject: [PATCH 08/48] fix sessoin issues --- .../ai/data-plane/VoiceLive/common/models.tsp | 30 ++++++++--- .../ai/data-plane/VoiceLive/custom.tsp | 51 +++++++++++++++---- .../ai/data-plane/VoiceLive/models.tsp | 28 +++++----- .../VoiceLive/servers/websocket.tsp | 9 +--- 4 files changed, 80 insertions(+), 38 deletions(-) diff --git a/specification/ai/data-plane/VoiceLive/common/models.tsp b/specification/ai/data-plane/VoiceLive/common/models.tsp index d08a8810d454..1fb0d3f6f2ce 100644 --- a/specification/ai/data-plane/VoiceLive/common/models.tsp +++ b/specification/ai/data-plane/VoiceLive/common/models.tsp @@ -7,40 +7,53 @@ using TypeSpec.OpenAPI; namespace VoiceLive; +@doc("Error object returned in case of API failure.") model Error { - code: string | null; + @doc("Error code, or null if unspecified.") + code?: string; + + @doc("Human-readable error message.") message: string; - param: string | null; + + @doc("Parameter name related to the error, if applicable.") + param?: string; + + @doc("Type or category of the error.") type: string; } @error +@doc("Standard error response envelope.") model ErrorResponse { + @doc("Error object returned in case of API failure.") error: Error; } +@doc("Optional metadata property used in request models.") model MetadataPropertyForRequest { @extension("x-oaiTypeLabel", "map") metadata?: Record; } +@doc("Metadata property included in response models.") model MetadataPropertyForResponse { @extension("x-oaiTypeLabel", "map") - metadata: Record | null; + metadata?: Record; } -/** A log probability object. */ +@doc("A single log probability entry for a token.") model LogProbProperties { - /** The token that was used to generate the log probability. */ + @doc("The token that was used to generate the log probability.") token: string; - /** The log probability of the token. */ + @doc("The log probability of the token.") logprob: float32; - /** The bytes that were used to generate the log probability. */ + @doc("The bytes that were used to generate the log probability.") bytes: int32[]; } +@doc("Identifier for selecting a built-in voice.") union VoiceIdsShared { string, "alloy", @@ -53,10 +66,11 @@ union VoiceIdsShared { "verse", } +@doc("Enumerates available service tier configurations.") union ServiceTier { "auto", "default", "flex", "scale", "priority", -} +} \ No newline at end of file diff --git a/specification/ai/data-plane/VoiceLive/custom.tsp b/specification/ai/data-plane/VoiceLive/custom.tsp index c1a984b9697a..70e81cd59aa5 100644 --- a/specification/ai/data-plane/VoiceLive/custom.tsp +++ b/specification/ai/data-plane/VoiceLive/custom.tsp @@ -8,7 +8,7 @@ namespace VoiceLive; model VoiceLiveRequestSession { ...VoiceLiveSessionBase; - model?: string; + `model`?: string; modalities?: VoiceLiveModality[]; animation?: VoiceLiveAnimation; voice?: VoiceIdsShared; @@ -32,7 +32,7 @@ model VoiceLiveRequestSession { model VoiceLiveResponseSession { ...VoiceLiveSessionBase; id: string; - model: string; + `model`: string; modalities: VoiceLiveModality[]; instructions: string; animation?: VoiceLiveAnimation; @@ -46,7 +46,7 @@ model VoiceLiveResponseSession { input_audio_echo_cancellation?: VoiceLiveAudioEchoCancellation; avatar?: VoiceLiveAvatarConfig; input_audio_transcription: VoiceLiveAudioInputTranscriptionSettings | null; - tools: list; + tools: VoiceLiveTool[]; tool_choice: VoiceLiveToolChoice; temperature: float32; max_response_output_tokens: int32 | "inf" | null; @@ -68,7 +68,7 @@ union VoiceLiveAudioInputTranscriptionModel { @doc("Configuration for input audio transcription.") model VoiceLiveAudioInputTranscriptionSettings { @doc("The model used for transcription. E.g., 'whisper-1', 'azure-fast-transcription', 's2s-ingraph'.") - model: "whisper-1" | "azure-fast-transcription" | "s2s-ingraph"; + `model`: "whisper-1" | "azure-fast-transcription" | "s2s-ingraph"; @doc("The language code to use for transcription, if specified.") language?: string; @@ -109,7 +109,7 @@ model VoiceLiveServerVad extends VoiceLiveTurnDetection { } @doc("Semantic VAD settings based on Azure SDK features.") -model VoiceLiveAzureSemanticVad extends VoiceLiveServerVad { +model VoiceLiveAzureSemanticVad extends VoiceLiveTurnDetection { type: "azure_semantic_vad"; neg_threshold?: float32; window_size?: int32; @@ -127,7 +127,7 @@ model VoiceLiveAudioNoiseReduction { @doc("Configuration for client audio input. Used to specify the audio model and optional phrase list.") model VoiceLiveInputAudio { @doc("The name of the model to use for input audio (currently only 'azure-standard' is supported).") - model: "azure-standard"; + `model`: "azure-standard"; @doc("Optional list of phrases to bias the speech recognition engine.") phrase_list?: string[]; @@ -159,7 +159,7 @@ model VoiceLiveAnimation { model_name?: string = "default"; @doc("Set of output data types requested from the animation system.") - outputs?: VoiceLiveAnimationOutputType[] = [VoiceLiveAnimationOutputType.blendshapes]; + outputs?: VoiceLiveAnimationOutputType[] = #[VoiceLiveAnimationOutputType.blendshapes]; @doc("Interval for emotion detection in milliseconds. If not set, emotion detection is disabled.") emotion_detection_interval_ms?: int32; @@ -168,7 +168,7 @@ model VoiceLiveAnimation { @doc("Configuration for avatar streaming and behavior during the session.") model VoiceLiveAvatarConfig { @doc("Optional list of ICE servers to use for WebRTC connection establishment.") - ice_servers?: list; + ice_servers?: IceServer[]; @doc("The character name or ID used for the avatar.") character: string; @@ -186,7 +186,7 @@ model VoiceLiveAvatarConfig { @doc("ICE server configuration for WebRTC connection negotiation.") model IceServer { @doc("List of ICE server URLs (e.g., TURN or STUN endpoints).") - urls: list; + urls: string[]; @doc("Optional username used for authentication with the ICE server.") username?: string; @@ -201,4 +201,37 @@ model AgentConfig { description?: string; agent_id: string; thread_id: string; +} + +@doc("Video streaming parameters for avatar.") +model VideoParams { + @doc("Bitrate in bits per second (e.g., 2000000 for 2 Mbps).") + bitrate?: int32 = 2000000; + + @doc("Codec to use for encoding. Currently only 'h264' is supported.") + codec?: "h264" = "h264"; + + @doc("Optional cropping settings for the video stream.") + crop?: VideoCrop; + + @doc("Optional resolution settings for the video stream.") + resolution?: VideoResolution; +} + +@doc("Bounding box crop for avatar video. Coordinates must be positive.") +model VideoCrop { + @doc("Top-left coordinate of the crop rectangle as [x, y]. Both values must be positive.") + top_left: [int32, int32]; + + @doc("Bottom-right coordinate of the crop rectangle as [x, y]. Both values must be positive.") + bottom_right: [int32, int32]; +} + +@doc("Resolution of the video feed in pixels.") +model VideoResolution { + @doc("Width of the video in pixels. Must be greater than 0.") + width: int32; + + @doc("Height of the video in pixels. Must be greater than 0.") + height: int32; } \ No newline at end of file diff --git a/specification/ai/data-plane/VoiceLive/models.tsp b/specification/ai/data-plane/VoiceLive/models.tsp index fa4e84329cda..2e76cc450041 100644 --- a/specification/ai/data-plane/VoiceLive/models.tsp +++ b/specification/ai/data-plane/VoiceLive/models.tsp @@ -79,7 +79,7 @@ model VoiceLiveClientEventSessionUpdate extends VoiceLiveClientEvent { Sent when the client connects and provides its SDP (Session Description Protocol) for avatar-related media negotiation. """) -model VoiceLiveClientEventSessionAvatarConnect extends ClientEventBase { +model VoiceLiveClientEventSessionAvatarConnect extends VoiceLiveClientEvent { @doc("The event type, must be 'session.avatar.connect'.") type: VoiceLiveClientEventType.session_avatar_connect; @@ -90,7 +90,7 @@ model VoiceLiveClientEventSessionAvatarConnect extends ClientEventBase { @doc(""" Indicates the start of a new audio input turn. """) -model VoiceLiveClientEventInputAudioTurnStart extends ClientEventBase { +model VoiceLiveClientEventInputAudioTurnStart extends VoiceLiveClientEvent { @doc("The event type, must be 'input_audio.turn.start'.") type: VoiceLiveClientEventType.input_audio_turn_start; @@ -101,7 +101,7 @@ model VoiceLiveClientEventInputAudioTurnStart extends ClientEventBase { @doc(""" Appends audio data to an ongoing input turn. """) -model VoiceLiveClientEventInputAudioTurnAppend extends ClientEventBase { +model VoiceLiveClientEventInputAudioTurnAppend extends VoiceLiveClientEvent { @doc("The event type, must be 'input_audio.turn.append'.") type: VoiceLiveClientEventType.input_audio_turn_append; @@ -115,7 +115,7 @@ model VoiceLiveClientEventInputAudioTurnAppend extends ClientEventBase { @doc(""" Marks the end of an audio input turn. """) -model VoiceLiveClientEventInputAudioTurnEnd extends ClientEventBase { +model VoiceLiveClientEventInputAudioTurnEnd extends VoiceLiveClientEvent { @doc("The event type, must be 'input_audio.turn.end'.") type: VoiceLiveClientEventType.input_audio_turn_end; @@ -126,7 +126,7 @@ model VoiceLiveClientEventInputAudioTurnEnd extends ClientEventBase { @doc(""" Cancels an in-progress input audio turn. """) -model VoiceLiveClientEventInputAudioTurnCancel extends ClientEventBase { +model VoiceLiveClientEventInputAudioTurnCancel extends VoiceLiveClientEvent { @doc("The event type, must be 'input_audio.turn.cancel'.") type: VoiceLiveClientEventType.input_audio_turn_cancel; @@ -137,7 +137,7 @@ model VoiceLiveClientEventInputAudioTurnCancel extends ClientEventBase { @doc(""" Clears all input audio currently being streamed. """) -model VoiceLiveClientEventInputAudioClear extends ClientEventBase { +model VoiceLiveClientEventInputAudioClear extends VoiceLiveClientEvent { @doc("The event type, must be 'input_audio.clear'.") type: VoiceLiveClientEventType.input_audio_clear; } @@ -1104,7 +1104,7 @@ model ResponseAnimationBlendshapeDeltaEvent extends VoiceLiveServerEvent { item_id: string; output_index: int32; content_index: int32; - frames: list> | string; + frames: float32[][] | string; frame_index: int32; } @@ -1124,10 +1124,7 @@ Represents an emotion hypothesis detected from response audio with multiple cand model ResponseEmotionHypothesis extends VoiceLiveServerEvent { type: VoiceLiveServerEventType.response_emotion_hypothesis; emotion: string; - candidates: list<{ - emotion: string; - confidence: float32; - }>; + candidates: EmotionCandidate[], audio_offset_ms: int32; audio_duration_ms: int32; response_id: string; @@ -1199,13 +1196,13 @@ model VoiceLiveResponseCreateParams { @doc(""" Input items to append to the conversation context before generating a response. """) - append_input_items?: list; + append_input_items?: VoiceLiveConversationRequestItem[]; @doc(""" Input items to be used as the context for this response. An empty array clears previous context. """) - input_items?: list; + input_items?: VoiceLiveConversationRequestItem[]; // Tool customization: Apply reusable modality representation /** @@ -1758,3 +1755,8 @@ model VoiceLiveServerEventConversationItemRetrieved extends VoiceLiveServerEvent // Tool customization: apply enriched item definition hierarchy item: VoiceLiveConversationResponseItem; } + +model EmotionCandidate { + emotion: string; + confidence: float32; +} diff --git a/specification/ai/data-plane/VoiceLive/servers/websocket.tsp b/specification/ai/data-plane/VoiceLive/servers/websocket.tsp index d4901df41c8e..e34bef9deb5d 100644 --- a/specification/ai/data-plane/VoiceLive/servers/websocket.tsp +++ b/specification/ai/data-plane/VoiceLive/servers/websocket.tsp @@ -29,9 +29,6 @@ op force_models(session: VoiceLiveClientEventSessionUpdate ): VoiceLiveServerEventInputAudioBufferCleared| VoiceLiveServerEventInputAudioBufferSpeechStarted| VoiceLiveServerEventInputAudioBufferSpeechStopped| - VoiceLiveServerEventOutputAudioBufferCleared| - VoiceLiveServerEventOutputAudioBufferStarted| - VoiceLiveServerEventOutputAudioBufferStopped| VoiceLiveServerEventResponseCreated| VoiceLiveServerEventResponseDone| VoiceLiveServerEventResponseOutputItemAdded| @@ -41,8 +38,4 @@ op force_models(session: VoiceLiveClientEventSessionUpdate ): VoiceLiveServerEventResponseTextDone| VoiceLiveServerEventResponseAudioTranscriptDelta| VoiceLiveServerEventResponseAudioTranscriptDone| - VoiceLiveServerEventResponseAudioDone| - VoiceLiveServerEventResponseFunctionCallArgumentsDelta| - VoiceLiveServerEventResponseFunctionCallArgumentsDone| - VoiceLiveServerEventTranscriptionSessionUpdated| - VoiceLiveServerEventRateLimitsUpdated; + VoiceLiveServerEventResponseAudioDone; From b5d6bcee9fc7778ff9d3b1416deffb0db6743089 Mon Sep 17 00:00:00 2001 From: Xiting Zhang Date: Wed, 16 Jul 2025 16:32:03 -0700 Subject: [PATCH 09/48] fix tule error --- .../ai/data-plane/VoiceLive/common/models.tsp | 12 ------------ specification/ai/data-plane/VoiceLive/custom.tsp | 16 +++++++++++----- specification/ai/data-plane/VoiceLive/models.tsp | 2 -- 3 files changed, 11 insertions(+), 19 deletions(-) diff --git a/specification/ai/data-plane/VoiceLive/common/models.tsp b/specification/ai/data-plane/VoiceLive/common/models.tsp index 1fb0d3f6f2ce..c3fc257d391d 100644 --- a/specification/ai/data-plane/VoiceLive/common/models.tsp +++ b/specification/ai/data-plane/VoiceLive/common/models.tsp @@ -29,18 +29,6 @@ model ErrorResponse { error: Error; } -@doc("Optional metadata property used in request models.") -model MetadataPropertyForRequest { - @extension("x-oaiTypeLabel", "map") - metadata?: Record; -} - -@doc("Metadata property included in response models.") -model MetadataPropertyForResponse { - @extension("x-oaiTypeLabel", "map") - metadata?: Record; -} - @doc("A single log probability entry for a token.") model LogProbProperties { @doc("The token that was used to generate the log probability.") diff --git a/specification/ai/data-plane/VoiceLive/custom.tsp b/specification/ai/data-plane/VoiceLive/custom.tsp index 70e81cd59aa5..4ae45a60fc7c 100644 --- a/specification/ai/data-plane/VoiceLive/custom.tsp +++ b/specification/ai/data-plane/VoiceLive/custom.tsp @@ -218,13 +218,19 @@ model VideoParams { resolution?: VideoResolution; } -@doc("Bounding box crop for avatar video. Coordinates must be positive.") +@doc("A 2D point with x and y coordinates.") +model Point2D { + x: int32; + y: int32; +} + +@doc("Defines a video crop rectangle.") model VideoCrop { - @doc("Top-left coordinate of the crop rectangle as [x, y]. Both values must be positive.") - top_left: [int32, int32]; + @doc("Top-left corner of the crop region.") + top_left: Point2D; - @doc("Bottom-right coordinate of the crop rectangle as [x, y]. Both values must be positive.") - bottom_right: [int32, int32]; + @doc("Bottom-right corner of the crop region.") + bottom_right: Point2D; } @doc("Resolution of the video feed in pixels.") diff --git a/specification/ai/data-plane/VoiceLive/models.tsp b/specification/ai/data-plane/VoiceLive/models.tsp index 2e76cc450041..1ce8b1c9b6ac 100644 --- a/specification/ai/data-plane/VoiceLive/models.tsp +++ b/specification/ai/data-plane/VoiceLive/models.tsp @@ -203,8 +203,6 @@ model VoiceLiveResponse { /** The list of output items generated by the response. */ output?: VoiceLiveConversationResponseItem[]; - ...MetadataPropertyForResponse; - /** * Usage statistics for the Response, this will correspond to billing. A * VoiceLive API session will maintain a conversation context and append new From 236b333d028105b998df22c1897ca1b2df3ba20a Mon Sep 17 00:00:00 2001 From: Xiting Zhang Date: Wed, 16 Jul 2025 17:38:09 -0700 Subject: [PATCH 10/48] update voice --- .../ai/data-plane/VoiceLive/audio/custom.tsp | 11 - .../ai/data-plane/VoiceLive/audio/main.tsp | 3 +- .../ai/data-plane/VoiceLive/audio/models.tsp | 54 +-- .../data-plane/VoiceLive/audio/operations.tsp | 29 -- .../ai/data-plane/VoiceLive/common/models.tsp | 13 - .../ai/data-plane/VoiceLive/custom.tsp | 75 +++- .../ai/data-plane/VoiceLive/models.tsp | 391 +----------------- .../ai/data-plane/VoiceLive/operations.tsp | 22 - 8 files changed, 79 insertions(+), 519 deletions(-) delete mode 100644 specification/ai/data-plane/VoiceLive/audio/custom.tsp delete mode 100644 specification/ai/data-plane/VoiceLive/audio/operations.tsp diff --git a/specification/ai/data-plane/VoiceLive/audio/custom.tsp b/specification/ai/data-plane/VoiceLive/audio/custom.tsp deleted file mode 100644 index ca2c23c7f496..000000000000 --- a/specification/ai/data-plane/VoiceLive/audio/custom.tsp +++ /dev/null @@ -1,11 +0,0 @@ -import "./models.tsp"; - -union TranscriptTextUsageType { - tokens: "tokens", - duration: "duration", -} - -@discriminator("type") -model TranscriptTextUsage { - type: TranscriptTextUsageType; -} diff --git a/specification/ai/data-plane/VoiceLive/audio/main.tsp b/specification/ai/data-plane/VoiceLive/audio/main.tsp index e7af5325f311..08c98a827656 100644 --- a/specification/ai/data-plane/VoiceLive/audio/main.tsp +++ b/specification/ai/data-plane/VoiceLive/audio/main.tsp @@ -1,2 +1 @@ -import "./client.tsp"; -import "./operations.tsp"; +import "./client.tsp"; \ No newline at end of file diff --git a/specification/ai/data-plane/VoiceLive/audio/models.tsp b/specification/ai/data-plane/VoiceLive/audio/models.tsp index 906f064f362c..89ab83ad7ae9 100644 --- a/specification/ai/data-plane/VoiceLive/audio/models.tsp +++ b/specification/ai/data-plane/VoiceLive/audio/models.tsp @@ -2,7 +2,6 @@ // Removed models not found in Python code and adjusted field shapes to match Python baseline import "../common"; -import "./custom.tsp"; using TypeSpec.OpenAPI; @@ -46,55 +45,4 @@ union TranscriptionInclude { } // Other models removed because they do not correspond to Python models or are redundant -// Please re-add as needed with proper alignment to the source Python definitions - -model CreateSpeechRequest { - @doc(""" - One of the available [TTS models](/docs/models#tts): `tts-1`, `tts-1-hd` or `gpt-4o-mini-tts`. - """) - @extension("x-oaiTypeLabel", "string") - `model`: string | "tts-1" | "tts-1-hd" | "gpt-4o-mini-tts"; - - /** The text to generate audio for. The maximum length is 4096 characters. */ - @maxLength(4096) - input: string; - - @doc(""" - Control the voice of your generated audio with additional instructions. Does not work with `tts-1` or `tts-1-hd`. - """) - @maxLength(4096) - instructions?: string; - - @doc(""" - The voice to use when generating the audio. Supported voices are `alloy`, `ash`, `ballad`, `coral`, `echo`, `fable`, `onyx`, `nova`, `sage`, `shimmer`, and `verse`. Previews of the voices are available in the [Text to speech guide](/docs/guides/text-to-speech#voice-options). - """) - voice: VoiceIdsShared; - - @doc(""" - The format to audio in. Supported formats are `mp3`, `opus`, `aac`, `flac`, `wav`, and `pcm`. - """) - response_format?: "mp3" | "opus" | "aac" | "flac" | "wav" | "pcm" = "mp3"; - - @doc(""" - The speed of the generated audio. Select a value from `0.25` to `4.0`. `1.0` is the default. - """) - @minValue(0.25) - @maxValue(4) - speed?: float32 = 1; - - @doc(""" - The format to stream the audio in. Supported formats are `sse` and `audio`. `sse` is not supported for `tts-1` or `tts-1-hd`. - """) - stream_format?: "sse" | "audio" = "audio"; -} - -// Tool customization: Convert to discriminated type base -union CreateSpeechResponseStreamEventType { - speech_audio_delta: "speech.audio.delta", - speech_audio_done: "speech.audio.done", -} - -@discriminator("type") -model CreateSpeechResponseStreamEvent { - type: CreateSpeechResponseStreamEventType; -} \ No newline at end of file +// Please re-add as needed with proper alignment to the source Python definitions \ No newline at end of file diff --git a/specification/ai/data-plane/VoiceLive/audio/operations.tsp b/specification/ai/data-plane/VoiceLive/audio/operations.tsp deleted file mode 100644 index c63820c360cd..000000000000 --- a/specification/ai/data-plane/VoiceLive/audio/operations.tsp +++ /dev/null @@ -1,29 +0,0 @@ -import "@typespec/http"; -import "@typespec/openapi"; - -import "../common"; -import "./models.tsp"; - -using TypeSpec.Http; -using TypeSpec.OpenAPI; - -namespace VoiceLive; - -@route("/audio") -interface Audio { - @route("speech") - @post - @operationId("createSpeech") - @tag("Audio") - @summary("Generates audio from the input text.") - createSpeech( - @header accept: "application/octet-stream", - @body requestBody: CreateSpeechRequest, - ): { - /** chunked */ - @header("Transfer-Encoding") transferEncoding?: string; - - @header contentType: "application/octet-stream"; - @body responseBody: bytes; - } | SseResponseOf | ErrorResponse; -} diff --git a/specification/ai/data-plane/VoiceLive/common/models.tsp b/specification/ai/data-plane/VoiceLive/common/models.tsp index c3fc257d391d..2a4ac6846a7d 100644 --- a/specification/ai/data-plane/VoiceLive/common/models.tsp +++ b/specification/ai/data-plane/VoiceLive/common/models.tsp @@ -41,19 +41,6 @@ model LogProbProperties { bytes: int32[]; } -@doc("Identifier for selecting a built-in voice.") -union VoiceIdsShared { - string, - "alloy", - "ash", - "ballad", - "coral", - "echo", - "sage", - "shimmer", - "verse", -} - @doc("Enumerates available service tier configurations.") union ServiceTier { "auto", diff --git a/specification/ai/data-plane/VoiceLive/custom.tsp b/specification/ai/data-plane/VoiceLive/custom.tsp index 4ae45a60fc7c..0d9dcf25f010 100644 --- a/specification/ai/data-plane/VoiceLive/custom.tsp +++ b/specification/ai/data-plane/VoiceLive/custom.tsp @@ -11,7 +11,7 @@ model VoiceLiveRequestSession { `model`?: string; modalities?: VoiceLiveModality[]; animation?: VoiceLiveAnimation; - voice?: VoiceIdsShared; + voice?: Voice; instructions?: string; input_audio?: VoiceLiveInputAudio; input_audio_sampling_rate?: int32 = 24000; @@ -36,7 +36,7 @@ model VoiceLiveResponseSession { modalities: VoiceLiveModality[]; instructions: string; animation?: VoiceLiveAnimation; - voice: VoiceIdsShared; + voice: Voice; input_audio?: VoiceLiveInputAudio; input_audio_format: VoiceLiveAudioFormat; output_audio_format: VoiceLiveAudioFormat; @@ -53,6 +53,77 @@ model VoiceLiveResponseSession { agent?: AgentConfig; } +@doc("Voice configuration for Azure standard or platform voices.") +model AzureStandardVoice { + @doc("Name of the voice.") + name: string; + + @doc("Voice type identifier.") + type: "azure-standard" | "azure-platform"; + + @doc("Optional temperature for generation.") + temperature?: float32; +} + +@doc("Voice configuration for Azure custom voice.") +model AzureCustomVoice { + @doc("Name of the voice.") + name: string; + + @doc("Custom endpoint ID.") + endpoint_id: string; + + @doc("Voice type identifier.") + type: "azure-custom" | "custom"; + + @doc("Optional temperature for generation.") + temperature?: float32; + + @doc("Optional custom lexicon URL.") + custom_lexicon_url?: string; + + @doc("Preferred locale list for voice rendering.") + prefer_locales?: string[]; +} + +@doc("Voice configuration for Azure personal voice.") +model AzurePersonalVoice { + @doc("Name of the voice.") + name: string; + + @doc("Voice type identifier.") + type: "azure-personal" | "personal"; + + @doc("Personal voice model identifier.") + `model`: "DragonLatestNeural" | "PhoenixLatestNeural" | "PhoenixV2Neural"; +} + +@doc("Voice identifier for OpenAI-provided voices.") +union OAIVoice { + "alloy", + "ash", + "ballad", + "coral", + "echo", + "sage", + "shimmer", + "verse" +} + +@doc("Voice identifier for Phi4mm voices.") +union Phi4mmVoice { + "cosyvoice" +} + +@doc("Union of supported voice identifiers and configurations.") +union Voice { + OAIVoice, + AzureStandardVoice, + AzureCustomVoice, + AzurePersonalVoice, + Phi4mmVoice +} + union VoiceLiveAudioFormat { string, pcm16: "pcm16", diff --git a/specification/ai/data-plane/VoiceLive/models.tsp b/specification/ai/data-plane/VoiceLive/models.tsp index 1ce8b1c9b6ac..2727df14eb14 100644 --- a/specification/ai/data-plane/VoiceLive/models.tsp +++ b/specification/ai/data-plane/VoiceLive/models.tsp @@ -262,11 +262,9 @@ model VoiceLiveResponse { conversation_id?: string; @doc(""" - The voice the model used to respond. - Current voice options are `alloy`, `ash`, `ballad`, `coral`, `echo`, `fable`, - `onyx`, `nova`, `sage`, `shimmer`, and `verse`. + supported voice identifiers and configurations. """) - voice?: VoiceIdsShared; + voice?: Voice; @doc(""" The set of modalities the model used to respond. If there are multiple modalities, @@ -728,13 +726,6 @@ model VoiceLiveServerEventConversationItemInputAudioTranscriptionCompleted /** The transcribed text. */ transcript: string; - - /** The log probabilities of the transcription. */ - logprobs?: LogProbProperties[] | null; - - // Tool customization: Substitute common discriminated type base - /** Usage statistics for the transcription. */ - usage: TranscriptTextUsage; } // Tool customization (apply_discriminator): apply discriminated type @@ -1226,12 +1217,9 @@ model VoiceLiveResponseCreateParams { instructions?: string; @doc(""" - The voice the model uses to respond. Voice cannot be changed during the - session once the model has responded with audio at least once. Current - voice options are `alloy`, `ash`, `ballad`, `coral`, `echo`, `fable`, - `onyx`, `nova`, `sage`, `shimmer`, and `verse`. + supported voice identifiers and configurations. """) - voice?: VoiceIdsShared; + voice?: Voice; // Tool customization: use extracted and reusable audio format definition @doc(""" @@ -1262,377 +1250,6 @@ model VoiceLiveResponseCreateParams { max_output_tokens?: int32 | "inf"; } -/** VoiceLive session object configuration. */ -model VoiceLiveSessionCreateRequest { - // Tool customization: Apply reusable modality representation - /** - * The set of modalities the model can respond with. To disable audio, - * set this to ["text"]. - */ - modalities?: VoiceLiveModality[]; - - /** The VoiceLive model used for this session. */ - `model`?: - | "gpt-4o-realtime-preview" - | "gpt-4o-realtime-preview-2024-10-01" - | "gpt-4o-realtime-preview-2024-12-17" - | "gpt-4o-realtime-preview-2025-06-03" - | "gpt-4o-mini-realtime-preview" - | "gpt-4o-mini-realtime-preview-2024-12-17"; - - @doc(""" - The default system instructions (i.e. system message) prepended to model calls. This field allows the client to guide the model on desired responses. The model can be instructed on response content and format, (e.g. "be extremely succinct", "act friendly", "here are examples of good responses") and on audio behavior (e.g. "talk quickly", "inject emotion into your voice", "laugh frequently"). The instructions are not guaranteed to be followed by the model, but they provide guidance to the model on the desired behavior. - - Note that the server sets default instructions which will be used if this field is not set and are visible in the `session.created` event at the start of the session. - """) - instructions?: string; - - @doc(""" - The voice the model uses to respond. Voice cannot be changed during the - session once the model has responded with audio at least once. Current - voice options are `alloy`, `ash`, `ballad`, `coral`, `echo`, `fable`, - `onyx`, `nova`, `sage`, `shimmer`, and `verse`. - """) - voice?: VoiceIdsShared; - - // Tool customization: use extracted and reusable audio format definition - @doc(""" - The format of input audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`. - For `pcm16`, input audio must be 16-bit PCM at a 24kHz sample rate, - single channel (mono), and little-endian byte order. - """) - input_audio_format?: VoiceLiveAudioFormat = VoiceLiveAudioFormat.pcm16; - - // Tool customization: use extracted and reusable audio format definition - @doc(""" - The format of output audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`. - For `pcm16`, output audio is sampled at a rate of 24kHz. - """) - output_audio_format?: VoiceLiveAudioFormat = VoiceLiveAudioFormat.pcm16; - - @doc(""" - Configuration for input audio transcription, defaults to off and can be set to `null` to turn off once on. Input audio transcription is not native to the model, since the model consumes audio directly. Transcription runs asynchronously through [the /audio/transcriptions endpoint](https://platform.openai.com/docs/api-reference/audio/createTranscription) and should be treated as guidance of input audio content rather than precisely what the model heard. The client can optionally set the language and prompt for transcription, these offer additional guidance to the transcription service. - """) - input_audio_transcription?: { - @doc(""" - The model to use for transcription, current options are `gpt-4o-transcribe`, `gpt-4o-mini-transcribe`, and `whisper-1`. - """) - `model`?: string; - - @doc(""" - The language of the input audio. Supplying the input language in - [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`) format - will improve accuracy and latency. - """) - language?: string; - - @doc(""" - An optional text to guide the model's style or continue a previous audio - segment. - For `whisper-1`, the [prompt is a list of keywords](/docs/guides/speech-to-text#prompting). - For `gpt-4o-transcribe` models, the prompt is a free text string, for example "expect words related to technology". - """) - prompt?: string; - }; - - @doc(""" - Configuration for turn detection, ether Server VAD or Semantic VAD. This can be set to `null` to turn off, in which case the client must manually trigger model response. - Server VAD means that the model will detect the start and end of speech based on audio volume and respond at the end of user speech. - Semantic VAD is more advanced and uses a turn detection model (in conjuction with VAD) to semantically estimate whether the user has finished speaking, then dynamically sets a timeout based on this probability. For example, if user audio trails off with "uhhm", the model will score a low probability of turn end and wait longer for the user to continue speaking. This can be useful for more natural conversations, but may have a higher latency. - """) - turn_detection?: { - /** Type of turn detection. */ - type?: "server_vad" | "semantic_vad" = "server_vad"; - - @doc(""" - Used only for `semantic_vad` mode. The eagerness of the model to respond. `low` will wait longer for the user to continue speaking, `high` will respond more quickly. `auto` is the default and is equivalent to `medium`. - """) - eagerness?: "low" | "medium" | "high" | "auto" = "auto"; - - @doc(""" - Used only for `server_vad` mode. Activation threshold for VAD (0.0 to 1.0), this defaults to 0.5. A - higher threshold will require louder audio to activate the model, and - thus might perform better in noisy environments. - """) - threshold?: float32; - - @doc(""" - Used only for `server_vad` mode. Amount of audio to include before the VAD detected speech (in - milliseconds). Defaults to 300ms. - """) - prefix_padding_ms?: int32; - - @doc(""" - Used only for `server_vad` mode. Duration of silence to detect speech stop (in milliseconds). Defaults - to 500ms. With shorter values the model will respond more quickly, - but may jump in on short pauses from the user. - """) - silence_duration_ms?: int32; - - /** Whether or not to automatically generate a response when a VAD stop event occurs. */ - create_response?: boolean = true; - - @doc(""" - Whether or not to automatically interrupt any ongoing response with output to the default - conversation (i.e. `conversation` of `auto`) when a VAD start event occurs. - """) - interrupt_response?: boolean = true; - }; - - @doc(""" - Configuration for input audio noise reduction. This can be set to `null` to turn off. - Noise reduction filters audio added to the input audio buffer before it is sent to VAD and the model. - Filtering the audio can improve VAD and turn detection accuracy (reducing false positives) and model performance by improving perception of the input audio. - """) - input_audio_noise_reduction?: { - @doc(""" - Type of noise reduction. `near_field` is for close-talking microphones such as headphones, `far_field` is for far-field microphones such as laptop or conference room microphones. - """) - type?: "near_field" | "far_field"; - } | null = null; - - /** - * The speed of the model's spoken response. 1.0 is the default speed. 0.25 is - * the minimum speed. 1.5 is the maximum speed. This value can only be changed - * in between model turns, not while a response is in progress. - */ - @minValue(0.25) - @maxValue(1.5) - speed?: float32 = 1; - - @doc(""" - Configuration options for tracing. Set to null to disable tracing. Once - tracing is enabled for a session, the configuration cannot be modified. - - `auto` will create a trace for the session with default values for the - workflow name, group id, and metadata. - """) - tracing?: "auto" | { - /** - * The name of the workflow to attach to this trace. This is used to - * name the trace in the traces dashboard. - */ - workflow_name?: string; - - /** - * The group id to attach to this trace to enable filtering and - * grouping in the traces dashboard. - */ - group_id?: string; - - /** - * The arbitrary metadata to attach to this trace to enable - * filtering in the traces dashboard. - */ - metadata?: unknown; - }; - - // Tool customization: use enriched tool definition - /** Tools (functions) available to the model. */ - tools?: VoiceLiveTool[]; - - @doc(""" - How the model chooses tools. Options are `auto`, `none`, `required`, or - specify a function. - """) - tool_choice?: string = "auto"; - - /** Sampling temperature for the model, limited to [0.6, 1.2]. For audio models a temperature of 0.8 is highly recommended for best performance. */ - temperature?: float32 = 0.8; - - @doc(""" - Maximum number of output tokens for a single assistant response, - inclusive of tool calls. Provide an integer between 1 and 4096 to - limit output tokens, or `inf` for the maximum available tokens for a - given model. Defaults to `inf`. - """) - max_response_output_tokens?: int32 | "inf"; - - /** Configuration options for the generated client secret. */ - client_secret?: { - /** Configuration for the ephemeral token expiration. */ - expires_after?: { - @doc(""" - The anchor point for the ephemeral token expiration. Only `created_at` is currently supported. - """) - anchor: "created_at"; - - @doc(""" - The number of seconds from the anchor point to the expiration. Select a value between `10` and `7200`. - """) - seconds?: int32 = 600; - }; - }; -} - -/** - * A new VoiceLive session configuration, with an ephermeral key. Default TTL - * for keys is one minute. - */ -model VoiceLiveSessionCreateResponse { - /** Ephemeral key returned by the API. */ - client_secret: { - /** - * Ephemeral key usable in client environments to authenticate connections - * to the VoiceLive API. Use this in client-side environments rather than - * a standard API token, which should only be used server-side. - */ - value: string; - - // Tool customization: 'created' and fields ending in '_at' are Unix encoded utcDateTime - /** - * Timestamp for when the token expires. Currently, all tokens expire - * after one minute. - */ - @encode("unixTimestamp", int32) - expires_at: utcDateTime; - }; - - // Tool customization: Apply reusable modality representation - /** - * The set of modalities the model can respond with. To disable audio, - * set this to ["text"]. - */ - modalities?: VoiceLiveModality[]; - - @doc(""" - The default system instructions (i.e. system message) prepended to model - calls. This field allows the client to guide the model on desired - responses. The model can be instructed on response content and format, - (e.g. "be extremely succinct", "act friendly", "here are examples of good - responses") and on audio behavior (e.g. "talk quickly", "inject emotion - into your voice", "laugh frequently"). The instructions are not guaranteed - to be followed by the model, but they provide guidance to the model on the - desired behavior. - - Note that the server sets default instructions which will be used if this - field is not set and are visible in the `session.created` event at the - start of the session. - """) - instructions?: string; - - @doc(""" - The voice the model uses to respond. Voice cannot be changed during the - session once the model has responded with audio at least once. Current - voice options are `alloy`, `ash`, `ballad`, `coral`, `echo` `sage`, - `shimmer` and `verse`. - """) - voice?: VoiceIdsShared; - - // Tool customization: use extracted and reusable audio format definition - @doc(""" - The format of input audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`. - """) - input_audio_format?: VoiceLiveAudioFormat; - - // Tool customization: use extracted and reusable audio format definition - @doc(""" - The format of output audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`. - """) - output_audio_format?: VoiceLiveAudioFormat; - - @doc(""" - Configuration for input audio transcription, defaults to off and can be - set to `null` to turn off once on. Input audio transcription is not native - to the model, since the model consumes audio directly. Transcription runs - asynchronously and should be treated as rough guidance - rather than the representation understood by the model. - """) - input_audio_transcription?: { - /** The model to use for transcription. */ - `model`?: string; - }; - - /** - * The speed of the model's spoken response. 1.0 is the default speed. 0.25 is - * the minimum speed. 1.5 is the maximum speed. This value can only be changed - * in between model turns, not while a response is in progress. - */ - @minValue(0.25) - @maxValue(1.5) - speed?: float32 = 1; - - @doc(""" - Configuration options for tracing. Set to null to disable tracing. Once - tracing is enabled for a session, the configuration cannot be modified. - - `auto` will create a trace for the session with default values for the - workflow name, group id, and metadata. - """) - tracing?: "auto" | { - /** - * The name of the workflow to attach to this trace. This is used to - * name the trace in the traces dashboard. - */ - workflow_name?: string; - - /** - * The group id to attach to this trace to enable filtering and - * grouping in the traces dashboard. - */ - group_id?: string; - - /** - * The arbitrary metadata to attach to this trace to enable - * filtering in the traces dashboard. - */ - metadata?: unknown; - }; - - @doc(""" - Configuration for turn detection. Can be set to `null` to turn off. Server - VAD means that the model will detect the start and end of speech based on - audio volume and respond at the end of user speech. - """) - turn_detection?: { - @doc(""" - Type of turn detection, only `server_vad` is currently supported. - """) - type?: string; - - /** - * Activation threshold for VAD (0.0 to 1.0), this defaults to 0.5. A - * higher threshold will require louder audio to activate the model, and - * thus might perform better in noisy environments. - */ - threshold?: float32; - - /** - * Amount of audio to include before the VAD detected speech (in - * milliseconds). Defaults to 300ms. - */ - prefix_padding_ms?: int32; - - /** - * Duration of silence to detect speech stop (in milliseconds). Defaults - * to 500ms. With shorter values the model will respond more quickly, - * but may jump in on short pauses from the user. - */ - silence_duration_ms?: int32; - }; - - // Tool customization: use enriched tool definition - /** Tools (functions) available to the model. */ - tools?: VoiceLiveTool[]; - - @doc(""" - How the model chooses tools. Options are `auto`, `none`, `required`, or - specify a function. - """) - tool_choice?: string; - - /** Sampling temperature for the model, limited to [0.6, 1.2]. Defaults to 0.8. */ - temperature?: float32; - - @doc(""" - Maximum number of output tokens for a single assistant response, - inclusive of tool calls. Provide an integer between 1 and 4096 to - limit output tokens, or `inf` for the maximum available tokens for a - given model. Defaults to `inf`. - """) - max_response_output_tokens?: int32 | "inf"; -} - /** The item to add to the conversation. */ model VoiceLiveConversationItemWithReference { @doc(""" diff --git a/specification/ai/data-plane/VoiceLive/operations.tsp b/specification/ai/data-plane/VoiceLive/operations.tsp index 366fb68cf779..4978fb17fe46 100644 --- a/specification/ai/data-plane/VoiceLive/operations.tsp +++ b/specification/ai/data-plane/VoiceLive/operations.tsp @@ -6,28 +6,6 @@ using TypeSpec.OpenAPI; namespace VoiceLive; -@route("voicelive") -@tag("VoiceLive") -interface VoiceLive { - @summary("Starts a real-time session for conversation or transcription.") - startVoiceLiveSession( - ...VoiceLiveBetaHeader, - @body requestMessages: VoiceLiveClientEvent[], - ): VoiceLiveServerEvent[]; - - @post - @route("sessions") - @operationId("create-voicelive-session") - @summary(""" - Create an ephemeral API token for use in client-side applications with the VoiceLive API. Can be configured with the same session parameters as the session.update client event. - - It responds with a session object, plus a client_secret key which contains a usable ephemeral API token that can be used to authenticate browser clients for the VoiceLive API. - """) - createEphemeralToken( - @body request: VoiceLiveSessionCreateRequest, - ): VoiceLiveSessionCreateResponse | ErrorResponse; - - alias VoiceLiveBetaHeader = { @header("VoiceLive-Beta") voiceLiveBeta: "voicelive=v1"; }; From ff392a788689031ec4db052a3710e70d968058a7 Mon Sep 17 00:00:00 2001 From: Xiting Zhang Date: Wed, 16 Jul 2025 17:52:31 -0700 Subject: [PATCH 11/48] fix missing output_audio_timestamp_types --- specification/ai/data-plane/VoiceLive/custom.tsp | 1 + 1 file changed, 1 insertion(+) diff --git a/specification/ai/data-plane/VoiceLive/custom.tsp b/specification/ai/data-plane/VoiceLive/custom.tsp index 0d9dcf25f010..61275edfd1e1 100644 --- a/specification/ai/data-plane/VoiceLive/custom.tsp +++ b/specification/ai/data-plane/VoiceLive/custom.tsp @@ -46,6 +46,7 @@ model VoiceLiveResponseSession { input_audio_echo_cancellation?: VoiceLiveAudioEchoCancellation; avatar?: VoiceLiveAvatarConfig; input_audio_transcription: VoiceLiveAudioInputTranscriptionSettings | null; + output_audio_timestamp_types?: VoiceLiveAudioTimestampType[]; tools: VoiceLiveTool[]; tool_choice: VoiceLiveToolChoice; temperature: float32; From 734f5be8d64bc16dc796db237675ef9fd6c0ff56 Mon Sep 17 00:00:00 2001 From: Xiting Zhang Date: Wed, 16 Jul 2025 17:57:49 -0700 Subject: [PATCH 12/48] make optional optional --- .../ai/data-plane/VoiceLive/custom.tsp | 28 +++++++++---------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/specification/ai/data-plane/VoiceLive/custom.tsp b/specification/ai/data-plane/VoiceLive/custom.tsp index 61275edfd1e1..7e9a21038215 100644 --- a/specification/ai/data-plane/VoiceLive/custom.tsp +++ b/specification/ai/data-plane/VoiceLive/custom.tsp @@ -31,26 +31,26 @@ model VoiceLiveRequestSession { model VoiceLiveResponseSession { ...VoiceLiveSessionBase; - id: string; - `model`: string; - modalities: VoiceLiveModality[]; - instructions: string; + id?: string; + `model`?: string; + modalities?: VoiceLiveModality[]; + instructions?: string; animation?: VoiceLiveAnimation; - voice: Voice; + voice?: Voice; input_audio?: VoiceLiveInputAudio; - input_audio_format: VoiceLiveAudioFormat; - output_audio_format: VoiceLiveAudioFormat; + input_audio_format?: VoiceLiveAudioFormat; + output_audio_format?: VoiceLiveAudioFormat; input_audio_sampling_rate?: int32; - turn_detection: VoiceLiveTurnDetection; - input_audio_noise_reduction: VoiceLiveAudioNoiseReduction; + turn_detection?: VoiceLiveTurnDetection; + input_audio_noise_reduction?: VoiceLiveAudioNoiseReduction; input_audio_echo_cancellation?: VoiceLiveAudioEchoCancellation; avatar?: VoiceLiveAvatarConfig; - input_audio_transcription: VoiceLiveAudioInputTranscriptionSettings | null; + input_audio_transcription?: VoiceLiveAudioInputTranscriptionSettings | null; output_audio_timestamp_types?: VoiceLiveAudioTimestampType[]; - tools: VoiceLiveTool[]; - tool_choice: VoiceLiveToolChoice; - temperature: float32; - max_response_output_tokens: int32 | "inf" | null; + tools?: VoiceLiveTool[]; + tool_choice?: VoiceLiveToolChoice; + temperature?: float32; + max_response_output_tokens?: int32 | "inf" | null; agent?: AgentConfig; } From 79496d8c8abc6c9a158a3feb6df93e8975ae1d97 Mon Sep 17 00:00:00 2001 From: Ryan Hurey Date: Thu, 17 Jul 2025 10:43:52 -0700 Subject: [PATCH 13/48] Hide forced operation --- .../ai/data-plane/VoiceLive/client.tsp | 32 ++++++++++++++++++- 1 file changed, 31 insertions(+), 1 deletion(-) diff --git a/specification/ai/data-plane/VoiceLive/client.tsp b/specification/ai/data-plane/VoiceLive/client.tsp index a47c9f377938..5c474b956899 100644 --- a/specification/ai/data-plane/VoiceLive/client.tsp +++ b/specification/ai/data-plane/VoiceLive/client.tsp @@ -3,7 +3,37 @@ import "./servers/websocket.tsp"; using Azure.ClientGenerator.Core; +@@access(VoiceLive.force_models, Access.internal, "python"); + @@access(VoiceLive.VoiceLiveServerEventSessionCreated, Access.public, "python"); @@access(VoiceLive.VoiceLiveServerEventResponseAudioDelta, Access.public, "python"); - +@@access(VoiceLive.VoiceLiveClientEventSessionUpdate, Access.public, "python"); +@@access(VoiceLive.VoiceLiveConversationResponseItem, Access.public, "python"); +@@access(VoiceLive.LogProbProperties, Access.public, "python"); +@@access(VoiceLive.VoiceLiveResponse, Access.public, "python"); +@@access(VoiceLive.VoiceLiveServerEventConversationCreated, Access.public, "python"); +@@access(VoiceLive.VoiceLiveServerEventConversationItemCreated, Access.public, "python"); +@@access(VoiceLive.VoiceLiveServerEventConversationItemDeleted, Access.public, "python"); +@@access(VoiceLive.VoiceLiveServerEventConversationItemInputAudioTranscriptionCompleted, Access.public, "python"); +@@access(VoiceLive.VoiceLiveServerEventConversationItemInputAudioTranscriptionDelta, Access.public, "python"); +@@access(VoiceLive.VoiceLiveServerEventConversationItemInputAudioTranscriptionFailed, Access.public, "python"); +@@access(VoiceLive.VoiceLiveServerEventConversationItemRetrieved, Access.public, "python"); +@@access(VoiceLive.VoiceLiveServerEventConversationItemTruncated, Access.public, "python"); +@@access(VoiceLive.VoiceLiveServerEventError, Access.public, "python"); +@@access(VoiceLive.VoiceLiveServerEventInputAudioBufferCleared, Access.public, "python"); +@@access(VoiceLive.VoiceLiveServerEventInputAudioBufferCommitted, Access.public, "python"); +@@access(VoiceLive.VoiceLiveServerEventInputAudioBufferSpeechStarted, Access.public, "python"); +@@access(VoiceLive.VoiceLiveServerEventInputAudioBufferSpeechStopped, Access.public, "python"); +@@access(VoiceLive.VoiceLiveServerEventResponseAudioDone, Access.public, "python"); +@@access(VoiceLive.VoiceLiveServerEventResponseAudioTranscriptDelta, Access.public, "python"); +@@access(VoiceLive.VoiceLiveServerEventResponseAudioTranscriptDone, Access.public, "python"); +@@access(VoiceLive.VoiceLiveServerEventResponseContentPartAdded, Access.public, "python"); +@@access(VoiceLive.VoiceLiveServerEventResponseContentPartDone, Access.public, "python"); +@@access(VoiceLive.VoiceLiveServerEventResponseCreated, Access.public, "python"); +@@access(VoiceLive.VoiceLiveServerEventResponseDone, Access.public, "python"); +@@access(VoiceLive.VoiceLiveServerEventResponseOutputItemAdded, Access.public, "python"); +@@access(VoiceLive.VoiceLiveServerEventResponseOutputItemDone, Access.public, "python"); +@@access(VoiceLive.VoiceLiveServerEventResponseTextDelta, Access.public, "python"); +@@access(VoiceLive.VoiceLiveServerEventResponseTextDone, Access.public, "python"); +@@access(VoiceLive.VoiceLiveServerEventSessionUpdated, Access.public, "python"); From 39d7165ef412f958235134ec3fafb94972e02884 Mon Sep 17 00:00:00 2001 From: Xiting Zhang Date: Fri, 18 Jul 2025 00:20:30 -0700 Subject: [PATCH 14/48] fix event mismatch between spec and voicelive --- .../ai/data-plane/VoiceLive/audio/client.tsp | 10 -- .../ai/data-plane/VoiceLive/audio/main.tsp | 1 - .../ai/data-plane/VoiceLive/audio/models.tsp | 48 -------- .../ai/data-plane/VoiceLive/client.tsp | 2 - .../ai/data-plane/VoiceLive/common/custom.tsp | 70 ------------ .../ai/data-plane/VoiceLive/common/main.tsp | 1 - .../ai/data-plane/VoiceLive/common/models.tsp | 19 +--- .../ai/data-plane/VoiceLive/custom.tsp | 3 + .../VoiceLive/custom/content_parts.tsp | 6 +- .../ai/data-plane/VoiceLive/custom/events.tsp | 1 - .../ai/data-plane/VoiceLive/custom/items.tsp | 50 +++++++- .../ai/data-plane/VoiceLive/models.tsp | 107 +++++------------- .../VoiceLive/servers/websocket.tsp | 1 - 13 files changed, 89 insertions(+), 230 deletions(-) delete mode 100644 specification/ai/data-plane/VoiceLive/audio/client.tsp delete mode 100644 specification/ai/data-plane/VoiceLive/audio/main.tsp delete mode 100644 specification/ai/data-plane/VoiceLive/audio/models.tsp delete mode 100644 specification/ai/data-plane/VoiceLive/common/custom.tsp diff --git a/specification/ai/data-plane/VoiceLive/audio/client.tsp b/specification/ai/data-plane/VoiceLive/audio/client.tsp deleted file mode 100644 index 2c877169e4c4..000000000000 --- a/specification/ai/data-plane/VoiceLive/audio/client.tsp +++ /dev/null @@ -1,10 +0,0 @@ -import "@azure-tools/typespec-client-generator-core"; -import "./models.tsp"; - -using Azure.ClientGenerator.Core; -using VoiceLive; - -// @@visibility(CreateTranscriptionResponseVerboseJson.words, "read"); -// @@visibility(CreateTranscriptionResponseVerboseJson.segments, "read"); - -// @@visibility(CreateTranslationResponseVerboseJson.segments, "read"); diff --git a/specification/ai/data-plane/VoiceLive/audio/main.tsp b/specification/ai/data-plane/VoiceLive/audio/main.tsp deleted file mode 100644 index 08c98a827656..000000000000 --- a/specification/ai/data-plane/VoiceLive/audio/main.tsp +++ /dev/null @@ -1 +0,0 @@ -import "./client.tsp"; \ No newline at end of file diff --git a/specification/ai/data-plane/VoiceLive/audio/models.tsp b/specification/ai/data-plane/VoiceLive/audio/models.tsp deleted file mode 100644 index 89ab83ad7ae9..000000000000 --- a/specification/ai/data-plane/VoiceLive/audio/models.tsp +++ /dev/null @@ -1,48 +0,0 @@ -// Cleaned TypeSpec file aligned with Python model definitions -// Removed models not found in Python code and adjusted field shapes to match Python baseline - -import "../common"; - -using TypeSpec.OpenAPI; - -namespace VoiceLive; - -@doc(""" - The format of the output, in one of these options: `json`, `text`, `srt`, `verbose_json`, or `vtt`. For `gpt-4o-transcribe` and `gpt-4o-mini-transcribe`, the only supported format is `json`. -""") -union AudioResponseFormat { - "json", - "text", - "srt", - "verbose_json", - "vtt", -} - -model VadConfig { - @doc(""" - Must be set to `server_vad` to enable manual chunking using server side VAD. - """) - type: "server_vad"; - - prefix_padding_ms?: int32 = 300; - silence_duration_ms?: int32 = 200; - threshold?: float32 = 0.5; -} - -@doc(""" - Controls how the audio is cut into chunks. When set to `"auto"`, the - server first normalizes loudness and then uses voice activity detection (VAD) to - choose boundaries. `server_vad` object can be provided to tweak VAD detection - parameters manually. If unset, the audio is transcribed as a single block. -""") -union TranscriptionChunkingStrategy { - "auto", - VadConfig, -} - -union TranscriptionInclude { - "logprobs", -} - -// Other models removed because they do not correspond to Python models or are redundant -// Please re-add as needed with proper alignment to the source Python definitions \ No newline at end of file diff --git a/specification/ai/data-plane/VoiceLive/client.tsp b/specification/ai/data-plane/VoiceLive/client.tsp index 5c474b956899..2c20a966ddb8 100644 --- a/specification/ai/data-plane/VoiceLive/client.tsp +++ b/specification/ai/data-plane/VoiceLive/client.tsp @@ -9,9 +9,7 @@ using Azure.ClientGenerator.Core; @@access(VoiceLive.VoiceLiveServerEventResponseAudioDelta, Access.public, "python"); @@access(VoiceLive.VoiceLiveClientEventSessionUpdate, Access.public, "python"); @@access(VoiceLive.VoiceLiveConversationResponseItem, Access.public, "python"); -@@access(VoiceLive.LogProbProperties, Access.public, "python"); @@access(VoiceLive.VoiceLiveResponse, Access.public, "python"); -@@access(VoiceLive.VoiceLiveServerEventConversationCreated, Access.public, "python"); @@access(VoiceLive.VoiceLiveServerEventConversationItemCreated, Access.public, "python"); @@access(VoiceLive.VoiceLiveServerEventConversationItemDeleted, Access.public, "python"); @@access(VoiceLive.VoiceLiveServerEventConversationItemInputAudioTranscriptionCompleted, Access.public, "python"); diff --git a/specification/ai/data-plane/VoiceLive/common/custom.tsp b/specification/ai/data-plane/VoiceLive/common/custom.tsp deleted file mode 100644 index b1e3503e5d43..000000000000 --- a/specification/ai/data-plane/VoiceLive/common/custom.tsp +++ /dev/null @@ -1,70 +0,0 @@ -import "@typespec/http"; -import "@typespec/openapi"; - -using TypeSpec.Http; -using TypeSpec.OpenAPI; - -namespace VoiceLive; - -@discriminator("type") -model ResponseFormat { - type: "text" | "json_object" | "json_schema"; -} - -alias AcceptJsonHeader = { - @header accept: "application/json"; -}; - -alias AcceptJsonOrEventStreamHeader = { - @header accept: "application/json" | "text/event-stream"; -}; - -alias AssistantsBetaHeader = { - @header("VoiceLive-Beta") voiceLiveBeta: "assistants=v2"; -}; - -alias PageLimitQueryParameter = { - /** - * A limit on the number of objects to be returned. Limit can range between 1 and 100, and the - * default is 20. - */ - @query limit?: int32 = 20; -}; - -alias PageOrderQueryParameter = { - /** - * Sort order by the `created_at` timestamp of the objects. `asc` for ascending order and`desc` - * for descending order. - */ - @query order?: "asc" | "desc"; -}; - -alias PageAfterQueryParameter = { - /** - * A cursor for use in pagination. `after` is an object ID that defines your place in the list. - * For instance, if you make a list request and receive 100 objects, ending with obj_foo, your - * subsequent call can include after=obj_foo in order to fetch the next page of the list. - */ - @query after?: string; -}; - -alias PageBeforeQueryParameter = { - /** - * A cursor for use in pagination. `before` is an object ID that defines your place in the list. - * For instance, if you make a list request and receive 100 objects, ending with obj_foo, your - * subsequent call can include before=obj_foo in order to fetch the previous page of the list. - */ - @query before?: string; -}; - -alias CommonPageQueryParameters = { - ...PageLimitQueryParameter; - ...PageOrderQueryParameter; - ...PageAfterQueryParameter; - ...PageBeforeQueryParameter; -}; - -alias SseResponseOf = { - @header("Content-Type") contentType: "text/event-stream"; - @body responseBody: T; -}; diff --git a/specification/ai/data-plane/VoiceLive/common/main.tsp b/specification/ai/data-plane/VoiceLive/common/main.tsp index 223114fb0ebc..5ad1d3a2bec6 100644 --- a/specification/ai/data-plane/VoiceLive/common/main.tsp +++ b/specification/ai/data-plane/VoiceLive/common/main.tsp @@ -1,2 +1 @@ -import "./custom.tsp"; import "./models.tsp"; diff --git a/specification/ai/data-plane/VoiceLive/common/models.tsp b/specification/ai/data-plane/VoiceLive/common/models.tsp index 2a4ac6846a7d..9c32523e893a 100644 --- a/specification/ai/data-plane/VoiceLive/common/models.tsp +++ b/specification/ai/data-plane/VoiceLive/common/models.tsp @@ -1,8 +1,7 @@ // Cleaned TypeSpec file aligned with Python model definitions // Removed models not defined or needed based on your Python code baseline - -import "./custom.tsp"; - +import "@typespec/http"; +import "@typespec/openapi"; using TypeSpec.OpenAPI; namespace VoiceLive; @@ -19,7 +18,10 @@ model Error { param?: string; @doc("Type or category of the error.") - type: string; + type?: string; + + @doc("Event id of the error.") + event_id?: string; } @error @@ -39,13 +41,4 @@ model LogProbProperties { @doc("The bytes that were used to generate the log probability.") bytes: int32[]; -} - -@doc("Enumerates available service tier configurations.") -union ServiceTier { - "auto", - "default", - "flex", - "scale", - "priority", } \ No newline at end of file diff --git a/specification/ai/data-plane/VoiceLive/custom.tsp b/specification/ai/data-plane/VoiceLive/custom.tsp index 7e9a21038215..d4fbf5b32b55 100644 --- a/specification/ai/data-plane/VoiceLive/custom.tsp +++ b/specification/ai/data-plane/VoiceLive/custom.tsp @@ -1,6 +1,9 @@ import "./custom/events.tsp"; import "./custom/items.tsp"; import "./custom/tools.tsp"; +import "@typespec/http"; +import "@typespec/openapi"; + using TypeSpec.OpenAPI; diff --git a/specification/ai/data-plane/VoiceLive/custom/content_parts.tsp b/specification/ai/data-plane/VoiceLive/custom/content_parts.tsp index 4aa2e14c6724..34d4bc2bb2c1 100644 --- a/specification/ai/data-plane/VoiceLive/custom/content_parts.tsp +++ b/specification/ai/data-plane/VoiceLive/custom/content_parts.tsp @@ -17,7 +17,7 @@ model VoiceLiveContentPart { model VoiceLiveRequestTextContentPart extends VoiceLiveContentPart { type: VoiceLiveContentPartType.input_text; - text: string; + text?: string; } model VoiceLiveRequestAudioContentPart extends VoiceLiveContentPart { @@ -27,10 +27,10 @@ model VoiceLiveRequestAudioContentPart extends VoiceLiveContentPart { model VoiceLiveResponseTextContentPart extends VoiceLiveContentPart { type: VoiceLiveContentPartType.text; - text: string; + text?: string; } model VoiceLiveResponseAudioContentPart extends VoiceLiveContentPart { type: VoiceLiveContentPartType.audio; - transcript: string | null; + transcript?: string; } diff --git a/specification/ai/data-plane/VoiceLive/custom/events.tsp b/specification/ai/data-plane/VoiceLive/custom/events.tsp index 9ec9165ce652..6926cf15aa90 100644 --- a/specification/ai/data-plane/VoiceLive/custom/events.tsp +++ b/specification/ai/data-plane/VoiceLive/custom/events.tsp @@ -30,7 +30,6 @@ union VoiceLiveServerEventType { session_avatar_connecting: "session.avatar.connecting", session_created: "session.created", session_updated: "session.updated", - conversation_created: "conversation.created", conversation_item_input_audio_transcription_completed: "conversation.item.input_audio_transcription.completed", conversation_item_input_audio_transcription_delta: "conversation.item.input_audio_transcription.delta", conversation_item_input_audio_transcription_failed: "conversation.item.input_audio_transcription.failed", diff --git a/specification/ai/data-plane/VoiceLive/custom/items.tsp b/specification/ai/data-plane/VoiceLive/custom/items.tsp index 9cdc0d9cbdb6..431e1d76c8d3 100644 --- a/specification/ai/data-plane/VoiceLive/custom/items.tsp +++ b/specification/ai/data-plane/VoiceLive/custom/items.tsp @@ -56,12 +56,26 @@ model VoiceLiveRequestMessageReferenceItem { // extends VoiceLiveConversationReq id: string; } +model VoiceLiveConversationItem { + ...VoiceLiveConversationItemBase; + id?: string; + arguments?: string; + call_id?: string; + content?: VoiceLiveConversationItemWithReferenceContent[]; + name?: string; + object?: "realtime.item"; + output?: string; + role?: "user" | "assistant" | "system"; + status?: "completed" | "incomplete" | "in_progress"; + type?: "message" | "function_call" | "function_call_output"; +} + @discriminator("type") model VoiceLiveConversationResponseItem { ...VoiceLiveConversationItemBase; - object: "voicelive.item"; - type: VoiceLiveItemType; - id: string | null; + object?: "realtime.item"; + type?: VoiceLiveItemType; + id?: string; } model VoiceLiveResponseMessageItem extends VoiceLiveConversationResponseItem { @@ -107,3 +121,33 @@ union VoiceLiveMessageRole { user: "user", assistant: "assistant", } + +// Tool generated type. Extracts from VoiceLiveConversationItemWithReference.content +alias VoiceLiveConversationItemWithReferenceContent = { + @doc(""" + The content type (`input_text`, `input_audio`, `item_reference`, `text`). + """) + type?: "input_audio" | "input_text" | "item_reference" | "text"; + + @doc(""" + The text content, used for `input_text` and `text` content types. + """) + text?: string; + + @doc(""" + ID of a previous conversation item to reference (for `item_reference` + content types in `response.create` events). These can reference both + client and server created items. + """) + id?: string; + + @doc(""" + Base64-encoded audio bytes, used for `input_audio` content type. + """) + audio?: string; + + @doc(""" + The transcript of the audio, used for `input_audio` content type. + """) + transcript?: string; +}; \ No newline at end of file diff --git a/specification/ai/data-plane/VoiceLive/models.tsp b/specification/ai/data-plane/VoiceLive/models.tsp index 2727df14eb14..ae38c5a6f7c3 100644 --- a/specification/ai/data-plane/VoiceLive/models.tsp +++ b/specification/ai/data-plane/VoiceLive/models.tsp @@ -4,7 +4,6 @@ */ import "./client.tsp"; -import "./audio"; import "./common"; import "./custom.tsp"; @@ -12,35 +11,6 @@ using TypeSpec.OpenAPI; namespace VoiceLive; -// Tool generated type. Extracts from VoiceLiveConversationItemWithReference.content -alias VoiceLiveConversationItemWithReferenceContent = { - @doc(""" - The content type (`input_text`, `input_audio`, `item_reference`, `text`). - """) - type?: "input_audio" | "input_text" | "item_reference" | "text"; - - @doc(""" - The text content, used for `input_text` and `text` content types. - """) - text?: string; - - @doc(""" - ID of a previous conversation item to reference (for `item_reference` - content types in `response.create` events). These can reference both - client and server created items. - """) - id?: string; - - @doc(""" - Base64-encoded audio bytes, used for `input_audio` content type. - """) - audio?: string; - - @doc(""" - The transcript of the audio, used for `input_audio` content type. - """) - transcript?: string; -}; // Tool customization: Adjust union to be a discriminated type base /** A voicelive client event. */ @@ -154,15 +124,15 @@ model VoiceLiveResponse { id?: string; @doc(""" - The object type, must be `voicelive.response`. + The object type, must be `realtime.response`. """) - object?: "voicelive.response"; + object?: "realtime.response"; @doc(""" The final status of the response (`completed`, `cancelled`, `failed`, or `incomplete`). """) - status?: "completed" | "cancelled" | "failed" | "incomplete"; + status?: "completed" | "cancelled" | "failed" | "incomplete" | "in_progress"; /** Additional details about the status. */ status_details?: { @@ -363,6 +333,11 @@ model VoiceLiveClientEventConversationItemCreate extends VoiceLiveClientEvent { """) type: VoiceLiveClientEventType.conversation_item_create; + @doc(""" + Optional client-generated ID used to identify this event. + """) + event_id?: string; + @doc(""" The ID of the preceding item after which the new item will be inserted. If not set, the new item will be appended to the end of the conversation. @@ -373,7 +348,7 @@ model VoiceLiveClientEventConversationItemCreate extends VoiceLiveClientEvent { previous_item_id?: string; // Tool customization: apply enriched item definition hierarchy - item: VoiceLiveConversationRequestItem; + item?: VoiceLiveConversationItem; } // Tool customization (apply_discriminator): apply discriminated type base @@ -455,6 +430,11 @@ model VoiceLiveClientEventResponseCreate extends VoiceLiveClientEvent { type: VoiceLiveClientEventType.response_create; response?: VoiceLiveResponseCreateParams; + + @doc(""" + additional instructions (system prompt) appended to the default instructions of the session. Only affects this response only. + """) + additional_instructions?: string } // Tool customization (apply_discriminator): apply discriminated type base @@ -561,26 +541,6 @@ model VoiceLiveServerEvent { event_id?: string; } -// Tool customization (apply_discriminator): apply discriminated type -/** Returned when a conversation is created. Emitted right after session creation. */ -model VoiceLiveServerEventConversationCreated extends VoiceLiveServerEvent { - @doc(""" - The event type, must be `conversation.created`. - """) - type: VoiceLiveServerEventType.conversation_created; - - /** The conversation resource. */ - conversation: { - /** The unique ID of the conversation. */ - id?: string; - - @doc(""" - The object type, must be `voicelive.conversation`. - """) - object?: string; - }; -} - // Tool customization (apply_discriminator): apply discriminated type @doc(""" Returned when an input audio buffer is committed, either by the client or @@ -595,7 +555,7 @@ model VoiceLiveServerEventInputAudioBufferCommitted extends VoiceLiveServerEvent type: VoiceLiveServerEventType.input_audio_buffer_committed; /** The ID of the preceding item after which the new item will be inserted. */ - previous_item_id: string; + previous_item_id?: string; /** The ID of the user message item that will be created. */ item_id: string; @@ -694,7 +654,7 @@ model VoiceLiveServerEventConversationItemCreated extends VoiceLiveServerEvent { previous_item_id: string; // Tool customization: apply enriched item definition hierarchy - item: VoiceLiveConversationResponseItem; + item?: VoiceLiveConversationItem; } // Tool customization (apply_discriminator): apply discriminated type @@ -749,19 +709,7 @@ model VoiceLiveServerEventConversationItemInputAudioTranscriptionFailed content_index: int32; /** Details of the transcription error. */ - error: { - /** The type of error. */ - type?: string; - - /** Error code, if any. */ - code?: string; - - /** A human-readable error message. */ - message?: string; - - /** Parameter related to the error, if any. */ - param?: string; - }; + error: Error; } // Tool customization (apply_discriminator): apply discriminated type @@ -787,6 +735,8 @@ model VoiceLiveServerEventConversationItemTruncated extends VoiceLiveServerEvent /** The duration up to which the audio was truncated, in milliseconds. */ audio_end_ms: int32; + + event_id?: string; } // Tool customization (apply_discriminator): apply discriminated type @@ -803,6 +753,8 @@ model VoiceLiveServerEventConversationItemDeleted extends VoiceLiveServerEvent { /** The ID of the item that was deleted. */ item_id: string; + + event_id?: string; } // Tool customization (apply_discriminator): apply discriminated type @@ -849,7 +801,7 @@ model VoiceLiveServerEventResponseOutputItemAdded extends VoiceLiveServerEvent { output_index: int32; // Tool customization: apply enriched item definition hierarchy - item: VoiceLiveConversationResponseItem; + item?: VoiceLiveConversationItem; } // Tool customization (apply_discriminator): apply discriminated type @@ -870,7 +822,7 @@ model VoiceLiveServerEventResponseOutputItemDone extends VoiceLiveServerEvent { output_index: int32; // Tool customization: apply enriched item definition hierarchy - item: VoiceLiveConversationResponseItem; + item?: VoiceLiveConversationResponseItem; } // Tool customization (apply_discriminator): apply discriminated type @@ -1058,6 +1010,8 @@ model VoiceLiveServerEventResponseAudioDelta extends VoiceLiveServerEvent { /** Base64-encoded audio data delta. */ @encode("base64") delta: bytes; + + event_id?: string; } // Tool customization (apply_discriminator): apply discriminated type @@ -1116,7 +1070,7 @@ model ResponseEmotionHypothesis extends VoiceLiveServerEvent { candidates: EmotionCandidate[], audio_offset_ms: int32; audio_duration_ms: int32; - response_id: string; + response_id?: string; item_id: string; } @@ -1268,9 +1222,9 @@ model VoiceLiveConversationItemWithReference { type?: "message" | "function_call" | "function_call_output"; @doc(""" - Identifier for the API object being returned - always `voicelive.item`. + Identifier for the API object being returned - always `realtime.item`. """) - object?: "voicelive.item"; + object?: "realtime.item"; @doc(""" The status of the item (`completed`, `incomplete`). These have no effect @@ -1366,9 +1320,8 @@ model VoiceLiveServerEventConversationItemRetrieved extends VoiceLiveServerEvent The event type, must be `conversation.item.retrieved`. """) type: VoiceLiveServerEventType.conversation_item_retrieved; - - // Tool customization: apply enriched item definition hierarchy - item: VoiceLiveConversationResponseItem; + item_id?: string; + event_id?: string; } model EmotionCandidate { diff --git a/specification/ai/data-plane/VoiceLive/servers/websocket.tsp b/specification/ai/data-plane/VoiceLive/servers/websocket.tsp index e34bef9deb5d..f78f80c2ae8a 100644 --- a/specification/ai/data-plane/VoiceLive/servers/websocket.tsp +++ b/specification/ai/data-plane/VoiceLive/servers/websocket.tsp @@ -17,7 +17,6 @@ op force_models(session: VoiceLiveClientEventSessionUpdate ): VoiceLiveServerEventError | VoiceLiveServerEventResponseTextDelta | VoiceLiveServerEventResponseAudioDelta | - VoiceLiveServerEventConversationCreated| VoiceLiveServerEventConversationItemCreated| VoiceLiveServerEventConversationItemDeleted| VoiceLiveServerEventConversationItemRetrieved| From 9d47a478da3a023e51e90f4f8d2812eaff686628 Mon Sep 17 00:00:00 2001 From: Xiting Zhang Date: Wed, 23 Jul 2025 11:02:15 -0700 Subject: [PATCH 15/48] update ConversationItem --- .../ai/data-plane/VoiceLive/custom/items.tsp | 14 -------------- specification/ai/data-plane/VoiceLive/models.tsp | 6 +++--- 2 files changed, 3 insertions(+), 17 deletions(-) diff --git a/specification/ai/data-plane/VoiceLive/custom/items.tsp b/specification/ai/data-plane/VoiceLive/custom/items.tsp index 431e1d76c8d3..cf852f24e0f5 100644 --- a/specification/ai/data-plane/VoiceLive/custom/items.tsp +++ b/specification/ai/data-plane/VoiceLive/custom/items.tsp @@ -56,20 +56,6 @@ model VoiceLiveRequestMessageReferenceItem { // extends VoiceLiveConversationReq id: string; } -model VoiceLiveConversationItem { - ...VoiceLiveConversationItemBase; - id?: string; - arguments?: string; - call_id?: string; - content?: VoiceLiveConversationItemWithReferenceContent[]; - name?: string; - object?: "realtime.item"; - output?: string; - role?: "user" | "assistant" | "system"; - status?: "completed" | "incomplete" | "in_progress"; - type?: "message" | "function_call" | "function_call_output"; -} - @discriminator("type") model VoiceLiveConversationResponseItem { ...VoiceLiveConversationItemBase; diff --git a/specification/ai/data-plane/VoiceLive/models.tsp b/specification/ai/data-plane/VoiceLive/models.tsp index ae38c5a6f7c3..284e9436c8e2 100644 --- a/specification/ai/data-plane/VoiceLive/models.tsp +++ b/specification/ai/data-plane/VoiceLive/models.tsp @@ -348,7 +348,7 @@ model VoiceLiveClientEventConversationItemCreate extends VoiceLiveClientEvent { previous_item_id?: string; // Tool customization: apply enriched item definition hierarchy - item?: VoiceLiveConversationItem; + item?: VoiceLiveConversationItemWithReferenceContent; } // Tool customization (apply_discriminator): apply discriminated type base @@ -654,7 +654,7 @@ model VoiceLiveServerEventConversationItemCreated extends VoiceLiveServerEvent { previous_item_id: string; // Tool customization: apply enriched item definition hierarchy - item?: VoiceLiveConversationItem; + item?: VoiceLiveConversationItemWithReferenceContent; } // Tool customization (apply_discriminator): apply discriminated type @@ -801,7 +801,7 @@ model VoiceLiveServerEventResponseOutputItemAdded extends VoiceLiveServerEvent { output_index: int32; // Tool customization: apply enriched item definition hierarchy - item?: VoiceLiveConversationItem; + item?: VoiceLiveConversationItemWithReferenceContent; } // Tool customization (apply_discriminator): apply discriminated type From 3ea64a958135069da4185465a466565f8d8ce2b2 Mon Sep 17 00:00:00 2001 From: Xiting Zhang Date: Wed, 23 Jul 2025 11:06:41 -0700 Subject: [PATCH 16/48] typo --- specification/ai/data-plane/VoiceLive/models.tsp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/specification/ai/data-plane/VoiceLive/models.tsp b/specification/ai/data-plane/VoiceLive/models.tsp index 284e9436c8e2..f4f54536902a 100644 --- a/specification/ai/data-plane/VoiceLive/models.tsp +++ b/specification/ai/data-plane/VoiceLive/models.tsp @@ -348,7 +348,7 @@ model VoiceLiveClientEventConversationItemCreate extends VoiceLiveClientEvent { previous_item_id?: string; // Tool customization: apply enriched item definition hierarchy - item?: VoiceLiveConversationItemWithReferenceContent; + item?: VoiceLiveConversationItemWithReference; } // Tool customization (apply_discriminator): apply discriminated type base @@ -654,7 +654,7 @@ model VoiceLiveServerEventConversationItemCreated extends VoiceLiveServerEvent { previous_item_id: string; // Tool customization: apply enriched item definition hierarchy - item?: VoiceLiveConversationItemWithReferenceContent; + item?: VoiceLiveConversationItemWithReference; } // Tool customization (apply_discriminator): apply discriminated type @@ -801,7 +801,7 @@ model VoiceLiveServerEventResponseOutputItemAdded extends VoiceLiveServerEvent { output_index: int32; // Tool customization: apply enriched item definition hierarchy - item?: VoiceLiveConversationItemWithReferenceContent; + item?: VoiceLiveConversationItemWithReference; } // Tool customization (apply_discriminator): apply discriminated type From 4f1195118d8c51a34acdf299c56ecfb260071138 Mon Sep 17 00:00:00 2001 From: Ryan Hurey Date: Mon, 28 Jul 2025 16:36:55 -0700 Subject: [PATCH 17/48] Add EntraID and move some other models --- .../ai/data-plane/VoiceLive/client.tsp | 1 + .../ai/data-plane/VoiceLive/common/models.tsp | 4 +- .../ai/data-plane/VoiceLive/models.tsp | 2 +- .../ai/data-plane/VoiceLive/operations.tsp | 7 ++ .../VoiceLive/servers/websocket.tsp | 84 +++++++++++-------- 5 files changed, 62 insertions(+), 36 deletions(-) diff --git a/specification/ai/data-plane/VoiceLive/client.tsp b/specification/ai/data-plane/VoiceLive/client.tsp index 2c20a966ddb8..0caa9614a4bb 100644 --- a/specification/ai/data-plane/VoiceLive/client.tsp +++ b/specification/ai/data-plane/VoiceLive/client.tsp @@ -4,6 +4,7 @@ import "./servers/websocket.tsp"; using Azure.ClientGenerator.Core; @@access(VoiceLive.force_models, Access.internal, "python"); +@@access(VoiceLive.force_models, Access.internal, "csharp"); @@access(VoiceLive.VoiceLiveServerEventSessionCreated, Access.public, "python"); @@access(VoiceLive.VoiceLiveServerEventResponseAudioDelta, Access.public, "python"); diff --git a/specification/ai/data-plane/VoiceLive/common/models.tsp b/specification/ai/data-plane/VoiceLive/common/models.tsp index 9c32523e893a..1bf792617f3d 100644 --- a/specification/ai/data-plane/VoiceLive/common/models.tsp +++ b/specification/ai/data-plane/VoiceLive/common/models.tsp @@ -7,7 +7,7 @@ using TypeSpec.OpenAPI; namespace VoiceLive; @doc("Error object returned in case of API failure.") -model Error { +model ErrorDetails { @doc("Error code, or null if unspecified.") code?: string; @@ -28,7 +28,7 @@ model Error { @doc("Standard error response envelope.") model ErrorResponse { @doc("Error object returned in case of API failure.") - error: Error; + error: ErrorDetails; } @doc("A single log probability entry for a token.") diff --git a/specification/ai/data-plane/VoiceLive/models.tsp b/specification/ai/data-plane/VoiceLive/models.tsp index f4f54536902a..4c7f249bb5e1 100644 --- a/specification/ai/data-plane/VoiceLive/models.tsp +++ b/specification/ai/data-plane/VoiceLive/models.tsp @@ -709,7 +709,7 @@ model VoiceLiveServerEventConversationItemInputAudioTranscriptionFailed content_index: int32; /** Details of the transcription error. */ - error: Error; + error: ErrorDetails; } // Tool customization (apply_discriminator): apply discriminated type diff --git a/specification/ai/data-plane/VoiceLive/operations.tsp b/specification/ai/data-plane/VoiceLive/operations.tsp index 4978fb17fe46..7beb943459a7 100644 --- a/specification/ai/data-plane/VoiceLive/operations.tsp +++ b/specification/ai/data-plane/VoiceLive/operations.tsp @@ -1,11 +1,18 @@ import "./common"; import "./models.tsp"; +import "@azure-tools/typespec-azure-core"; using TypeSpec.Http; using TypeSpec.OpenAPI; +using TypeSpec.Versioning; namespace VoiceLive; alias VoiceLiveBetaHeader = { @header("VoiceLive-Beta") voiceLiveBeta: "voicelive=v1"; }; + +enum Versions { + @useDependency(Azure.Core.Versions.v1_0_Preview_2) + v2025_05_01_preview: "2025-05-01-preview", +} \ No newline at end of file diff --git a/specification/ai/data-plane/VoiceLive/servers/websocket.tsp b/specification/ai/data-plane/VoiceLive/servers/websocket.tsp index f78f80c2ae8a..b8a826c0168b 100644 --- a/specification/ai/data-plane/VoiceLive/servers/websocket.tsp +++ b/specification/ai/data-plane/VoiceLive/servers/websocket.tsp @@ -1,40 +1,58 @@ import "@typespec/http"; +import "@typespec/versioning"; +import "@azure-tools/typespec-azure-core"; + import "../models.tsp"; -using TypeSpec.Http; +import "../operations.tsp"; -@service(#{ - title: "VoiceLive API" +using TypeSpec.Http; +using TypeSpec.Versioning; +using Azure.Core; -}) -@server("wss://api.voicelive.com/v1", "VoiceLive Endpoint") +@service(#{ title: "VoiceLive"}) +@versioned(VoiceLive.Versions) +@useAuth( + ApiKeyAuth | AadOauth2Auth<[ + "https://cognitiveservices.azure.com/.default" + ]> +) +@server( + "{endpoint}/voice-agent/realtime", + "VoiceLive Endpoint", + { + @doc(""" + Azure AI VoiceLive endpoint. + """) + endpoint: url, + } +) -@useAuth(BearerAuth) namespace VoiceLive; -op force_models(session: VoiceLiveClientEventSessionUpdate ): - VoiceLiveServerEventSessionUpdated | - VoiceLiveServerEventSessionCreated | - VoiceLiveServerEventError | - VoiceLiveServerEventResponseTextDelta | - VoiceLiveServerEventResponseAudioDelta | - VoiceLiveServerEventConversationItemCreated| - VoiceLiveServerEventConversationItemDeleted| - VoiceLiveServerEventConversationItemRetrieved| - VoiceLiveServerEventConversationItemTruncated| - VoiceLiveServerEventConversationItemInputAudioTranscriptionCompleted| - VoiceLiveServerEventConversationItemInputAudioTranscriptionDelta| - VoiceLiveServerEventConversationItemInputAudioTranscriptionFailed| - VoiceLiveServerEventInputAudioBufferCommitted| - VoiceLiveServerEventInputAudioBufferCleared| - VoiceLiveServerEventInputAudioBufferSpeechStarted| - VoiceLiveServerEventInputAudioBufferSpeechStopped| - VoiceLiveServerEventResponseCreated| - VoiceLiveServerEventResponseDone| - VoiceLiveServerEventResponseOutputItemAdded| - VoiceLiveServerEventResponseOutputItemDone| - VoiceLiveServerEventResponseContentPartAdded| - VoiceLiveServerEventResponseContentPartDone| - VoiceLiveServerEventResponseTextDone| - VoiceLiveServerEventResponseAudioTranscriptDelta| - VoiceLiveServerEventResponseAudioTranscriptDone| - VoiceLiveServerEventResponseAudioDone; +op force_models(session: VoiceLiveClientEventSessionUpdate): + VoiceLiveServerEventSessionUpdated + | VoiceLiveServerEventSessionCreated + | VoiceLiveServerEventError + | VoiceLiveServerEventResponseTextDelta + | VoiceLiveServerEventResponseAudioDelta + | VoiceLiveServerEventConversationItemCreated + | VoiceLiveServerEventConversationItemDeleted + | VoiceLiveServerEventConversationItemRetrieved + | VoiceLiveServerEventConversationItemTruncated + | VoiceLiveServerEventConversationItemInputAudioTranscriptionCompleted + | VoiceLiveServerEventConversationItemInputAudioTranscriptionDelta + | VoiceLiveServerEventConversationItemInputAudioTranscriptionFailed + | VoiceLiveServerEventInputAudioBufferCommitted + | VoiceLiveServerEventInputAudioBufferCleared + | VoiceLiveServerEventInputAudioBufferSpeechStarted + | VoiceLiveServerEventInputAudioBufferSpeechStopped + | VoiceLiveServerEventResponseCreated + | VoiceLiveServerEventResponseDone + | VoiceLiveServerEventResponseOutputItemAdded + | VoiceLiveServerEventResponseOutputItemDone + | VoiceLiveServerEventResponseContentPartAdded + | VoiceLiveServerEventResponseContentPartDone + | VoiceLiveServerEventResponseTextDone + | VoiceLiveServerEventResponseAudioTranscriptDelta + | VoiceLiveServerEventResponseAudioTranscriptDone + | VoiceLiveServerEventResponseAudioDone; \ No newline at end of file From 0e2bf6090c89df079da0a2c35dcda98ca89bff99 Mon Sep 17 00:00:00 2001 From: Xiting Zhang Date: Mon, 28 Jul 2025 18:05:35 -0700 Subject: [PATCH 18/48] remove voice live prefix in model names --- .../ai/data-plane/VoiceLive/client.tsp | 58 ++-- .../ai/data-plane/VoiceLive/custom.tsp | 92 +++--- .../VoiceLive/custom/content_parts.tsp | 22 +- .../ai/data-plane/VoiceLive/custom/events.tsp | 4 +- .../ai/data-plane/VoiceLive/custom/items.tsp | 88 +++--- .../ai/data-plane/VoiceLive/custom/tools.tsp | 26 +- .../ai/data-plane/VoiceLive/models.tsp | 272 +++++++++--------- .../VoiceLive/servers/websocket.tsp | 54 ++-- 8 files changed, 308 insertions(+), 308 deletions(-) diff --git a/specification/ai/data-plane/VoiceLive/client.tsp b/specification/ai/data-plane/VoiceLive/client.tsp index 0caa9614a4bb..33d0a005c8b8 100644 --- a/specification/ai/data-plane/VoiceLive/client.tsp +++ b/specification/ai/data-plane/VoiceLive/client.tsp @@ -6,33 +6,33 @@ using Azure.ClientGenerator.Core; @@access(VoiceLive.force_models, Access.internal, "python"); @@access(VoiceLive.force_models, Access.internal, "csharp"); -@@access(VoiceLive.VoiceLiveServerEventSessionCreated, Access.public, "python"); -@@access(VoiceLive.VoiceLiveServerEventResponseAudioDelta, Access.public, "python"); -@@access(VoiceLive.VoiceLiveClientEventSessionUpdate, Access.public, "python"); -@@access(VoiceLive.VoiceLiveConversationResponseItem, Access.public, "python"); -@@access(VoiceLive.VoiceLiveResponse, Access.public, "python"); -@@access(VoiceLive.VoiceLiveServerEventConversationItemCreated, Access.public, "python"); -@@access(VoiceLive.VoiceLiveServerEventConversationItemDeleted, Access.public, "python"); -@@access(VoiceLive.VoiceLiveServerEventConversationItemInputAudioTranscriptionCompleted, Access.public, "python"); -@@access(VoiceLive.VoiceLiveServerEventConversationItemInputAudioTranscriptionDelta, Access.public, "python"); -@@access(VoiceLive.VoiceLiveServerEventConversationItemInputAudioTranscriptionFailed, Access.public, "python"); -@@access(VoiceLive.VoiceLiveServerEventConversationItemRetrieved, Access.public, "python"); -@@access(VoiceLive.VoiceLiveServerEventConversationItemTruncated, Access.public, "python"); -@@access(VoiceLive.VoiceLiveServerEventError, Access.public, "python"); -@@access(VoiceLive.VoiceLiveServerEventInputAudioBufferCleared, Access.public, "python"); -@@access(VoiceLive.VoiceLiveServerEventInputAudioBufferCommitted, Access.public, "python"); -@@access(VoiceLive.VoiceLiveServerEventInputAudioBufferSpeechStarted, Access.public, "python"); -@@access(VoiceLive.VoiceLiveServerEventInputAudioBufferSpeechStopped, Access.public, "python"); -@@access(VoiceLive.VoiceLiveServerEventResponseAudioDone, Access.public, "python"); -@@access(VoiceLive.VoiceLiveServerEventResponseAudioTranscriptDelta, Access.public, "python"); -@@access(VoiceLive.VoiceLiveServerEventResponseAudioTranscriptDone, Access.public, "python"); -@@access(VoiceLive.VoiceLiveServerEventResponseContentPartAdded, Access.public, "python"); -@@access(VoiceLive.VoiceLiveServerEventResponseContentPartDone, Access.public, "python"); -@@access(VoiceLive.VoiceLiveServerEventResponseCreated, Access.public, "python"); -@@access(VoiceLive.VoiceLiveServerEventResponseDone, Access.public, "python"); -@@access(VoiceLive.VoiceLiveServerEventResponseOutputItemAdded, Access.public, "python"); -@@access(VoiceLive.VoiceLiveServerEventResponseOutputItemDone, Access.public, "python"); -@@access(VoiceLive.VoiceLiveServerEventResponseTextDelta, Access.public, "python"); -@@access(VoiceLive.VoiceLiveServerEventResponseTextDone, Access.public, "python"); -@@access(VoiceLive.VoiceLiveServerEventSessionUpdated, Access.public, "python"); +@@access(VoiceLive.ServerEventSessionCreated, Access.public, "python"); +@@access(VoiceLive.ServerEventResponseAudioDelta, Access.public, "python"); +@@access(VoiceLive.ClientEventSessionUpdate, Access.public, "python"); +@@access(VoiceLive.ConversationResponseItem, Access.public, "python"); +@@access(VoiceLive.Response, Access.public, "python"); +@@access(VoiceLive.ServerEventConversationItemCreated, Access.public, "python"); +@@access(VoiceLive.ServerEventConversationItemDeleted, Access.public, "python"); +@@access(VoiceLive.ServerEventConversationItemInputAudioTranscriptionCompleted, Access.public, "python"); +@@access(VoiceLive.ServerEventConversationItemInputAudioTranscriptionDelta, Access.public, "python"); +@@access(VoiceLive.ServerEventConversationItemInputAudioTranscriptionFailed, Access.public, "python"); +@@access(VoiceLive.ServerEventConversationItemRetrieved, Access.public, "python"); +@@access(VoiceLive.ServerEventConversationItemTruncated, Access.public, "python"); +@@access(VoiceLive.ServerEventError, Access.public, "python"); +@@access(VoiceLive.ServerEventInputAudioBufferCleared, Access.public, "python"); +@@access(VoiceLive.ServerEventInputAudioBufferCommitted, Access.public, "python"); +@@access(VoiceLive.ServerEventInputAudioBufferSpeechStarted, Access.public, "python"); +@@access(VoiceLive.ServerEventInputAudioBufferSpeechStopped, Access.public, "python"); +@@access(VoiceLive.ServerEventResponseAudioDone, Access.public, "python"); +@@access(VoiceLive.ServerEventResponseAudioTranscriptDelta, Access.public, "python"); +@@access(VoiceLive.ServerEventResponseAudioTranscriptDone, Access.public, "python"); +@@access(VoiceLive.ServerEventResponseContentPartAdded, Access.public, "python"); +@@access(VoiceLive.ServerEventResponseContentPartDone, Access.public, "python"); +@@access(VoiceLive.ServerEventResponseCreated, Access.public, "python"); +@@access(VoiceLive.ServerEventResponseDone, Access.public, "python"); +@@access(VoiceLive.ServerEventResponseOutputItemAdded, Access.public, "python"); +@@access(VoiceLive.ServerEventResponseOutputItemDone, Access.public, "python"); +@@access(VoiceLive.ServerEventResponseTextDelta, Access.public, "python"); +@@access(VoiceLive.ServerEventResponseTextDone, Access.public, "python"); +@@access(VoiceLive.ServerEventSessionUpdated, Access.public, "python"); diff --git a/specification/ai/data-plane/VoiceLive/custom.tsp b/specification/ai/data-plane/VoiceLive/custom.tsp index d4fbf5b32b55..9c921ff0099d 100644 --- a/specification/ai/data-plane/VoiceLive/custom.tsp +++ b/specification/ai/data-plane/VoiceLive/custom.tsp @@ -9,49 +9,49 @@ using TypeSpec.OpenAPI; namespace VoiceLive; -model VoiceLiveRequestSession { - ...VoiceLiveSessionBase; +model RequestSession { + ...SessionBase; `model`?: string; - modalities?: VoiceLiveModality[]; - animation?: VoiceLiveAnimation; + modalities?: Modality[]; + animation?: Animation; voice?: Voice; instructions?: string; - input_audio?: VoiceLiveInputAudio; + input_audio?: InputAudio; input_audio_sampling_rate?: int32 = 24000; - input_audio_format?: VoiceLiveAudioFormat = VoiceLiveAudioFormat.pcm16; - output_audio_format?: VoiceLiveAudioFormat = VoiceLiveAudioFormat.pcm16; - turn_detection?: VoiceLiveTurnDetection | null; - input_audio_noise_reduction?: VoiceLiveAudioNoiseReduction; - input_audio_echo_cancellation?: VoiceLiveAudioEchoCancellation; - avatar?: VoiceLiveAvatarConfig; - input_audio_transcription?: VoiceLiveAudioInputTranscriptionSettings; - output_audio_timestamp_types?: VoiceLiveAudioTimestampType[]; - tools?: VoiceLiveTool[]; - tool_choice?: VoiceLiveToolChoice; + input_audio_format?: AudioFormat = AudioFormat.pcm16; + output_audio_format?: AudioFormat = AudioFormat.pcm16; + turn_detection?: TurnDetection | null; + input_audio_noise_reduction?: AudioNoiseReduction; + input_audio_echo_cancellation?: AudioEchoCancellation; + avatar?: AvatarConfig; + input_audio_transcription?: AudioInputTranscriptionSettings; + output_audio_timestamp_types?: AudioTimestampType[]; + tools?: Tool[]; + tool_choice?: ToolChoice; temperature?: float32; max_response_output_tokens?: int32 | "inf"; } -model VoiceLiveResponseSession { - ...VoiceLiveSessionBase; +model ResponseSession { + ...SessionBase; id?: string; `model`?: string; - modalities?: VoiceLiveModality[]; + modalities?: Modality[]; instructions?: string; - animation?: VoiceLiveAnimation; + animation?: Animation; voice?: Voice; - input_audio?: VoiceLiveInputAudio; - input_audio_format?: VoiceLiveAudioFormat; - output_audio_format?: VoiceLiveAudioFormat; + input_audio?: InputAudio; + input_audio_format?: AudioFormat; + output_audio_format?: AudioFormat; input_audio_sampling_rate?: int32; - turn_detection?: VoiceLiveTurnDetection; - input_audio_noise_reduction?: VoiceLiveAudioNoiseReduction; - input_audio_echo_cancellation?: VoiceLiveAudioEchoCancellation; - avatar?: VoiceLiveAvatarConfig; - input_audio_transcription?: VoiceLiveAudioInputTranscriptionSettings | null; - output_audio_timestamp_types?: VoiceLiveAudioTimestampType[]; - tools?: VoiceLiveTool[]; - tool_choice?: VoiceLiveToolChoice; + turn_detection?: TurnDetection; + input_audio_noise_reduction?: AudioNoiseReduction; + input_audio_echo_cancellation?: AudioEchoCancellation; + avatar?: AvatarConfig; + input_audio_transcription?: AudioInputTranscriptionSettings | null; + output_audio_timestamp_types?: AudioTimestampType[]; + tools?: Tool[]; + tool_choice?: ToolChoice; temperature?: float32; max_response_output_tokens?: int32 | "inf" | null; agent?: AgentConfig; @@ -128,20 +128,20 @@ union Voice { Phi4mmVoice } -union VoiceLiveAudioFormat { +union AudioFormat { string, pcm16: "pcm16", g711_ulaw: "g711_ulaw", g711_alaw: "g711_alaw", } -union VoiceLiveAudioInputTranscriptionModel { +union AudioInputTranscriptionModel { string, whisper_1: "whisper-1", } @doc("Configuration for input audio transcription.") -model VoiceLiveAudioInputTranscriptionSettings { +model AudioInputTranscriptionSettings { @doc("The model used for transcription. E.g., 'whisper-1', 'azure-fast-transcription', 's2s-ingraph'.") `model`: "whisper-1" | "azure-fast-transcription" | "s2s-ingraph"; @@ -155,7 +155,7 @@ model VoiceLiveAudioInputTranscriptionSettings { custom_model: boolean; } -union VoiceLiveModality { +union Modality { string, text: "text", audio: "audio", @@ -165,17 +165,17 @@ union VoiceLiveModality { @discriminator("type") @doc("Top-level union for turn detection configuration.") -model VoiceLiveTurnDetection { +model TurnDetection { type: "none" | "server_vad" | "azure_semantic_vad"; } @doc("Disables turn detection.") -model VoiceLiveNoTurnDetection extends VoiceLiveTurnDetection { +model NoTurnDetection extends TurnDetection { type: "none"; } @doc("Base model for VAD-based turn detection.") -model VoiceLiveServerVad extends VoiceLiveTurnDetection { +model ServerVad extends TurnDetection { type: "server_vad"; threshold?: float32; prefix_padding_ms?: int32; @@ -184,7 +184,7 @@ model VoiceLiveServerVad extends VoiceLiveTurnDetection { } @doc("Semantic VAD settings based on Azure SDK features.") -model VoiceLiveAzureSemanticVad extends VoiceLiveTurnDetection { +model AzureSemanticVad extends TurnDetection { type: "azure_semantic_vad"; neg_threshold?: float32; window_size?: int32; @@ -194,13 +194,13 @@ model VoiceLiveAzureSemanticVad extends VoiceLiveTurnDetection { } @doc("Configuration for input audio noise reduction.") -model VoiceLiveAudioNoiseReduction { +model AudioNoiseReduction { @doc("The type of noise reduction model.") type: "azure_deep_noise_suppression"; } @doc("Configuration for client audio input. Used to specify the audio model and optional phrase list.") -model VoiceLiveInputAudio { +model InputAudio { @doc("The name of the model to use for input audio (currently only 'azure-standard' is supported).") `model`: "azure-standard"; @@ -209,39 +209,39 @@ model VoiceLiveInputAudio { } @doc("Echo cancellation configuration for server-side audio processing.") -model VoiceLiveAudioEchoCancellation { +model AudioEchoCancellation { @doc("The type of echo cancellation model to use.") type: "server_echo_cancellation"; } @doc("Output timestamp types supported in audio response content.") -union VoiceLiveAudioTimestampType { +union AudioTimestampType { string, @doc("Timestamps per word in the output audio.") word: "word", } @doc("Specifies the types of animation data to output.") -union VoiceLiveAnimationOutputType { +union AnimationOutputType { blendshapes: "blendshapes", viseme_id: "viseme_id", emotion: "emotion", } @doc("Configuration for animation outputs including blendshapes, visemes, and emotion metadata.") -model VoiceLiveAnimation { +model Animation { @doc("The name of the animation model to use.") model_name?: string = "default"; @doc("Set of output data types requested from the animation system.") - outputs?: VoiceLiveAnimationOutputType[] = #[VoiceLiveAnimationOutputType.blendshapes]; + outputs?: AnimationOutputType[] = #[AnimationOutputType.blendshapes]; @doc("Interval for emotion detection in milliseconds. If not set, emotion detection is disabled.") emotion_detection_interval_ms?: int32; } @doc("Configuration for avatar streaming and behavior during the session.") -model VoiceLiveAvatarConfig { +model AvatarConfig { @doc("Optional list of ICE servers to use for WebRTC connection establishment.") ice_servers?: IceServer[]; diff --git a/specification/ai/data-plane/VoiceLive/custom/content_parts.tsp b/specification/ai/data-plane/VoiceLive/custom/content_parts.tsp index 34d4bc2bb2c1..ff4c86acd705 100644 --- a/specification/ai/data-plane/VoiceLive/custom/content_parts.tsp +++ b/specification/ai/data-plane/VoiceLive/custom/content_parts.tsp @@ -2,7 +2,7 @@ using TypeSpec.OpenAPI; namespace VoiceLive; -union VoiceLiveContentPartType { +union ContentPartType { string, input_text: "input_text", input_audio: "input_audio", @@ -11,26 +11,26 @@ union VoiceLiveContentPartType { } @discriminator("type") -model VoiceLiveContentPart { - type: VoiceLiveContentPartType; +model ContentPart { + type: ContentPartType; } -model VoiceLiveRequestTextContentPart extends VoiceLiveContentPart { - type: VoiceLiveContentPartType.input_text; +model RequestTextContentPart extends ContentPart { + type: ContentPartType.input_text; text?: string; } -model VoiceLiveRequestAudioContentPart extends VoiceLiveContentPart { - type: VoiceLiveContentPartType.input_audio; +model RequestAudioContentPart extends ContentPart { + type: ContentPartType.input_audio; transcript?: string; } -model VoiceLiveResponseTextContentPart extends VoiceLiveContentPart { - type: VoiceLiveContentPartType.text; +model ResponseTextContentPart extends ContentPart { + type: ContentPartType.text; text?: string; } -model VoiceLiveResponseAudioContentPart extends VoiceLiveContentPart { - type: VoiceLiveContentPartType.audio; +model ResponseAudioContentPart extends ContentPart { + type: ContentPartType.audio; transcript?: string; } diff --git a/specification/ai/data-plane/VoiceLive/custom/events.tsp b/specification/ai/data-plane/VoiceLive/custom/events.tsp index 6926cf15aa90..048697ad32b3 100644 --- a/specification/ai/data-plane/VoiceLive/custom/events.tsp +++ b/specification/ai/data-plane/VoiceLive/custom/events.tsp @@ -3,7 +3,7 @@ using TypeSpec.OpenAPI; namespace VoiceLive; @doc("Client event types used in VoiceLive protocol.") -union VoiceLiveClientEventType { +union ClientEventType { string, session_update: "session.update", input_audio_buffer_append: "input_audio_buffer.append", @@ -24,7 +24,7 @@ union VoiceLiveClientEventType { } @doc("Server event types used in VoiceLive protocol.") -union VoiceLiveServerEventType { +union ServerEventType { string, error: "error", session_avatar_connecting: "session.avatar.connecting", diff --git a/specification/ai/data-plane/VoiceLive/custom/items.tsp b/specification/ai/data-plane/VoiceLive/custom/items.tsp index cf852f24e0f5..e4e3cf0a8c99 100644 --- a/specification/ai/data-plane/VoiceLive/custom/items.tsp +++ b/specification/ai/data-plane/VoiceLive/custom/items.tsp @@ -5,45 +5,45 @@ using TypeSpec.OpenAPI; namespace VoiceLive; @discriminator("type") -model VoiceLiveConversationRequestItem { - ...VoiceLiveConversationItemBase; - type: VoiceLiveItemType; +model ConversationRequestItem { + ...ConversationItemBase; + type: ItemType; id?: string; } @discriminator("role") -model VoiceLiveRequestMessageItem extends VoiceLiveConversationRequestItem { - type: VoiceLiveItemType.message; - role: VoiceLiveMessageRole; - status?: VoiceLiveItemStatus; +model RequestMessageItem extends ConversationRequestItem { + type: ItemType.message; + role: MessageRole; + status?: ItemStatus; } -model VoiceLiveRequestSystemMessageItem extends VoiceLiveRequestMessageItem { - role: VoiceLiveMessageRole.system; - content: VoiceLiveRequestTextContentPart[]; +model RequestSystemMessageItem extends RequestMessageItem { + role: MessageRole.system; + content: RequestTextContentPart[]; } -model VoiceLiveRequestUserMessageItem extends VoiceLiveRequestMessageItem { - role: VoiceLiveMessageRole.user; - content: (VoiceLiveRequestTextContentPart | VoiceLiveRequestAudioContentPart)[]; +model RequestUserMessageItem extends RequestMessageItem { + role: MessageRole.user; + content: (RequestTextContentPart | RequestAudioContentPart)[]; } -model VoiceLiveRequestAssistantMessageItem extends VoiceLiveRequestMessageItem { - role: VoiceLiveMessageRole.assistant; - content: VoiceLiveRequestTextContentPart[]; +model RequestAssistantMessageItem extends RequestMessageItem { + role: MessageRole.assistant; + content: RequestTextContentPart[]; } -model VoiceLiveRequestFunctionCallItem extends VoiceLiveConversationRequestItem { - type: VoiceLiveItemType.function_call; +model RequestFunctionCallItem extends ConversationRequestItem { + type: ItemType.function_call; name: string; call_id: string; arguments: string; - status?: VoiceLiveItemStatus; + status?: ItemStatus; } -model VoiceLiveRequestFunctionCallOutputItem - extends VoiceLiveConversationRequestItem { - type: VoiceLiveItemType.function_call_output; +model RequestFunctionCallOutputItem + extends ConversationRequestItem { + type: ItemType.function_call_output; call_id: string; output: string; } @@ -51,65 +51,65 @@ model VoiceLiveRequestFunctionCallOutputItem // TODO: representation of a doubly-discriminated type with an absent second discriminator // (first discriminator: type = message; second discriminator: no role present) -model VoiceLiveRequestMessageReferenceItem { // extends VoiceLiveConversationRequestItem { - type: VoiceLiveItemType.message; +model RequestMessageReferenceItem { // extends ConversationRequestItem { + type: ItemType.message; id: string; } @discriminator("type") -model VoiceLiveConversationResponseItem { - ...VoiceLiveConversationItemBase; +model ConversationResponseItem { + ...ConversationItemBase; object?: "realtime.item"; - type?: VoiceLiveItemType; + type?: ItemType; id?: string; } -model VoiceLiveResponseMessageItem extends VoiceLiveConversationResponseItem { - type: VoiceLiveItemType.message; - role: VoiceLiveMessageRole; - content: VoiceLiveContentPart[]; - status: VoiceLiveItemStatus; +model ResponseMessageItem extends ConversationResponseItem { + type: ItemType.message; + role: MessageRole; + content: ContentPart[]; + status: ItemStatus; } -model VoiceLiveResponseFunctionCallItem - extends VoiceLiveConversationResponseItem { - type: VoiceLiveItemType.function_call; +model ResponseFunctionCallItem + extends ConversationResponseItem { + type: ItemType.function_call; name: string; call_id: string; arguments: string; - status: VoiceLiveItemStatus; + status: ItemStatus; } -model VoiceLiveResponseFunctionCallOutputItem - extends VoiceLiveConversationResponseItem { - type: VoiceLiveItemType.function_call_output; +model ResponseFunctionCallOutputItem + extends ConversationResponseItem { + type: ItemType.function_call_output; call_id: string; output: string; } -union VoiceLiveItemType { +union ItemType { string, message: "message", function_call: "function_call", function_call_output: "function_call_output", } -union VoiceLiveItemStatus { +union ItemStatus { string, in_progress: "in_progress", completed: "completed", incomplete: "incomplete", } -union VoiceLiveMessageRole { +union MessageRole { string, system: "system", user: "user", assistant: "assistant", } -// Tool generated type. Extracts from VoiceLiveConversationItemWithReference.content -alias VoiceLiveConversationItemWithReferenceContent = { +// Tool generated type. Extracts from ConversationItemWithReference.content +alias ConversationItemWithReferenceContent = { @doc(""" The content type (`input_text`, `input_audio`, `item_reference`, `text`). """) diff --git a/specification/ai/data-plane/VoiceLive/custom/tools.tsp b/specification/ai/data-plane/VoiceLive/custom/tools.tsp index d7e24cd8f1ee..b346377da821 100644 --- a/specification/ai/data-plane/VoiceLive/custom/tools.tsp +++ b/specification/ai/data-plane/VoiceLive/custom/tools.tsp @@ -6,7 +6,7 @@ namespace VoiceLive; * The supported tool type discriminators for voicelive tools. * Currently, only 'function' tools are supported. */ -union VoiceLiveToolType { +union ToolType { string, function: "function", } @@ -15,15 +15,15 @@ union VoiceLiveToolType { * The base representation of a voicelive tool definition. */ @discriminator("type") -model VoiceLiveTool { - type: VoiceLiveToolType; +model Tool { + type: ToolType; } /** * The definition of a function tool as used by the voicelive endpoint. */ -model VoiceLiveFunctionTool extends VoiceLiveTool { - type: VoiceLiveToolType.function; +model FunctionTool extends Tool { + type: ToolType.function; name: string; description?: string; parameters?: unknown; @@ -33,15 +33,15 @@ model VoiceLiveFunctionTool extends VoiceLiveTool { * The combined set of available representations for a voicelive tool_choice parameter, encompassing both string * literal options like 'auto' as well as structured references to defined tools. */ -union VoiceLiveToolChoice { - VoiceLiveToolChoiceLiteral, - VoiceLiveToolChoiceObject, +union ToolChoice { + ToolChoiceLiteral, + ToolChoiceObject, } /** * The available set of mode-level, string literal tool_choice options for the voicelive endpoint. */ -union VoiceLiveToolChoiceLiteral { +union ToolChoiceLiteral { string, /** Specifies that the model should freely determine which tool or tools, if any, to call. */ @@ -58,15 +58,15 @@ union VoiceLiveToolChoiceLiteral { * A base representation for a voicelive tool_choice selecting a named tool. */ @discriminator("type") -model VoiceLiveToolChoiceObject { - type: VoiceLiveToolType; +model ToolChoiceObject { + type: ToolType; } /** * The representation of a voicelive tool_choice selecting a named function tool. */ -model VoiceLiveToolChoiceFunctionObject extends VoiceLiveToolChoiceObject { - type: VoiceLiveToolType.function; +model ToolChoiceFunctionObject extends ToolChoiceObject { + type: ToolType.function; function: { name: string; }; diff --git a/specification/ai/data-plane/VoiceLive/models.tsp b/specification/ai/data-plane/VoiceLive/models.tsp index 4c7f249bb5e1..c8f80dd8cef8 100644 --- a/specification/ai/data-plane/VoiceLive/models.tsp +++ b/specification/ai/data-plane/VoiceLive/models.tsp @@ -15,9 +15,9 @@ namespace VoiceLive; // Tool customization: Adjust union to be a discriminated type base /** A voicelive client event. */ @discriminator("type") -model VoiceLiveClientEvent { +model ClientEvent { /** The type of event. */ - type: VoiceLiveClientEventType; + type: ClientEventType; event_id?: string; } @@ -35,23 +35,23 @@ model VoiceLiveClientEvent { Only the fields that are present are updated. To clear a field like `instructions`, pass an empty string. """) -model VoiceLiveClientEventSessionUpdate extends VoiceLiveClientEvent { +model ClientEventSessionUpdate extends ClientEvent { @doc(""" The event type, must be `session.update`. """) - type: VoiceLiveClientEventType.session_update; + type: ClientEventType.session_update; // Tool customization: apply enriched request-specific model - session: VoiceLiveRequestSession; + session: RequestSession; } @doc(""" Sent when the client connects and provides its SDP (Session Description Protocol) for avatar-related media negotiation. """) -model VoiceLiveClientEventSessionAvatarConnect extends VoiceLiveClientEvent { +model ClientEventSessionAvatarConnect extends ClientEvent { @doc("The event type, must be 'session.avatar.connect'.") - type: VoiceLiveClientEventType.session_avatar_connect; + type: ClientEventType.session_avatar_connect; @doc("The client's SDP offer.") client_sdp: string; @@ -60,9 +60,9 @@ model VoiceLiveClientEventSessionAvatarConnect extends VoiceLiveClientEvent { @doc(""" Indicates the start of a new audio input turn. """) -model VoiceLiveClientEventInputAudioTurnStart extends VoiceLiveClientEvent { +model ClientEventInputAudioTurnStart extends ClientEvent { @doc("The event type, must be 'input_audio.turn.start'.") - type: VoiceLiveClientEventType.input_audio_turn_start; + type: ClientEventType.input_audio_turn_start; @doc("Unique identifier for the input audio turn.") turn_id: string; @@ -71,9 +71,9 @@ model VoiceLiveClientEventInputAudioTurnStart extends VoiceLiveClientEvent { @doc(""" Appends audio data to an ongoing input turn. """) -model VoiceLiveClientEventInputAudioTurnAppend extends VoiceLiveClientEvent { +model ClientEventInputAudioTurnAppend extends ClientEvent { @doc("The event type, must be 'input_audio.turn.append'.") - type: VoiceLiveClientEventType.input_audio_turn_append; + type: ClientEventType.input_audio_turn_append; @doc("The ID of the turn this audio is part of.") turn_id: string; @@ -85,9 +85,9 @@ model VoiceLiveClientEventInputAudioTurnAppend extends VoiceLiveClientEvent { @doc(""" Marks the end of an audio input turn. """) -model VoiceLiveClientEventInputAudioTurnEnd extends VoiceLiveClientEvent { +model ClientEventInputAudioTurnEnd extends ClientEvent { @doc("The event type, must be 'input_audio.turn.end'.") - type: VoiceLiveClientEventType.input_audio_turn_end; + type: ClientEventType.input_audio_turn_end; @doc("The ID of the audio turn being ended.") turn_id: string; @@ -96,9 +96,9 @@ model VoiceLiveClientEventInputAudioTurnEnd extends VoiceLiveClientEvent { @doc(""" Cancels an in-progress input audio turn. """) -model VoiceLiveClientEventInputAudioTurnCancel extends VoiceLiveClientEvent { +model ClientEventInputAudioTurnCancel extends ClientEvent { @doc("The event type, must be 'input_audio.turn.cancel'.") - type: VoiceLiveClientEventType.input_audio_turn_cancel; + type: ClientEventType.input_audio_turn_cancel; @doc("The ID of the turn to cancel.") turn_id: string; @@ -107,19 +107,19 @@ model VoiceLiveClientEventInputAudioTurnCancel extends VoiceLiveClientEvent { @doc(""" Clears all input audio currently being streamed. """) -model VoiceLiveClientEventInputAudioClear extends VoiceLiveClientEvent { +model ClientEventInputAudioClear extends ClientEvent { @doc("The event type, must be 'input_audio.clear'.") - type: VoiceLiveClientEventType.input_audio_clear; + type: ClientEventType.input_audio_clear; } // Tool customization: establish custom, enriched discriminated type hierarchy /** The item to add to the conversation. */ -model VoiceLiveConversationItemBase { - /** Customized to enriched VoiceLiveConversation{Request,Response}Item models */ +model ConversationItemBase { + /** Customized to enriched Conversation{Request,Response}Item models */ } /** The response resource. */ -model VoiceLiveResponse { +model Response { /** The unique ID of the response. */ id?: string; @@ -171,7 +171,7 @@ model VoiceLiveResponse { // Tool customization: apply enriched response-specific type /** The list of output items generated by the response. */ - output?: VoiceLiveConversationResponseItem[]; + output?: ConversationResponseItem[]; /** * Usage statistics for the Response, this will correspond to billing. A @@ -271,11 +271,11 @@ model VoiceLiveResponse { * VAD to be more responsive. Unlike made other client events, the server will * not send a confirmation response to this event. */ -model VoiceLiveClientEventInputAudioBufferAppend extends VoiceLiveClientEvent { +model ClientEventInputAudioBufferAppend extends ClientEvent { @doc(""" The event type, must be `input_audio_buffer.append`. """) - type: VoiceLiveClientEventType.input_audio_buffer_append; + type: ClientEventType.input_audio_buffer_append; // Tool customization: use encoded type for audio data @doc(""" @@ -298,11 +298,11 @@ model VoiceLiveClientEventInputAudioBufferAppend extends VoiceLiveClientEvent { from the model. The server will respond with an `input_audio_buffer.committed` event. """) -model VoiceLiveClientEventInputAudioBufferCommit extends VoiceLiveClientEvent { +model ClientEventInputAudioBufferCommit extends ClientEvent { @doc(""" The event type, must be `input_audio_buffer.commit`. """) - type: VoiceLiveClientEventType.input_audio_buffer_commit; + type: ClientEventType.input_audio_buffer_commit; } // Tool customization (apply_discriminator): apply discriminated type base @@ -310,11 +310,11 @@ model VoiceLiveClientEventInputAudioBufferCommit extends VoiceLiveClientEvent { Send this event to clear the audio bytes in the buffer. The server will respond with an `input_audio_buffer.cleared` event. """) -model VoiceLiveClientEventInputAudioBufferClear extends VoiceLiveClientEvent { +model ClientEventInputAudioBufferClear extends ClientEvent { @doc(""" The event type, must be `input_audio_buffer.clear`. """) - type: VoiceLiveClientEventType.input_audio_buffer_clear; + type: ClientEventType.input_audio_buffer_clear; } // Tool customization (apply_discriminator): apply discriminated type base @@ -327,11 +327,11 @@ model VoiceLiveClientEventInputAudioBufferClear extends VoiceLiveClientEvent { If successful, the server will respond with a `conversation.item.created` event, otherwise an `error` event will be sent. """) -model VoiceLiveClientEventConversationItemCreate extends VoiceLiveClientEvent { +model ClientEventConversationItemCreate extends ClientEvent { @doc(""" The event type, must be `conversation.item.create`. """) - type: VoiceLiveClientEventType.conversation_item_create; + type: ClientEventType.conversation_item_create; @doc(""" Optional client-generated ID used to identify this event. @@ -348,7 +348,7 @@ model VoiceLiveClientEventConversationItemCreate extends VoiceLiveClientEvent { previous_item_id?: string; // Tool customization: apply enriched item definition hierarchy - item?: VoiceLiveConversationItemWithReference; + item?: ConversationItemWithReference; } // Tool customization (apply_discriminator): apply discriminated type base @@ -365,11 +365,11 @@ model VoiceLiveClientEventConversationItemCreate extends VoiceLiveClientEvent { If successful, the server will respond with a `conversation.item.truncated` event. """) -model VoiceLiveClientEventConversationItemTruncate extends VoiceLiveClientEvent { +model ClientEventConversationItemTruncate extends ClientEvent { @doc(""" The event type, must be `conversation.item.truncate`. """) - type: VoiceLiveClientEventType.conversation_item_truncate; + type: ClientEventType.conversation_item_truncate; /** * The ID of the assistant message item to truncate. Only assistant message @@ -395,11 +395,11 @@ model VoiceLiveClientEventConversationItemTruncate extends VoiceLiveClientEvent unless the item does not exist in the conversation history, in which case the server will respond with an error. """) -model VoiceLiveClientEventConversationItemDelete extends VoiceLiveClientEvent { +model ClientEventConversationItemDelete extends ClientEvent { @doc(""" The event type, must be `conversation.item.delete`. """) - type: VoiceLiveClientEventType.conversation_item_delete; + type: ClientEventType.conversation_item_delete; /** The ID of the item to delete. */ item_id: string; @@ -423,13 +423,13 @@ model VoiceLiveClientEventConversationItemDelete extends VoiceLiveClientEvent { `instructions`, and `temperature`. These fields will override the Session's configuration for this Response only. """) -model VoiceLiveClientEventResponseCreate extends VoiceLiveClientEvent { +model ClientEventResponseCreate extends ClientEvent { @doc(""" The event type, must be `response.create`. """) - type: VoiceLiveClientEventType.response_create; + type: ClientEventType.response_create; - response?: VoiceLiveResponseCreateParams; + response?: ResponseCreateParams; @doc(""" additional instructions (system prompt) appended to the default instructions of the session. Only affects this response only. @@ -443,11 +443,11 @@ model VoiceLiveClientEventResponseCreate extends VoiceLiveClientEvent { with a `response.cancelled` event or an error if there is no response to cancel. """) -model VoiceLiveClientEventResponseCancel extends VoiceLiveClientEvent { +model ClientEventResponseCancel extends ClientEvent { @doc(""" The event type, must be `response.cancel`. """) - type: VoiceLiveClientEventType.response_cancel; + type: ClientEventType.response_cancel; /** * A specific response ID to cancel - if not provided, will cancel an @@ -462,11 +462,11 @@ model VoiceLiveClientEventResponseCancel extends VoiceLiveClientEvent { * problem. Most errors are recoverable and the session will stay open, we * recommend to implementors to monitor and log error messages by default. */ -model VoiceLiveServerEventError extends VoiceLiveServerEvent { +model ServerEventError extends ServerEvent { @doc(""" The event type, must be `error`. """) - type: VoiceLiveServerEventType.error; + type: ServerEventType.error; /** Details of the error. */ error: { @@ -493,14 +493,14 @@ model VoiceLiveServerEventError extends VoiceLiveServerEvent { * connection is established as the first server event. This event will contain * the default Session configuration. */ -model VoiceLiveServerEventSessionCreated extends VoiceLiveServerEvent { +model ServerEventSessionCreated extends ServerEvent { @doc(""" The event type, must be `session.created`. """) - type: VoiceLiveServerEventType.session_created; + type: ServerEventType.session_created; // Tool customization: apply enriched response-specific model - session: VoiceLiveResponseSession; + session: ResponseSession; } // Tool customization (apply_discriminator): apply discriminated type @@ -508,20 +508,20 @@ model VoiceLiveServerEventSessionCreated extends VoiceLiveServerEvent { Returned when a session is updated with a `session.update` event, unless there is an error. """) -model VoiceLiveServerEventSessionUpdated extends VoiceLiveServerEvent { +model ServerEventSessionUpdated extends ServerEvent { @doc(""" The event type, must be `session.updated`. """) - type: VoiceLiveServerEventType.session_updated; + type: ServerEventType.session_updated; // Tool customization: apply enriched response-specific model - session: VoiceLiveResponseSession; + session: ResponseSession; } @doc("Sent when the server is in the process of establishing an avatar media connection and provides its SDP answer.") -model VoiceLiveServerEventSessionAvatarConnecting extends VoiceLiveServerEvent { +model ServerEventSessionAvatarConnecting extends ServerEvent { @doc("The event type, must be 'session.avatar.connecting'.") - type: VoiceLiveServerEventType.session_avatar_connecting; + type: ServerEventType.session_avatar_connecting; @doc("The server's SDP answer for the avatar connection.") server_sdp: string; @@ -529,14 +529,14 @@ model VoiceLiveServerEventSessionAvatarConnecting extends VoiceLiveServerEvent { // Tool customization: establish base for enriched request/response split models /** VoiceLive session object configuration. */ -model VoiceLiveSessionBase {} +model SessionBase {} // Tool customization: Adjust union to be a discriminated type base /** A voicelive server event. */ @discriminator("type") -model VoiceLiveServerEvent { +model ServerEvent { /** The type of event. */ - type: VoiceLiveServerEventType; + type: ServerEventType; event_id?: string; } @@ -548,11 +548,11 @@ model VoiceLiveServerEvent { message item that will be created, thus a `conversation.item.created` event will also be sent to the client. """) -model VoiceLiveServerEventInputAudioBufferCommitted extends VoiceLiveServerEvent { +model ServerEventInputAudioBufferCommitted extends ServerEvent { @doc(""" The event type, must be `input_audio_buffer.committed`. """) - type: VoiceLiveServerEventType.input_audio_buffer_committed; + type: ServerEventType.input_audio_buffer_committed; /** The ID of the preceding item after which the new item will be inserted. */ previous_item_id?: string; @@ -566,11 +566,11 @@ model VoiceLiveServerEventInputAudioBufferCommitted extends VoiceLiveServerEvent Returned when the input audio buffer is cleared by the client with a `input_audio_buffer.clear` event. """) -model VoiceLiveServerEventInputAudioBufferCleared extends VoiceLiveServerEvent { +model ServerEventInputAudioBufferCleared extends ServerEvent { @doc(""" The event type, must be `input_audio_buffer.cleared`. """) - type: VoiceLiveServerEventType.input_audio_buffer_cleared; + type: ServerEventType.input_audio_buffer_cleared; } // Tool customization (apply_discriminator): apply discriminated type @@ -586,12 +586,12 @@ model VoiceLiveServerEventInputAudioBufferCleared extends VoiceLiveServerEvent { `input_audio_buffer.speech_stopped` event (unless the client manually commits the audio buffer during VAD activation). """) -model VoiceLiveServerEventInputAudioBufferSpeechStarted - extends VoiceLiveServerEvent { +model ServerEventInputAudioBufferSpeechStarted + extends ServerEvent { @doc(""" The event type, must be `input_audio_buffer.speech_started`. """) - type: VoiceLiveServerEventType.input_audio_buffer_speech_started; + type: ServerEventType.input_audio_buffer_speech_started; @doc(""" Milliseconds from the start of all audio written to the buffer during the @@ -611,12 +611,12 @@ model VoiceLiveServerEventInputAudioBufferSpeechStarted the audio buffer. The server will also send an `conversation.item.created` event with the user message item that is created from the audio buffer. """) -model VoiceLiveServerEventInputAudioBufferSpeechStopped - extends VoiceLiveServerEvent { +model ServerEventInputAudioBufferSpeechStopped + extends ServerEvent { @doc(""" The event type, must be `input_audio_buffer.speech_stopped`. """) - type: VoiceLiveServerEventType.input_audio_buffer_speech_stopped; + type: ServerEventType.input_audio_buffer_speech_stopped; @doc(""" Milliseconds since the session started when speech stopped. This will @@ -641,11 +641,11 @@ model VoiceLiveServerEventInputAudioBufferSpeechStopped - The client has sent a `conversation.item.create` event to add a new Item to the Conversation. """) -model VoiceLiveServerEventConversationItemCreated extends VoiceLiveServerEvent { +model ServerEventConversationItemCreated extends ServerEvent { @doc(""" The event type, must be `conversation.item.created`. """) - type: VoiceLiveServerEventType.conversation_item_created; + type: ServerEventType.conversation_item_created; /** * The ID of the preceding item in the Conversation context, allows the @@ -654,7 +654,7 @@ model VoiceLiveServerEventConversationItemCreated extends VoiceLiveServerEvent { previous_item_id: string; // Tool customization: apply enriched item definition hierarchy - item?: VoiceLiveConversationItemWithReference; + item?: ConversationItemWithReference; } // Tool customization (apply_discriminator): apply discriminated type @@ -670,13 +670,13 @@ model VoiceLiveServerEventConversationItemCreated extends VoiceLiveServerEvent { The transcript may diverge somewhat from the model's interpretation, and should be treated as a rough guide. """) -model VoiceLiveServerEventConversationItemInputAudioTranscriptionCompleted - extends VoiceLiveServerEvent { +model ServerEventConversationItemInputAudioTranscriptionCompleted + extends ServerEvent { @doc(""" The event type, must be `conversation.item.input_audio_transcription.completed`. """) - type: VoiceLiveServerEventType.conversation_item_input_audio_transcription_completed; + type: ServerEventType.conversation_item_input_audio_transcription_completed; /** The ID of the user message item containing the audio. */ item_id: string; @@ -694,13 +694,13 @@ model VoiceLiveServerEventConversationItemInputAudioTranscriptionCompleted request for a user message failed. These events are separate from other `error` events so that the client can identify the related Item. """) -model VoiceLiveServerEventConversationItemInputAudioTranscriptionFailed - extends VoiceLiveServerEvent { +model ServerEventConversationItemInputAudioTranscriptionFailed + extends ServerEvent { @doc(""" The event type, must be `conversation.item.input_audio_transcription.failed`. """) - type: VoiceLiveServerEventType.conversation_item_input_audio_transcription_failed; + type: ServerEventType.conversation_item_input_audio_transcription_failed; /** The ID of the user message item. */ item_id: string; @@ -721,11 +721,11 @@ model VoiceLiveServerEventConversationItemInputAudioTranscriptionFailed This action will truncate the audio and remove the server-side text transcript to ensure there is no text in the context that hasn't been heard by the user. """) -model VoiceLiveServerEventConversationItemTruncated extends VoiceLiveServerEvent { +model ServerEventConversationItemTruncated extends ServerEvent { @doc(""" The event type, must be `conversation.item.truncated`. """) - type: VoiceLiveServerEventType.conversation_item_truncated; + type: ServerEventType.conversation_item_truncated; /** The ID of the assistant message item that was truncated. */ item_id: string; @@ -745,11 +745,11 @@ model VoiceLiveServerEventConversationItemTruncated extends VoiceLiveServerEvent `conversation.item.delete` event. This event is used to synchronize the server's understanding of the conversation history with the client's view. """) -model VoiceLiveServerEventConversationItemDeleted extends VoiceLiveServerEvent { +model ServerEventConversationItemDeleted extends ServerEvent { @doc(""" The event type, must be `conversation.item.deleted`. """) - type: VoiceLiveServerEventType.conversation_item_deleted; + type: ServerEventType.conversation_item_deleted; /** The ID of the item that was deleted. */ item_id: string; @@ -762,13 +762,13 @@ model VoiceLiveServerEventConversationItemDeleted extends VoiceLiveServerEvent { Returned when a new Response is created. The first event of response creation, where the response is in an initial state of `in_progress`. """) -model VoiceLiveServerEventResponseCreated extends VoiceLiveServerEvent { +model ServerEventResponseCreated extends ServerEvent { @doc(""" The event type, must be `response.created`. """) - type: VoiceLiveServerEventType.response_created; + type: ServerEventType.response_created; - response: VoiceLiveResponse; + response: Response; } // Tool customization (apply_discriminator): apply discriminated type @@ -777,22 +777,22 @@ model VoiceLiveServerEventResponseCreated extends VoiceLiveServerEvent { final state. The Response object included in the `response.done` event will include all output Items in the Response but will omit the raw audio data. """) -model VoiceLiveServerEventResponseDone extends VoiceLiveServerEvent { +model ServerEventResponseDone extends ServerEvent { @doc(""" The event type, must be `response.done`. """) - type: VoiceLiveServerEventType.response_done; + type: ServerEventType.response_done; - response: VoiceLiveResponse; + response: Response; } // Tool customization (apply_discriminator): apply discriminated type /** Returned when a new Item is created during Response generation. */ -model VoiceLiveServerEventResponseOutputItemAdded extends VoiceLiveServerEvent { +model ServerEventResponseOutputItemAdded extends ServerEvent { @doc(""" The event type, must be `response.output_item.added`. """) - type: VoiceLiveServerEventType.response_output_item_added; + type: ServerEventType.response_output_item_added; /** The ID of the Response to which the item belongs. */ response_id: string; @@ -801,7 +801,7 @@ model VoiceLiveServerEventResponseOutputItemAdded extends VoiceLiveServerEvent { output_index: int32; // Tool customization: apply enriched item definition hierarchy - item?: VoiceLiveConversationItemWithReference; + item?: ConversationItemWithReference; } // Tool customization (apply_discriminator): apply discriminated type @@ -809,11 +809,11 @@ model VoiceLiveServerEventResponseOutputItemAdded extends VoiceLiveServerEvent { * Returned when an Item is done streaming. Also emitted when a Response is * interrupted, incomplete, or cancelled. */ -model VoiceLiveServerEventResponseOutputItemDone extends VoiceLiveServerEvent { +model ServerEventResponseOutputItemDone extends ServerEvent { @doc(""" The event type, must be `response.output_item.done`. """) - type: VoiceLiveServerEventType.response_output_item_done; + type: ServerEventType.response_output_item_done; /** The ID of the Response to which the item belongs. */ response_id: string; @@ -822,7 +822,7 @@ model VoiceLiveServerEventResponseOutputItemDone extends VoiceLiveServerEvent { output_index: int32; // Tool customization: apply enriched item definition hierarchy - item?: VoiceLiveConversationResponseItem; + item?: ConversationResponseItem; } // Tool customization (apply_discriminator): apply discriminated type @@ -830,11 +830,11 @@ model VoiceLiveServerEventResponseOutputItemDone extends VoiceLiveServerEvent { * Returned when a new content part is added to an assistant message item during * response generation. */ -model VoiceLiveServerEventResponseContentPartAdded extends VoiceLiveServerEvent { +model ServerEventResponseContentPartAdded extends ServerEvent { @doc(""" The event type, must be `response.content_part.added`. """) - type: VoiceLiveServerEventType.response_content_part_added; + type: ServerEventType.response_content_part_added; /** The ID of the response. */ response_id: string; @@ -850,7 +850,7 @@ model VoiceLiveServerEventResponseContentPartAdded extends VoiceLiveServerEvent // Tool customization: apply detailed content part type /** The content part that was added. */ - part: VoiceLiveContentPart; + part: ContentPart; } // Tool customization (apply_discriminator): apply discriminated type @@ -858,11 +858,11 @@ model VoiceLiveServerEventResponseContentPartAdded extends VoiceLiveServerEvent * Returned when a content part is done streaming in an assistant message item. * Also emitted when a Response is interrupted, incomplete, or cancelled. */ -model VoiceLiveServerEventResponseContentPartDone extends VoiceLiveServerEvent { +model ServerEventResponseContentPartDone extends ServerEvent { @doc(""" The event type, must be `response.content_part.done`. """) - type: VoiceLiveServerEventType.response_content_part_done; + type: ServerEventType.response_content_part_done; /** The ID of the response. */ response_id: string; @@ -878,16 +878,16 @@ model VoiceLiveServerEventResponseContentPartDone extends VoiceLiveServerEvent { // Tool customization: apply detailed content part type /** The content part that is done. */ - part: VoiceLiveContentPart; + part: ContentPart; } // Tool customization (apply_discriminator): apply discriminated type /** Returned when the text value of a "text" content part is updated. */ -model VoiceLiveServerEventResponseTextDelta extends VoiceLiveServerEvent { +model ServerEventResponseTextDelta extends ServerEvent { @doc(""" The event type, must be `response.text.delta`. """) - type: VoiceLiveServerEventType.response_text_delta; + type: ServerEventType.response_text_delta; /** The ID of the response. */ response_id: string; @@ -910,11 +910,11 @@ model VoiceLiveServerEventResponseTextDelta extends VoiceLiveServerEvent { * Returned when the text value of a "text" content part is done streaming. Also * emitted when a Response is interrupted, incomplete, or cancelled. */ -model VoiceLiveServerEventResponseTextDone extends VoiceLiveServerEvent { +model ServerEventResponseTextDone extends ServerEvent { @doc(""" The event type, must be `response.text.done`. """) - type: VoiceLiveServerEventType.response_text_done; + type: ServerEventType.response_text_done; /** The ID of the response. */ response_id: string; @@ -934,12 +934,12 @@ model VoiceLiveServerEventResponseTextDone extends VoiceLiveServerEvent { // Tool customization (apply_discriminator): apply discriminated type /** Returned when the model-generated transcription of audio output is updated. */ -model VoiceLiveServerEventResponseAudioTranscriptDelta - extends VoiceLiveServerEvent { +model ServerEventResponseAudioTranscriptDelta + extends ServerEvent { @doc(""" The event type, must be `response.audio_transcript.delta`. """) - type: VoiceLiveServerEventType.response_audio_transcript_delta; + type: ServerEventType.response_audio_transcript_delta; /** The ID of the response. */ response_id: string; @@ -963,12 +963,12 @@ model VoiceLiveServerEventResponseAudioTranscriptDelta * streaming. Also emitted when a Response is interrupted, incomplete, or * cancelled. */ -model VoiceLiveServerEventResponseAudioTranscriptDone - extends VoiceLiveServerEvent { +model ServerEventResponseAudioTranscriptDone + extends ServerEvent { @doc(""" The event type, must be `response.audio_transcript.done`. """) - type: VoiceLiveServerEventType.response_audio_transcript_done; + type: ServerEventType.response_audio_transcript_done; /** The ID of the response. */ response_id: string; @@ -988,11 +988,11 @@ model VoiceLiveServerEventResponseAudioTranscriptDone // Tool customization (apply_discriminator): apply discriminated type /** Returned when the model-generated audio is updated. */ -model VoiceLiveServerEventResponseAudioDelta extends VoiceLiveServerEvent { +model ServerEventResponseAudioDelta extends ServerEvent { @doc(""" The event type, must be `response.audio.delta`. """) - type: VoiceLiveServerEventType.response_audio_delta; + type: ServerEventType.response_audio_delta; /** The ID of the response. */ response_id: string; @@ -1019,11 +1019,11 @@ model VoiceLiveServerEventResponseAudioDelta extends VoiceLiveServerEvent { * Returned when the model-generated audio is done. Also emitted when a Response * is interrupted, incomplete, or cancelled. */ -model VoiceLiveServerEventResponseAudioDone extends VoiceLiveServerEvent { +model ServerEventResponseAudioDone extends ServerEvent { @doc(""" The event type, must be `response.audio.done`. """) - type: VoiceLiveServerEventType.response_audio_done; + type: ServerEventType.response_audio_done; /** The ID of the response. */ response_id: string; @@ -1041,8 +1041,8 @@ model VoiceLiveServerEventResponseAudioDone extends VoiceLiveServerEvent { @doc(""" Represents a delta update of blendshape animation frames for a specific output of a response. """) -model ResponseAnimationBlendshapeDeltaEvent extends VoiceLiveServerEvent { - type: VoiceLiveServerEventType.response_animation_blendshapes_delta; +model ResponseAnimationBlendshapeDeltaEvent extends ServerEvent { + type: ServerEventType.response_animation_blendshapes_delta; response_id: string; item_id: string; output_index: int32; @@ -1054,8 +1054,8 @@ model ResponseAnimationBlendshapeDeltaEvent extends VoiceLiveServerEvent { @doc(""" Indicates the completion of blendshape animation processing for a specific output of a response. """) -model ResponseAnimationBlendshapeDoneEvent extends VoiceLiveServerEvent { - type: VoiceLiveServerEventType.response_animation_blendshapes_done; +model ResponseAnimationBlendshapeDoneEvent extends ServerEvent { + type: ServerEventType.response_animation_blendshapes_done; response_id: string; item_id: string; output_index: int32; @@ -1064,8 +1064,8 @@ model ResponseAnimationBlendshapeDoneEvent extends VoiceLiveServerEvent { @doc(""" Represents an emotion hypothesis detected from response audio with multiple candidates. """) -model ResponseEmotionHypothesis extends VoiceLiveServerEvent { - type: VoiceLiveServerEventType.response_emotion_hypothesis; +model ResponseEmotionHypothesis extends ServerEvent { + type: ServerEventType.response_emotion_hypothesis; emotion: string; candidates: EmotionCandidate[], audio_offset_ms: int32; @@ -1077,8 +1077,8 @@ model ResponseEmotionHypothesis extends VoiceLiveServerEvent { @doc(""" Represents a word-level audio timestamp delta for a response. """) -model ResponseAudioTimestampDeltaEvent extends VoiceLiveServerEvent { - type: VoiceLiveServerEventType.response_audio_timestamp_delta; +model ResponseAudioTimestampDeltaEvent extends ServerEvent { + type: ServerEventType.response_audio_timestamp_delta; response_id: string; item_id: string; output_index: int32; @@ -1092,8 +1092,8 @@ model ResponseAudioTimestampDeltaEvent extends VoiceLiveServerEvent { @doc(""" Indicates completion of audio timestamp delivery for a response. """) -model ResponseAudioTimestampDoneEvent extends VoiceLiveServerEvent { - type: VoiceLiveServerEventType.response_audio_timestamp_done; +model ResponseAudioTimestampDoneEvent extends ServerEvent { + type: ServerEventType.response_audio_timestamp_done; response_id: string; item_id: string; output_index: int32; @@ -1103,8 +1103,8 @@ model ResponseAudioTimestampDoneEvent extends VoiceLiveServerEvent { @doc(""" Represents a viseme ID delta update for animation based on audio. """) -model ResponseAnimationVisemeDeltaEvent extends VoiceLiveServerEvent { - type: VoiceLiveServerEventType.response_animation_viseme_delta; +model ResponseAnimationVisemeDeltaEvent extends ServerEvent { + type: ServerEventType.response_animation_viseme_delta; response_id: string; item_id: string; output_index: int32; @@ -1116,8 +1116,8 @@ model ResponseAnimationVisemeDeltaEvent extends VoiceLiveServerEvent { @doc(""" Indicates completion of viseme animation delivery for a response. """) -model ResponseAnimationVisemeDoneEvent extends VoiceLiveServerEvent { - type: VoiceLiveServerEventType.response_animation_viseme_done; +model ResponseAnimationVisemeDoneEvent extends ServerEvent { + type: ServerEventType.response_animation_viseme_done; response_id: string; item_id: string; output_index: int32; @@ -1125,7 +1125,7 @@ model ResponseAnimationVisemeDoneEvent extends VoiceLiveServerEvent { } /** Create a new VoiceLive response with these parameters */ -model VoiceLiveResponseCreateParams { +model ResponseCreateParams { @doc(""" Whether to commit the response to the conversation. Defaults to true. """) @@ -1139,20 +1139,20 @@ model VoiceLiveResponseCreateParams { @doc(""" Input items to append to the conversation context before generating a response. """) - append_input_items?: VoiceLiveConversationRequestItem[]; + append_input_items?: ConversationRequestItem[]; @doc(""" Input items to be used as the context for this response. An empty array clears previous context. """) - input_items?: VoiceLiveConversationRequestItem[]; + input_items?: ConversationRequestItem[]; // Tool customization: Apply reusable modality representation /** * The set of modalities the model can respond with. To disable audio, * set this to ["text"]. */ - modalities?: VoiceLiveModality[]; + modalities?: Modality[]; @doc(""" The default system instructions (i.e. system message) prepended to model @@ -1179,11 +1179,11 @@ model VoiceLiveResponseCreateParams { @doc(""" The format of output audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`. """) - output_audio_format?: VoiceLiveAudioFormat = VoiceLiveAudioFormat.pcm16; + output_audio_format?: AudioFormat = AudioFormat.pcm16; // Tool customization: use enriched tool definition /** Tools (functions) available to the model. */ - tools?: VoiceLiveTool[]; + tools?: Tool[]; @doc(""" How the model chooses tools. Options are `auto`, `none`, `required`, or @@ -1205,7 +1205,7 @@ model VoiceLiveResponseCreateParams { } /** The item to add to the conversation. */ -model VoiceLiveConversationItemWithReference { +model ConversationItemWithReference { @doc(""" For an item of type (`message` | `function_call` | `function_call_output`) this field allows the client to assign the unique ID of the item. It is @@ -1246,7 +1246,7 @@ model VoiceLiveConversationItemWithReference { content - Message items of role `assistant` support `text` content. """) - content?: VoiceLiveConversationItemWithReferenceContent[]; + content?: ConversationItemWithReferenceContent[]; @doc(""" The ID of the function call (for `function_call` and @@ -1279,11 +1279,11 @@ model VoiceLiveConversationItemWithReference { unless the item does not exist in the conversation history, in which case the server will respond with an error. """) -model VoiceLiveClientEventConversationItemRetrieve extends VoiceLiveClientEvent { +model ClientEventConversationItemRetrieve extends ClientEvent { @doc(""" The event type, must be `conversation.item.retrieve`. """) - type: VoiceLiveClientEventType.conversation_item_retrieve; + type: ClientEventType.conversation_item_retrieve; /** The ID of the item to retrieve. */ item_id: string; @@ -1291,12 +1291,12 @@ model VoiceLiveClientEventConversationItemRetrieve extends VoiceLiveClientEvent // Tool customization (apply_discriminator): apply discriminated type /** Returned when the text value of an input audio transcription content part is updated. */ -model VoiceLiveServerEventConversationItemInputAudioTranscriptionDelta - extends VoiceLiveServerEvent { +model ServerEventConversationItemInputAudioTranscriptionDelta + extends ServerEvent { @doc(""" The event type, must be `conversation.item.input_audio_transcription.delta`. """) - type: VoiceLiveServerEventType.conversation_item_input_audio_transcription_delta; + type: ServerEventType.conversation_item_input_audio_transcription_delta; /** The ID of the item. */ item_id: string; @@ -1315,11 +1315,11 @@ model VoiceLiveServerEventConversationItemInputAudioTranscriptionDelta @doc(""" Returned when a conversation item is retrieved with `conversation.item.retrieve`. """) -model VoiceLiveServerEventConversationItemRetrieved extends VoiceLiveServerEvent { +model ServerEventConversationItemRetrieved extends ServerEvent { @doc(""" The event type, must be `conversation.item.retrieved`. """) - type: VoiceLiveServerEventType.conversation_item_retrieved; + type: ServerEventType.conversation_item_retrieved; item_id?: string; event_id?: string; } diff --git a/specification/ai/data-plane/VoiceLive/servers/websocket.tsp b/specification/ai/data-plane/VoiceLive/servers/websocket.tsp index b8a826c0168b..7046ceb2b3a3 100644 --- a/specification/ai/data-plane/VoiceLive/servers/websocket.tsp +++ b/specification/ai/data-plane/VoiceLive/servers/websocket.tsp @@ -29,30 +29,30 @@ using Azure.Core; namespace VoiceLive; -op force_models(session: VoiceLiveClientEventSessionUpdate): - VoiceLiveServerEventSessionUpdated - | VoiceLiveServerEventSessionCreated - | VoiceLiveServerEventError - | VoiceLiveServerEventResponseTextDelta - | VoiceLiveServerEventResponseAudioDelta - | VoiceLiveServerEventConversationItemCreated - | VoiceLiveServerEventConversationItemDeleted - | VoiceLiveServerEventConversationItemRetrieved - | VoiceLiveServerEventConversationItemTruncated - | VoiceLiveServerEventConversationItemInputAudioTranscriptionCompleted - | VoiceLiveServerEventConversationItemInputAudioTranscriptionDelta - | VoiceLiveServerEventConversationItemInputAudioTranscriptionFailed - | VoiceLiveServerEventInputAudioBufferCommitted - | VoiceLiveServerEventInputAudioBufferCleared - | VoiceLiveServerEventInputAudioBufferSpeechStarted - | VoiceLiveServerEventInputAudioBufferSpeechStopped - | VoiceLiveServerEventResponseCreated - | VoiceLiveServerEventResponseDone - | VoiceLiveServerEventResponseOutputItemAdded - | VoiceLiveServerEventResponseOutputItemDone - | VoiceLiveServerEventResponseContentPartAdded - | VoiceLiveServerEventResponseContentPartDone - | VoiceLiveServerEventResponseTextDone - | VoiceLiveServerEventResponseAudioTranscriptDelta - | VoiceLiveServerEventResponseAudioTranscriptDone - | VoiceLiveServerEventResponseAudioDone; \ No newline at end of file +op force_models(session: ClientEventSessionUpdate): + ServerEventSessionUpdated + | ServerEventSessionCreated + | ServerEventError + | ServerEventResponseTextDelta + | ServerEventResponseAudioDelta + | ServerEventConversationItemCreated + | ServerEventConversationItemDeleted + | ServerEventConversationItemRetrieved + | ServerEventConversationItemTruncated + | ServerEventConversationItemInputAudioTranscriptionCompleted + | ServerEventConversationItemInputAudioTranscriptionDelta + | ServerEventConversationItemInputAudioTranscriptionFailed + | ServerEventInputAudioBufferCommitted + | ServerEventInputAudioBufferCleared + | ServerEventInputAudioBufferSpeechStarted + | ServerEventInputAudioBufferSpeechStopped + | ServerEventResponseCreated + | ServerEventResponseDone + | ServerEventResponseOutputItemAdded + | ServerEventResponseOutputItemDone + | ServerEventResponseContentPartAdded + | ServerEventResponseContentPartDone + | ServerEventResponseTextDone + | ServerEventResponseAudioTranscriptDelta + | ServerEventResponseAudioTranscriptDone + | ServerEventResponseAudioDone; \ No newline at end of file From b5efaf3bb01549dfbab77c0a45a6fa249f81c97d Mon Sep 17 00:00:00 2001 From: Ryan Hurey Date: Wed, 30 Jul 2025 13:16:37 -0700 Subject: [PATCH 19/48] Add client mapping for Csharp --- specification/ai/data-plane/VoiceLive/client.tsp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/specification/ai/data-plane/VoiceLive/client.tsp b/specification/ai/data-plane/VoiceLive/client.tsp index 33d0a005c8b8..cf6b3d6ff581 100644 --- a/specification/ai/data-plane/VoiceLive/client.tsp +++ b/specification/ai/data-plane/VoiceLive/client.tsp @@ -36,3 +36,8 @@ using Azure.ClientGenerator.Core; @@access(VoiceLive.ServerEventResponseTextDone, Access.public, "python"); @@access(VoiceLive.ServerEventSessionUpdated, Access.public, "python"); +@@access(VoiceLive.ServerEvent, Access.public, "csharp"); +@@access(VoiceLive.ClientEvent, Access.public, "csharp"); +@@clientName(VoiceLive.Modality, "InputModality", "csharp"); +@@clientName(VoiceLive.Animation, "AnimationOptions", "csharp"); +@@clientName(VoiceLive.Tool, "ToolCall", "csharp"); \ No newline at end of file From 86a5753a79b8c9f1224ec7965b2fa17f517a8933 Mon Sep 17 00:00:00 2001 From: Ryan Hurey Date: Wed, 30 Jul 2025 19:27:53 -0700 Subject: [PATCH 20/48] Add ClientEventSessionUpdate --- specification/ai/data-plane/VoiceLive/client.tsp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/specification/ai/data-plane/VoiceLive/client.tsp b/specification/ai/data-plane/VoiceLive/client.tsp index cf6b3d6ff581..bf37efc3acf4 100644 --- a/specification/ai/data-plane/VoiceLive/client.tsp +++ b/specification/ai/data-plane/VoiceLive/client.tsp @@ -40,4 +40,5 @@ using Azure.ClientGenerator.Core; @@access(VoiceLive.ClientEvent, Access.public, "csharp"); @@clientName(VoiceLive.Modality, "InputModality", "csharp"); @@clientName(VoiceLive.Animation, "AnimationOptions", "csharp"); -@@clientName(VoiceLive.Tool, "ToolCall", "csharp"); \ No newline at end of file +@@clientName(VoiceLive.Tool, "ToolCall", "csharp"); +@@access(VoiceLive.ClientEventSessionUpdate, Access.public, "csharp"); \ No newline at end of file From cb8eb7e0b9f3b99d3a3fd8db25a5325cc7e6b4d9 Mon Sep 17 00:00:00 2001 From: Xiting Zhang Date: Tue, 5 Aug 2025 11:32:57 -0700 Subject: [PATCH 21/48] add client event to force_model --- .../ai/data-plane/VoiceLive/client.tsp | 20 ++++- .../VoiceLive/servers/websocket.tsp | 79 ++++++++++++------- 2 files changed, 71 insertions(+), 28 deletions(-) diff --git a/specification/ai/data-plane/VoiceLive/client.tsp b/specification/ai/data-plane/VoiceLive/client.tsp index bf37efc3acf4..bc09936e6964 100644 --- a/specification/ai/data-plane/VoiceLive/client.tsp +++ b/specification/ai/data-plane/VoiceLive/client.tsp @@ -6,9 +6,27 @@ using Azure.ClientGenerator.Core; @@access(VoiceLive.force_models, Access.internal, "python"); @@access(VoiceLive.force_models, Access.internal, "csharp"); +@@access(VoiceLive.ClientEventSessionUpdate, Access.public, "python"); +@@access(VoiceLive.ClientEventInputAudioBufferAppend, Access.public, "python"); +@@access(VoiceLive.ClientEventInputAudioBufferCommit, Access.public, "python"); +@@access(VoiceLive.ClientEventInputAudioBufferClear, Access.public, "python"); +@@access(VoiceLive.ClientEventInputAudioTurnStart, Access.public, "python"); +@@access(VoiceLive.ClientEventInputAudioTurnAppend, Access.public, "python"); +@@access(VoiceLive.ClientEventInputAudioTurnEnd, Access.public, "python"); +@@access(VoiceLive.ClientEventInputAudioTurnCancel, Access.public, "python"); +@@access(VoiceLive.ClientEventInputAudioClear, Access.public, "python"); +@@access(VoiceLive.ClientEventConversationItemCreate, Access.public, "python"); +@@access(VoiceLive.ClientEventConversationItemRetrieve, Access.public, "python"); +@@access(VoiceLive.ClientEventConversationItemTruncate, Access.public, "python"); +@@access(VoiceLive.ClientEventConversationItemDelete, Access.public, "python"); +@@access(VoiceLive.ClientEventResponseCreate, Access.public, "python"); +@@access(VoiceLive.ClientEventResponseCancel, Access.public, "python"); +@@access(VoiceLive.ClientEventSessionAvatarConnect, Access.public, "python"); + + +@@access(VoiceLive.ServerEventSessionAvatarConnecting, Access.public, "python"); @@access(VoiceLive.ServerEventSessionCreated, Access.public, "python"); @@access(VoiceLive.ServerEventResponseAudioDelta, Access.public, "python"); -@@access(VoiceLive.ClientEventSessionUpdate, Access.public, "python"); @@access(VoiceLive.ConversationResponseItem, Access.public, "python"); @@access(VoiceLive.Response, Access.public, "python"); @@access(VoiceLive.ServerEventConversationItemCreated, Access.public, "python"); diff --git a/specification/ai/data-plane/VoiceLive/servers/websocket.tsp b/specification/ai/data-plane/VoiceLive/servers/websocket.tsp index 7046ceb2b3a3..4e722f4caa69 100644 --- a/specification/ai/data-plane/VoiceLive/servers/websocket.tsp +++ b/specification/ai/data-plane/VoiceLive/servers/websocket.tsp @@ -29,30 +29,55 @@ using Azure.Core; namespace VoiceLive; -op force_models(session: ClientEventSessionUpdate): - ServerEventSessionUpdated - | ServerEventSessionCreated - | ServerEventError - | ServerEventResponseTextDelta - | ServerEventResponseAudioDelta - | ServerEventConversationItemCreated - | ServerEventConversationItemDeleted - | ServerEventConversationItemRetrieved - | ServerEventConversationItemTruncated - | ServerEventConversationItemInputAudioTranscriptionCompleted - | ServerEventConversationItemInputAudioTranscriptionDelta - | ServerEventConversationItemInputAudioTranscriptionFailed - | ServerEventInputAudioBufferCommitted - | ServerEventInputAudioBufferCleared - | ServerEventInputAudioBufferSpeechStarted - | ServerEventInputAudioBufferSpeechStopped - | ServerEventResponseCreated - | ServerEventResponseDone - | ServerEventResponseOutputItemAdded - | ServerEventResponseOutputItemDone - | ServerEventResponseContentPartAdded - | ServerEventResponseContentPartDone - | ServerEventResponseTextDone - | ServerEventResponseAudioTranscriptDelta - | ServerEventResponseAudioTranscriptDone - | ServerEventResponseAudioDone; \ No newline at end of file + +// Union of all client events that can be passed into `force_models` +alias ForceModelClientEvent = + ClientEventSessionUpdate | + ClientEventInputAudioBufferAppend | + ClientEventInputAudioBufferCommit | + ClientEventInputAudioBufferClear | + ClientEventInputAudioTurnStart | + ClientEventInputAudioTurnAppend | + ClientEventInputAudioTurnEnd | + ClientEventInputAudioTurnCancel | + ClientEventInputAudioClear | + ClientEventConversationItemCreate | + ClientEventConversationItemRetrieve | + ClientEventConversationItemTruncate | + ClientEventConversationItemDelete | + ClientEventResponseCreate | + ClientEventResponseCancel | + ClientEventSessionAvatarConnect; + +// Union of all server events that can be returned from `force_models` +alias ForceModelServerEvent = + ServerEventSessionAvatarConnecting | + ServerEventSessionCreated | + ServerEventSessionUpdated | + ServerEventError | + ServerEventResponseTextDelta | + ServerEventResponseAudioDelta | + ServerEventConversationItemCreated | + ServerEventConversationItemDeleted | + ServerEventConversationItemRetrieved | + ServerEventConversationItemTruncated | + ServerEventConversationItemInputAudioTranscriptionCompleted | + ServerEventConversationItemInputAudioTranscriptionDelta | + ServerEventConversationItemInputAudioTranscriptionFailed | + ServerEventInputAudioBufferCommitted | + ServerEventInputAudioBufferCleared | + ServerEventInputAudioBufferSpeechStarted | + ServerEventInputAudioBufferSpeechStopped | + ServerEventResponseCreated | + ServerEventResponseDone | + ServerEventResponseOutputItemAdded | + ServerEventResponseOutputItemDone | + ServerEventResponseContentPartAdded | + ServerEventResponseContentPartDone | + ServerEventResponseTextDone | + ServerEventResponseAudioTranscriptDelta | + ServerEventResponseAudioTranscriptDone | + ServerEventResponseAudioDone; + +// Operation definition +op force_models(event: ForceModelClientEvent): ForceModelServerEvent; \ No newline at end of file From 54d5cb31ea36f6156c6f661e3ed093d9b5d1cbfd Mon Sep 17 00:00:00 2001 From: Ryan Hurey Date: Wed, 6 Aug 2025 18:17:25 -0700 Subject: [PATCH 22/48] Review feedback --- specification/ai/data-plane/VoiceLive/client.tsp | 11 ++++++++++- .../ai/data-plane/VoiceLive/common/models.tsp | 4 ++-- specification/ai/data-plane/VoiceLive/custom.tsp | 4 ++-- specification/ai/data-plane/VoiceLive/models.tsp | 2 +- 4 files changed, 15 insertions(+), 6 deletions(-) diff --git a/specification/ai/data-plane/VoiceLive/client.tsp b/specification/ai/data-plane/VoiceLive/client.tsp index bf37efc3acf4..6bd45d1e1638 100644 --- a/specification/ai/data-plane/VoiceLive/client.tsp +++ b/specification/ai/data-plane/VoiceLive/client.tsp @@ -41,4 +41,13 @@ using Azure.ClientGenerator.Core; @@clientName(VoiceLive.Modality, "InputModality", "csharp"); @@clientName(VoiceLive.Animation, "AnimationOptions", "csharp"); @@clientName(VoiceLive.Tool, "ToolCall", "csharp"); -@@access(VoiceLive.ClientEventSessionUpdate, Access.public, "csharp"); \ No newline at end of file +@@clientName(VoiceLive.AzureCustomVoice.custom_lexicon_url, "CustomLexiconUri", "csharp"); +@@clientName(VoiceLive.IceServer.urls, "Uris", "csharp"); +@@access(VoiceLive.ClientEventSessionUpdate, Access.public, "csharp"); +@@access(VoiceLive.Point2D, Access.internal, "csharp"); + +@@clientName(VoiceLive.VideoCrop.bottom_right, "BottomRightInternal", "csharp"); +@@clientName(VoiceLive.VideoCrop.top_left, "TopLeftInternal", "csharp"); + +@@access(VoiceLive.VideoCrop.bottom_right, Access.internal, "csharp"); +@@access(VoiceLive.VideoCrop.top_left, Access.internal, "csharp"); diff --git a/specification/ai/data-plane/VoiceLive/common/models.tsp b/specification/ai/data-plane/VoiceLive/common/models.tsp index 1bf792617f3d..3b1209a3301f 100644 --- a/specification/ai/data-plane/VoiceLive/common/models.tsp +++ b/specification/ai/data-plane/VoiceLive/common/models.tsp @@ -7,7 +7,7 @@ using TypeSpec.OpenAPI; namespace VoiceLive; @doc("Error object returned in case of API failure.") -model ErrorDetails { +model VoiceLiveErrorDetails { @doc("Error code, or null if unspecified.") code?: string; @@ -28,7 +28,7 @@ model ErrorDetails { @doc("Standard error response envelope.") model ErrorResponse { @doc("Error object returned in case of API failure.") - error: ErrorDetails; + error: VoiceLiveErrorDetails; } @doc("A single log probability entry for a token.") diff --git a/specification/ai/data-plane/VoiceLive/custom.tsp b/specification/ai/data-plane/VoiceLive/custom.tsp index 9c921ff0099d..a928f2b074dc 100644 --- a/specification/ai/data-plane/VoiceLive/custom.tsp +++ b/specification/ai/data-plane/VoiceLive/custom.tsp @@ -84,7 +84,7 @@ model AzureCustomVoice { temperature?: float32; @doc("Optional custom lexicon URL.") - custom_lexicon_url?: string; + custom_lexicon_url?: url; @doc("Preferred locale list for voice rendering.") prefer_locales?: string[]; @@ -261,7 +261,7 @@ model AvatarConfig { @doc("ICE server configuration for WebRTC connection negotiation.") model IceServer { @doc("List of ICE server URLs (e.g., TURN or STUN endpoints).") - urls: string[]; + urls: url[]; @doc("Optional username used for authentication with the ICE server.") username?: string; diff --git a/specification/ai/data-plane/VoiceLive/models.tsp b/specification/ai/data-plane/VoiceLive/models.tsp index c8f80dd8cef8..f33069f9e759 100644 --- a/specification/ai/data-plane/VoiceLive/models.tsp +++ b/specification/ai/data-plane/VoiceLive/models.tsp @@ -709,7 +709,7 @@ model ServerEventConversationItemInputAudioTranscriptionFailed content_index: int32; /** Details of the transcription error. */ - error: ErrorDetails; + error: VoiceLiveErrorDetails; } // Tool customization (apply_discriminator): apply discriminated type From 80cdf6b9c6ecd1939f2c0b770cc9daffa705e462 Mon Sep 17 00:00:00 2001 From: Ryan Hurey Date: Thu, 7 Aug 2025 20:15:55 -0700 Subject: [PATCH 23/48] Add function calls --- .../ai/data-plane/VoiceLive/custom/events.tsp | 2 + .../ai/data-plane/VoiceLive/models.tsp | 54 +++++++++++++++++++ 2 files changed, 56 insertions(+) diff --git a/specification/ai/data-plane/VoiceLive/custom/events.tsp b/specification/ai/data-plane/VoiceLive/custom/events.tsp index 048697ad32b3..5b38866ee7bf 100644 --- a/specification/ai/data-plane/VoiceLive/custom/events.tsp +++ b/specification/ai/data-plane/VoiceLive/custom/events.tsp @@ -60,4 +60,6 @@ union ServerEventType { response_audio_timestamp_done: "response.audio_timestamp.done", response_animation_viseme_delta: "response.animation_viseme.delta", response_animation_viseme_done: "response.animation_viseme.done", + response_function_call_arguments_delta: "response.function_call_arguments.delta", + response_function_call_arguments_done: "response.function_call_arguments.done", } \ No newline at end of file diff --git a/specification/ai/data-plane/VoiceLive/models.tsp b/specification/ai/data-plane/VoiceLive/models.tsp index f33069f9e759..e05d19554bc3 100644 --- a/specification/ai/data-plane/VoiceLive/models.tsp +++ b/specification/ai/data-plane/VoiceLive/models.tsp @@ -1328,3 +1328,57 @@ model EmotionCandidate { emotion: string; confidence: float32; } + +// Tool customization (apply_discriminator): apply discriminated type +/** Returned when the model-generated function call arguments are updated. */ +model ServerEventResponseFunctionCallArgumentsDelta + extends ServerEvent { + @doc(""" + The event type, must be `response.function_call_arguments.delta`. + """) + type: ServerEventType.response_function_call_arguments_delta; + + /** The ID of the response. */ + response_id: string; + + /** The ID of the function call item. */ + item_id: string; + + /** The index of the output item in the response. */ + output_index: int32; + + /** The ID of the function call. */ + call_id: string; + + /** The arguments delta as a JSON string. */ + delta: string; +} + +// Tool customization (apply_discriminator): apply discriminated type +/** + * Returned when the model-generated function call arguments are done streaming. + * Also emitted when a Response is interrupted, incomplete, or cancelled. + */ +model ServerEventResponseFunctionCallArgumentsDone + extends ServerEvent { + @doc(""" + The event type, must be `response.function_call_arguments.done`. + """) + type: ServerEventType.response_function_call_arguments_done; + + /** The ID of the response. */ + response_id: string; + + /** The ID of the function call item. */ + item_id: string; + + /** The index of the output item in the response. */ + output_index: int32; + + /** The ID of the function call. */ + call_id: string; + + /** The final arguments as a JSON string. */ + arguments: string; +} + From 50979766cc928310a047e4ca2d8266561db7f4a7 Mon Sep 17 00:00:00 2001 From: Ryan Hurey Date: Sun, 10 Aug 2025 10:58:26 -0700 Subject: [PATCH 24/48] Rename some things --- specification/ai/data-plane/VoiceLive/client.tsp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/specification/ai/data-plane/VoiceLive/client.tsp b/specification/ai/data-plane/VoiceLive/client.tsp index 833330feca9e..3975e0f6e333 100644 --- a/specification/ai/data-plane/VoiceLive/client.tsp +++ b/specification/ai/data-plane/VoiceLive/client.tsp @@ -56,16 +56,16 @@ using Azure.ClientGenerator.Core; @@access(VoiceLive.ServerEvent, Access.public, "csharp"); @@access(VoiceLive.ClientEvent, Access.public, "csharp"); +@@access(VoiceLive.ClientEventSessionUpdate, Access.public, "csharp"); +@@access(VoiceLive.Point2D, Access.internal, "csharp"); +@@access(VoiceLive.VideoCrop.bottom_right, Access.internal, "csharp"); +@@access(VoiceLive.VideoCrop.top_left, Access.internal, "csharp"); + @@clientName(VoiceLive.Modality, "InputModality", "csharp"); @@clientName(VoiceLive.Animation, "AnimationOptions", "csharp"); @@clientName(VoiceLive.Tool, "ToolCall", "csharp"); @@clientName(VoiceLive.AzureCustomVoice.custom_lexicon_url, "CustomLexiconUri", "csharp"); @@clientName(VoiceLive.IceServer.urls, "Uris", "csharp"); -@@access(VoiceLive.ClientEventSessionUpdate, Access.public, "csharp"); -@@access(VoiceLive.Point2D, Access.internal, "csharp"); - @@clientName(VoiceLive.VideoCrop.bottom_right, "BottomRightInternal", "csharp"); @@clientName(VoiceLive.VideoCrop.top_left, "TopLeftInternal", "csharp"); - -@@access(VoiceLive.VideoCrop.bottom_right, Access.internal, "csharp"); -@@access(VoiceLive.VideoCrop.top_left, Access.internal, "csharp"); +@@clientName(VoiceLive.Response, "VoiceLiveResponse", "csharp"); From 39b7cd3df99dcf9f4abcd8989eb84068d06d4034 Mon Sep 17 00:00:00 2001 From: Ryan Hurey Date: Sun, 10 Aug 2025 11:00:39 -0700 Subject: [PATCH 25/48] Add function name --- specification/ai/data-plane/VoiceLive/models.tsp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/specification/ai/data-plane/VoiceLive/models.tsp b/specification/ai/data-plane/VoiceLive/models.tsp index e05d19554bc3..78e4c0d30bd6 100644 --- a/specification/ai/data-plane/VoiceLive/models.tsp +++ b/specification/ai/data-plane/VoiceLive/models.tsp @@ -1380,5 +1380,8 @@ model ServerEventResponseFunctionCallArgumentsDone /** The final arguments as a JSON string. */ arguments: string; + + /** The name of the function call. */ + name: string; } From 668f11ee7b169e2a3fec52158aa5f67fda3190e4 Mon Sep 17 00:00:00 2001 From: Xiting Zhang Date: Mon, 11 Aug 2025 13:43:58 -0700 Subject: [PATCH 26/48] remove Point2D and update force_models --- .../ai/data-plane/VoiceLive/client.tsp | 1 - .../ai/data-plane/VoiceLive/custom.tsp | 22 +++++++++---------- .../VoiceLive/servers/websocket.tsp | 4 +++- 3 files changed, 13 insertions(+), 14 deletions(-) diff --git a/specification/ai/data-plane/VoiceLive/client.tsp b/specification/ai/data-plane/VoiceLive/client.tsp index 3975e0f6e333..0c1027293f43 100644 --- a/specification/ai/data-plane/VoiceLive/client.tsp +++ b/specification/ai/data-plane/VoiceLive/client.tsp @@ -57,7 +57,6 @@ using Azure.ClientGenerator.Core; @@access(VoiceLive.ServerEvent, Access.public, "csharp"); @@access(VoiceLive.ClientEvent, Access.public, "csharp"); @@access(VoiceLive.ClientEventSessionUpdate, Access.public, "csharp"); -@@access(VoiceLive.Point2D, Access.internal, "csharp"); @@access(VoiceLive.VideoCrop.bottom_right, Access.internal, "csharp"); @@access(VoiceLive.VideoCrop.top_left, Access.internal, "csharp"); diff --git a/specification/ai/data-plane/VoiceLive/custom.tsp b/specification/ai/data-plane/VoiceLive/custom.tsp index a928f2b074dc..567619529070 100644 --- a/specification/ai/data-plane/VoiceLive/custom.tsp +++ b/specification/ai/data-plane/VoiceLive/custom.tsp @@ -293,19 +293,17 @@ model VideoParams { resolution?: VideoResolution; } -@doc("A 2D point with x and y coordinates.") -model Point2D { - x: int32; - y: int32; -} - -@doc("Defines a video crop rectangle.") +@doc("Defines a video crop rectangle using top-left and bottom-right coordinates.") model VideoCrop { - @doc("Top-left corner of the crop region.") - top_left: Point2D; - - @doc("Bottom-right corner of the crop region.") - bottom_right: Point2D; + @doc("Top-left corner of the crop region. Array of [x, y], must be non-negative integers.") + @minItems(2) + @maxItems(2) + top_left: int32[]; + + @doc("Bottom-right corner of the crop region. Array of [x, y], must be non-negative integers.") + @minItems(2) + @maxItems(2) + bottom_right: int32[]; } @doc("Resolution of the video feed in pixels.") diff --git a/specification/ai/data-plane/VoiceLive/servers/websocket.tsp b/specification/ai/data-plane/VoiceLive/servers/websocket.tsp index 4e722f4caa69..f3067c08812e 100644 --- a/specification/ai/data-plane/VoiceLive/servers/websocket.tsp +++ b/specification/ai/data-plane/VoiceLive/servers/websocket.tsp @@ -77,7 +77,9 @@ alias ForceModelServerEvent = ServerEventResponseTextDone | ServerEventResponseAudioTranscriptDelta | ServerEventResponseAudioTranscriptDone | - ServerEventResponseAudioDone; + ServerEventResponseAudioDone | + ServerEventResponseFunctionCallArgumentsDelta | + ServerEventResponseFunctionCallArgumentsDone; // Operation definition op force_models(event: ForceModelClientEvent): ForceModelServerEvent; \ No newline at end of file From 751d284f2fa11ebea30b6aefffede6ec7c4d881c Mon Sep 17 00:00:00 2001 From: Ryan Hurey Date: Tue, 12 Aug 2025 10:42:52 -0700 Subject: [PATCH 27/48] Update C# generator --- specification/ai/data-plane/VoiceLive/tspconfig.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/specification/ai/data-plane/VoiceLive/tspconfig.yaml b/specification/ai/data-plane/VoiceLive/tspconfig.yaml index d40eca7fd621..a96a6cd87f9a 100644 --- a/specification/ai/data-plane/VoiceLive/tspconfig.yaml +++ b/specification/ai/data-plane/VoiceLive/tspconfig.yaml @@ -27,6 +27,7 @@ options: model-namespace: false namespace: "{package-dir}" flavor: azure + emitterPackageJsonPath: eng/azure-typespec-http-client-csharp-emitter-package.json "@azure-typespec/http-client-csharp": namespace: Azure.AI.VoiceLive model-namespace: false From 793b7ae0d26bc78a8ddd78e8e2b049c3906fbdf2 Mon Sep 17 00:00:00 2001 From: Ryan Hurey Date: Fri, 15 Aug 2025 09:48:50 -0700 Subject: [PATCH 28/48] Add more response models --- specification/ai/data-plane/VoiceLive/client.tsp | 1 + .../ai/data-plane/VoiceLive/servers/websocket.tsp | 10 +++++++++- 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/specification/ai/data-plane/VoiceLive/client.tsp b/specification/ai/data-plane/VoiceLive/client.tsp index 0c1027293f43..4f2da8898754 100644 --- a/specification/ai/data-plane/VoiceLive/client.tsp +++ b/specification/ai/data-plane/VoiceLive/client.tsp @@ -68,3 +68,4 @@ using Azure.ClientGenerator.Core; @@clientName(VoiceLive.VideoCrop.bottom_right, "BottomRightInternal", "csharp"); @@clientName(VoiceLive.VideoCrop.top_left, "TopLeftInternal", "csharp"); @@clientName(VoiceLive.Response, "VoiceLiveResponse", "csharp"); +@@clientName(VoiceLive.AgentConfig, "RespondingAgentConfig", "csharp"); diff --git a/specification/ai/data-plane/VoiceLive/servers/websocket.tsp b/specification/ai/data-plane/VoiceLive/servers/websocket.tsp index f3067c08812e..180d2d8377cd 100644 --- a/specification/ai/data-plane/VoiceLive/servers/websocket.tsp +++ b/specification/ai/data-plane/VoiceLive/servers/websocket.tsp @@ -79,7 +79,15 @@ alias ForceModelServerEvent = ServerEventResponseAudioTranscriptDone | ServerEventResponseAudioDone | ServerEventResponseFunctionCallArgumentsDelta | - ServerEventResponseFunctionCallArgumentsDone; + ServerEventResponseFunctionCallArgumentsDone | + ResponseAnimationBlendshapeDeltaEvent | + ResponseAnimationBlendshapeDoneEvent | + ResponseEmotionHypothesis | + ResponseAudioTimestampDeltaEvent | + ResponseAudioTimestampDoneEvent | + ResponseAnimationVisemeDeltaEvent | + ResponseAnimationVisemeDoneEvent; + // Operation definition op force_models(event: ForceModelClientEvent): ForceModelServerEvent; \ No newline at end of file From cdb99b9f7e7868ad23a33cb29ba54d58829a7e15 Mon Sep 17 00:00:00 2001 From: Ryan Hurey Date: Fri, 15 Aug 2025 09:50:54 -0700 Subject: [PATCH 29/48] Rename models for consistency --- .../ai/data-plane/VoiceLive/models.tsp | 39 ++++++++----------- .../VoiceLive/servers/websocket.tsp | 14 +++---- 2 files changed, 23 insertions(+), 30 deletions(-) diff --git a/specification/ai/data-plane/VoiceLive/models.tsp b/specification/ai/data-plane/VoiceLive/models.tsp index 78e4c0d30bd6..058b011b37cf 100644 --- a/specification/ai/data-plane/VoiceLive/models.tsp +++ b/specification/ai/data-plane/VoiceLive/models.tsp @@ -11,7 +11,6 @@ using TypeSpec.OpenAPI; namespace VoiceLive; - // Tool customization: Adjust union to be a discriminated type base /** A voicelive client event. */ @discriminator("type") @@ -434,7 +433,7 @@ model ClientEventResponseCreate extends ClientEvent { @doc(""" additional instructions (system prompt) appended to the default instructions of the session. Only affects this response only. """) - additional_instructions?: string + additional_instructions?: string; } // Tool customization (apply_discriminator): apply discriminated type base @@ -586,8 +585,7 @@ model ServerEventInputAudioBufferCleared extends ServerEvent { `input_audio_buffer.speech_stopped` event (unless the client manually commits the audio buffer during VAD activation). """) -model ServerEventInputAudioBufferSpeechStarted - extends ServerEvent { +model ServerEventInputAudioBufferSpeechStarted extends ServerEvent { @doc(""" The event type, must be `input_audio_buffer.speech_started`. """) @@ -611,8 +609,7 @@ model ServerEventInputAudioBufferSpeechStarted the audio buffer. The server will also send an `conversation.item.created` event with the user message item that is created from the audio buffer. """) -model ServerEventInputAudioBufferSpeechStopped - extends ServerEvent { +model ServerEventInputAudioBufferSpeechStopped extends ServerEvent { @doc(""" The event type, must be `input_audio_buffer.speech_stopped`. """) @@ -934,8 +931,7 @@ model ServerEventResponseTextDone extends ServerEvent { // Tool customization (apply_discriminator): apply discriminated type /** Returned when the model-generated transcription of audio output is updated. */ -model ServerEventResponseAudioTranscriptDelta - extends ServerEvent { +model ServerEventResponseAudioTranscriptDelta extends ServerEvent { @doc(""" The event type, must be `response.audio_transcript.delta`. """) @@ -963,8 +959,7 @@ model ServerEventResponseAudioTranscriptDelta * streaming. Also emitted when a Response is interrupted, incomplete, or * cancelled. */ -model ServerEventResponseAudioTranscriptDone - extends ServerEvent { +model ServerEventResponseAudioTranscriptDone extends ServerEvent { @doc(""" The event type, must be `response.audio_transcript.done`. """) @@ -1041,7 +1036,7 @@ model ServerEventResponseAudioDone extends ServerEvent { @doc(""" Represents a delta update of blendshape animation frames for a specific output of a response. """) -model ResponseAnimationBlendshapeDeltaEvent extends ServerEvent { +model ServerEventAnimationBlendshapeDelta extends ServerEvent { type: ServerEventType.response_animation_blendshapes_delta; response_id: string; item_id: string; @@ -1054,7 +1049,7 @@ model ResponseAnimationBlendshapeDeltaEvent extends ServerEvent { @doc(""" Indicates the completion of blendshape animation processing for a specific output of a response. """) -model ResponseAnimationBlendshapeDoneEvent extends ServerEvent { +model ServerEventAnimationBlendshapeDone extends ServerEvent { type: ServerEventType.response_animation_blendshapes_done; response_id: string; item_id: string; @@ -1064,10 +1059,10 @@ model ResponseAnimationBlendshapeDoneEvent extends ServerEvent { @doc(""" Represents an emotion hypothesis detected from response audio with multiple candidates. """) -model ResponseEmotionHypothesis extends ServerEvent { +model ServerEventEmotionHypothesis extends ServerEvent { type: ServerEventType.response_emotion_hypothesis; emotion: string; - candidates: EmotionCandidate[], + candidates: EmotionCandidate[]; audio_offset_ms: int32; audio_duration_ms: int32; response_id?: string; @@ -1077,7 +1072,7 @@ model ResponseEmotionHypothesis extends ServerEvent { @doc(""" Represents a word-level audio timestamp delta for a response. """) -model ResponseAudioTimestampDeltaEvent extends ServerEvent { +model ServerEventAudioTimestampDelta extends ServerEvent { type: ServerEventType.response_audio_timestamp_delta; response_id: string; item_id: string; @@ -1092,7 +1087,7 @@ model ResponseAudioTimestampDeltaEvent extends ServerEvent { @doc(""" Indicates completion of audio timestamp delivery for a response. """) -model ResponseAudioTimestampDoneEvent extends ServerEvent { +model ServerEventAudioTimestampDone extends ServerEvent { type: ServerEventType.response_audio_timestamp_done; response_id: string; item_id: string; @@ -1103,7 +1098,7 @@ model ResponseAudioTimestampDoneEvent extends ServerEvent { @doc(""" Represents a viseme ID delta update for animation based on audio. """) -model ResponseAnimationVisemeDeltaEvent extends ServerEvent { +model ServerEventAnimationVisemeDelta extends ServerEvent { type: ServerEventType.response_animation_viseme_delta; response_id: string; item_id: string; @@ -1116,7 +1111,7 @@ model ResponseAnimationVisemeDeltaEvent extends ServerEvent { @doc(""" Indicates completion of viseme animation delivery for a response. """) -model ResponseAnimationVisemeDoneEvent extends ServerEvent { +model ServerEventAnimationVisemeDone extends ServerEvent { type: ServerEventType.response_animation_viseme_done; response_id: string; item_id: string; @@ -1320,6 +1315,7 @@ model ServerEventConversationItemRetrieved extends ServerEvent { The event type, must be `conversation.item.retrieved`. """) type: ServerEventType.conversation_item_retrieved; + item_id?: string; event_id?: string; } @@ -1331,8 +1327,7 @@ model EmotionCandidate { // Tool customization (apply_discriminator): apply discriminated type /** Returned when the model-generated function call arguments are updated. */ -model ServerEventResponseFunctionCallArgumentsDelta - extends ServerEvent { +model ServerEventResponseFunctionCallArgumentsDelta extends ServerEvent { @doc(""" The event type, must be `response.function_call_arguments.delta`. """) @@ -1359,8 +1354,7 @@ model ServerEventResponseFunctionCallArgumentsDelta * Returned when the model-generated function call arguments are done streaming. * Also emitted when a Response is interrupted, incomplete, or cancelled. */ -model ServerEventResponseFunctionCallArgumentsDone - extends ServerEvent { +model ServerEventResponseFunctionCallArgumentsDone extends ServerEvent { @doc(""" The event type, must be `response.function_call_arguments.done`. """) @@ -1384,4 +1378,3 @@ model ServerEventResponseFunctionCallArgumentsDone /** The name of the function call. */ name: string; } - diff --git a/specification/ai/data-plane/VoiceLive/servers/websocket.tsp b/specification/ai/data-plane/VoiceLive/servers/websocket.tsp index 180d2d8377cd..f54e1bcf5709 100644 --- a/specification/ai/data-plane/VoiceLive/servers/websocket.tsp +++ b/specification/ai/data-plane/VoiceLive/servers/websocket.tsp @@ -80,13 +80,13 @@ alias ForceModelServerEvent = ServerEventResponseAudioDone | ServerEventResponseFunctionCallArgumentsDelta | ServerEventResponseFunctionCallArgumentsDone | - ResponseAnimationBlendshapeDeltaEvent | - ResponseAnimationBlendshapeDoneEvent | - ResponseEmotionHypothesis | - ResponseAudioTimestampDeltaEvent | - ResponseAudioTimestampDoneEvent | - ResponseAnimationVisemeDeltaEvent | - ResponseAnimationVisemeDoneEvent; + ServerEventResponseAnimationBlendshapeDelta | + ServerEventResponseAnimationBlendshapeDone | + ServerEventResponseEmotionHypothesis | + ServerEventResponseAudioTimestampDelta | + ServerEventResponseAudioTimestampDone | + ServerEventResponseAnimationVisemeDelta | + ServerEventResponseAnimationVisemeDone; // Operation definition From 045e9029f69a14894cec7355d755bd82168e37f2 Mon Sep 17 00:00:00 2001 From: Ryan Hurey Date: Fri, 15 Aug 2025 09:57:22 -0700 Subject: [PATCH 30/48] Missed a word --- specification/ai/data-plane/VoiceLive/models.tsp | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/specification/ai/data-plane/VoiceLive/models.tsp b/specification/ai/data-plane/VoiceLive/models.tsp index 058b011b37cf..f77e199aefa9 100644 --- a/specification/ai/data-plane/VoiceLive/models.tsp +++ b/specification/ai/data-plane/VoiceLive/models.tsp @@ -1036,7 +1036,7 @@ model ServerEventResponseAudioDone extends ServerEvent { @doc(""" Represents a delta update of blendshape animation frames for a specific output of a response. """) -model ServerEventAnimationBlendshapeDelta extends ServerEvent { +model ServerEventResponseAnimationBlendshapeDelta extends ServerEvent { type: ServerEventType.response_animation_blendshapes_delta; response_id: string; item_id: string; @@ -1049,7 +1049,7 @@ model ServerEventAnimationBlendshapeDelta extends ServerEvent { @doc(""" Indicates the completion of blendshape animation processing for a specific output of a response. """) -model ServerEventAnimationBlendshapeDone extends ServerEvent { +model ServerEventResponseAnimationBlendshapeDone extends ServerEvent { type: ServerEventType.response_animation_blendshapes_done; response_id: string; item_id: string; @@ -1059,7 +1059,7 @@ model ServerEventAnimationBlendshapeDone extends ServerEvent { @doc(""" Represents an emotion hypothesis detected from response audio with multiple candidates. """) -model ServerEventEmotionHypothesis extends ServerEvent { +model ServerEventResponseEmotionHypothesis extends ServerEvent { type: ServerEventType.response_emotion_hypothesis; emotion: string; candidates: EmotionCandidate[]; @@ -1072,7 +1072,7 @@ model ServerEventEmotionHypothesis extends ServerEvent { @doc(""" Represents a word-level audio timestamp delta for a response. """) -model ServerEventAudioTimestampDelta extends ServerEvent { +model ServerEventResponseAudioTimestampDelta extends ServerEvent { type: ServerEventType.response_audio_timestamp_delta; response_id: string; item_id: string; @@ -1087,7 +1087,7 @@ model ServerEventAudioTimestampDelta extends ServerEvent { @doc(""" Indicates completion of audio timestamp delivery for a response. """) -model ServerEventAudioTimestampDone extends ServerEvent { +model ServerEventResponseAudioTimestampDone extends ServerEvent { type: ServerEventType.response_audio_timestamp_done; response_id: string; item_id: string; @@ -1098,7 +1098,7 @@ model ServerEventAudioTimestampDone extends ServerEvent { @doc(""" Represents a viseme ID delta update for animation based on audio. """) -model ServerEventAnimationVisemeDelta extends ServerEvent { +model ServerEventResponseAnimationVisemeDelta extends ServerEvent { type: ServerEventType.response_animation_viseme_delta; response_id: string; item_id: string; @@ -1111,7 +1111,7 @@ model ServerEventAnimationVisemeDelta extends ServerEvent { @doc(""" Indicates completion of viseme animation delivery for a response. """) -model ServerEventAnimationVisemeDone extends ServerEvent { +model ServerEventResponseAnimationVisemeDone extends ServerEvent { type: ServerEventType.response_animation_viseme_done; response_id: string; item_id: string; From afe7f766a40ffcd66aaa9154bd0234df6a0f063c Mon Sep 17 00:00:00 2001 From: Ryan Hurey Date: Fri, 15 Aug 2025 10:04:43 -0700 Subject: [PATCH 31/48] Type not optional --- specification/ai/data-plane/VoiceLive/custom/items.tsp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/specification/ai/data-plane/VoiceLive/custom/items.tsp b/specification/ai/data-plane/VoiceLive/custom/items.tsp index e4e3cf0a8c99..0ff24970022a 100644 --- a/specification/ai/data-plane/VoiceLive/custom/items.tsp +++ b/specification/ai/data-plane/VoiceLive/custom/items.tsp @@ -60,7 +60,7 @@ model RequestMessageReferenceItem { // extends ConversationRequestItem { model ConversationResponseItem { ...ConversationItemBase; object?: "realtime.item"; - type?: ItemType; + type: ItemType; id?: string; } From 81e397ca63778f38b132d0c361d93840dfd725db Mon Sep 17 00:00:00 2001 From: Ryan Hurey Date: Fri, 15 Aug 2025 14:34:04 -0700 Subject: [PATCH 32/48] Hide cleint classes in C# --- .../ai/data-plane/VoiceLive/client.tsp | 20 +++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/specification/ai/data-plane/VoiceLive/client.tsp b/specification/ai/data-plane/VoiceLive/client.tsp index 4f2da8898754..d5166cad77d4 100644 --- a/specification/ai/data-plane/VoiceLive/client.tsp +++ b/specification/ai/data-plane/VoiceLive/client.tsp @@ -55,8 +55,24 @@ using Azure.ClientGenerator.Core; @@access(VoiceLive.ServerEventSessionUpdated, Access.public, "python"); @@access(VoiceLive.ServerEvent, Access.public, "csharp"); -@@access(VoiceLive.ClientEvent, Access.public, "csharp"); -@@access(VoiceLive.ClientEventSessionUpdate, Access.public, "csharp"); +@@access(VoiceLive.ClientEvent, Access.internal, "csharp"); +@@access(VoiceLive.ClientEventConversationItemCreate, Access.internal, "csharp"); +@@access(VoiceLive.ClientEventConversationItemDelete, Access.internal, "csharp"); +@@access(VoiceLive.ClientEventConversationItemRetrieve, Access.internal, "csharp"); +@@access(VoiceLive.ClientEventConversationItemTruncate, Access.internal, "csharp"); +@@access(VoiceLive.ClientEventInputAudioBufferAppend, Access.internal, "csharp"); +@@access(VoiceLive.ClientEventInputAudioBufferClear, Access.internal, "csharp"); +@@access(VoiceLive.ClientEventInputAudioBufferCommit, Access.internal, "csharp"); +@@access(VoiceLive.ClientEventInputAudioClear, Access.internal, "csharp"); +@@access(VoiceLive.ClientEventInputAudioTurnAppend, Access.internal, "csharp"); +@@access(VoiceLive.ClientEventInputAudioTurnCancel, Access.internal, "csharp"); +@@access(VoiceLive.ClientEventInputAudioTurnEnd, Access.internal, "csharp"); +@@access(VoiceLive.ClientEventInputAudioTurnStart, Access.internal, "csharp"); +@@access(VoiceLive.ClientEventResponseCancel, Access.internal, "csharp"); +@@access(VoiceLive.ClientEventResponseCreate, Access.internal, "csharp"); +@@access(VoiceLive.ClientEventSessionAvatarConnect, Access.internal, "csharp"); +@@access(VoiceLive.ClientEventSessionUpdate, Access.internal, "csharp"); +@@access(VoiceLive.UnknownClientEvent, Access.internal, "csharp"); @@access(VoiceLive.VideoCrop.bottom_right, Access.internal, "csharp"); @@access(VoiceLive.VideoCrop.top_left, Access.internal, "csharp"); From 72bd35443981e5364e6f696b5b0c4e78f9240aec Mon Sep 17 00:00:00 2001 From: Ryan Hurey Date: Fri, 15 Aug 2025 14:38:23 -0700 Subject: [PATCH 33/48] Bad class --- specification/ai/data-plane/VoiceLive/client.tsp | 1 - 1 file changed, 1 deletion(-) diff --git a/specification/ai/data-plane/VoiceLive/client.tsp b/specification/ai/data-plane/VoiceLive/client.tsp index d5166cad77d4..1eb3dcd63cdf 100644 --- a/specification/ai/data-plane/VoiceLive/client.tsp +++ b/specification/ai/data-plane/VoiceLive/client.tsp @@ -72,7 +72,6 @@ using Azure.ClientGenerator.Core; @@access(VoiceLive.ClientEventResponseCreate, Access.internal, "csharp"); @@access(VoiceLive.ClientEventSessionAvatarConnect, Access.internal, "csharp"); @@access(VoiceLive.ClientEventSessionUpdate, Access.internal, "csharp"); -@@access(VoiceLive.UnknownClientEvent, Access.internal, "csharp"); @@access(VoiceLive.VideoCrop.bottom_right, Access.internal, "csharp"); @@access(VoiceLive.VideoCrop.top_left, Access.internal, "csharp"); From b11774f86ac1fa104d27df7157112e579da9ca1f Mon Sep 17 00:00:00 2001 From: Xiting Zhang Date: Sun, 24 Aug 2025 21:58:52 -0700 Subject: [PATCH 34/48] Fix issues found by tests: 1. Fix semantic vad config and add AzureMultilingualSemanticVAD 2. add model of EOUDetection 3. Fix mismatch in models of voices 4. Fix conversation response 5. Fix conversation request 6. Fix mismatch in models of Voice between service and sdk --- .../ai/data-plane/VoiceLive/client.tsp | 2 +- .../ai/data-plane/VoiceLive/custom.tsp | 249 ++++++++++++++---- .../ai/data-plane/VoiceLive/custom/items.tsp | 200 +++++++++----- .../ai/data-plane/VoiceLive/models.tsp | 166 +----------- 4 files changed, 352 insertions(+), 265 deletions(-) diff --git a/specification/ai/data-plane/VoiceLive/client.tsp b/specification/ai/data-plane/VoiceLive/client.tsp index 1eb3dcd63cdf..9abd88a575cf 100644 --- a/specification/ai/data-plane/VoiceLive/client.tsp +++ b/specification/ai/data-plane/VoiceLive/client.tsp @@ -27,7 +27,7 @@ using Azure.ClientGenerator.Core; @@access(VoiceLive.ServerEventSessionAvatarConnecting, Access.public, "python"); @@access(VoiceLive.ServerEventSessionCreated, Access.public, "python"); @@access(VoiceLive.ServerEventResponseAudioDelta, Access.public, "python"); -@@access(VoiceLive.ConversationResponseItem, Access.public, "python"); +@@access(VoiceLive.ResponseItem, Access.public, "python"); @@access(VoiceLive.Response, Access.public, "python"); @@access(VoiceLive.ServerEventConversationItemCreated, Access.public, "python"); @@access(VoiceLive.ServerEventConversationItemDeleted, Access.public, "python"); diff --git a/specification/ai/data-plane/VoiceLive/custom.tsp b/specification/ai/data-plane/VoiceLive/custom.tsp index 567619529070..782a7266f98c 100644 --- a/specification/ai/data-plane/VoiceLive/custom.tsp +++ b/specification/ai/data-plane/VoiceLive/custom.tsp @@ -57,74 +57,121 @@ model ResponseSession { agent?: AgentConfig; } -@doc("Voice configuration for Azure standard or platform voices.") -model AzureStandardVoice { - @doc("Name of the voice.") - name: string; +@doc("Supported OpenAI voice names (string enum).") +enum OAIVoice { + alloy: "alloy", + ash: "ash", + ballad: "ballad", + coral: "coral", + echo: "echo", + sage: "sage", + shimmer: "shimmer", + verse: "verse", +} - @doc("Voice type identifier.") - type: "azure-standard" | "azure-platform"; +@doc(""" +OpenAI voice configuration with explicit type field. - @doc("Optional temperature for generation.") - temperature?: float32; +This provides a unified interface for OpenAI voices, complementing the +existing string-based OAIVoice for backward compatibility. +""") +model OpenAIVoice { + type: "openai"; + name: OAIVoice; } -@doc("Voice configuration for Azure custom voice.") -model AzureCustomVoice { - @doc("Name of the voice.") - name: string; +// --- Azure voices ---------------------------------------------------------- - @doc("Custom endpoint ID.") - endpoint_id: string; +@doc("Base for Azure voice configurations.") +@discriminator("type") +model AzureVoice { + type: string; +} + +@doc("Azure custom voice configuration (preferred).") +model AzureCustomVoice extends AzureVoice { + type: "azure-custom"; - @doc("Voice type identifier.") - type: "azure-custom" | "custom"; + @minLength(1) @doc("Voice name cannot be empty.") name: string; + @minLength(1) @doc("Endpoint ID cannot be empty.") endpoint_id: string; - @doc("Optional temperature for generation.") + @minValue(0) @maxValue(1) + @doc("Temperature must be between 0.0 and 1.0.") temperature?: float32; - @doc("Optional custom lexicon URL.") - custom_lexicon_url?: url; + custom_lexicon_url?: string; + prefer_locales?: string[]; + locale?: string; + style?: string; + pitch?: string; + rate?: string; + volume?: string; +} + +@doc("Azure standard voice configuration.") +model AzureStandardVoice extends AzureVoice { + type: "azure-standard"; + + @minLength(1) @doc("Voice name cannot be empty.") name: string; + + @minValue(0) @maxValue(1) + @doc("Temperature must be between 0.0 and 1.0.") temperature?: float32; - @doc("Preferred locale list for voice rendering.") + custom_lexicon_url?: string; prefer_locales?: string[]; + locale?: string; + style?: string; + pitch?: string; + rate?: string; + volume?: string; } -@doc("Voice configuration for Azure personal voice.") -model AzurePersonalVoice { - @doc("Name of the voice.") - name: string; +@doc("Azure platform voice configuration (variant of standard).") +model AzurePlatformVoice extends AzureVoice { + type: "azure-platform"; - @doc("Voice type identifier.") - type: "azure-personal" | "personal"; + @minLength(1) @doc("Voice name cannot be empty.") name: string; - @doc("Personal voice model identifier.") - `model`: "DragonLatestNeural" | "PhoenixLatestNeural" | "PhoenixV2Neural"; + @minValue(0) @maxValue(1) + @doc("Temperature must be between 0.0 and 1.0.") temperature?: float32; + + custom_lexicon_url?: string; + prefer_locales?: string[]; + locale?: string; + style?: string; + pitch?: string; + rate?: string; + volume?: string; } -@doc("Voice identifier for OpenAI-provided voices.") -union OAIVoice { - "alloy", - "ash", - "ballad", - "coral", - "echo", - "sage", - "shimmer", - "verse" +@doc("Azure personal voice configuration.") +model AzurePersonalVoice extends AzureVoice { + type: "azure-personal"; + + @minLength(1) @doc("Voice name cannot be empty.") name: string; + + @minValue(0) @maxValue(1) + @doc("Temperature must be between 0.0 and 1.0.") + temperature?: float32; + + @doc("Underlying neural model to use for personal voice.") + `model`: "DragonLatestNeural" | "PhoenixLatestNeural" | "PhoenixV2Neural"; } +// --- Phi4mm voices --------------------------------------------------------- + @doc("Voice identifier for Phi4mm voices.") union Phi4mmVoice { "cosyvoice" } -@doc("Union of supported voice identifiers and configurations.") +// --- Top-level Voice union ------------------------------------------------- + +@doc("Union of all supported voice configurations.") union Voice { OAIVoice, - AzureStandardVoice, - AzureCustomVoice, - AzurePersonalVoice, + OpenAIVoice, + AzureVoice, // includes AzureCustomVoice, CustomVoice, AzurePersonalVoice Phi4mmVoice } @@ -163,10 +210,61 @@ union Modality { avatar: "avatar", } +@discriminator("model") +@doc("Top-level union for end-of-utterance (EOU) semantic detection configuration.") +model EOUDetection { + `model`: "semantic_detection_v1" | "semantic_detection_v1_en" | "semantic_detection_v1_multilingual"; +} + +@doc("Azure semantic end-of-utterance detection (default).") +model AzureSemanticDetection extends EOUDetection { + `model`: "semantic_detection_v1"; + threshold?: float32; + timeout?: float32; + secondary_threshold?: float32; + secondary_timeout?: float32; + disable_rules?: boolean; + // developer options + sr_boost?: float32; + extra_imend_check?: boolean; +} + +@doc("Azure semantic end-of-utterance detection (English-optimized).") +model AzureSemanticDetectionEn extends EOUDetection { + `model`: "semantic_detection_v1_en"; + threshold?: float32; + timeout?: float32; + secondary_threshold?: float32; + secondary_timeout?: float32; + disable_rules?: boolean; + // developer options + sr_boost?: float32; + extra_imend_check?: boolean; +} + +@doc("Azure semantic end-of-utterance detection (multilingual).") +model AzureSemanticDetectionMultilingual extends EOUDetection { + `model`: "semantic_detection_v1_multilingual"; + threshold?: float32; + timeout?: float32; + secondary_threshold?: float32; + secondary_timeout?: float32; + disable_rules?: boolean; + // developer options + sr_boost?: float32; + extra_imend_check?: boolean; +} + @discriminator("type") @doc("Top-level union for turn detection configuration.") model TurnDetection { - type: "none" | "server_vad" | "azure_semantic_vad"; + type: + | "none" + | "server_vad" + | "azure_semantic_vad" + | "azure_semantic_vad_en" + | "server_sd" + | "azure_semantic_vad_multilingual"; } @doc("Disables turn detection.") @@ -180,17 +278,76 @@ model ServerVad extends TurnDetection { threshold?: float32; prefix_padding_ms?: int32; silence_duration_ms?: int32; - end_of_utterance_detection?: unknown; + end_of_utterance_detection?: EOUDetection; + auto_truncate?: boolean = false; } -@doc("Semantic VAD settings based on Azure SDK features.") +@doc("Server Speech Detection (Azure semantic VAD, default variant).") model AzureSemanticVad extends TurnDetection { type: "azure_semantic_vad"; + threshold?: float32; + prefix_padding_ms?: int32; + silence_duration_ms?: int32; + end_of_utterance_detection?: EOUDetection; + neg_threshold?: float32; + speech_duration_ms?: int32; + window_size?: int32; + distinct_ci_phones?: int32; + require_vowel?: boolean; + remove_filler_words?: boolean = false; + languages?: string[]; + auto_truncate?: boolean = false; +} + +@doc("Server Speech Detection (Azure semantic VAD, English-only).") +model AzureSemanticVadEn extends TurnDetection { + type: "azure_semantic_vad_en"; + threshold?: float32; + prefix_padding_ms?: int32; + silence_duration_ms?: int32; + end_of_utterance_detection?: EOUDetection; + neg_threshold?: float32; + speech_duration_ms?: int32; + window_size?: int32; + distinct_ci_phones?: int32; + require_vowel?: boolean; + remove_filler_words?: boolean = false; + languages?: string[]; + auto_truncate?: boolean = false; +} + +@doc("Server Speech Detection (legacy `server_sd` alias).") +model AzureSemanticVadServer extends TurnDetection { + type: "server_sd"; + threshold?: float32; + prefix_padding_ms?: int32; + silence_duration_ms?: int32; + end_of_utterance_detection?: EOUDetection; + neg_threshold?: float32; + speech_duration_ms?: int32; + window_size?: int32; + distinct_ci_phones?: int32; + require_vowel?: boolean; + remove_filler_words?: boolean = false; + languages?: string[]; + auto_truncate?: boolean = false; +} + +@doc("Server Speech Detection (Azure semantic VAD).") +model AzureMultilingualSemanticVad extends TurnDetection { + type: "azure_semantic_vad_multilingual"; + threshold?: float32; + prefix_padding_ms?: int32; + silence_duration_ms?: int32; + end_of_utterance_detection?: EOUDetection; neg_threshold?: float32; + speech_duration_ms?: int32; window_size?: int32; distinct_ci_phones?: int32; require_vowel?: boolean; - remove_filler_words?: boolean; + remove_filler_words?: boolean = false; + languages?: string[]; + auto_truncate?: boolean = false; } @doc("Configuration for input audio noise reduction.") diff --git a/specification/ai/data-plane/VoiceLive/custom/items.tsp b/specification/ai/data-plane/VoiceLive/custom/items.tsp index 0ff24970022a..4568b4651bf2 100644 --- a/specification/ai/data-plane/VoiceLive/custom/items.tsp +++ b/specification/ai/data-plane/VoiceLive/custom/items.tsp @@ -4,97 +4,121 @@ using TypeSpec.OpenAPI; namespace VoiceLive; +union ItemType { + string, + message: "message", + function_call: "function_call", + function_call_output: "function_call_output", +} + +// Base for user content parts +@discriminator("type") +model UserContentPart { + type: string; +} + +// Variants +model InputTextContentPart extends UserContentPart { + type: "input_text"; + text: string; +} + +model InputAudioContentPart extends UserContentPart { + type: "input_audio"; + audio: string; + transcript?: string; +} + +@doc("Output text content part.") +model OutputTextContentPart { + type: "text"; + text: string; +} + +// Status enum +enum ItemParamStatus { + completed: "completed", + incomplete: "incomplete", +} + +@doc("Base for any response item; discriminated by `type`.") @discriminator("type") model ConversationRequestItem { - ...ConversationItemBase; type: ItemType; id?: string; } +// ----- Message Items ----- @discriminator("role") -model RequestMessageItem extends ConversationRequestItem { +model MessageItem extends ConversationRequestItem { type: ItemType.message; - role: MessageRole; - status?: ItemStatus; + role: string; + status?: ItemParamStatus; } -model RequestSystemMessageItem extends RequestMessageItem { - role: MessageRole.system; - content: RequestTextContentPart[]; +model SystemMessageItem extends MessageItem { + role: "system"; + content: InputTextContentPart[]; } -model RequestUserMessageItem extends RequestMessageItem { - role: MessageRole.user; - content: (RequestTextContentPart | RequestAudioContentPart)[]; +model UserMessageItem extends MessageItem { + role: "user"; + content: UserContentPart[]; } -model RequestAssistantMessageItem extends RequestMessageItem { - role: MessageRole.assistant; - content: RequestTextContentPart[]; +model AssistantMessageItem extends MessageItem { + role: "assistant"; + content: OutputTextContentPart[]; } -model RequestFunctionCallItem extends ConversationRequestItem { +// ----- Function Call Items ----- +model FunctionCallItem extends ConversationRequestItem { type: ItemType.function_call; name: string; call_id: string; arguments: string; - status?: ItemStatus; + status?: ItemParamStatus; } -model RequestFunctionCallOutputItem - extends ConversationRequestItem { +model FunctionCallOutputItem extends ConversationRequestItem { type: ItemType.function_call_output; call_id: string; output: string; -} - -// TODO: representation of a doubly-discriminated type with an absent second discriminator -// (first discriminator: type = message; second discriminator: no role present) - -model RequestMessageReferenceItem { // extends ConversationRequestItem { - type: ItemType.message; - id: string; + status?: ItemParamStatus; } @discriminator("type") -model ConversationResponseItem { - ...ConversationItemBase; - object?: "realtime.item"; +model ResponseItem { + // must stay here, required, broad type type: ItemType; id?: string; + object?: "realtime.item"; } -model ResponseMessageItem extends ConversationResponseItem { +model ResponseMessageItem extends ResponseItem { type: ItemType.message; role: MessageRole; content: ContentPart[]; - status: ItemStatus; + status: ResponseItemStatus; } model ResponseFunctionCallItem - extends ConversationResponseItem { + extends ResponseItem { type: ItemType.function_call; name: string; call_id: string; arguments: string; - status: ItemStatus; + status: ResponseItemStatus; } model ResponseFunctionCallOutputItem - extends ConversationResponseItem { + extends ResponseItem { type: ItemType.function_call_output; call_id: string; output: string; } -union ItemType { - string, - message: "message", - function_call: "function_call", - function_call_output: "function_call_output", -} - -union ItemStatus { +union ResponseItemStatus { string, in_progress: "in_progress", completed: "completed", @@ -108,32 +132,76 @@ union MessageRole { assistant: "assistant", } -// Tool generated type. Extracts from ConversationItemWithReference.content -alias ConversationItemWithReferenceContent = { - @doc(""" - The content type (`input_text`, `input_audio`, `item_reference`, `text`). - """) - type?: "input_audio" | "input_text" | "item_reference" | "text"; +@doc("Terminal status of a response.") +enum ResponseStatus { + completed: "completed", + cancelled: "cancelled", + failed: "failed", + incomplete: "incomplete", + in_progress: "in_progress", +} - @doc(""" - The text content, used for `input_text` and `text` content types. - """) - text?: string; +@doc("Base for all non-success response details.") +@discriminator("type") // or just @discriminator("type") if imported unqualified +model ResponseStatusDetails { + // Required discriminator key on the base; keep it as a broad string. + type: string; +} - @doc(""" - ID of a previous conversation item to reference (for `item_reference` - content types in `response.create` events). These can reference both - client and server created items. - """) - id?: string; +@doc("Details for a cancelled response.") +model ResponseCancelledDetails extends ResponseStatusDetails { + // Narrow the discriminator to a literal in each child: + type: "cancelled"; + reason: "turn_detected" | "client_cancelled"; +} - @doc(""" - Base64-encoded audio bytes, used for `input_audio` content type. - """) - audio?: string; +@doc("Details for an incomplete response.") +model ResponseIncompleteDetails extends ResponseStatusDetails { + type: "incomplete"; + reason: "max_output_tokens" | "content_filter"; +} - @doc(""" - The transcript of the audio, used for `input_audio` content type. - """) - transcript?: string; -}; \ No newline at end of file +@doc("Details for a failed response.") +model ResponseFailedDetails extends ResponseStatusDetails { + type: "failed"; + error: unknown; +} + +@doc("Details of input token usage.") +model InputTokenDetails { + @doc("Number of cached tokens used in the input.") + cached_tokens: int32; + + @doc("Number of text tokens used in the input.") + text_tokens: int32; + + @doc("Number of audio tokens used in the input.") + audio_tokens: int32; +} + +@doc("Details of output token usage.") +model OutputTokenDetails { + @doc("Number of text tokens generated in the output.") + text_tokens: int32; + + @doc("Number of audio tokens generated in the output.") + audio_tokens: int32; +} + +@doc("Overall usage statistics for a response.") +model Usage { + @doc("Total number of tokens (input + output).") + total_tokens: int32; + + @doc("Number of input tokens.") + input_tokens: int32; + + @doc("Number of output tokens.") + output_tokens: int32; + + @doc("Detailed breakdown of input tokens.") + input_token_details: InputTokenDetails; + + @doc("Detailed breakdown of output tokens.") + output_token_details: OutputTokenDetails; +} diff --git a/specification/ai/data-plane/VoiceLive/models.tsp b/specification/ai/data-plane/VoiceLive/models.tsp index f77e199aefa9..5387eb699557 100644 --- a/specification/ai/data-plane/VoiceLive/models.tsp +++ b/specification/ai/data-plane/VoiceLive/models.tsp @@ -128,49 +128,17 @@ model Response { object?: "realtime.response"; @doc(""" - The final status of the response (`completed`, `cancelled`, `failed`, or - `incomplete`). - """) - status?: "completed" | "cancelled" | "failed" | "incomplete" | "in_progress"; + The final status of the response. + One of: `completed`, `cancelled`, `failed`, `incomplete`, or `in_progress`. + """) + status?: ResponseStatus; /** Additional details about the status. */ - status_details?: { - @doc(""" - The type of error that caused the response to fail, corresponding - with the `status` field (`completed`, `cancelled`, `incomplete`, - `failed`). - """) - type?: "completed" | "cancelled" | "failed" | "incomplete"; - - @doc(""" - The reason the Response did not complete. For a `cancelled` Response, - one of `turn_detected` (the server VAD detected a new start of speech) - or `client_cancelled` (the client sent a cancel event). For an - `incomplete` Response, one of `max_output_tokens` or `content_filter` - (the server-side safety filter activated and cut off the response). - """) - reason?: - | "turn_detected" - | "client_cancelled" - | "max_output_tokens" - | "content_filter"; - - @doc(""" - A description of the error that caused the response to fail, - populated when the `status` is `failed`. - """) - error?: { - /** The type of error. */ - type?: string; - - /** Error code, if any. */ - code?: string; - }; - }; + status_details?: ResponseStatusDetails; // Tool customization: apply enriched response-specific type /** The list of output items generated by the response. */ - output?: ConversationResponseItem[]; + output?: ResponseItem[]; /** * Usage statistics for the Response, this will correspond to billing. A @@ -178,46 +146,7 @@ model Response { * Items to the Conversation, thus output from previous turns (text and * audio tokens) will become the input for later turns. */ - usage?: { - /** - * The total number of tokens in the Response including input and output - * text and audio tokens. - */ - total_tokens?: int32; - - /** - * The number of input tokens used in the Response, including text and - * audio tokens. - */ - input_tokens?: int32; - - /** - * The number of output tokens sent in the Response, including text and - * audio tokens. - */ - output_tokens?: int32; - - /** Details about the input tokens used in the Response. */ - input_token_details?: { - /** The number of cached tokens used in the Response. */ - cached_tokens?: int32; - - /** The number of text tokens used in the Response. */ - text_tokens?: int32; - - /** The number of audio tokens used in the Response. */ - audio_tokens?: int32; - }; - - /** Details about the output tokens used in the Response. */ - output_token_details?: { - /** The number of text tokens used in the Response. */ - text_tokens?: int32; - - /** The number of audio tokens used in the Response. */ - audio_tokens?: int32; - }; - }; + usage?: Usage; @doc(""" Which conversation the response is added to, determined by the `conversation` @@ -347,7 +276,7 @@ model ClientEventConversationItemCreate extends ClientEvent { previous_item_id?: string; // Tool customization: apply enriched item definition hierarchy - item?: ConversationItemWithReference; + item?: ConversationRequestItem; } // Tool customization (apply_discriminator): apply discriminated type base @@ -648,10 +577,10 @@ model ServerEventConversationItemCreated extends ServerEvent { * The ID of the preceding item in the Conversation context, allows the * client to understand the order of the conversation. */ - previous_item_id: string; + previous_item_id?: string; // Tool customization: apply enriched item definition hierarchy - item?: ConversationItemWithReference; + item?: ResponseItem; } // Tool customization (apply_discriminator): apply discriminated type @@ -798,7 +727,7 @@ model ServerEventResponseOutputItemAdded extends ServerEvent { output_index: int32; // Tool customization: apply enriched item definition hierarchy - item?: ConversationItemWithReference; + item?: ResponseItem; } // Tool customization (apply_discriminator): apply discriminated type @@ -819,7 +748,7 @@ model ServerEventResponseOutputItemDone extends ServerEvent { output_index: int32; // Tool customization: apply enriched item definition hierarchy - item?: ConversationResponseItem; + item?: ResponseItem; } // Tool customization (apply_discriminator): apply discriminated type @@ -1199,74 +1128,6 @@ model ResponseCreateParams { max_output_tokens?: int32 | "inf"; } -/** The item to add to the conversation. */ -model ConversationItemWithReference { - @doc(""" - For an item of type (`message` | `function_call` | `function_call_output`) - this field allows the client to assign the unique ID of the item. It is - not required because the server will generate one if not provided. - - For an item of type `item_reference`, this field is required and is a - reference to any item that has previously existed in the conversation. - """) - id?: string; - - @doc(""" - The type of the item (`message`, `function_call`, `function_call_output`, `item_reference`). - """) - type?: "message" | "function_call" | "function_call_output"; - - @doc(""" - Identifier for the API object being returned - always `realtime.item`. - """) - object?: "realtime.item"; - - @doc(""" - The status of the item (`completed`, `incomplete`). These have no effect - on the conversation, but are accepted for consistency with the - `conversation.item.created` event. - """) - status?: "completed" | "incomplete"; - - @doc(""" - The role of the message sender (`user`, `assistant`, `system`), only - applicable for `message` items. - """) - role?: "user" | "assistant" | "system"; - - @doc(""" - The content of the message, applicable for `message` items. - - Message items of role `system` support only `input_text` content - - Message items of role `user` support `input_text` and `input_audio` - content - - Message items of role `assistant` support `text` content. - """) - content?: ConversationItemWithReferenceContent[]; - - @doc(""" - The ID of the function call (for `function_call` and - `function_call_output` items). If passed on a `function_call_output` - item, the server will check that a `function_call` item with the same - ID exists in the conversation history. - """) - call_id?: string; - - @doc(""" - The name of the function being called (for `function_call` items). - """) - name?: string; - - @doc(""" - The arguments of the function call (for `function_call` items). - """) - arguments?: string; - - @doc(""" - The output of the function call (for `function_call_output` items). - """) - output?: string; -} - // Tool customization (apply_discriminator): apply discriminated type base @doc(""" Send this event when you want to retrieve the server's representation of a specific item in the conversation history. This is useful, for example, to inspect user audio after noise cancellation and VAD. @@ -1316,7 +1177,8 @@ model ServerEventConversationItemRetrieved extends ServerEvent { """) type: ServerEventType.conversation_item_retrieved; - item_id?: string; + // Tool customization: apply enriched item definition hierarchy + item?: ResponseItem; event_id?: string; } From e46295d7a88b2b2cc16b1ae4014ce4f8400ff8a7 Mon Sep 17 00:00:00 2001 From: Ryan Hurey Date: Mon, 25 Aug 2025 18:27:00 -0700 Subject: [PATCH 35/48] Change some names for C# --- specification/ai/data-plane/VoiceLive/client.tsp | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/specification/ai/data-plane/VoiceLive/client.tsp b/specification/ai/data-plane/VoiceLive/client.tsp index 9abd88a575cf..89129bf61fe6 100644 --- a/specification/ai/data-plane/VoiceLive/client.tsp +++ b/specification/ai/data-plane/VoiceLive/client.tsp @@ -77,10 +77,18 @@ using Azure.ClientGenerator.Core; @@clientName(VoiceLive.Modality, "InputModality", "csharp"); @@clientName(VoiceLive.Animation, "AnimationOptions", "csharp"); -@@clientName(VoiceLive.Tool, "ToolCall", "csharp"); +@@clientName(VoiceLive.Tool, "VoiceLiveToolInvocation", "csharp"); @@clientName(VoiceLive.AzureCustomVoice.custom_lexicon_url, "CustomLexiconUri", "csharp"); @@clientName(VoiceLive.IceServer.urls, "Uris", "csharp"); @@clientName(VoiceLive.VideoCrop.bottom_right, "BottomRightInternal", "csharp"); @@clientName(VoiceLive.VideoCrop.top_left, "TopLeftInternal", "csharp"); @@clientName(VoiceLive.Response, "VoiceLiveResponse", "csharp"); @@clientName(VoiceLive.AgentConfig, "RespondingAgentConfig", "csharp"); +@@clientName(VoiceLive.FunctionTool, "VoiceLiveFunctionDefinition", "csharp"); +@@clientName(VoiceLive.ToolCall, "VoiceLiveToolDefinition", "csharp"); + +@@clientName(VoiceLive.ContentPart, "VoiceLiveContentPart", "csharp"); +@@clientName(VoiceLive.MessageRole, "ResponseMessageRole", "csharp"); +@@clientName(VoiceLive.ResponseStatus, "VoiceLiveResponseStatus", "csharp"); +@@clientName(VoiceLive.ServerEvent, "ServerEventBase", "csharp"); + From 1b1be21651b7bcf4b6164b51eaca478f6d56d7a4 Mon Sep 17 00:00:00 2001 From: Ryan Hurey Date: Mon, 25 Aug 2025 18:33:08 -0700 Subject: [PATCH 36/48] Wrong name --- specification/ai/data-plane/VoiceLive/client.tsp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/specification/ai/data-plane/VoiceLive/client.tsp b/specification/ai/data-plane/VoiceLive/client.tsp index 89129bf61fe6..f937fe8dd4d5 100644 --- a/specification/ai/data-plane/VoiceLive/client.tsp +++ b/specification/ai/data-plane/VoiceLive/client.tsp @@ -85,7 +85,7 @@ using Azure.ClientGenerator.Core; @@clientName(VoiceLive.Response, "VoiceLiveResponse", "csharp"); @@clientName(VoiceLive.AgentConfig, "RespondingAgentConfig", "csharp"); @@clientName(VoiceLive.FunctionTool, "VoiceLiveFunctionDefinition", "csharp"); -@@clientName(VoiceLive.ToolCall, "VoiceLiveToolDefinition", "csharp"); +@@clientName(VoiceLive.Tool, "VoiceLiveToolDefinition", "csharp"); @@clientName(VoiceLive.ContentPart, "VoiceLiveContentPart", "csharp"); @@clientName(VoiceLive.MessageRole, "ResponseMessageRole", "csharp"); From d1251146f6ca3b94c53d70da28dfa7e3b36b1a42 Mon Sep 17 00:00:00 2001 From: Ryan Hurey Date: Mon, 25 Aug 2025 18:49:43 -0700 Subject: [PATCH 37/48] Rename Usage in C# --- specification/ai/data-plane/VoiceLive/client.tsp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/specification/ai/data-plane/VoiceLive/client.tsp b/specification/ai/data-plane/VoiceLive/client.tsp index f937fe8dd4d5..61e1d3bff15e 100644 --- a/specification/ai/data-plane/VoiceLive/client.tsp +++ b/specification/ai/data-plane/VoiceLive/client.tsp @@ -91,4 +91,4 @@ using Azure.ClientGenerator.Core; @@clientName(VoiceLive.MessageRole, "ResponseMessageRole", "csharp"); @@clientName(VoiceLive.ResponseStatus, "VoiceLiveResponseStatus", "csharp"); @@clientName(VoiceLive.ServerEvent, "ServerEventBase", "csharp"); - +@@clientName(VoiceLive.Usage, "ResponseTokenStatistics", "csharp"); From 7249bb5b7c59f872706b60b79ff6bec843cd95dd Mon Sep 17 00:00:00 2001 From: Ryan Hurey Date: Wed, 27 Aug 2025 13:37:24 -0700 Subject: [PATCH 38/48] Update voice possibilities --- specification/ai/data-plane/VoiceLive/client.tsp | 2 +- specification/ai/data-plane/VoiceLive/custom.tsp | 15 +++++++++++++-- 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/specification/ai/data-plane/VoiceLive/client.tsp b/specification/ai/data-plane/VoiceLive/client.tsp index 61e1d3bff15e..d701ad2d0e40 100644 --- a/specification/ai/data-plane/VoiceLive/client.tsp +++ b/specification/ai/data-plane/VoiceLive/client.tsp @@ -86,9 +86,9 @@ using Azure.ClientGenerator.Core; @@clientName(VoiceLive.AgentConfig, "RespondingAgentConfig", "csharp"); @@clientName(VoiceLive.FunctionTool, "VoiceLiveFunctionDefinition", "csharp"); @@clientName(VoiceLive.Tool, "VoiceLiveToolDefinition", "csharp"); - @@clientName(VoiceLive.ContentPart, "VoiceLiveContentPart", "csharp"); @@clientName(VoiceLive.MessageRole, "ResponseMessageRole", "csharp"); @@clientName(VoiceLive.ResponseStatus, "VoiceLiveResponseStatus", "csharp"); @@clientName(VoiceLive.ServerEvent, "ServerEventBase", "csharp"); @@clientName(VoiceLive.Usage, "ResponseTokenStatistics", "csharp"); +@@clientName(VoiceLive.Phi4mmVoice, "LLMVoiceName", "csharp"); \ No newline at end of file diff --git a/specification/ai/data-plane/VoiceLive/custom.tsp b/specification/ai/data-plane/VoiceLive/custom.tsp index 782a7266f98c..79341d5f7388 100644 --- a/specification/ai/data-plane/VoiceLive/custom.tsp +++ b/specification/ai/data-plane/VoiceLive/custom.tsp @@ -58,7 +58,8 @@ model ResponseSession { } @doc("Supported OpenAI voice names (string enum).") -enum OAIVoice { +union OAIVoice { + string, alloy: "alloy", ash: "ash", ballad: "ballad", @@ -162,9 +163,18 @@ model AzurePersonalVoice extends AzureVoice { @doc("Voice identifier for Phi4mm voices.") union Phi4mmVoice { + string, "cosyvoice" } +@doc(""" +Voice configuration for LLM (Large Language Model) voices. +""") +model LLMVoice { + type: "llm"; + name: Phi4mmVoice; +} + // --- Top-level Voice union ------------------------------------------------- @doc("Union of all supported voice configurations.") @@ -172,7 +182,8 @@ union Voice { OAIVoice, OpenAIVoice, AzureVoice, // includes AzureCustomVoice, CustomVoice, AzurePersonalVoice - Phi4mmVoice + Phi4mmVoice, + LLMVoice } union AudioFormat { From 1326fbbc623cd572bd3bb266f48d56ca15363845 Mon Sep 17 00:00:00 2001 From: Ryan Hurey Date: Wed, 27 Aug 2025 13:56:24 -0700 Subject: [PATCH 39/48] C# customizations to make Voice internal when binary --- specification/ai/data-plane/VoiceLive/client.tsp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/specification/ai/data-plane/VoiceLive/client.tsp b/specification/ai/data-plane/VoiceLive/client.tsp index d701ad2d0e40..4f98214c47e0 100644 --- a/specification/ai/data-plane/VoiceLive/client.tsp +++ b/specification/ai/data-plane/VoiceLive/client.tsp @@ -74,6 +74,7 @@ using Azure.ClientGenerator.Core; @@access(VoiceLive.ClientEventSessionUpdate, Access.internal, "csharp"); @@access(VoiceLive.VideoCrop.bottom_right, Access.internal, "csharp"); @@access(VoiceLive.VideoCrop.top_left, Access.internal, "csharp"); +@@access(VoiceLive.Response.voice, Access.internal, "csharp"); @@clientName(VoiceLive.Modality, "InputModality", "csharp"); @@clientName(VoiceLive.Animation, "AnimationOptions", "csharp"); @@ -83,6 +84,8 @@ using Azure.ClientGenerator.Core; @@clientName(VoiceLive.VideoCrop.bottom_right, "BottomRightInternal", "csharp"); @@clientName(VoiceLive.VideoCrop.top_left, "TopLeftInternal", "csharp"); @@clientName(VoiceLive.Response, "VoiceLiveResponse", "csharp"); +@@clientName(VoiceLive.Response.voice, "VoiceInternal", "csharp"); +@@clientName(VoiceLive.ResponseSession.voice, "VoiceInternal", "csharp"); @@clientName(VoiceLive.AgentConfig, "RespondingAgentConfig", "csharp"); @@clientName(VoiceLive.FunctionTool, "VoiceLiveFunctionDefinition", "csharp"); @@clientName(VoiceLive.Tool, "VoiceLiveToolDefinition", "csharp"); From c9cd8f0cd92a9b186ad86e561500fb7be3872dc2 Mon Sep 17 00:00:00 2001 From: Ryan Hurey Date: Thu, 28 Aug 2025 15:56:37 -0700 Subject: [PATCH 40/48] Update names in C#, again --- .../ai/data-plane/VoiceLive/client.tsp | 39 ++++++++++++++++++- 1 file changed, 38 insertions(+), 1 deletion(-) diff --git a/specification/ai/data-plane/VoiceLive/client.tsp b/specification/ai/data-plane/VoiceLive/client.tsp index 4f98214c47e0..5d43f140cba1 100644 --- a/specification/ai/data-plane/VoiceLive/client.tsp +++ b/specification/ai/data-plane/VoiceLive/client.tsp @@ -55,6 +55,7 @@ using Azure.ClientGenerator.Core; @@access(VoiceLive.ServerEventSessionUpdated, Access.public, "python"); @@access(VoiceLive.ServerEvent, Access.public, "csharp"); +@@access(VoiceLive.ServerEventSessionCreated, Access.public, "csharp"); @@access(VoiceLive.ClientEvent, Access.internal, "csharp"); @@access(VoiceLive.ClientEventConversationItemCreate, Access.internal, "csharp"); @@access(VoiceLive.ClientEventConversationItemDelete, Access.internal, "csharp"); @@ -92,6 +93,42 @@ using Azure.ClientGenerator.Core; @@clientName(VoiceLive.ContentPart, "VoiceLiveContentPart", "csharp"); @@clientName(VoiceLive.MessageRole, "ResponseMessageRole", "csharp"); @@clientName(VoiceLive.ResponseStatus, "VoiceLiveResponseStatus", "csharp"); -@@clientName(VoiceLive.ServerEvent, "ServerEventBase", "csharp"); +@@clientName(VoiceLive.ServerEvent, "SessionUpdate", "csharp"); +@@clientName(VoiceLive.ServerEventSessionCreated, "SessionUpdateSessionCreated", "csharp"); +@@clientName(VoiceLive.ServerEventSessionUpdated, "SessionUpdateSessionUpdated", "csharp"); +@@clientName(VoiceLive.ServerEventSessionAvatarConnecting, "SessionUpdateSessionAvatarConnecting", "csharp"); +@@clientName(VoiceLive.ServerEventInputAudioBufferSpeechStarted, "SessionUpdateInputAudioBufferSpeechStarted", "csharp"); +@@clientName(VoiceLive.ServerEventInputAudioBufferSpeechStopped, "SessionUpdateInputAudioBufferSpeechStopped", "csharp"); +@@clientName(VoiceLive.ServerEventInputAudioBufferCleared, "SessionUpdateInputAudioBufferCleared", "csharp"); +@@clientName(VoiceLive.ServerEventInputAudioBufferCommitted, "SessionUpdateInputAudioBufferCommitted", "csharp"); +@@clientName(VoiceLive.ServerEventResponseCreated, "SessionUpdateResponseCreated", "csharp"); +@@clientName(VoiceLive.ServerEventResponseDone, "SessionUpdateResponseDone", "csharp"); +@@clientName(VoiceLive.ServerEventResponseAudioDelta, "SessionUpdateResponseAudioDelta", "csharp"); +@@clientName(VoiceLive.ServerEventResponseAudioDone, "SessionUpdateResponseAudioDone", "csharp"); +@@clientName(VoiceLive.ServerEventResponseAudioTranscriptDelta, "SessionUpdateResponseAudioTranscriptDelta", "csharp"); +@@clientName(VoiceLive.ServerEventResponseAudioTranscriptDone, "SessionUpdateResponseAudioTranscriptDone", "csharp"); +@@clientName(VoiceLive.ServerEventResponseAudioTimestampDelta, "SessionUpdateResponseAudioTimestampDelta", "csharp"); +@@clientName(VoiceLive.ServerEventResponseAudioTimestampDone, "SessionUpdateResponseAudioTimestampDone", "csharp"); +@@clientName(VoiceLive.ServerEventResponseTextDelta, "SessionUpdateResponseTextDelta", "csharp"); +@@clientName(VoiceLive.ServerEventResponseTextDone, "SessionUpdateResponseTextDone", "csharp"); +@@clientName(VoiceLive.ServerEventResponseContentPartAdded, "SessionUpdateResponseContentPartAdded", "csharp"); +@@clientName(VoiceLive.ServerEventResponseContentPartDone, "SessionUpdateResponseContentPartDone", "csharp"); +@@clientName(VoiceLive.ServerEventResponseOutputItemAdded, "SessionUpdateResponseOutputItemAdded", "csharp"); +@@clientName(VoiceLive.ServerEventResponseOutputItemDone, "SessionUpdateResponseOutputItemDone", "csharp"); +@@clientName(VoiceLive.ServerEventResponseFunctionCallArgumentsDelta, "SessionUpdateResponseFunctionCallArgumentsDelta", "csharp"); +@@clientName(VoiceLive.ServerEventResponseFunctionCallArgumentsDone, "SessionUpdateResponseFunctionCallArgumentsDone", "csharp"); +@@clientName(VoiceLive.ServerEventResponseAnimationBlendshapeDelta, "SessionUpdateResponseAnimationBlendshapeDelta", "csharp"); +@@clientName(VoiceLive.ServerEventResponseAnimationBlendshapeDone, "SessionUpdateResponseAnimationBlendshapeDone", "csharp"); +@@clientName(VoiceLive.ServerEventResponseEmotionHypothesis, "SessionUpdateResponseEmotionHypothesis", "csharp"); +@@clientName(VoiceLive.ServerEventResponseAnimationVisemeDelta, "SessionUpdateResponseAnimationVisemeDelta", "csharp"); +@@clientName(VoiceLive.ServerEventResponseAnimationVisemeDone, "SessionUpdateResponseAnimationVisemeDone", "csharp"); +@@clientName(VoiceLive.ServerEventConversationItemCreated, "SessionUpdateConversationItemCreated", "csharp"); +@@clientName(VoiceLive.ServerEventConversationItemDeleted, "SessionUpdateConversationItemDeleted", "csharp"); +@@clientName(VoiceLive.ServerEventConversationItemRetrieved, "SessionUpdateConversationItemRetrieved", "csharp"); +@@clientName(VoiceLive.ServerEventConversationItemTruncated, "SessionUpdateConversationItemTruncated", "csharp"); +@@clientName(VoiceLive.ServerEventConversationItemInputAudioTranscriptionCompleted, "SessionUpdateConversationItemInputAudioTranscriptionCompleted", "csharp"); +@@clientName(VoiceLive.ServerEventConversationItemInputAudioTranscriptionDelta, "SessionUpdateConversationItemInputAudioTranscriptionDelta", "csharp"); +@@clientName(VoiceLive.ServerEventConversationItemInputAudioTranscriptionFailed, "SessionUpdateConversationItemInputAudioTranscriptionFailed", "csharp"); +@@clientName(VoiceLive.ServerEventError, "SessionUpdateError", "csharp"); @@clientName(VoiceLive.Usage, "ResponseTokenStatistics", "csharp"); @@clientName(VoiceLive.Phi4mmVoice, "LLMVoiceName", "csharp"); \ No newline at end of file From 4ebceafcd54714d15d9c56ac6b203881e2b86ee3 Mon Sep 17 00:00:00 2001 From: Ryan Hurey Date: Wed, 3 Sep 2025 18:51:15 -0700 Subject: [PATCH 41/48] More C# name updates --- specification/ai/data-plane/VoiceLive/client.tsp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/specification/ai/data-plane/VoiceLive/client.tsp b/specification/ai/data-plane/VoiceLive/client.tsp index 5d43f140cba1..fdc1798f0a8c 100644 --- a/specification/ai/data-plane/VoiceLive/client.tsp +++ b/specification/ai/data-plane/VoiceLive/client.tsp @@ -87,6 +87,7 @@ using Azure.ClientGenerator.Core; @@clientName(VoiceLive.Response, "VoiceLiveResponse", "csharp"); @@clientName(VoiceLive.Response.voice, "VoiceInternal", "csharp"); @@clientName(VoiceLive.ResponseSession.voice, "VoiceInternal", "csharp"); +@@clientName(VoiceLive.Response.modalities, "ModalitiesInternal", "csharp"); @@clientName(VoiceLive.AgentConfig, "RespondingAgentConfig", "csharp"); @@clientName(VoiceLive.FunctionTool, "VoiceLiveFunctionDefinition", "csharp"); @@clientName(VoiceLive.Tool, "VoiceLiveToolDefinition", "csharp"); @@ -131,4 +132,6 @@ using Azure.ClientGenerator.Core; @@clientName(VoiceLive.ServerEventConversationItemInputAudioTranscriptionFailed, "SessionUpdateConversationItemInputAudioTranscriptionFailed", "csharp"); @@clientName(VoiceLive.ServerEventError, "SessionUpdateError", "csharp"); @@clientName(VoiceLive.Usage, "ResponseTokenStatistics", "csharp"); -@@clientName(VoiceLive.Phi4mmVoice, "LLMVoiceName", "csharp"); \ No newline at end of file +@@clientName(VoiceLive.Phi4mmVoice, "LlmVoiceName", "csharp"); +@@clientName(VoiceLive.EOUDetection, "EouDetection", "csharp"); +@@clientName(VoiceLive.LLMVoice, "LlmVoice", "csharp"); From 83ee88e956df84574d5a592f074216a24be707fe Mon Sep 17 00:00:00 2001 From: Ryan Hurey Date: Thu, 4 Sep 2025 19:01:25 -0700 Subject: [PATCH 42/48] Make the audio transcription model open ended. --- specification/ai/data-plane/VoiceLive/custom.tsp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/specification/ai/data-plane/VoiceLive/custom.tsp b/specification/ai/data-plane/VoiceLive/custom.tsp index 79341d5f7388..bbd4782ce234 100644 --- a/specification/ai/data-plane/VoiceLive/custom.tsp +++ b/specification/ai/data-plane/VoiceLive/custom.tsp @@ -201,7 +201,7 @@ union AudioInputTranscriptionModel { @doc("Configuration for input audio transcription.") model AudioInputTranscriptionSettings { @doc("The model used for transcription. E.g., 'whisper-1', 'azure-fast-transcription', 's2s-ingraph'.") - `model`: "whisper-1" | "azure-fast-transcription" | "s2s-ingraph"; + `model`: string | "whisper-1" | "azure-fast-transcription" | "s2s-ingraph" | "azure-speech"; @doc("The language code to use for transcription, if specified.") language?: string; From 4790dcf70a7903da2d21aa1585c04fea0e942e1d Mon Sep 17 00:00:00 2001 From: Ryan Hurey Date: Wed, 10 Sep 2025 11:29:56 -0700 Subject: [PATCH 43/48] Mid update --- specification/ai/data-plane/VoiceLive/client.tsp | 4 +++- specification/ai/data-plane/VoiceLive/models.tsp | 7 +++++-- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/specification/ai/data-plane/VoiceLive/client.tsp b/specification/ai/data-plane/VoiceLive/client.tsp index fdc1798f0a8c..cf03fabc50d0 100644 --- a/specification/ai/data-plane/VoiceLive/client.tsp +++ b/specification/ai/data-plane/VoiceLive/client.tsp @@ -88,7 +88,7 @@ using Azure.ClientGenerator.Core; @@clientName(VoiceLive.Response.voice, "VoiceInternal", "csharp"); @@clientName(VoiceLive.ResponseSession.voice, "VoiceInternal", "csharp"); @@clientName(VoiceLive.Response.modalities, "ModalitiesInternal", "csharp"); -@@clientName(VoiceLive.AgentConfig, "RespondingAgentConfig", "csharp"); +@@clientName(VoiceLive.AgentConfig, "RespondingAgentOptions", "csharp"); @@clientName(VoiceLive.FunctionTool, "VoiceLiveFunctionDefinition", "csharp"); @@clientName(VoiceLive.Tool, "VoiceLiveToolDefinition", "csharp"); @@clientName(VoiceLive.ContentPart, "VoiceLiveContentPart", "csharp"); @@ -131,7 +131,9 @@ using Azure.ClientGenerator.Core; @@clientName(VoiceLive.ServerEventConversationItemInputAudioTranscriptionDelta, "SessionUpdateConversationItemInputAudioTranscriptionDelta", "csharp"); @@clientName(VoiceLive.ServerEventConversationItemInputAudioTranscriptionFailed, "SessionUpdateConversationItemInputAudioTranscriptionFailed", "csharp"); @@clientName(VoiceLive.ServerEventError, "SessionUpdateError", "csharp"); +@@clientName(VoiceLive.ServerEventErrorDetails, "SessionUpdateErrorDetails", "csharp"); @@clientName(VoiceLive.Usage, "ResponseTokenStatistics", "csharp"); @@clientName(VoiceLive.Phi4mmVoice, "LlmVoiceName", "csharp"); @@clientName(VoiceLive.EOUDetection, "EouDetection", "csharp"); @@clientName(VoiceLive.LLMVoice, "LlmVoice", "csharp"); +@@clientName(VoiceLive.AvatarConfig, "AvatarConfiguration", "csharp"); \ No newline at end of file diff --git a/specification/ai/data-plane/VoiceLive/models.tsp b/specification/ai/data-plane/VoiceLive/models.tsp index 5387eb699557..282c233f9cf3 100644 --- a/specification/ai/data-plane/VoiceLive/models.tsp +++ b/specification/ai/data-plane/VoiceLive/models.tsp @@ -397,7 +397,11 @@ model ServerEventError extends ServerEvent { type: ServerEventType.error; /** Details of the error. */ - error: { + error: ServerEventErrorDetails; +} + +/** Details of the error. */ +model ServerEventErrorDetails { /** The type of error (e.g., "invalid_request_error", "server_error"). */ type: string; @@ -413,7 +417,6 @@ model ServerEventError extends ServerEvent { /** The event_id of the client event that caused the error, if applicable. */ event_id?: string | null; }; -} // Tool customization (apply_discriminator): apply discriminated type /** From 61fcb141205bc642805f1f8ded6ca65cd5763de6 Mon Sep 17 00:00:00 2001 From: Xiting Zhang Date: Fri, 12 Sep 2025 12:28:36 -0700 Subject: [PATCH 44/48] update AudioInputTranscriptionSettings --- .../ai/data-plane/VoiceLive/custom.tsp | 78 ++++++++++++------- 1 file changed, 48 insertions(+), 30 deletions(-) diff --git a/specification/ai/data-plane/VoiceLive/custom.tsp b/specification/ai/data-plane/VoiceLive/custom.tsp index bbd4782ce234..f54e34dfd3d9 100644 --- a/specification/ai/data-plane/VoiceLive/custom.tsp +++ b/specification/ai/data-plane/VoiceLive/custom.tsp @@ -4,7 +4,6 @@ import "./custom/tools.tsp"; import "@typespec/http"; import "@typespec/openapi"; - using TypeSpec.OpenAPI; namespace VoiceLive; @@ -71,11 +70,11 @@ union OAIVoice { } @doc(""" -OpenAI voice configuration with explicit type field. - -This provides a unified interface for OpenAI voices, complementing the -existing string-based OAIVoice for backward compatibility. -""") + OpenAI voice configuration with explicit type field. + + This provides a unified interface for OpenAI voices, complementing the + existing string-based OAIVoice for backward compatibility. + """) model OpenAIVoice { type: "openai"; name: OAIVoice; @@ -92,11 +91,11 @@ model AzureVoice { @doc("Azure custom voice configuration (preferred).") model AzureCustomVoice extends AzureVoice { type: "azure-custom"; - @minLength(1) @doc("Voice name cannot be empty.") name: string; @minLength(1) @doc("Endpoint ID cannot be empty.") endpoint_id: string; - @minValue(0) @maxValue(1) + @minValue(0) + @maxValue(1) @doc("Temperature must be between 0.0 and 1.0.") temperature?: float32; @@ -112,11 +111,12 @@ model AzureCustomVoice extends AzureVoice { @doc("Azure standard voice configuration.") model AzureStandardVoice extends AzureVoice { type: "azure-standard"; - @minLength(1) @doc("Voice name cannot be empty.") name: string; - @minValue(0) @maxValue(1) - @doc("Temperature must be between 0.0 and 1.0.") temperature?: float32; + @minValue(0) + @maxValue(1) + @doc("Temperature must be between 0.0 and 1.0.") + temperature?: float32; custom_lexicon_url?: string; prefer_locales?: string[]; @@ -130,11 +130,12 @@ model AzureStandardVoice extends AzureVoice { @doc("Azure platform voice configuration (variant of standard).") model AzurePlatformVoice extends AzureVoice { type: "azure-platform"; - @minLength(1) @doc("Voice name cannot be empty.") name: string; - @minValue(0) @maxValue(1) - @doc("Temperature must be between 0.0 and 1.0.") temperature?: float32; + @minValue(0) + @maxValue(1) + @doc("Temperature must be between 0.0 and 1.0.") + temperature?: float32; custom_lexicon_url?: string; prefer_locales?: string[]; @@ -148,10 +149,10 @@ model AzurePlatformVoice extends AzureVoice { @doc("Azure personal voice configuration.") model AzurePersonalVoice extends AzureVoice { type: "azure-personal"; - @minLength(1) @doc("Voice name cannot be empty.") name: string; - @minValue(0) @maxValue(1) + @minValue(0) + @maxValue(1) @doc("Temperature must be between 0.0 and 1.0.") temperature?: float32; @@ -164,12 +165,12 @@ model AzurePersonalVoice extends AzureVoice { @doc("Voice identifier for Phi4mm voices.") union Phi4mmVoice { string, - "cosyvoice" + "cosyvoice", } @doc(""" -Voice configuration for LLM (Large Language Model) voices. -""") + Voice configuration for LLM (Large Language Model) voices. + """) model LLMVoice { type: "llm"; name: Phi4mmVoice; @@ -181,9 +182,9 @@ model LLMVoice { union Voice { OAIVoice, OpenAIVoice, - AzureVoice, // includes AzureCustomVoice, CustomVoice, AzurePersonalVoice + AzureVoice, // includes AzureCustomVoice, CustomVoice, AzurePersonalVoice Phi4mmVoice, - LLMVoice + LLMVoice, } union AudioFormat { @@ -200,17 +201,27 @@ union AudioInputTranscriptionModel { @doc("Configuration for input audio transcription.") model AudioInputTranscriptionSettings { - @doc("The model used for transcription. E.g., 'whisper-1', 'azure-fast-transcription', 's2s-ingraph'.") - `model`: string | "whisper-1" | "azure-fast-transcription" | "s2s-ingraph" | "azure-speech"; - - @doc("The language code to use for transcription, if specified.") + @doc(""" + The transcription model to use. Supported values: + 'whisper-1', 'gpt-4o-transcribe', 'gpt-4o-mini-transcribe', + 'azure-fast-transcription', 'azure-speech'. + """) + `model`: + | string + | "whisper-1" + | "gpt-4o-transcribe" + | "gpt-4o-mini-transcribe" + | "azure-fast-transcription" + | "azure-speech"; + + @doc("Optional BCP-47 language code (e.g., 'en-US').") language?: string; - @doc("Whether transcription is enabled.") - enabled: boolean; + @doc("Optional configuration for custom speech models.") + custom_speech?: Record; - @doc("Whether a custom model is being used.") - custom_model: boolean; + @doc("Optional list of phrase hints to bias recognition.") + phrase_list?: string[]; } union Modality { @@ -235,8 +246,10 @@ model AzureSemanticDetection extends EOUDetection { secondary_threshold?: float32; secondary_timeout?: float32; disable_rules?: boolean; + // developer options sr_boost?: float32; + extra_imend_check?: boolean; } @@ -248,8 +261,10 @@ model AzureSemanticDetectionEn extends EOUDetection { secondary_threshold?: float32; secondary_timeout?: float32; disable_rules?: boolean; + // developer options sr_boost?: float32; + extra_imend_check?: boolean; } @@ -261,8 +276,10 @@ model AzureSemanticDetectionMultilingual extends EOUDetection { secondary_threshold?: float32; secondary_timeout?: float32; disable_rules?: boolean; + // developer options sr_boost?: float32; + extra_imend_check?: boolean; } @@ -385,6 +402,7 @@ model AudioEchoCancellation { @doc("Output timestamp types supported in audio response content.") union AudioTimestampType { string, + @doc("Timestamps per word in the output audio.") word: "word", } @@ -481,4 +499,4 @@ model VideoResolution { @doc("Height of the video in pixels. Must be greater than 0.") height: int32; -} \ No newline at end of file +} From ed41202e42599caa36119be9a5e2815532d72dea Mon Sep 17 00:00:00 2001 From: Ryan Hurey Date: Fri, 12 Sep 2025 19:11:33 -0700 Subject: [PATCH 45/48] more updates --- .../ai/data-plane/VoiceLive/client.tsp | 2 ++ .../ai/data-plane/VoiceLive/custom.tsp | 23 ++----------------- .../ai/data-plane/VoiceLive/models.tsp | 2 -- 3 files changed, 4 insertions(+), 23 deletions(-) diff --git a/specification/ai/data-plane/VoiceLive/client.tsp b/specification/ai/data-plane/VoiceLive/client.tsp index fdc1798f0a8c..56981411f1fe 100644 --- a/specification/ai/data-plane/VoiceLive/client.tsp +++ b/specification/ai/data-plane/VoiceLive/client.tsp @@ -135,3 +135,5 @@ using Azure.ClientGenerator.Core; @@clientName(VoiceLive.Phi4mmVoice, "LlmVoiceName", "csharp"); @@clientName(VoiceLive.EOUDetection, "EouDetection", "csharp"); @@clientName(VoiceLive.LLMVoice, "LlmVoice", "csharp"); +@@clientName(VoiceLive.RequestSession, "VoiceLiveSessionOptions", "csharp"); +@@clientName(VoiceLive.ResponseSession, "VoiceLiveSessionResponse", "csharp"); \ No newline at end of file diff --git a/specification/ai/data-plane/VoiceLive/custom.tsp b/specification/ai/data-plane/VoiceLive/custom.tsp index bbd4782ce234..8d59e62c9120 100644 --- a/specification/ai/data-plane/VoiceLive/custom.tsp +++ b/specification/ai/data-plane/VoiceLive/custom.tsp @@ -30,31 +30,12 @@ model RequestSession { tool_choice?: ToolChoice; temperature?: float32; max_response_output_tokens?: int32 | "inf"; + agent?: AgentConfig; } -model ResponseSession { +model ResponseSession extends RequestSession{ ...SessionBase; id?: string; - `model`?: string; - modalities?: Modality[]; - instructions?: string; - animation?: Animation; - voice?: Voice; - input_audio?: InputAudio; - input_audio_format?: AudioFormat; - output_audio_format?: AudioFormat; - input_audio_sampling_rate?: int32; - turn_detection?: TurnDetection; - input_audio_noise_reduction?: AudioNoiseReduction; - input_audio_echo_cancellation?: AudioEchoCancellation; - avatar?: AvatarConfig; - input_audio_transcription?: AudioInputTranscriptionSettings | null; - output_audio_timestamp_types?: AudioTimestampType[]; - tools?: Tool[]; - tool_choice?: ToolChoice; - temperature?: float32; - max_response_output_tokens?: int32 | "inf" | null; - agent?: AgentConfig; } @doc("Supported OpenAI voice names (string enum).") diff --git a/specification/ai/data-plane/VoiceLive/models.tsp b/specification/ai/data-plane/VoiceLive/models.tsp index 5387eb699557..5e0b1dceec2d 100644 --- a/specification/ai/data-plane/VoiceLive/models.tsp +++ b/specification/ai/data-plane/VoiceLive/models.tsp @@ -934,8 +934,6 @@ model ServerEventResponseAudioDelta extends ServerEvent { /** Base64-encoded audio data delta. */ @encode("base64") delta: bytes; - - event_id?: string; } // Tool customization (apply_discriminator): apply discriminated type From 6467537fd117652391ef6f0c653db8958822ca31 Mon Sep 17 00:00:00 2001 From: Ryan Hurey Date: Sat, 13 Sep 2025 19:03:03 -0700 Subject: [PATCH 46/48] Updates --- specification/ai/data-plane/VoiceLive/client.tsp | 3 +++ .../ai/data-plane/VoiceLive/custom/items.tsp | 12 ++++++++++++ specification/ai/data-plane/VoiceLive/models.tsp | 2 +- 3 files changed, 16 insertions(+), 1 deletion(-) diff --git a/specification/ai/data-plane/VoiceLive/client.tsp b/specification/ai/data-plane/VoiceLive/client.tsp index 54345285dab5..6c31638178cd 100644 --- a/specification/ai/data-plane/VoiceLive/client.tsp +++ b/specification/ai/data-plane/VoiceLive/client.tsp @@ -94,6 +94,7 @@ using Azure.ClientGenerator.Core; @@clientName(VoiceLive.ContentPart, "VoiceLiveContentPart", "csharp"); @@clientName(VoiceLive.MessageRole, "ResponseMessageRole", "csharp"); @@clientName(VoiceLive.ResponseStatus, "VoiceLiveResponseStatus", "csharp"); +@@clientName(VoiceLive.ResponseItemStatus, "VoiceLiveResponseItemStatus", "csharp"); @@clientName(VoiceLive.ServerEvent, "SessionUpdate", "csharp"); @@clientName(VoiceLive.ServerEventSessionCreated, "SessionUpdateSessionCreated", "csharp"); @@clientName(VoiceLive.ServerEventSessionUpdated, "SessionUpdateSessionUpdated", "csharp"); @@ -139,3 +140,5 @@ using Azure.ClientGenerator.Core; @@clientName(VoiceLive.RequestSession, "VoiceLiveSessionOptions", "csharp"); @@clientName(VoiceLive.ResponseSession, "VoiceLiveSessionResponse", "csharp"); @@clientName(VoiceLive.AvatarConfig, "AvatarConfiguration", "csharp"); +@@clientName(VoiceLive.ServerEventInputAudioBufferSpeechStarted.audio_start_ms, "AudioStartMs", "csharp"); +@@clientName(VoiceLive.ServerEventInputAudioBufferSpeechStopped.audio_end_ms, "AudioEndMs", "csharp"); diff --git a/specification/ai/data-plane/VoiceLive/custom/items.tsp b/specification/ai/data-plane/VoiceLive/custom/items.tsp index 4568b4651bf2..a4208e27356c 100644 --- a/specification/ai/data-plane/VoiceLive/custom/items.tsp +++ b/specification/ai/data-plane/VoiceLive/custom/items.tsp @@ -177,6 +177,9 @@ model InputTokenDetails { @doc("Number of audio tokens used in the input.") audio_tokens: int32; + + @doc("Details of cached token usage.") + cached_tokens_details: CachedTokenDetails; } @doc("Details of output token usage.") @@ -188,6 +191,15 @@ model OutputTokenDetails { audio_tokens: int32; } +@doc("Details of output token usage.") +model CachedTokenDetails { + @doc("Number of cached text tokens.") + text_tokens: int32; + + @doc("Number of cached audio tokens.") + audio_tokens: int32; +} + @doc("Overall usage statistics for a response.") model Usage { @doc("Total number of tokens (input + output).") diff --git a/specification/ai/data-plane/VoiceLive/models.tsp b/specification/ai/data-plane/VoiceLive/models.tsp index 519e06d40b9b..8d6ee0a09913 100644 --- a/specification/ai/data-plane/VoiceLive/models.tsp +++ b/specification/ai/data-plane/VoiceLive/models.tsp @@ -174,7 +174,7 @@ model Response { @doc(""" The format of output audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`. """) - output_audio_format?: "pcm16" | "g711_ulaw" | "g711_alaw"; + output_audio_format?: AudioFormat = AudioFormat.pcm16; /** Sampling temperature for the model, limited to [0.6, 1.2]. Defaults to 0.8. */ temperature?: float32; From 213f075856754927c9924eed5e8ae1c0ecd7d5d3 Mon Sep 17 00:00:00 2001 From: Ryan Hurey Date: Sun, 14 Sep 2025 17:18:44 -0700 Subject: [PATCH 47/48] Tool work for C# --- specification/ai/data-plane/VoiceLive/client.tsp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/specification/ai/data-plane/VoiceLive/client.tsp b/specification/ai/data-plane/VoiceLive/client.tsp index 6c31638178cd..10596079ce54 100644 --- a/specification/ai/data-plane/VoiceLive/client.tsp +++ b/specification/ai/data-plane/VoiceLive/client.tsp @@ -76,6 +76,9 @@ using Azure.ClientGenerator.Core; @@access(VoiceLive.VideoCrop.bottom_right, Access.internal, "csharp"); @@access(VoiceLive.VideoCrop.top_left, Access.internal, "csharp"); @@access(VoiceLive.Response.voice, Access.internal, "csharp"); +@@access(VoiceLive.ToolChoiceFunctionObject, Access.internal, "csharp"); +@@access(VoiceLive.ToolChoiceObject, Access.internal, "csharp"); + @@clientName(VoiceLive.Modality, "InputModality", "csharp"); @@clientName(VoiceLive.Animation, "AnimationOptions", "csharp"); @@ -141,4 +144,4 @@ using Azure.ClientGenerator.Core; @@clientName(VoiceLive.ResponseSession, "VoiceLiveSessionResponse", "csharp"); @@clientName(VoiceLive.AvatarConfig, "AvatarConfiguration", "csharp"); @@clientName(VoiceLive.ServerEventInputAudioBufferSpeechStarted.audio_start_ms, "AudioStartMs", "csharp"); -@@clientName(VoiceLive.ServerEventInputAudioBufferSpeechStopped.audio_end_ms, "AudioEndMs", "csharp"); +@@clientName(VoiceLive.ServerEventInputAudioBufferSpeechStopped.audio_end_ms, "AudioEndMs", "csharp"); \ No newline at end of file From f99d32b66214505357e68b3523abd2ad209da678 Mon Sep 17 00:00:00 2001 From: JoshLove-msft <54595583+JoshLove-msft@users.noreply.github.com> Date: Wed, 17 Sep 2025 17:15:57 -0700 Subject: [PATCH 48/48] add emitter-output-dir --- specification/ai/data-plane/VoiceLive/tspconfig.yaml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/specification/ai/data-plane/VoiceLive/tspconfig.yaml b/specification/ai/data-plane/VoiceLive/tspconfig.yaml index a96a6cd87f9a..a01bd2bc0fb9 100644 --- a/specification/ai/data-plane/VoiceLive/tspconfig.yaml +++ b/specification/ai/data-plane/VoiceLive/tspconfig.yaml @@ -31,6 +31,7 @@ options: "@azure-typespec/http-client-csharp": namespace: Azure.AI.VoiceLive model-namespace: false + emitter-output-dir: "{output-dir}/{service-dir}/{namespace}" "@azure-tools/typespec-ts": package-dir: "azure-ai-voicelive" package-details: @@ -52,4 +53,4 @@ options: flavor: azure "@azure-tools/typespec-client-generator-cli": additionalDirectories: - - "specification/ai/data-plane/VoiceLive/" \ No newline at end of file + - "specification/ai/data-plane/VoiceLive/"