diff --git a/specification/ai/data-plane/VoiceLive/client.tsp b/specification/ai/data-plane/VoiceLive/client.tsp new file mode 100644 index 000000000000..fdc1798f0a8c --- /dev/null +++ b/specification/ai/data-plane/VoiceLive/client.tsp @@ -0,0 +1,137 @@ +import "@azure-tools/typespec-client-generator-core"; +import "./servers/websocket.tsp"; + +using Azure.ClientGenerator.Core; + +@@access(VoiceLive.force_models, Access.internal, "python"); +@@access(VoiceLive.force_models, Access.internal, "csharp"); + +@@access(VoiceLive.ClientEventSessionUpdate, Access.public, "python"); +@@access(VoiceLive.ClientEventInputAudioBufferAppend, Access.public, "python"); +@@access(VoiceLive.ClientEventInputAudioBufferCommit, Access.public, "python"); +@@access(VoiceLive.ClientEventInputAudioBufferClear, Access.public, "python"); +@@access(VoiceLive.ClientEventInputAudioTurnStart, Access.public, "python"); +@@access(VoiceLive.ClientEventInputAudioTurnAppend, Access.public, "python"); +@@access(VoiceLive.ClientEventInputAudioTurnEnd, Access.public, "python"); +@@access(VoiceLive.ClientEventInputAudioTurnCancel, Access.public, "python"); +@@access(VoiceLive.ClientEventInputAudioClear, Access.public, "python"); +@@access(VoiceLive.ClientEventConversationItemCreate, Access.public, "python"); +@@access(VoiceLive.ClientEventConversationItemRetrieve, Access.public, "python"); +@@access(VoiceLive.ClientEventConversationItemTruncate, Access.public, "python"); +@@access(VoiceLive.ClientEventConversationItemDelete, Access.public, "python"); +@@access(VoiceLive.ClientEventResponseCreate, Access.public, "python"); +@@access(VoiceLive.ClientEventResponseCancel, Access.public, "python"); +@@access(VoiceLive.ClientEventSessionAvatarConnect, Access.public, "python"); + + +@@access(VoiceLive.ServerEventSessionAvatarConnecting, Access.public, "python"); +@@access(VoiceLive.ServerEventSessionCreated, Access.public, "python"); +@@access(VoiceLive.ServerEventResponseAudioDelta, Access.public, "python"); +@@access(VoiceLive.ResponseItem, Access.public, "python"); +@@access(VoiceLive.Response, Access.public, "python"); +@@access(VoiceLive.ServerEventConversationItemCreated, Access.public, "python"); +@@access(VoiceLive.ServerEventConversationItemDeleted, Access.public, "python"); +@@access(VoiceLive.ServerEventConversationItemInputAudioTranscriptionCompleted, Access.public, "python"); +@@access(VoiceLive.ServerEventConversationItemInputAudioTranscriptionDelta, Access.public, "python"); +@@access(VoiceLive.ServerEventConversationItemInputAudioTranscriptionFailed, Access.public, "python"); +@@access(VoiceLive.ServerEventConversationItemRetrieved, Access.public, "python"); +@@access(VoiceLive.ServerEventConversationItemTruncated, Access.public, "python"); +@@access(VoiceLive.ServerEventError, Access.public, "python"); +@@access(VoiceLive.ServerEventInputAudioBufferCleared, Access.public, "python"); +@@access(VoiceLive.ServerEventInputAudioBufferCommitted, Access.public, "python"); +@@access(VoiceLive.ServerEventInputAudioBufferSpeechStarted, Access.public, "python"); +@@access(VoiceLive.ServerEventInputAudioBufferSpeechStopped, Access.public, "python"); +@@access(VoiceLive.ServerEventResponseAudioDone, Access.public, "python"); +@@access(VoiceLive.ServerEventResponseAudioTranscriptDelta, Access.public, "python"); +@@access(VoiceLive.ServerEventResponseAudioTranscriptDone, Access.public, "python"); +@@access(VoiceLive.ServerEventResponseContentPartAdded, Access.public, "python"); +@@access(VoiceLive.ServerEventResponseContentPartDone, Access.public, "python"); +@@access(VoiceLive.ServerEventResponseCreated, Access.public, "python"); +@@access(VoiceLive.ServerEventResponseDone, Access.public, "python"); +@@access(VoiceLive.ServerEventResponseOutputItemAdded, Access.public, "python"); +@@access(VoiceLive.ServerEventResponseOutputItemDone, Access.public, "python"); +@@access(VoiceLive.ServerEventResponseTextDelta, Access.public, "python"); +@@access(VoiceLive.ServerEventResponseTextDone, Access.public, "python"); +@@access(VoiceLive.ServerEventSessionUpdated, Access.public, "python"); + +@@access(VoiceLive.ServerEvent, Access.public, "csharp"); +@@access(VoiceLive.ServerEventSessionCreated, Access.public, "csharp"); +@@access(VoiceLive.ClientEvent, Access.internal, "csharp"); +@@access(VoiceLive.ClientEventConversationItemCreate, Access.internal, "csharp"); +@@access(VoiceLive.ClientEventConversationItemDelete, Access.internal, "csharp"); +@@access(VoiceLive.ClientEventConversationItemRetrieve, Access.internal, "csharp"); +@@access(VoiceLive.ClientEventConversationItemTruncate, Access.internal, "csharp"); +@@access(VoiceLive.ClientEventInputAudioBufferAppend, Access.internal, "csharp"); +@@access(VoiceLive.ClientEventInputAudioBufferClear, Access.internal, "csharp"); +@@access(VoiceLive.ClientEventInputAudioBufferCommit, Access.internal, "csharp"); +@@access(VoiceLive.ClientEventInputAudioClear, Access.internal, "csharp"); +@@access(VoiceLive.ClientEventInputAudioTurnAppend, Access.internal, "csharp"); +@@access(VoiceLive.ClientEventInputAudioTurnCancel, Access.internal, "csharp"); +@@access(VoiceLive.ClientEventInputAudioTurnEnd, Access.internal, "csharp"); +@@access(VoiceLive.ClientEventInputAudioTurnStart, Access.internal, "csharp"); +@@access(VoiceLive.ClientEventResponseCancel, Access.internal, "csharp"); +@@access(VoiceLive.ClientEventResponseCreate, Access.internal, "csharp"); +@@access(VoiceLive.ClientEventSessionAvatarConnect, Access.internal, "csharp"); +@@access(VoiceLive.ClientEventSessionUpdate, Access.internal, "csharp"); +@@access(VoiceLive.VideoCrop.bottom_right, Access.internal, "csharp"); +@@access(VoiceLive.VideoCrop.top_left, Access.internal, "csharp"); +@@access(VoiceLive.Response.voice, Access.internal, "csharp"); + +@@clientName(VoiceLive.Modality, "InputModality", "csharp"); +@@clientName(VoiceLive.Animation, "AnimationOptions", "csharp"); +@@clientName(VoiceLive.Tool, "VoiceLiveToolInvocation", "csharp"); +@@clientName(VoiceLive.AzureCustomVoice.custom_lexicon_url, "CustomLexiconUri", "csharp"); +@@clientName(VoiceLive.IceServer.urls, "Uris", "csharp"); +@@clientName(VoiceLive.VideoCrop.bottom_right, "BottomRightInternal", "csharp"); +@@clientName(VoiceLive.VideoCrop.top_left, "TopLeftInternal", "csharp"); +@@clientName(VoiceLive.Response, "VoiceLiveResponse", "csharp"); +@@clientName(VoiceLive.Response.voice, "VoiceInternal", "csharp"); +@@clientName(VoiceLive.ResponseSession.voice, "VoiceInternal", "csharp"); +@@clientName(VoiceLive.Response.modalities, "ModalitiesInternal", "csharp"); +@@clientName(VoiceLive.AgentConfig, "RespondingAgentConfig", "csharp"); +@@clientName(VoiceLive.FunctionTool, "VoiceLiveFunctionDefinition", "csharp"); +@@clientName(VoiceLive.Tool, "VoiceLiveToolDefinition", "csharp"); +@@clientName(VoiceLive.ContentPart, "VoiceLiveContentPart", "csharp"); +@@clientName(VoiceLive.MessageRole, "ResponseMessageRole", "csharp"); +@@clientName(VoiceLive.ResponseStatus, "VoiceLiveResponseStatus", "csharp"); +@@clientName(VoiceLive.ServerEvent, "SessionUpdate", "csharp"); +@@clientName(VoiceLive.ServerEventSessionCreated, "SessionUpdateSessionCreated", "csharp"); +@@clientName(VoiceLive.ServerEventSessionUpdated, "SessionUpdateSessionUpdated", "csharp"); +@@clientName(VoiceLive.ServerEventSessionAvatarConnecting, "SessionUpdateSessionAvatarConnecting", "csharp"); +@@clientName(VoiceLive.ServerEventInputAudioBufferSpeechStarted, "SessionUpdateInputAudioBufferSpeechStarted", "csharp"); +@@clientName(VoiceLive.ServerEventInputAudioBufferSpeechStopped, "SessionUpdateInputAudioBufferSpeechStopped", "csharp"); +@@clientName(VoiceLive.ServerEventInputAudioBufferCleared, "SessionUpdateInputAudioBufferCleared", "csharp"); +@@clientName(VoiceLive.ServerEventInputAudioBufferCommitted, "SessionUpdateInputAudioBufferCommitted", "csharp"); +@@clientName(VoiceLive.ServerEventResponseCreated, "SessionUpdateResponseCreated", "csharp"); +@@clientName(VoiceLive.ServerEventResponseDone, "SessionUpdateResponseDone", "csharp"); +@@clientName(VoiceLive.ServerEventResponseAudioDelta, "SessionUpdateResponseAudioDelta", "csharp"); +@@clientName(VoiceLive.ServerEventResponseAudioDone, "SessionUpdateResponseAudioDone", "csharp"); +@@clientName(VoiceLive.ServerEventResponseAudioTranscriptDelta, "SessionUpdateResponseAudioTranscriptDelta", "csharp"); +@@clientName(VoiceLive.ServerEventResponseAudioTranscriptDone, "SessionUpdateResponseAudioTranscriptDone", "csharp"); +@@clientName(VoiceLive.ServerEventResponseAudioTimestampDelta, "SessionUpdateResponseAudioTimestampDelta", "csharp"); +@@clientName(VoiceLive.ServerEventResponseAudioTimestampDone, "SessionUpdateResponseAudioTimestampDone", "csharp"); +@@clientName(VoiceLive.ServerEventResponseTextDelta, "SessionUpdateResponseTextDelta", "csharp"); +@@clientName(VoiceLive.ServerEventResponseTextDone, "SessionUpdateResponseTextDone", "csharp"); +@@clientName(VoiceLive.ServerEventResponseContentPartAdded, "SessionUpdateResponseContentPartAdded", "csharp"); +@@clientName(VoiceLive.ServerEventResponseContentPartDone, "SessionUpdateResponseContentPartDone", "csharp"); +@@clientName(VoiceLive.ServerEventResponseOutputItemAdded, "SessionUpdateResponseOutputItemAdded", "csharp"); +@@clientName(VoiceLive.ServerEventResponseOutputItemDone, "SessionUpdateResponseOutputItemDone", "csharp"); +@@clientName(VoiceLive.ServerEventResponseFunctionCallArgumentsDelta, "SessionUpdateResponseFunctionCallArgumentsDelta", "csharp"); +@@clientName(VoiceLive.ServerEventResponseFunctionCallArgumentsDone, "SessionUpdateResponseFunctionCallArgumentsDone", "csharp"); +@@clientName(VoiceLive.ServerEventResponseAnimationBlendshapeDelta, "SessionUpdateResponseAnimationBlendshapeDelta", "csharp"); +@@clientName(VoiceLive.ServerEventResponseAnimationBlendshapeDone, "SessionUpdateResponseAnimationBlendshapeDone", "csharp"); +@@clientName(VoiceLive.ServerEventResponseEmotionHypothesis, "SessionUpdateResponseEmotionHypothesis", "csharp"); +@@clientName(VoiceLive.ServerEventResponseAnimationVisemeDelta, "SessionUpdateResponseAnimationVisemeDelta", "csharp"); +@@clientName(VoiceLive.ServerEventResponseAnimationVisemeDone, "SessionUpdateResponseAnimationVisemeDone", "csharp"); +@@clientName(VoiceLive.ServerEventConversationItemCreated, "SessionUpdateConversationItemCreated", "csharp"); +@@clientName(VoiceLive.ServerEventConversationItemDeleted, "SessionUpdateConversationItemDeleted", "csharp"); +@@clientName(VoiceLive.ServerEventConversationItemRetrieved, "SessionUpdateConversationItemRetrieved", "csharp"); +@@clientName(VoiceLive.ServerEventConversationItemTruncated, "SessionUpdateConversationItemTruncated", "csharp"); +@@clientName(VoiceLive.ServerEventConversationItemInputAudioTranscriptionCompleted, "SessionUpdateConversationItemInputAudioTranscriptionCompleted", "csharp"); +@@clientName(VoiceLive.ServerEventConversationItemInputAudioTranscriptionDelta, "SessionUpdateConversationItemInputAudioTranscriptionDelta", "csharp"); +@@clientName(VoiceLive.ServerEventConversationItemInputAudioTranscriptionFailed, "SessionUpdateConversationItemInputAudioTranscriptionFailed", "csharp"); +@@clientName(VoiceLive.ServerEventError, "SessionUpdateError", "csharp"); +@@clientName(VoiceLive.Usage, "ResponseTokenStatistics", "csharp"); +@@clientName(VoiceLive.Phi4mmVoice, "LlmVoiceName", "csharp"); +@@clientName(VoiceLive.EOUDetection, "EouDetection", "csharp"); +@@clientName(VoiceLive.LLMVoice, "LlmVoice", "csharp"); diff --git a/specification/ai/data-plane/VoiceLive/common/main.tsp b/specification/ai/data-plane/VoiceLive/common/main.tsp new file mode 100644 index 000000000000..5ad1d3a2bec6 --- /dev/null +++ b/specification/ai/data-plane/VoiceLive/common/main.tsp @@ -0,0 +1 @@ +import "./models.tsp"; diff --git a/specification/ai/data-plane/VoiceLive/common/models.tsp b/specification/ai/data-plane/VoiceLive/common/models.tsp new file mode 100644 index 000000000000..3b1209a3301f --- /dev/null +++ b/specification/ai/data-plane/VoiceLive/common/models.tsp @@ -0,0 +1,44 @@ +// Cleaned TypeSpec file aligned with Python model definitions +// Removed models not defined or needed based on your Python code baseline +import "@typespec/http"; +import "@typespec/openapi"; +using TypeSpec.OpenAPI; + +namespace VoiceLive; + +@doc("Error object returned in case of API failure.") +model VoiceLiveErrorDetails { + @doc("Error code, or null if unspecified.") + code?: string; + + @doc("Human-readable error message.") + message: string; + + @doc("Parameter name related to the error, if applicable.") + param?: string; + + @doc("Type or category of the error.") + type?: string; + + @doc("Event id of the error.") + event_id?: string; +} + +@error +@doc("Standard error response envelope.") +model ErrorResponse { + @doc("Error object returned in case of API failure.") + error: VoiceLiveErrorDetails; +} + +@doc("A single log probability entry for a token.") +model LogProbProperties { + @doc("The token that was used to generate the log probability.") + token: string; + + @doc("The log probability of the token.") + logprob: float32; + + @doc("The bytes that were used to generate the log probability.") + bytes: int32[]; +} \ No newline at end of file diff --git a/specification/ai/data-plane/VoiceLive/custom.tsp b/specification/ai/data-plane/VoiceLive/custom.tsp new file mode 100644 index 000000000000..bbd4782ce234 --- /dev/null +++ b/specification/ai/data-plane/VoiceLive/custom.tsp @@ -0,0 +1,484 @@ +import "./custom/events.tsp"; +import "./custom/items.tsp"; +import "./custom/tools.tsp"; +import "@typespec/http"; +import "@typespec/openapi"; + + +using TypeSpec.OpenAPI; + +namespace VoiceLive; + +model RequestSession { + ...SessionBase; + `model`?: string; + modalities?: Modality[]; + animation?: Animation; + voice?: Voice; + instructions?: string; + input_audio?: InputAudio; + input_audio_sampling_rate?: int32 = 24000; + input_audio_format?: AudioFormat = AudioFormat.pcm16; + output_audio_format?: AudioFormat = AudioFormat.pcm16; + turn_detection?: TurnDetection | null; + input_audio_noise_reduction?: AudioNoiseReduction; + input_audio_echo_cancellation?: AudioEchoCancellation; + avatar?: AvatarConfig; + input_audio_transcription?: AudioInputTranscriptionSettings; + output_audio_timestamp_types?: AudioTimestampType[]; + tools?: Tool[]; + tool_choice?: ToolChoice; + temperature?: float32; + max_response_output_tokens?: int32 | "inf"; +} + +model ResponseSession { + ...SessionBase; + id?: string; + `model`?: string; + modalities?: Modality[]; + instructions?: string; + animation?: Animation; + voice?: Voice; + input_audio?: InputAudio; + input_audio_format?: AudioFormat; + output_audio_format?: AudioFormat; + input_audio_sampling_rate?: int32; + turn_detection?: TurnDetection; + input_audio_noise_reduction?: AudioNoiseReduction; + input_audio_echo_cancellation?: AudioEchoCancellation; + avatar?: AvatarConfig; + input_audio_transcription?: AudioInputTranscriptionSettings | null; + output_audio_timestamp_types?: AudioTimestampType[]; + tools?: Tool[]; + tool_choice?: ToolChoice; + temperature?: float32; + max_response_output_tokens?: int32 | "inf" | null; + agent?: AgentConfig; +} + +@doc("Supported OpenAI voice names (string enum).") +union OAIVoice { + string, + alloy: "alloy", + ash: "ash", + ballad: "ballad", + coral: "coral", + echo: "echo", + sage: "sage", + shimmer: "shimmer", + verse: "verse", +} + +@doc(""" +OpenAI voice configuration with explicit type field. + +This provides a unified interface for OpenAI voices, complementing the +existing string-based OAIVoice for backward compatibility. +""") +model OpenAIVoice { + type: "openai"; + name: OAIVoice; +} + +// --- Azure voices ---------------------------------------------------------- + +@doc("Base for Azure voice configurations.") +@discriminator("type") +model AzureVoice { + type: string; +} + +@doc("Azure custom voice configuration (preferred).") +model AzureCustomVoice extends AzureVoice { + type: "azure-custom"; + + @minLength(1) @doc("Voice name cannot be empty.") name: string; + @minLength(1) @doc("Endpoint ID cannot be empty.") endpoint_id: string; + + @minValue(0) @maxValue(1) + @doc("Temperature must be between 0.0 and 1.0.") + temperature?: float32; + + custom_lexicon_url?: string; + prefer_locales?: string[]; + locale?: string; + style?: string; + pitch?: string; + rate?: string; + volume?: string; +} + +@doc("Azure standard voice configuration.") +model AzureStandardVoice extends AzureVoice { + type: "azure-standard"; + + @minLength(1) @doc("Voice name cannot be empty.") name: string; + + @minValue(0) @maxValue(1) + @doc("Temperature must be between 0.0 and 1.0.") temperature?: float32; + + custom_lexicon_url?: string; + prefer_locales?: string[]; + locale?: string; + style?: string; + pitch?: string; + rate?: string; + volume?: string; +} + +@doc("Azure platform voice configuration (variant of standard).") +model AzurePlatformVoice extends AzureVoice { + type: "azure-platform"; + + @minLength(1) @doc("Voice name cannot be empty.") name: string; + + @minValue(0) @maxValue(1) + @doc("Temperature must be between 0.0 and 1.0.") temperature?: float32; + + custom_lexicon_url?: string; + prefer_locales?: string[]; + locale?: string; + style?: string; + pitch?: string; + rate?: string; + volume?: string; +} + +@doc("Azure personal voice configuration.") +model AzurePersonalVoice extends AzureVoice { + type: "azure-personal"; + + @minLength(1) @doc("Voice name cannot be empty.") name: string; + + @minValue(0) @maxValue(1) + @doc("Temperature must be between 0.0 and 1.0.") + temperature?: float32; + + @doc("Underlying neural model to use for personal voice.") + `model`: "DragonLatestNeural" | "PhoenixLatestNeural" | "PhoenixV2Neural"; +} + +// --- Phi4mm voices --------------------------------------------------------- + +@doc("Voice identifier for Phi4mm voices.") +union Phi4mmVoice { + string, + "cosyvoice" +} + +@doc(""" +Voice configuration for LLM (Large Language Model) voices. +""") +model LLMVoice { + type: "llm"; + name: Phi4mmVoice; +} + +// --- Top-level Voice union ------------------------------------------------- + +@doc("Union of all supported voice configurations.") +union Voice { + OAIVoice, + OpenAIVoice, + AzureVoice, // includes AzureCustomVoice, CustomVoice, AzurePersonalVoice + Phi4mmVoice, + LLMVoice +} + +union AudioFormat { + string, + pcm16: "pcm16", + g711_ulaw: "g711_ulaw", + g711_alaw: "g711_alaw", +} + +union AudioInputTranscriptionModel { + string, + whisper_1: "whisper-1", +} + +@doc("Configuration for input audio transcription.") +model AudioInputTranscriptionSettings { + @doc("The model used for transcription. E.g., 'whisper-1', 'azure-fast-transcription', 's2s-ingraph'.") + `model`: string | "whisper-1" | "azure-fast-transcription" | "s2s-ingraph" | "azure-speech"; + + @doc("The language code to use for transcription, if specified.") + language?: string; + + @doc("Whether transcription is enabled.") + enabled: boolean; + + @doc("Whether a custom model is being used.") + custom_model: boolean; +} + +union Modality { + string, + text: "text", + audio: "audio", + animation: "animation", + avatar: "avatar", +} + +@discriminator("model") +@doc("Top-level union for end-of-utterance (EOU) semantic detection configuration.") +model EOUDetection { + `model`: "semantic_detection_v1" | "semantic_detection_v1_en" | "semantic_detection_v1_multilingual"; +} + +@doc("Azure semantic end-of-utterance detection (default).") +model AzureSemanticDetection extends EOUDetection { + `model`: "semantic_detection_v1"; + threshold?: float32; + timeout?: float32; + secondary_threshold?: float32; + secondary_timeout?: float32; + disable_rules?: boolean; + // developer options + sr_boost?: float32; + extra_imend_check?: boolean; +} + +@doc("Azure semantic end-of-utterance detection (English-optimized).") +model AzureSemanticDetectionEn extends EOUDetection { + `model`: "semantic_detection_v1_en"; + threshold?: float32; + timeout?: float32; + secondary_threshold?: float32; + secondary_timeout?: float32; + disable_rules?: boolean; + // developer options + sr_boost?: float32; + extra_imend_check?: boolean; +} + +@doc("Azure semantic end-of-utterance detection (multilingual).") +model AzureSemanticDetectionMultilingual extends EOUDetection { + `model`: "semantic_detection_v1_multilingual"; + threshold?: float32; + timeout?: float32; + secondary_threshold?: float32; + secondary_timeout?: float32; + disable_rules?: boolean; + // developer options + sr_boost?: float32; + extra_imend_check?: boolean; +} + +@discriminator("type") +@doc("Top-level union for turn detection configuration.") +model TurnDetection { + type: + | "none" + | "server_vad" + | "azure_semantic_vad" + | "azure_semantic_vad_en" + | "server_sd" + | "azure_semantic_vad_multilingual"; +} + +@doc("Disables turn detection.") +model NoTurnDetection extends TurnDetection { + type: "none"; +} + +@doc("Base model for VAD-based turn detection.") +model ServerVad extends TurnDetection { + type: "server_vad"; + threshold?: float32; + prefix_padding_ms?: int32; + silence_duration_ms?: int32; + end_of_utterance_detection?: EOUDetection; + auto_truncate?: boolean = false; +} + +@doc("Server Speech Detection (Azure semantic VAD, default variant).") +model AzureSemanticVad extends TurnDetection { + type: "azure_semantic_vad"; + threshold?: float32; + prefix_padding_ms?: int32; + silence_duration_ms?: int32; + end_of_utterance_detection?: EOUDetection; + neg_threshold?: float32; + speech_duration_ms?: int32; + window_size?: int32; + distinct_ci_phones?: int32; + require_vowel?: boolean; + remove_filler_words?: boolean = false; + languages?: string[]; + auto_truncate?: boolean = false; +} + +@doc("Server Speech Detection (Azure semantic VAD, English-only).") +model AzureSemanticVadEn extends TurnDetection { + type: "azure_semantic_vad_en"; + threshold?: float32; + prefix_padding_ms?: int32; + silence_duration_ms?: int32; + end_of_utterance_detection?: EOUDetection; + neg_threshold?: float32; + speech_duration_ms?: int32; + window_size?: int32; + distinct_ci_phones?: int32; + require_vowel?: boolean; + remove_filler_words?: boolean = false; + languages?: string[]; + auto_truncate?: boolean = false; +} + +@doc("Server Speech Detection (legacy `server_sd` alias).") +model AzureSemanticVadServer extends TurnDetection { + type: "server_sd"; + threshold?: float32; + prefix_padding_ms?: int32; + silence_duration_ms?: int32; + end_of_utterance_detection?: EOUDetection; + neg_threshold?: float32; + speech_duration_ms?: int32; + window_size?: int32; + distinct_ci_phones?: int32; + require_vowel?: boolean; + remove_filler_words?: boolean = false; + languages?: string[]; + auto_truncate?: boolean = false; +} + +@doc("Server Speech Detection (Azure semantic VAD).") +model AzureMultilingualSemanticVad extends TurnDetection { + type: "azure_semantic_vad_multilingual"; + threshold?: float32; + prefix_padding_ms?: int32; + silence_duration_ms?: int32; + end_of_utterance_detection?: EOUDetection; + neg_threshold?: float32; + speech_duration_ms?: int32; + window_size?: int32; + distinct_ci_phones?: int32; + require_vowel?: boolean; + remove_filler_words?: boolean = false; + languages?: string[]; + auto_truncate?: boolean = false; +} + +@doc("Configuration for input audio noise reduction.") +model AudioNoiseReduction { + @doc("The type of noise reduction model.") + type: "azure_deep_noise_suppression"; +} + +@doc("Configuration for client audio input. Used to specify the audio model and optional phrase list.") +model InputAudio { + @doc("The name of the model to use for input audio (currently only 'azure-standard' is supported).") + `model`: "azure-standard"; + + @doc("Optional list of phrases to bias the speech recognition engine.") + phrase_list?: string[]; +} + +@doc("Echo cancellation configuration for server-side audio processing.") +model AudioEchoCancellation { + @doc("The type of echo cancellation model to use.") + type: "server_echo_cancellation"; +} + +@doc("Output timestamp types supported in audio response content.") +union AudioTimestampType { + string, + @doc("Timestamps per word in the output audio.") + word: "word", +} + +@doc("Specifies the types of animation data to output.") +union AnimationOutputType { + blendshapes: "blendshapes", + viseme_id: "viseme_id", + emotion: "emotion", +} + +@doc("Configuration for animation outputs including blendshapes, visemes, and emotion metadata.") +model Animation { + @doc("The name of the animation model to use.") + model_name?: string = "default"; + + @doc("Set of output data types requested from the animation system.") + outputs?: AnimationOutputType[] = #[AnimationOutputType.blendshapes]; + + @doc("Interval for emotion detection in milliseconds. If not set, emotion detection is disabled.") + emotion_detection_interval_ms?: int32; +} + +@doc("Configuration for avatar streaming and behavior during the session.") +model AvatarConfig { + @doc("Optional list of ICE servers to use for WebRTC connection establishment.") + ice_servers?: IceServer[]; + + @doc("The character name or ID used for the avatar.") + character: string; + + @doc("Optional avatar style, such as emotional tone or speaking style.") + style?: string; + + @doc("Indicates whether the avatar is customized or not.") + customized: boolean; + + @doc("Optional video configuration including resolution, bitrate, and codec.") + video?: VideoParams; +} + +@doc("ICE server configuration for WebRTC connection negotiation.") +model IceServer { + @doc("List of ICE server URLs (e.g., TURN or STUN endpoints).") + urls: url[]; + + @doc("Optional username used for authentication with the ICE server.") + username?: string; + + @doc("Optional credential (e.g., password or token) used for authentication.") + credential?: string; +} + +model AgentConfig { + type: "agent"; + name: string; + description?: string; + agent_id: string; + thread_id: string; +} + +@doc("Video streaming parameters for avatar.") +model VideoParams { + @doc("Bitrate in bits per second (e.g., 2000000 for 2 Mbps).") + bitrate?: int32 = 2000000; + + @doc("Codec to use for encoding. Currently only 'h264' is supported.") + codec?: "h264" = "h264"; + + @doc("Optional cropping settings for the video stream.") + crop?: VideoCrop; + + @doc("Optional resolution settings for the video stream.") + resolution?: VideoResolution; +} + +@doc("Defines a video crop rectangle using top-left and bottom-right coordinates.") +model VideoCrop { + @doc("Top-left corner of the crop region. Array of [x, y], must be non-negative integers.") + @minItems(2) + @maxItems(2) + top_left: int32[]; + + @doc("Bottom-right corner of the crop region. Array of [x, y], must be non-negative integers.") + @minItems(2) + @maxItems(2) + bottom_right: int32[]; +} + +@doc("Resolution of the video feed in pixels.") +model VideoResolution { + @doc("Width of the video in pixels. Must be greater than 0.") + width: int32; + + @doc("Height of the video in pixels. Must be greater than 0.") + height: int32; +} \ No newline at end of file diff --git a/specification/ai/data-plane/VoiceLive/custom/content_parts.tsp b/specification/ai/data-plane/VoiceLive/custom/content_parts.tsp new file mode 100644 index 000000000000..ff4c86acd705 --- /dev/null +++ b/specification/ai/data-plane/VoiceLive/custom/content_parts.tsp @@ -0,0 +1,36 @@ +using TypeSpec.OpenAPI; + +namespace VoiceLive; + +union ContentPartType { + string, + input_text: "input_text", + input_audio: "input_audio", + text: "text", + audio: "audio", +} + +@discriminator("type") +model ContentPart { + type: ContentPartType; +} + +model RequestTextContentPart extends ContentPart { + type: ContentPartType.input_text; + text?: string; +} + +model RequestAudioContentPart extends ContentPart { + type: ContentPartType.input_audio; + transcript?: string; +} + +model ResponseTextContentPart extends ContentPart { + type: ContentPartType.text; + text?: string; +} + +model ResponseAudioContentPart extends ContentPart { + type: ContentPartType.audio; + transcript?: string; +} diff --git a/specification/ai/data-plane/VoiceLive/custom/events.tsp b/specification/ai/data-plane/VoiceLive/custom/events.tsp new file mode 100644 index 000000000000..5b38866ee7bf --- /dev/null +++ b/specification/ai/data-plane/VoiceLive/custom/events.tsp @@ -0,0 +1,65 @@ +using TypeSpec.OpenAPI; + +namespace VoiceLive; + +@doc("Client event types used in VoiceLive protocol.") +union ClientEventType { + string, + session_update: "session.update", + input_audio_buffer_append: "input_audio_buffer.append", + input_audio_buffer_commit: "input_audio_buffer.commit", + input_audio_buffer_clear: "input_audio_buffer.clear", + input_audio_turn_start: "input_audio.turn.start", + input_audio_turn_append: "input_audio.turn.append", + input_audio_turn_end: "input_audio.turn.end", + input_audio_turn_cancel: "input_audio.turn.cancel", + input_audio_clear: "input_audio.clear", + conversation_item_create: "conversation.item.create", + conversation_item_retrieve: "conversation.item.retrieve", + conversation_item_truncate: "conversation.item.truncate", + conversation_item_delete: "conversation.item.delete", + response_create: "response.create", + response_cancel: "response.cancel", + session_avatar_connect: "session.avatar.connect", +} + +@doc("Server event types used in VoiceLive protocol.") +union ServerEventType { + string, + error: "error", + session_avatar_connecting: "session.avatar.connecting", + session_created: "session.created", + session_updated: "session.updated", + conversation_item_input_audio_transcription_completed: "conversation.item.input_audio_transcription.completed", + conversation_item_input_audio_transcription_delta: "conversation.item.input_audio_transcription.delta", + conversation_item_input_audio_transcription_failed: "conversation.item.input_audio_transcription.failed", + conversation_item_created: "conversation.item.created", + conversation_item_retrieved: "conversation.item.retrieved", + conversation_item_truncated: "conversation.item.truncated", + conversation_item_deleted: "conversation.item.deleted", + input_audio_buffer_committed: "input_audio_buffer.committed", + input_audio_buffer_cleared: "input_audio_buffer.cleared", + input_audio_buffer_speech_started: "input_audio_buffer.speech_started", + input_audio_buffer_speech_stopped: "input_audio_buffer.speech_stopped", + response_created: "response.created", + response_done: "response.done", + response_output_item_added: "response.output_item.added", + response_output_item_done: "response.output_item.done", + response_content_part_added: "response.content_part.added", + response_content_part_done: "response.content_part.done", + response_text_delta: "response.text.delta", + response_text_done: "response.text.done", + response_audio_transcript_delta: "response.audio_transcript.delta", + response_audio_transcript_done: "response.audio_transcript.done", + response_audio_delta: "response.audio.delta", + response_audio_done: "response.audio.done", + response_animation_blendshapes_delta: "response.animation_blendshapes.delta", + response_animation_blendshapes_done: "response.animation_blendshapes.done", + response_emotion_hypothesis: "response.emotion_hypothesis", + response_audio_timestamp_delta: "response.audio_timestamp.delta", + response_audio_timestamp_done: "response.audio_timestamp.done", + response_animation_viseme_delta: "response.animation_viseme.delta", + response_animation_viseme_done: "response.animation_viseme.done", + response_function_call_arguments_delta: "response.function_call_arguments.delta", + response_function_call_arguments_done: "response.function_call_arguments.done", +} \ No newline at end of file diff --git a/specification/ai/data-plane/VoiceLive/custom/items.tsp b/specification/ai/data-plane/VoiceLive/custom/items.tsp new file mode 100644 index 000000000000..4568b4651bf2 --- /dev/null +++ b/specification/ai/data-plane/VoiceLive/custom/items.tsp @@ -0,0 +1,207 @@ +import "./content_parts.tsp"; + +using TypeSpec.OpenAPI; + +namespace VoiceLive; + +union ItemType { + string, + message: "message", + function_call: "function_call", + function_call_output: "function_call_output", +} + +// Base for user content parts +@discriminator("type") +model UserContentPart { + type: string; +} + +// Variants +model InputTextContentPart extends UserContentPart { + type: "input_text"; + text: string; +} + +model InputAudioContentPart extends UserContentPart { + type: "input_audio"; + audio: string; + transcript?: string; +} + +@doc("Output text content part.") +model OutputTextContentPart { + type: "text"; + text: string; +} + +// Status enum +enum ItemParamStatus { + completed: "completed", + incomplete: "incomplete", +} + +@doc("Base for any response item; discriminated by `type`.") +@discriminator("type") +model ConversationRequestItem { + type: ItemType; + id?: string; +} + +// ----- Message Items ----- +@discriminator("role") +model MessageItem extends ConversationRequestItem { + type: ItemType.message; + role: string; + status?: ItemParamStatus; +} + +model SystemMessageItem extends MessageItem { + role: "system"; + content: InputTextContentPart[]; +} + +model UserMessageItem extends MessageItem { + role: "user"; + content: UserContentPart[]; +} + +model AssistantMessageItem extends MessageItem { + role: "assistant"; + content: OutputTextContentPart[]; +} + +// ----- Function Call Items ----- +model FunctionCallItem extends ConversationRequestItem { + type: ItemType.function_call; + name: string; + call_id: string; + arguments: string; + status?: ItemParamStatus; +} + +model FunctionCallOutputItem extends ConversationRequestItem { + type: ItemType.function_call_output; + call_id: string; + output: string; + status?: ItemParamStatus; +} + +@discriminator("type") +model ResponseItem { + // must stay here, required, broad type + type: ItemType; + id?: string; + object?: "realtime.item"; +} + +model ResponseMessageItem extends ResponseItem { + type: ItemType.message; + role: MessageRole; + content: ContentPart[]; + status: ResponseItemStatus; +} + +model ResponseFunctionCallItem + extends ResponseItem { + type: ItemType.function_call; + name: string; + call_id: string; + arguments: string; + status: ResponseItemStatus; +} + +model ResponseFunctionCallOutputItem + extends ResponseItem { + type: ItemType.function_call_output; + call_id: string; + output: string; +} + +union ResponseItemStatus { + string, + in_progress: "in_progress", + completed: "completed", + incomplete: "incomplete", +} + +union MessageRole { + string, + system: "system", + user: "user", + assistant: "assistant", +} + +@doc("Terminal status of a response.") +enum ResponseStatus { + completed: "completed", + cancelled: "cancelled", + failed: "failed", + incomplete: "incomplete", + in_progress: "in_progress", +} + +@doc("Base for all non-success response details.") +@discriminator("type") // or just @discriminator("type") if imported unqualified +model ResponseStatusDetails { + // Required discriminator key on the base; keep it as a broad string. + type: string; +} + +@doc("Details for a cancelled response.") +model ResponseCancelledDetails extends ResponseStatusDetails { + // Narrow the discriminator to a literal in each child: + type: "cancelled"; + reason: "turn_detected" | "client_cancelled"; +} + +@doc("Details for an incomplete response.") +model ResponseIncompleteDetails extends ResponseStatusDetails { + type: "incomplete"; + reason: "max_output_tokens" | "content_filter"; +} + +@doc("Details for a failed response.") +model ResponseFailedDetails extends ResponseStatusDetails { + type: "failed"; + error: unknown; +} + +@doc("Details of input token usage.") +model InputTokenDetails { + @doc("Number of cached tokens used in the input.") + cached_tokens: int32; + + @doc("Number of text tokens used in the input.") + text_tokens: int32; + + @doc("Number of audio tokens used in the input.") + audio_tokens: int32; +} + +@doc("Details of output token usage.") +model OutputTokenDetails { + @doc("Number of text tokens generated in the output.") + text_tokens: int32; + + @doc("Number of audio tokens generated in the output.") + audio_tokens: int32; +} + +@doc("Overall usage statistics for a response.") +model Usage { + @doc("Total number of tokens (input + output).") + total_tokens: int32; + + @doc("Number of input tokens.") + input_tokens: int32; + + @doc("Number of output tokens.") + output_tokens: int32; + + @doc("Detailed breakdown of input tokens.") + input_token_details: InputTokenDetails; + + @doc("Detailed breakdown of output tokens.") + output_token_details: OutputTokenDetails; +} diff --git a/specification/ai/data-plane/VoiceLive/custom/tools.tsp b/specification/ai/data-plane/VoiceLive/custom/tools.tsp new file mode 100644 index 000000000000..b346377da821 --- /dev/null +++ b/specification/ai/data-plane/VoiceLive/custom/tools.tsp @@ -0,0 +1,73 @@ +using TypeSpec.OpenAPI; + +namespace VoiceLive; + +/** + * The supported tool type discriminators for voicelive tools. + * Currently, only 'function' tools are supported. + */ +union ToolType { + string, + function: "function", +} + +/** + * The base representation of a voicelive tool definition. + */ +@discriminator("type") +model Tool { + type: ToolType; +} + +/** + * The definition of a function tool as used by the voicelive endpoint. + */ +model FunctionTool extends Tool { + type: ToolType.function; + name: string; + description?: string; + parameters?: unknown; +} + +/** + * The combined set of available representations for a voicelive tool_choice parameter, encompassing both string + * literal options like 'auto' as well as structured references to defined tools. + */ +union ToolChoice { + ToolChoiceLiteral, + ToolChoiceObject, +} + +/** + * The available set of mode-level, string literal tool_choice options for the voicelive endpoint. + */ +union ToolChoiceLiteral { + string, + + /** Specifies that the model should freely determine which tool or tools, if any, to call. */ + auto: "auto", + + /** Specifies that the model should call no tools whatsoever. */ + none: "none", + + /** Specifies that the model should call at least one tool. */ + required: "required", +} + +/** + * A base representation for a voicelive tool_choice selecting a named tool. + */ +@discriminator("type") +model ToolChoiceObject { + type: ToolType; +} + +/** + * The representation of a voicelive tool_choice selecting a named function tool. + */ +model ToolChoiceFunctionObject extends ToolChoiceObject { + type: ToolType.function; + function: { + name: string; + }; +} diff --git a/specification/ai/data-plane/VoiceLive/main.tsp b/specification/ai/data-plane/VoiceLive/main.tsp new file mode 100644 index 000000000000..144c4aeaff10 --- /dev/null +++ b/specification/ai/data-plane/VoiceLive/main.tsp @@ -0,0 +1 @@ +import "./operations.tsp"; diff --git a/specification/ai/data-plane/VoiceLive/models.tsp b/specification/ai/data-plane/VoiceLive/models.tsp new file mode 100644 index 000000000000..5387eb699557 --- /dev/null +++ b/specification/ai/data-plane/VoiceLive/models.tsp @@ -0,0 +1,1242 @@ +/* + * This file was automatically generated from an OpenAPI .yaml file. + * Edits made directly to this file will be lost. + */ + +import "./client.tsp"; +import "./common"; +import "./custom.tsp"; + +using TypeSpec.OpenAPI; + +namespace VoiceLive; + +// Tool customization: Adjust union to be a discriminated type base +/** A voicelive client event. */ +@discriminator("type") +model ClientEvent { + /** The type of event. */ + type: ClientEventType; + + event_id?: string; +} + +// Tool customization (apply_discriminator): apply discriminated type base +@doc(""" + Send this event to update the session’s default configuration. + The client may send this event at any time to update any field, + except for `voice`. However, note that once a session has been + initialized with a particular `model`, it can’t be changed to + another model using `session.update`. + + When the server receives a `session.update`, it will respond + with a `session.updated` event showing the full, effective configuration. + Only the fields that are present are updated. To clear a field like + `instructions`, pass an empty string. + """) +model ClientEventSessionUpdate extends ClientEvent { + @doc(""" + The event type, must be `session.update`. + """) + type: ClientEventType.session_update; + + // Tool customization: apply enriched request-specific model + session: RequestSession; +} + +@doc(""" + Sent when the client connects and provides its SDP (Session Description Protocol) + for avatar-related media negotiation. +""") +model ClientEventSessionAvatarConnect extends ClientEvent { + @doc("The event type, must be 'session.avatar.connect'.") + type: ClientEventType.session_avatar_connect; + + @doc("The client's SDP offer.") + client_sdp: string; +} + +@doc(""" + Indicates the start of a new audio input turn. +""") +model ClientEventInputAudioTurnStart extends ClientEvent { + @doc("The event type, must be 'input_audio.turn.start'.") + type: ClientEventType.input_audio_turn_start; + + @doc("Unique identifier for the input audio turn.") + turn_id: string; +} + +@doc(""" + Appends audio data to an ongoing input turn. +""") +model ClientEventInputAudioTurnAppend extends ClientEvent { + @doc("The event type, must be 'input_audio.turn.append'.") + type: ClientEventType.input_audio_turn_append; + + @doc("The ID of the turn this audio is part of.") + turn_id: string; + + @doc("Base64-encoded audio chunk.") + audio: string; +} + +@doc(""" + Marks the end of an audio input turn. +""") +model ClientEventInputAudioTurnEnd extends ClientEvent { + @doc("The event type, must be 'input_audio.turn.end'.") + type: ClientEventType.input_audio_turn_end; + + @doc("The ID of the audio turn being ended.") + turn_id: string; +} + +@doc(""" + Cancels an in-progress input audio turn. +""") +model ClientEventInputAudioTurnCancel extends ClientEvent { + @doc("The event type, must be 'input_audio.turn.cancel'.") + type: ClientEventType.input_audio_turn_cancel; + + @doc("The ID of the turn to cancel.") + turn_id: string; +} + +@doc(""" + Clears all input audio currently being streamed. +""") +model ClientEventInputAudioClear extends ClientEvent { + @doc("The event type, must be 'input_audio.clear'.") + type: ClientEventType.input_audio_clear; +} + +// Tool customization: establish custom, enriched discriminated type hierarchy +/** The item to add to the conversation. */ +model ConversationItemBase { + /** Customized to enriched Conversation{Request,Response}Item models */ +} + +/** The response resource. */ +model Response { + /** The unique ID of the response. */ + id?: string; + + @doc(""" + The object type, must be `realtime.response`. + """) + object?: "realtime.response"; + + @doc(""" + The final status of the response. + One of: `completed`, `cancelled`, `failed`, `incomplete`, or `in_progress`. + """) + status?: ResponseStatus; + + /** Additional details about the status. */ + status_details?: ResponseStatusDetails; + + // Tool customization: apply enriched response-specific type + /** The list of output items generated by the response. */ + output?: ResponseItem[]; + + /** + * Usage statistics for the Response, this will correspond to billing. A + * VoiceLive API session will maintain a conversation context and append new + * Items to the Conversation, thus output from previous turns (text and + * audio tokens) will become the input for later turns. + */ + usage?: Usage; + + @doc(""" + Which conversation the response is added to, determined by the `conversation` + field in the `response.create` event. If `auto`, the response will be added to + the default conversation and the value of `conversation_id` will be an id like + `conv_1234`. If `none`, the response will not be added to any conversation and + the value of `conversation_id` will be `null`. If responses are being triggered + by server VAD, the response will be added to the default conversation, thus + the `conversation_id` will be an id like `conv_1234`. + """) + conversation_id?: string; + + @doc(""" + supported voice identifiers and configurations. + """) + voice?: Voice; + + @doc(""" + The set of modalities the model used to respond. If there are multiple modalities, + the model will pick one, for example if `modalities` is `["text", "audio"]`, the model + could be responding in either text or audio. + """) + modalities?: ("text" | "audio")[]; + + @doc(""" + The format of output audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`. + """) + output_audio_format?: "pcm16" | "g711_ulaw" | "g711_alaw"; + + /** Sampling temperature for the model, limited to [0.6, 1.2]. Defaults to 0.8. */ + temperature?: float32; + + /** + * Maximum number of output tokens for a single assistant response, + * inclusive of tool calls, that was used in this response. + */ + max_output_tokens?: int32 | "inf"; +} + +// Tool customization (apply_discriminator): apply discriminated type base +/** + * Send this event to append audio bytes to the input audio buffer. The audio + * buffer is temporary storage you can write to and later commit. In Server VAD + * mode, the audio buffer is used to detect speech and the server will decide + * when to commit. When Server VAD is disabled, you must commit the audio buffer + * manually. + * + * The client may choose how much audio to place in each event up to a maximum + * of 15 MiB, for example streaming smaller chunks from the client may allow the + * VAD to be more responsive. Unlike made other client events, the server will + * not send a confirmation response to this event. + */ +model ClientEventInputAudioBufferAppend extends ClientEvent { + @doc(""" + The event type, must be `input_audio_buffer.append`. + """) + type: ClientEventType.input_audio_buffer_append; + + // Tool customization: use encoded type for audio data + @doc(""" + Base64-encoded audio. This must be in the format specified by the + `input_audio_format` field in the session configuration. + """) + audio: string; +} + +// Tool customization (apply_discriminator): apply discriminated type base +@doc(""" + Send this event to commit the user input audio buffer, which will create a + new user message item in the conversation. This event will produce an error + if the input audio buffer is empty. When in Server VAD mode, the client does + not need to send this event, the server will commit the audio buffer + automatically. + + Committing the input audio buffer will trigger input audio transcription + (if enabled in session configuration), but it will not create a response + from the model. The server will respond with an `input_audio_buffer.committed` + event. + """) +model ClientEventInputAudioBufferCommit extends ClientEvent { + @doc(""" + The event type, must be `input_audio_buffer.commit`. + """) + type: ClientEventType.input_audio_buffer_commit; +} + +// Tool customization (apply_discriminator): apply discriminated type base +@doc(""" + Send this event to clear the audio bytes in the buffer. The server will + respond with an `input_audio_buffer.cleared` event. + """) +model ClientEventInputAudioBufferClear extends ClientEvent { + @doc(""" + The event type, must be `input_audio_buffer.clear`. + """) + type: ClientEventType.input_audio_buffer_clear; +} + +// Tool customization (apply_discriminator): apply discriminated type base +@doc(""" + Add a new Item to the Conversation's context, including messages, function + calls, and function call responses. This event can be used both to populate a + "history" of the conversation and to add new items mid-stream, but has the + current limitation that it cannot populate assistant audio messages. + + If successful, the server will respond with a `conversation.item.created` + event, otherwise an `error` event will be sent. + """) +model ClientEventConversationItemCreate extends ClientEvent { + @doc(""" + The event type, must be `conversation.item.create`. + """) + type: ClientEventType.conversation_item_create; + + @doc(""" + Optional client-generated ID used to identify this event. + """) + event_id?: string; + + @doc(""" + The ID of the preceding item after which the new item will be inserted. + If not set, the new item will be appended to the end of the conversation. + If set to `root`, the new item will be added to the beginning of the conversation. + If set to an existing ID, it allows an item to be inserted mid-conversation. If the + ID cannot be found, an error will be returned and the item will not be added. + """) + previous_item_id?: string; + + // Tool customization: apply enriched item definition hierarchy + item?: ConversationRequestItem; +} + +// Tool customization (apply_discriminator): apply discriminated type base +@doc(""" + Send this event to truncate a previous assistant message’s audio. The server + will produce audio faster than voicelive, so this event is useful when the user + interrupts to truncate audio that has already been sent to the client but not + yet played. This will synchronize the server's understanding of the audio with + the client's playback. + + Truncating audio will delete the server-side text transcript to ensure there + is not text in the context that hasn't been heard by the user. + + If successful, the server will respond with a `conversation.item.truncated` + event. + """) +model ClientEventConversationItemTruncate extends ClientEvent { + @doc(""" + The event type, must be `conversation.item.truncate`. + """) + type: ClientEventType.conversation_item_truncate; + + /** + * The ID of the assistant message item to truncate. Only assistant message + * items can be truncated. + */ + item_id: string; + + /** The index of the content part to truncate. Set this to 0. */ + content_index: int32; + + /** + * Inclusive duration up to which audio is truncated, in milliseconds. If + * the audio_end_ms is greater than the actual audio duration, the server + * will respond with an error. + */ + audio_end_ms: int32; +} + +// Tool customization (apply_discriminator): apply discriminated type base +@doc(""" + Send this event when you want to remove any item from the conversation + history. The server will respond with a `conversation.item.deleted` event, + unless the item does not exist in the conversation history, in which case the + server will respond with an error. + """) +model ClientEventConversationItemDelete extends ClientEvent { + @doc(""" + The event type, must be `conversation.item.delete`. + """) + type: ClientEventType.conversation_item_delete; + + /** The ID of the item to delete. */ + item_id: string; +} + +// Tool customization (apply_discriminator): apply discriminated type base +@doc(""" + This event instructs the server to create a Response, which means triggering + model inference. When in Server VAD mode, the server will create Responses + automatically. + + A Response will include at least one Item, and may have two, in which case + the second will be a function call. These Items will be appended to the + conversation history. + + The server will respond with a `response.created` event, events for Items + and content created, and finally a `response.done` event to indicate the + Response is complete. + + The `response.create` event includes inference configuration like + `instructions`, and `temperature`. These fields will override the Session's + configuration for this Response only. + """) +model ClientEventResponseCreate extends ClientEvent { + @doc(""" + The event type, must be `response.create`. + """) + type: ClientEventType.response_create; + + response?: ResponseCreateParams; + + @doc(""" + additional instructions (system prompt) appended to the default instructions of the session. Only affects this response only. + """) + additional_instructions?: string; +} + +// Tool customization (apply_discriminator): apply discriminated type base +@doc(""" + Send this event to cancel an in-progress response. The server will respond + with a `response.cancelled` event or an error if there is no response to + cancel. + """) +model ClientEventResponseCancel extends ClientEvent { + @doc(""" + The event type, must be `response.cancel`. + """) + type: ClientEventType.response_cancel; + + /** + * A specific response ID to cancel - if not provided, will cancel an + * in-progress response in the default conversation. + */ + response_id?: string; +} + +// Tool customization (apply_discriminator): apply discriminated type +/** + * Returned when an error occurs, which could be a client problem or a server + * problem. Most errors are recoverable and the session will stay open, we + * recommend to implementors to monitor and log error messages by default. + */ +model ServerEventError extends ServerEvent { + @doc(""" + The event type, must be `error`. + """) + type: ServerEventType.error; + + /** Details of the error. */ + error: { + /** The type of error (e.g., "invalid_request_error", "server_error"). */ + type: string; + + /** Error code, if any. */ + code?: string | null; + + /** A human-readable error message. */ + message: string; + + /** Parameter related to the error, if any. */ + param?: string | null; + + /** The event_id of the client event that caused the error, if applicable. */ + event_id?: string | null; + }; +} + +// Tool customization (apply_discriminator): apply discriminated type +/** + * Returned when a Session is created. Emitted automatically when a new + * connection is established as the first server event. This event will contain + * the default Session configuration. + */ +model ServerEventSessionCreated extends ServerEvent { + @doc(""" + The event type, must be `session.created`. + """) + type: ServerEventType.session_created; + + // Tool customization: apply enriched response-specific model + session: ResponseSession; +} + +// Tool customization (apply_discriminator): apply discriminated type +@doc(""" + Returned when a session is updated with a `session.update` event, unless + there is an error. + """) +model ServerEventSessionUpdated extends ServerEvent { + @doc(""" + The event type, must be `session.updated`. + """) + type: ServerEventType.session_updated; + + // Tool customization: apply enriched response-specific model + session: ResponseSession; +} + +@doc("Sent when the server is in the process of establishing an avatar media connection and provides its SDP answer.") +model ServerEventSessionAvatarConnecting extends ServerEvent { + @doc("The event type, must be 'session.avatar.connecting'.") + type: ServerEventType.session_avatar_connecting; + + @doc("The server's SDP answer for the avatar connection.") + server_sdp: string; +} + +// Tool customization: establish base for enriched request/response split models +/** VoiceLive session object configuration. */ +model SessionBase {} + +// Tool customization: Adjust union to be a discriminated type base +/** A voicelive server event. */ +@discriminator("type") +model ServerEvent { + /** The type of event. */ + type: ServerEventType; + + event_id?: string; +} + +// Tool customization (apply_discriminator): apply discriminated type +@doc(""" + Returned when an input audio buffer is committed, either by the client or + automatically in server VAD mode. The `item_id` property is the ID of the user + message item that will be created, thus a `conversation.item.created` event + will also be sent to the client. + """) +model ServerEventInputAudioBufferCommitted extends ServerEvent { + @doc(""" + The event type, must be `input_audio_buffer.committed`. + """) + type: ServerEventType.input_audio_buffer_committed; + + /** The ID of the preceding item after which the new item will be inserted. */ + previous_item_id?: string; + + /** The ID of the user message item that will be created. */ + item_id: string; +} + +// Tool customization (apply_discriminator): apply discriminated type +@doc(""" + Returned when the input audio buffer is cleared by the client with a + `input_audio_buffer.clear` event. + """) +model ServerEventInputAudioBufferCleared extends ServerEvent { + @doc(""" + The event type, must be `input_audio_buffer.cleared`. + """) + type: ServerEventType.input_audio_buffer_cleared; +} + +// Tool customization (apply_discriminator): apply discriminated type +@doc(""" + Sent by the server when in `server_vad` mode to indicate that speech has been + detected in the audio buffer. This can happen any time audio is added to the + buffer (unless speech is already detected). The client may want to use this + event to interrupt audio playback or provide visual feedback to the user. + + The client should expect to receive a `input_audio_buffer.speech_stopped` event + when speech stops. The `item_id` property is the ID of the user message item + that will be created when speech stops and will also be included in the + `input_audio_buffer.speech_stopped` event (unless the client manually commits + the audio buffer during VAD activation). + """) +model ServerEventInputAudioBufferSpeechStarted extends ServerEvent { + @doc(""" + The event type, must be `input_audio_buffer.speech_started`. + """) + type: ServerEventType.input_audio_buffer_speech_started; + + @doc(""" + Milliseconds from the start of all audio written to the buffer during the + session when speech was first detected. This will correspond to the + beginning of audio sent to the model, and thus includes the + `prefix_padding_ms` configured in the Session. + """) + audio_start_ms: int32; + + /** The ID of the user message item that will be created when speech stops. */ + item_id: string; +} + +// Tool customization (apply_discriminator): apply discriminated type +@doc(""" + Returned in `server_vad` mode when the server detects the end of speech in + the audio buffer. The server will also send an `conversation.item.created` + event with the user message item that is created from the audio buffer. + """) +model ServerEventInputAudioBufferSpeechStopped extends ServerEvent { + @doc(""" + The event type, must be `input_audio_buffer.speech_stopped`. + """) + type: ServerEventType.input_audio_buffer_speech_stopped; + + @doc(""" + Milliseconds since the session started when speech stopped. This will + correspond to the end of audio sent to the model, and thus includes the + `min_silence_duration_ms` configured in the Session. + """) + audio_end_ms: int32; + + /** The ID of the user message item that will be created. */ + item_id: string; +} + +// Tool customization (apply_discriminator): apply discriminated type +@doc(""" + Returned when a conversation item is created. There are several scenarios that produce this event: + - The server is generating a Response, which if successful will produce + either one or two Items, which will be of type `message` + (role `assistant`) or type `function_call`. + - The input audio buffer has been committed, either by the client or the + server (in `server_vad` mode). The server will take the content of the + input audio buffer and add it to a new user message Item. + - The client has sent a `conversation.item.create` event to add a new Item + to the Conversation. + """) +model ServerEventConversationItemCreated extends ServerEvent { + @doc(""" + The event type, must be `conversation.item.created`. + """) + type: ServerEventType.conversation_item_created; + + /** + * The ID of the preceding item in the Conversation context, allows the + * client to understand the order of the conversation. + */ + previous_item_id?: string; + + // Tool customization: apply enriched item definition hierarchy + item?: ResponseItem; +} + +// Tool customization (apply_discriminator): apply discriminated type +@doc(""" + This event is the output of audio transcription for user audio written to the + user audio buffer. Transcription begins when the input audio buffer is + committed by the client or server (in `server_vad` mode). Transcription runs + asynchronously with Response creation, so this event may come before or after + the Response events. + + VoiceLive API models accept audio natively, and thus input transcription is a + separate process run on a separate ASR (Automatic Speech Recognition) model. + The transcript may diverge somewhat from the model's interpretation, and + should be treated as a rough guide. + """) +model ServerEventConversationItemInputAudioTranscriptionCompleted + extends ServerEvent { + @doc(""" + The event type, must be + `conversation.item.input_audio_transcription.completed`. + """) + type: ServerEventType.conversation_item_input_audio_transcription_completed; + + /** The ID of the user message item containing the audio. */ + item_id: string; + + /** The index of the content part containing the audio. */ + content_index: int32; + + /** The transcribed text. */ + transcript: string; +} + +// Tool customization (apply_discriminator): apply discriminated type +@doc(""" + Returned when input audio transcription is configured, and a transcription + request for a user message failed. These events are separate from other + `error` events so that the client can identify the related Item. + """) +model ServerEventConversationItemInputAudioTranscriptionFailed + extends ServerEvent { + @doc(""" + The event type, must be + `conversation.item.input_audio_transcription.failed`. + """) + type: ServerEventType.conversation_item_input_audio_transcription_failed; + + /** The ID of the user message item. */ + item_id: string; + + /** The index of the content part containing the audio. */ + content_index: int32; + + /** Details of the transcription error. */ + error: VoiceLiveErrorDetails; +} + +// Tool customization (apply_discriminator): apply discriminated type +@doc(""" + Returned when an earlier assistant audio message item is truncated by the + client with a `conversation.item.truncate` event. This event is used to + synchronize the server's understanding of the audio with the client's playback. + + This action will truncate the audio and remove the server-side text transcript + to ensure there is no text in the context that hasn't been heard by the user. + """) +model ServerEventConversationItemTruncated extends ServerEvent { + @doc(""" + The event type, must be `conversation.item.truncated`. + """) + type: ServerEventType.conversation_item_truncated; + + /** The ID of the assistant message item that was truncated. */ + item_id: string; + + /** The index of the content part that was truncated. */ + content_index: int32; + + /** The duration up to which the audio was truncated, in milliseconds. */ + audio_end_ms: int32; + + event_id?: string; +} + +// Tool customization (apply_discriminator): apply discriminated type +@doc(""" + Returned when an item in the conversation is deleted by the client with a + `conversation.item.delete` event. This event is used to synchronize the + server's understanding of the conversation history with the client's view. + """) +model ServerEventConversationItemDeleted extends ServerEvent { + @doc(""" + The event type, must be `conversation.item.deleted`. + """) + type: ServerEventType.conversation_item_deleted; + + /** The ID of the item that was deleted. */ + item_id: string; + + event_id?: string; +} + +// Tool customization (apply_discriminator): apply discriminated type +@doc(""" + Returned when a new Response is created. The first event of response creation, + where the response is in an initial state of `in_progress`. + """) +model ServerEventResponseCreated extends ServerEvent { + @doc(""" + The event type, must be `response.created`. + """) + type: ServerEventType.response_created; + + response: Response; +} + +// Tool customization (apply_discriminator): apply discriminated type +@doc(""" + Returned when a Response is done streaming. Always emitted, no matter the + final state. The Response object included in the `response.done` event will + include all output Items in the Response but will omit the raw audio data. + """) +model ServerEventResponseDone extends ServerEvent { + @doc(""" + The event type, must be `response.done`. + """) + type: ServerEventType.response_done; + + response: Response; +} + +// Tool customization (apply_discriminator): apply discriminated type +/** Returned when a new Item is created during Response generation. */ +model ServerEventResponseOutputItemAdded extends ServerEvent { + @doc(""" + The event type, must be `response.output_item.added`. + """) + type: ServerEventType.response_output_item_added; + + /** The ID of the Response to which the item belongs. */ + response_id: string; + + /** The index of the output item in the Response. */ + output_index: int32; + + // Tool customization: apply enriched item definition hierarchy + item?: ResponseItem; +} + +// Tool customization (apply_discriminator): apply discriminated type +/** + * Returned when an Item is done streaming. Also emitted when a Response is + * interrupted, incomplete, or cancelled. + */ +model ServerEventResponseOutputItemDone extends ServerEvent { + @doc(""" + The event type, must be `response.output_item.done`. + """) + type: ServerEventType.response_output_item_done; + + /** The ID of the Response to which the item belongs. */ + response_id: string; + + /** The index of the output item in the Response. */ + output_index: int32; + + // Tool customization: apply enriched item definition hierarchy + item?: ResponseItem; +} + +// Tool customization (apply_discriminator): apply discriminated type +/** + * Returned when a new content part is added to an assistant message item during + * response generation. + */ +model ServerEventResponseContentPartAdded extends ServerEvent { + @doc(""" + The event type, must be `response.content_part.added`. + """) + type: ServerEventType.response_content_part_added; + + /** The ID of the response. */ + response_id: string; + + /** The ID of the item to which the content part was added. */ + item_id: string; + + /** The index of the output item in the response. */ + output_index: int32; + + /** The index of the content part in the item's content array. */ + content_index: int32; + + // Tool customization: apply detailed content part type + /** The content part that was added. */ + part: ContentPart; +} + +// Tool customization (apply_discriminator): apply discriminated type +/** + * Returned when a content part is done streaming in an assistant message item. + * Also emitted when a Response is interrupted, incomplete, or cancelled. + */ +model ServerEventResponseContentPartDone extends ServerEvent { + @doc(""" + The event type, must be `response.content_part.done`. + """) + type: ServerEventType.response_content_part_done; + + /** The ID of the response. */ + response_id: string; + + /** The ID of the item. */ + item_id: string; + + /** The index of the output item in the response. */ + output_index: int32; + + /** The index of the content part in the item's content array. */ + content_index: int32; + + // Tool customization: apply detailed content part type + /** The content part that is done. */ + part: ContentPart; +} + +// Tool customization (apply_discriminator): apply discriminated type +/** Returned when the text value of a "text" content part is updated. */ +model ServerEventResponseTextDelta extends ServerEvent { + @doc(""" + The event type, must be `response.text.delta`. + """) + type: ServerEventType.response_text_delta; + + /** The ID of the response. */ + response_id: string; + + /** The ID of the item. */ + item_id: string; + + /** The index of the output item in the response. */ + output_index: int32; + + /** The index of the content part in the item's content array. */ + content_index: int32; + + /** The text delta. */ + delta: string; +} + +// Tool customization (apply_discriminator): apply discriminated type +/** + * Returned when the text value of a "text" content part is done streaming. Also + * emitted when a Response is interrupted, incomplete, or cancelled. + */ +model ServerEventResponseTextDone extends ServerEvent { + @doc(""" + The event type, must be `response.text.done`. + """) + type: ServerEventType.response_text_done; + + /** The ID of the response. */ + response_id: string; + + /** The ID of the item. */ + item_id: string; + + /** The index of the output item in the response. */ + output_index: int32; + + /** The index of the content part in the item's content array. */ + content_index: int32; + + /** The final text content. */ + text: string; +} + +// Tool customization (apply_discriminator): apply discriminated type +/** Returned when the model-generated transcription of audio output is updated. */ +model ServerEventResponseAudioTranscriptDelta extends ServerEvent { + @doc(""" + The event type, must be `response.audio_transcript.delta`. + """) + type: ServerEventType.response_audio_transcript_delta; + + /** The ID of the response. */ + response_id: string; + + /** The ID of the item. */ + item_id: string; + + /** The index of the output item in the response. */ + output_index: int32; + + /** The index of the content part in the item's content array. */ + content_index: int32; + + /** The transcript delta. */ + delta: string; +} + +// Tool customization (apply_discriminator): apply discriminated type +/** + * Returned when the model-generated transcription of audio output is done + * streaming. Also emitted when a Response is interrupted, incomplete, or + * cancelled. + */ +model ServerEventResponseAudioTranscriptDone extends ServerEvent { + @doc(""" + The event type, must be `response.audio_transcript.done`. + """) + type: ServerEventType.response_audio_transcript_done; + + /** The ID of the response. */ + response_id: string; + + /** The ID of the item. */ + item_id: string; + + /** The index of the output item in the response. */ + output_index: int32; + + /** The index of the content part in the item's content array. */ + content_index: int32; + + /** The final transcript of the audio. */ + transcript: string; +} + +// Tool customization (apply_discriminator): apply discriminated type +/** Returned when the model-generated audio is updated. */ +model ServerEventResponseAudioDelta extends ServerEvent { + @doc(""" + The event type, must be `response.audio.delta`. + """) + type: ServerEventType.response_audio_delta; + + /** The ID of the response. */ + response_id: string; + + /** The ID of the item. */ + item_id: string; + + /** The index of the output item in the response. */ + output_index: int32; + + /** The index of the content part in the item's content array. */ + content_index: int32; + + // Tool customization: use encoded type for audio data + /** Base64-encoded audio data delta. */ + @encode("base64") + delta: bytes; + + event_id?: string; +} + +// Tool customization (apply_discriminator): apply discriminated type +/** + * Returned when the model-generated audio is done. Also emitted when a Response + * is interrupted, incomplete, or cancelled. + */ +model ServerEventResponseAudioDone extends ServerEvent { + @doc(""" + The event type, must be `response.audio.done`. + """) + type: ServerEventType.response_audio_done; + + /** The ID of the response. */ + response_id: string; + + /** The ID of the item. */ + item_id: string; + + /** The index of the output item in the response. */ + output_index: int32; + + /** The index of the content part in the item's content array. */ + content_index: int32; +} + +@doc(""" +Represents a delta update of blendshape animation frames for a specific output of a response. +""") +model ServerEventResponseAnimationBlendshapeDelta extends ServerEvent { + type: ServerEventType.response_animation_blendshapes_delta; + response_id: string; + item_id: string; + output_index: int32; + content_index: int32; + frames: float32[][] | string; + frame_index: int32; +} + +@doc(""" +Indicates the completion of blendshape animation processing for a specific output of a response. +""") +model ServerEventResponseAnimationBlendshapeDone extends ServerEvent { + type: ServerEventType.response_animation_blendshapes_done; + response_id: string; + item_id: string; + output_index: int32; +} + +@doc(""" +Represents an emotion hypothesis detected from response audio with multiple candidates. +""") +model ServerEventResponseEmotionHypothesis extends ServerEvent { + type: ServerEventType.response_emotion_hypothesis; + emotion: string; + candidates: EmotionCandidate[]; + audio_offset_ms: int32; + audio_duration_ms: int32; + response_id?: string; + item_id: string; +} + +@doc(""" +Represents a word-level audio timestamp delta for a response. +""") +model ServerEventResponseAudioTimestampDelta extends ServerEvent { + type: ServerEventType.response_audio_timestamp_delta; + response_id: string; + item_id: string; + output_index: int32; + content_index: int32; + audio_offset_ms: int32; + audio_duration_ms: int32; + text: string; + timestamp_type: "word"; +} + +@doc(""" +Indicates completion of audio timestamp delivery for a response. +""") +model ServerEventResponseAudioTimestampDone extends ServerEvent { + type: ServerEventType.response_audio_timestamp_done; + response_id: string; + item_id: string; + output_index: int32; + content_index: int32; +} + +@doc(""" +Represents a viseme ID delta update for animation based on audio. +""") +model ServerEventResponseAnimationVisemeDelta extends ServerEvent { + type: ServerEventType.response_animation_viseme_delta; + response_id: string; + item_id: string; + output_index: int32; + content_index: int32; + audio_offset_ms: int32; + viseme_id: int32; +} + +@doc(""" +Indicates completion of viseme animation delivery for a response. +""") +model ServerEventResponseAnimationVisemeDone extends ServerEvent { + type: ServerEventType.response_animation_viseme_done; + response_id: string; + item_id: string; + output_index: int32; + content_index: int32; +} + +/** Create a new VoiceLive response with these parameters */ +model ResponseCreateParams { + @doc(""" + Whether to commit the response to the conversation. Defaults to true. + """) + commit?: boolean = true; + + @doc(""" + Whether to cancel any ongoing generation before starting this one. Defaults to true. + """) + cancel_previous?: boolean = true; + + @doc(""" + Input items to append to the conversation context before generating a response. + """) + append_input_items?: ConversationRequestItem[]; + + @doc(""" + Input items to be used as the context for this response. + An empty array clears previous context. + """) + input_items?: ConversationRequestItem[]; + + // Tool customization: Apply reusable modality representation + /** + * The set of modalities the model can respond with. To disable audio, + * set this to ["text"]. + */ + modalities?: Modality[]; + + @doc(""" + The default system instructions (i.e. system message) prepended to model + calls. This field allows the client to guide the model on desired + responses. The model can be instructed on response content and format, + (e.g. "be extremely succinct", "act friendly", "here are examples of good + responses") and on audio behavior (e.g. "talk quickly", "inject emotion + into your voice", "laugh frequently"). The instructions are not guaranteed + to be followed by the model, but they provide guidance to the model on the + desired behavior. + + Note that the server sets default instructions which will be used if this + field is not set and are visible in the `session.created` event at the + start of the session. + """) + instructions?: string; + + @doc(""" + supported voice identifiers and configurations. + """) + voice?: Voice; + + // Tool customization: use extracted and reusable audio format definition + @doc(""" + The format of output audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`. + """) + output_audio_format?: AudioFormat = AudioFormat.pcm16; + + // Tool customization: use enriched tool definition + /** Tools (functions) available to the model. */ + tools?: Tool[]; + + @doc(""" + How the model chooses tools. Options are `auto`, `none`, `required`, or + specify a function, like `{"type": "function", "function": {"name": "my_function"}}`. + """) + tool_choice?: string; + + /** Sampling temperature for the model, limited to [0.6, 1.2]. Defaults to 0.8. */ + temperature?: float32; + + // Tool customization: Address (observed as of 2025-01-31) spec issue with 'max_response_output_tokens' + @doc(""" + Maximum number of output tokens for a single assistant response, + inclusive of tool calls. Provide an integer between 1 and 4096 to + limit output tokens, or `inf` for the maximum available tokens for a + given model. Defaults to `inf`. + """) + max_output_tokens?: int32 | "inf"; +} + +// Tool customization (apply_discriminator): apply discriminated type base +@doc(""" + Send this event when you want to retrieve the server's representation of a specific item in the conversation history. This is useful, for example, to inspect user audio after noise cancellation and VAD. + The server will respond with a `conversation.item.retrieved` event, + unless the item does not exist in the conversation history, in which case the + server will respond with an error. + """) +model ClientEventConversationItemRetrieve extends ClientEvent { + @doc(""" + The event type, must be `conversation.item.retrieve`. + """) + type: ClientEventType.conversation_item_retrieve; + + /** The ID of the item to retrieve. */ + item_id: string; +} + +// Tool customization (apply_discriminator): apply discriminated type +/** Returned when the text value of an input audio transcription content part is updated. */ +model ServerEventConversationItemInputAudioTranscriptionDelta + extends ServerEvent { + @doc(""" + The event type, must be `conversation.item.input_audio_transcription.delta`. + """) + type: ServerEventType.conversation_item_input_audio_transcription_delta; + + /** The ID of the item. */ + item_id: string; + + /** The index of the content part in the item's content array. */ + content_index?: int32; + + /** The text delta. */ + delta?: string; + + /** The log probabilities of the transcription. */ + logprobs?: LogProbProperties[] | null; +} + +// Tool customization (apply_discriminator): apply discriminated type +@doc(""" + Returned when a conversation item is retrieved with `conversation.item.retrieve`. + """) +model ServerEventConversationItemRetrieved extends ServerEvent { + @doc(""" + The event type, must be `conversation.item.retrieved`. + """) + type: ServerEventType.conversation_item_retrieved; + + // Tool customization: apply enriched item definition hierarchy + item?: ResponseItem; + event_id?: string; +} + +model EmotionCandidate { + emotion: string; + confidence: float32; +} + +// Tool customization (apply_discriminator): apply discriminated type +/** Returned when the model-generated function call arguments are updated. */ +model ServerEventResponseFunctionCallArgumentsDelta extends ServerEvent { + @doc(""" + The event type, must be `response.function_call_arguments.delta`. + """) + type: ServerEventType.response_function_call_arguments_delta; + + /** The ID of the response. */ + response_id: string; + + /** The ID of the function call item. */ + item_id: string; + + /** The index of the output item in the response. */ + output_index: int32; + + /** The ID of the function call. */ + call_id: string; + + /** The arguments delta as a JSON string. */ + delta: string; +} + +// Tool customization (apply_discriminator): apply discriminated type +/** + * Returned when the model-generated function call arguments are done streaming. + * Also emitted when a Response is interrupted, incomplete, or cancelled. + */ +model ServerEventResponseFunctionCallArgumentsDone extends ServerEvent { + @doc(""" + The event type, must be `response.function_call_arguments.done`. + """) + type: ServerEventType.response_function_call_arguments_done; + + /** The ID of the response. */ + response_id: string; + + /** The ID of the function call item. */ + item_id: string; + + /** The index of the output item in the response. */ + output_index: int32; + + /** The ID of the function call. */ + call_id: string; + + /** The final arguments as a JSON string. */ + arguments: string; + + /** The name of the function call. */ + name: string; +} diff --git a/specification/ai/data-plane/VoiceLive/operations.tsp b/specification/ai/data-plane/VoiceLive/operations.tsp new file mode 100644 index 000000000000..7beb943459a7 --- /dev/null +++ b/specification/ai/data-plane/VoiceLive/operations.tsp @@ -0,0 +1,18 @@ +import "./common"; +import "./models.tsp"; +import "@azure-tools/typespec-azure-core"; + +using TypeSpec.Http; +using TypeSpec.OpenAPI; +using TypeSpec.Versioning; + +namespace VoiceLive; + +alias VoiceLiveBetaHeader = { + @header("VoiceLive-Beta") voiceLiveBeta: "voicelive=v1"; +}; + +enum Versions { + @useDependency(Azure.Core.Versions.v1_0_Preview_2) + v2025_05_01_preview: "2025-05-01-preview", +} \ No newline at end of file diff --git a/specification/ai/data-plane/VoiceLive/servers/websocket.tsp b/specification/ai/data-plane/VoiceLive/servers/websocket.tsp new file mode 100644 index 000000000000..f54e1bcf5709 --- /dev/null +++ b/specification/ai/data-plane/VoiceLive/servers/websocket.tsp @@ -0,0 +1,93 @@ +import "@typespec/http"; +import "@typespec/versioning"; +import "@azure-tools/typespec-azure-core"; + +import "../models.tsp"; +import "../operations.tsp"; + +using TypeSpec.Http; +using TypeSpec.Versioning; +using Azure.Core; + +@service(#{ title: "VoiceLive"}) +@versioned(VoiceLive.Versions) +@useAuth( + ApiKeyAuth | AadOauth2Auth<[ + "https://cognitiveservices.azure.com/.default" + ]> +) +@server( + "{endpoint}/voice-agent/realtime", + "VoiceLive Endpoint", + { + @doc(""" + Azure AI VoiceLive endpoint. + """) + endpoint: url, + } +) + + +namespace VoiceLive; + +// Union of all client events that can be passed into `force_models` +alias ForceModelClientEvent = + ClientEventSessionUpdate | + ClientEventInputAudioBufferAppend | + ClientEventInputAudioBufferCommit | + ClientEventInputAudioBufferClear | + ClientEventInputAudioTurnStart | + ClientEventInputAudioTurnAppend | + ClientEventInputAudioTurnEnd | + ClientEventInputAudioTurnCancel | + ClientEventInputAudioClear | + ClientEventConversationItemCreate | + ClientEventConversationItemRetrieve | + ClientEventConversationItemTruncate | + ClientEventConversationItemDelete | + ClientEventResponseCreate | + ClientEventResponseCancel | + ClientEventSessionAvatarConnect; + +// Union of all server events that can be returned from `force_models` +alias ForceModelServerEvent = + ServerEventSessionAvatarConnecting | + ServerEventSessionCreated | + ServerEventSessionUpdated | + ServerEventError | + ServerEventResponseTextDelta | + ServerEventResponseAudioDelta | + ServerEventConversationItemCreated | + ServerEventConversationItemDeleted | + ServerEventConversationItemRetrieved | + ServerEventConversationItemTruncated | + ServerEventConversationItemInputAudioTranscriptionCompleted | + ServerEventConversationItemInputAudioTranscriptionDelta | + ServerEventConversationItemInputAudioTranscriptionFailed | + ServerEventInputAudioBufferCommitted | + ServerEventInputAudioBufferCleared | + ServerEventInputAudioBufferSpeechStarted | + ServerEventInputAudioBufferSpeechStopped | + ServerEventResponseCreated | + ServerEventResponseDone | + ServerEventResponseOutputItemAdded | + ServerEventResponseOutputItemDone | + ServerEventResponseContentPartAdded | + ServerEventResponseContentPartDone | + ServerEventResponseTextDone | + ServerEventResponseAudioTranscriptDelta | + ServerEventResponseAudioTranscriptDone | + ServerEventResponseAudioDone | + ServerEventResponseFunctionCallArgumentsDelta | + ServerEventResponseFunctionCallArgumentsDone | + ServerEventResponseAnimationBlendshapeDelta | + ServerEventResponseAnimationBlendshapeDone | + ServerEventResponseEmotionHypothesis | + ServerEventResponseAudioTimestampDelta | + ServerEventResponseAudioTimestampDone | + ServerEventResponseAnimationVisemeDelta | + ServerEventResponseAnimationVisemeDone; + + +// Operation definition +op force_models(event: ForceModelClientEvent): ForceModelServerEvent; \ No newline at end of file diff --git a/specification/ai/data-plane/VoiceLive/tspconfig.yaml b/specification/ai/data-plane/VoiceLive/tspconfig.yaml new file mode 100644 index 000000000000..a96a6cd87f9a --- /dev/null +++ b/specification/ai/data-plane/VoiceLive/tspconfig.yaml @@ -0,0 +1,55 @@ +parameters: + "service-dir": + default: "sdk/ai" + "dependencies": + default: "" +emit: + - "@azure-tools/typespec-autorest" +linter: + extends: + - "@azure-tools/typespec-azure-rulesets/data-plane" +options: + "@azure-tools/typespec-autorest": + azure-resource-provider-folder: "data-plane" + emit-lro-options: "none" + emitter-output-dir: "{project-root}/.." + output-file: "{azure-resource-provider-folder}/{service-name}/{version-status}/{version}/widgets.json" + "@azure-tools/typespec-python": + package-dir: "azure-ai-voicelive" + namespace: "azure.ai.voicelive" + generate-test: false + generate-sample: false + flavor: azure + package-name: "azure-ai-voicelive" + "@azure-tools/typespec-csharp": + package-dir: "Azure.AI.VoiceLive" + clear-output-folder: true + model-namespace: false + namespace: "{package-dir}" + flavor: azure + emitterPackageJsonPath: eng/azure-typespec-http-client-csharp-emitter-package.json + "@azure-typespec/http-client-csharp": + namespace: Azure.AI.VoiceLive + model-namespace: false + "@azure-tools/typespec-ts": + package-dir: "azure-ai-voicelive" + package-details: + name: "@azure-rest/ai-voicelive" + flavor: azure + "@azure-tools/typespec-java": + package-dir: "azure-ai-voicelive" + namespace: com.azure.ai.voicelive + flavor: azure + "@azure-tools/typespec-go": + module: "github.com/Azure/azure-sdk-for-go/{service-dir}/{package-dir}" + service-dir: "sdk/ai" + package-dir: "voicelive" + module-version: "0.0.1" + generate-fakes: true + inject-spans: true + single-client: true + slice-elements-byval: true + flavor: azure + "@azure-tools/typespec-client-generator-cli": + additionalDirectories: + - "specification/ai/data-plane/VoiceLive/" \ No newline at end of file