diff --git a/specification/ai/data-plane/VoiceLive/client.tsp b/specification/ai/data-plane/VoiceLive/client.tsp
new file mode 100644
index 000000000000..fdc1798f0a8c
--- /dev/null
+++ b/specification/ai/data-plane/VoiceLive/client.tsp
@@ -0,0 +1,137 @@
+import "@azure-tools/typespec-client-generator-core";
+import "./servers/websocket.tsp";
+
+using Azure.ClientGenerator.Core;
+
+@@access(VoiceLive.force_models, Access.internal, "python");
+@@access(VoiceLive.force_models, Access.internal, "csharp");
+
+@@access(VoiceLive.ClientEventSessionUpdate, Access.public, "python");
+@@access(VoiceLive.ClientEventInputAudioBufferAppend, Access.public, "python");
+@@access(VoiceLive.ClientEventInputAudioBufferCommit, Access.public, "python");
+@@access(VoiceLive.ClientEventInputAudioBufferClear, Access.public, "python");
+@@access(VoiceLive.ClientEventInputAudioTurnStart, Access.public, "python");
+@@access(VoiceLive.ClientEventInputAudioTurnAppend, Access.public, "python");
+@@access(VoiceLive.ClientEventInputAudioTurnEnd, Access.public, "python");
+@@access(VoiceLive.ClientEventInputAudioTurnCancel, Access.public, "python");
+@@access(VoiceLive.ClientEventInputAudioClear, Access.public, "python");
+@@access(VoiceLive.ClientEventConversationItemCreate, Access.public, "python");
+@@access(VoiceLive.ClientEventConversationItemRetrieve, Access.public, "python");
+@@access(VoiceLive.ClientEventConversationItemTruncate, Access.public, "python");
+@@access(VoiceLive.ClientEventConversationItemDelete, Access.public, "python");
+@@access(VoiceLive.ClientEventResponseCreate, Access.public, "python");
+@@access(VoiceLive.ClientEventResponseCancel, Access.public, "python");
+@@access(VoiceLive.ClientEventSessionAvatarConnect, Access.public, "python");
+
+
+@@access(VoiceLive.ServerEventSessionAvatarConnecting, Access.public, "python");
+@@access(VoiceLive.ServerEventSessionCreated, Access.public, "python");
+@@access(VoiceLive.ServerEventResponseAudioDelta, Access.public, "python");
+@@access(VoiceLive.ResponseItem, Access.public, "python");
+@@access(VoiceLive.Response, Access.public, "python");
+@@access(VoiceLive.ServerEventConversationItemCreated, Access.public, "python");
+@@access(VoiceLive.ServerEventConversationItemDeleted, Access.public, "python");
+@@access(VoiceLive.ServerEventConversationItemInputAudioTranscriptionCompleted, Access.public, "python");
+@@access(VoiceLive.ServerEventConversationItemInputAudioTranscriptionDelta, Access.public, "python");
+@@access(VoiceLive.ServerEventConversationItemInputAudioTranscriptionFailed, Access.public, "python");
+@@access(VoiceLive.ServerEventConversationItemRetrieved, Access.public, "python");
+@@access(VoiceLive.ServerEventConversationItemTruncated, Access.public, "python");
+@@access(VoiceLive.ServerEventError, Access.public, "python");
+@@access(VoiceLive.ServerEventInputAudioBufferCleared, Access.public, "python");
+@@access(VoiceLive.ServerEventInputAudioBufferCommitted, Access.public, "python");
+@@access(VoiceLive.ServerEventInputAudioBufferSpeechStarted, Access.public, "python");
+@@access(VoiceLive.ServerEventInputAudioBufferSpeechStopped, Access.public, "python");
+@@access(VoiceLive.ServerEventResponseAudioDone, Access.public, "python");
+@@access(VoiceLive.ServerEventResponseAudioTranscriptDelta, Access.public, "python");
+@@access(VoiceLive.ServerEventResponseAudioTranscriptDone, Access.public, "python");
+@@access(VoiceLive.ServerEventResponseContentPartAdded, Access.public, "python");
+@@access(VoiceLive.ServerEventResponseContentPartDone, Access.public, "python");
+@@access(VoiceLive.ServerEventResponseCreated, Access.public, "python");
+@@access(VoiceLive.ServerEventResponseDone, Access.public, "python");
+@@access(VoiceLive.ServerEventResponseOutputItemAdded, Access.public, "python");
+@@access(VoiceLive.ServerEventResponseOutputItemDone, Access.public, "python");
+@@access(VoiceLive.ServerEventResponseTextDelta, Access.public, "python");
+@@access(VoiceLive.ServerEventResponseTextDone, Access.public, "python");
+@@access(VoiceLive.ServerEventSessionUpdated, Access.public, "python");
+
+@@access(VoiceLive.ServerEvent, Access.public, "csharp");
+@@access(VoiceLive.ServerEventSessionCreated, Access.public, "csharp");
+@@access(VoiceLive.ClientEvent, Access.internal, "csharp");
+@@access(VoiceLive.ClientEventConversationItemCreate, Access.internal, "csharp");
+@@access(VoiceLive.ClientEventConversationItemDelete, Access.internal, "csharp");
+@@access(VoiceLive.ClientEventConversationItemRetrieve, Access.internal, "csharp");
+@@access(VoiceLive.ClientEventConversationItemTruncate, Access.internal, "csharp");
+@@access(VoiceLive.ClientEventInputAudioBufferAppend, Access.internal, "csharp");
+@@access(VoiceLive.ClientEventInputAudioBufferClear, Access.internal, "csharp");
+@@access(VoiceLive.ClientEventInputAudioBufferCommit, Access.internal, "csharp");
+@@access(VoiceLive.ClientEventInputAudioClear, Access.internal, "csharp");
+@@access(VoiceLive.ClientEventInputAudioTurnAppend, Access.internal, "csharp");
+@@access(VoiceLive.ClientEventInputAudioTurnCancel, Access.internal, "csharp");
+@@access(VoiceLive.ClientEventInputAudioTurnEnd, Access.internal, "csharp");
+@@access(VoiceLive.ClientEventInputAudioTurnStart, Access.internal, "csharp");
+@@access(VoiceLive.ClientEventResponseCancel, Access.internal, "csharp");
+@@access(VoiceLive.ClientEventResponseCreate, Access.internal, "csharp");
+@@access(VoiceLive.ClientEventSessionAvatarConnect, Access.internal, "csharp");
+@@access(VoiceLive.ClientEventSessionUpdate, Access.internal, "csharp");
+@@access(VoiceLive.VideoCrop.bottom_right, Access.internal, "csharp");
+@@access(VoiceLive.VideoCrop.top_left, Access.internal, "csharp");
+@@access(VoiceLive.Response.voice, Access.internal, "csharp");
+
+@@clientName(VoiceLive.Modality, "InputModality", "csharp");
+@@clientName(VoiceLive.Animation, "AnimationOptions", "csharp");
+@@clientName(VoiceLive.Tool, "VoiceLiveToolInvocation", "csharp");
+@@clientName(VoiceLive.AzureCustomVoice.custom_lexicon_url, "CustomLexiconUri", "csharp");
+@@clientName(VoiceLive.IceServer.urls, "Uris", "csharp");
+@@clientName(VoiceLive.VideoCrop.bottom_right, "BottomRightInternal", "csharp");
+@@clientName(VoiceLive.VideoCrop.top_left, "TopLeftInternal", "csharp");
+@@clientName(VoiceLive.Response, "VoiceLiveResponse", "csharp");
+@@clientName(VoiceLive.Response.voice, "VoiceInternal", "csharp");
+@@clientName(VoiceLive.ResponseSession.voice, "VoiceInternal", "csharp");
+@@clientName(VoiceLive.Response.modalities, "ModalitiesInternal", "csharp");
+@@clientName(VoiceLive.AgentConfig, "RespondingAgentConfig", "csharp");
+@@clientName(VoiceLive.FunctionTool, "VoiceLiveFunctionDefinition", "csharp");
+@@clientName(VoiceLive.Tool, "VoiceLiveToolDefinition", "csharp");
+@@clientName(VoiceLive.ContentPart, "VoiceLiveContentPart", "csharp");
+@@clientName(VoiceLive.MessageRole, "ResponseMessageRole", "csharp");
+@@clientName(VoiceLive.ResponseStatus, "VoiceLiveResponseStatus", "csharp");
+@@clientName(VoiceLive.ServerEvent, "SessionUpdate", "csharp");
+@@clientName(VoiceLive.ServerEventSessionCreated, "SessionUpdateSessionCreated", "csharp");
+@@clientName(VoiceLive.ServerEventSessionUpdated, "SessionUpdateSessionUpdated", "csharp");
+@@clientName(VoiceLive.ServerEventSessionAvatarConnecting, "SessionUpdateSessionAvatarConnecting", "csharp");
+@@clientName(VoiceLive.ServerEventInputAudioBufferSpeechStarted, "SessionUpdateInputAudioBufferSpeechStarted", "csharp");
+@@clientName(VoiceLive.ServerEventInputAudioBufferSpeechStopped, "SessionUpdateInputAudioBufferSpeechStopped", "csharp");
+@@clientName(VoiceLive.ServerEventInputAudioBufferCleared, "SessionUpdateInputAudioBufferCleared", "csharp");
+@@clientName(VoiceLive.ServerEventInputAudioBufferCommitted, "SessionUpdateInputAudioBufferCommitted", "csharp");
+@@clientName(VoiceLive.ServerEventResponseCreated, "SessionUpdateResponseCreated", "csharp");
+@@clientName(VoiceLive.ServerEventResponseDone, "SessionUpdateResponseDone", "csharp");
+@@clientName(VoiceLive.ServerEventResponseAudioDelta, "SessionUpdateResponseAudioDelta", "csharp");
+@@clientName(VoiceLive.ServerEventResponseAudioDone, "SessionUpdateResponseAudioDone", "csharp");
+@@clientName(VoiceLive.ServerEventResponseAudioTranscriptDelta, "SessionUpdateResponseAudioTranscriptDelta", "csharp");
+@@clientName(VoiceLive.ServerEventResponseAudioTranscriptDone, "SessionUpdateResponseAudioTranscriptDone", "csharp");
+@@clientName(VoiceLive.ServerEventResponseAudioTimestampDelta, "SessionUpdateResponseAudioTimestampDelta", "csharp");
+@@clientName(VoiceLive.ServerEventResponseAudioTimestampDone, "SessionUpdateResponseAudioTimestampDone", "csharp");
+@@clientName(VoiceLive.ServerEventResponseTextDelta, "SessionUpdateResponseTextDelta", "csharp");
+@@clientName(VoiceLive.ServerEventResponseTextDone, "SessionUpdateResponseTextDone", "csharp");
+@@clientName(VoiceLive.ServerEventResponseContentPartAdded, "SessionUpdateResponseContentPartAdded", "csharp");
+@@clientName(VoiceLive.ServerEventResponseContentPartDone, "SessionUpdateResponseContentPartDone", "csharp");
+@@clientName(VoiceLive.ServerEventResponseOutputItemAdded, "SessionUpdateResponseOutputItemAdded", "csharp");
+@@clientName(VoiceLive.ServerEventResponseOutputItemDone, "SessionUpdateResponseOutputItemDone", "csharp");
+@@clientName(VoiceLive.ServerEventResponseFunctionCallArgumentsDelta, "SessionUpdateResponseFunctionCallArgumentsDelta", "csharp");
+@@clientName(VoiceLive.ServerEventResponseFunctionCallArgumentsDone, "SessionUpdateResponseFunctionCallArgumentsDone", "csharp");
+@@clientName(VoiceLive.ServerEventResponseAnimationBlendshapeDelta, "SessionUpdateResponseAnimationBlendshapeDelta", "csharp");
+@@clientName(VoiceLive.ServerEventResponseAnimationBlendshapeDone, "SessionUpdateResponseAnimationBlendshapeDone", "csharp");
+@@clientName(VoiceLive.ServerEventResponseEmotionHypothesis, "SessionUpdateResponseEmotionHypothesis", "csharp");
+@@clientName(VoiceLive.ServerEventResponseAnimationVisemeDelta, "SessionUpdateResponseAnimationVisemeDelta", "csharp");
+@@clientName(VoiceLive.ServerEventResponseAnimationVisemeDone, "SessionUpdateResponseAnimationVisemeDone", "csharp");
+@@clientName(VoiceLive.ServerEventConversationItemCreated, "SessionUpdateConversationItemCreated", "csharp");
+@@clientName(VoiceLive.ServerEventConversationItemDeleted, "SessionUpdateConversationItemDeleted", "csharp");
+@@clientName(VoiceLive.ServerEventConversationItemRetrieved, "SessionUpdateConversationItemRetrieved", "csharp");
+@@clientName(VoiceLive.ServerEventConversationItemTruncated, "SessionUpdateConversationItemTruncated", "csharp");
+@@clientName(VoiceLive.ServerEventConversationItemInputAudioTranscriptionCompleted, "SessionUpdateConversationItemInputAudioTranscriptionCompleted", "csharp");
+@@clientName(VoiceLive.ServerEventConversationItemInputAudioTranscriptionDelta, "SessionUpdateConversationItemInputAudioTranscriptionDelta", "csharp");
+@@clientName(VoiceLive.ServerEventConversationItemInputAudioTranscriptionFailed, "SessionUpdateConversationItemInputAudioTranscriptionFailed", "csharp");
+@@clientName(VoiceLive.ServerEventError, "SessionUpdateError", "csharp");
+@@clientName(VoiceLive.Usage, "ResponseTokenStatistics", "csharp");
+@@clientName(VoiceLive.Phi4mmVoice, "LlmVoiceName", "csharp");
+@@clientName(VoiceLive.EOUDetection, "EouDetection", "csharp");
+@@clientName(VoiceLive.LLMVoice, "LlmVoice", "csharp");
diff --git a/specification/ai/data-plane/VoiceLive/common/main.tsp b/specification/ai/data-plane/VoiceLive/common/main.tsp
new file mode 100644
index 000000000000..5ad1d3a2bec6
--- /dev/null
+++ b/specification/ai/data-plane/VoiceLive/common/main.tsp
@@ -0,0 +1 @@
+import "./models.tsp";
diff --git a/specification/ai/data-plane/VoiceLive/common/models.tsp b/specification/ai/data-plane/VoiceLive/common/models.tsp
new file mode 100644
index 000000000000..3b1209a3301f
--- /dev/null
+++ b/specification/ai/data-plane/VoiceLive/common/models.tsp
@@ -0,0 +1,44 @@
+// Cleaned TypeSpec file aligned with Python model definitions
+// Removed models not defined or needed based on your Python code baseline
+import "@typespec/http";
+import "@typespec/openapi";
+using TypeSpec.OpenAPI;
+
+namespace VoiceLive;
+
+@doc("Error object returned in case of API failure.")
+model VoiceLiveErrorDetails {
+  @doc("Error code, or null if unspecified.")
+  code?: string;
+
+  @doc("Human-readable error message.")
+  message: string;
+
+  @doc("Parameter name related to the error, if applicable.")
+  param?: string;
+
+  @doc("Type or category of the error.")
+  type?: string;
+
+  @doc("Event id of the error.")
+  event_id?: string;
+}
+
+@error
+@doc("Standard error response envelope.")
+model ErrorResponse {
+  @doc("Error object returned in case of API failure.")
+  error: VoiceLiveErrorDetails;
+}
+
+@doc("A single log probability entry for a token.")
+model LogProbProperties {
+  @doc("The token that was used to generate the log probability.")
+  token: string;
+
+  @doc("The log probability of the token.")
+  logprob: float32;
+
+  @doc("The bytes that were used to generate the log probability.")
+  bytes: int32[];
+}
\ No newline at end of file
diff --git a/specification/ai/data-plane/VoiceLive/custom.tsp b/specification/ai/data-plane/VoiceLive/custom.tsp
new file mode 100644
index 000000000000..bbd4782ce234
--- /dev/null
+++ b/specification/ai/data-plane/VoiceLive/custom.tsp
@@ -0,0 +1,484 @@
+import "./custom/events.tsp";
+import "./custom/items.tsp";
+import "./custom/tools.tsp";
+import "@typespec/http";
+import "@typespec/openapi";
+
+
+using TypeSpec.OpenAPI;
+
+namespace VoiceLive;
+
+model RequestSession {
+  ...SessionBase;
+  `model`?: string;
+  modalities?: Modality[];
+  animation?: Animation;
+  voice?: Voice;
+  instructions?: string;
+  input_audio?: InputAudio;
+  input_audio_sampling_rate?: int32 = 24000;
+  input_audio_format?: AudioFormat = AudioFormat.pcm16;
+  output_audio_format?: AudioFormat = AudioFormat.pcm16;
+  turn_detection?: TurnDetection | null;
+  input_audio_noise_reduction?: AudioNoiseReduction;
+  input_audio_echo_cancellation?: AudioEchoCancellation;
+  avatar?: AvatarConfig;
+  input_audio_transcription?: AudioInputTranscriptionSettings;
+  output_audio_timestamp_types?: AudioTimestampType[];
+  tools?: Tool[];
+  tool_choice?: ToolChoice;
+  temperature?: float32;
+  max_response_output_tokens?: int32 | "inf";
+}
+
+model ResponseSession {
+  ...SessionBase;
+  id?: string;
+  `model`?: string;
+  modalities?: Modality[];
+  instructions?: string;
+  animation?: Animation;
+  voice?: Voice;
+  input_audio?: InputAudio;
+  input_audio_format?: AudioFormat;
+  output_audio_format?: AudioFormat;
+  input_audio_sampling_rate?: int32;
+  turn_detection?: TurnDetection;
+  input_audio_noise_reduction?: AudioNoiseReduction;
+  input_audio_echo_cancellation?: AudioEchoCancellation;
+  avatar?: AvatarConfig;
+  input_audio_transcription?: AudioInputTranscriptionSettings | null;
+  output_audio_timestamp_types?: AudioTimestampType[];
+  tools?: Tool[];
+  tool_choice?: ToolChoice;
+  temperature?: float32;
+  max_response_output_tokens?: int32 | "inf" | null;
+  agent?: AgentConfig;
+}
+
+@doc("Supported OpenAI voice names (string enum).")
+union OAIVoice {
+  string,
+  alloy: "alloy",
+  ash: "ash",
+  ballad: "ballad",
+  coral: "coral",
+  echo: "echo",
+  sage: "sage",
+  shimmer: "shimmer",
+  verse: "verse",
+}
+
+@doc("""
+OpenAI voice configuration with explicit type field.
+
+This provides a unified interface for OpenAI voices, complementing the
+existing string-based OAIVoice for backward compatibility.
+""")
+model OpenAIVoice {
+  type: "openai";
+  name: OAIVoice;
+}
+
+// --- Azure voices ----------------------------------------------------------
+
+@doc("Base for Azure voice configurations.")
+@discriminator("type")
+model AzureVoice {
+  type: string;
+}
+
+@doc("Azure custom voice configuration (preferred).")
+model AzureCustomVoice extends AzureVoice {
+  type: "azure-custom";
+
+  @minLength(1) @doc("Voice name cannot be empty.") name: string;
+  @minLength(1) @doc("Endpoint ID cannot be empty.") endpoint_id: string;
+
+  @minValue(0) @maxValue(1)
+  @doc("Temperature must be between 0.0 and 1.0.")
+  temperature?: float32;
+
+  custom_lexicon_url?: string;
+  prefer_locales?: string[];
+  locale?: string;
+  style?: string;
+  pitch?: string;
+  rate?: string;
+  volume?: string;
+}
+
+@doc("Azure standard voice configuration.")
+model AzureStandardVoice extends AzureVoice {
+  type: "azure-standard";
+
+  @minLength(1) @doc("Voice name cannot be empty.") name: string;
+
+  @minValue(0) @maxValue(1)
+  @doc("Temperature must be between 0.0 and 1.0.") temperature?: float32;
+
+  custom_lexicon_url?: string;
+  prefer_locales?: string[];
+  locale?: string;
+  style?: string;
+  pitch?: string;
+  rate?: string;
+  volume?: string;
+}
+
+@doc("Azure platform voice configuration (variant of standard).")
+model AzurePlatformVoice extends AzureVoice {
+  type: "azure-platform";
+
+  @minLength(1) @doc("Voice name cannot be empty.") name: string;
+
+  @minValue(0) @maxValue(1)
+  @doc("Temperature must be between 0.0 and 1.0.") temperature?: float32;
+
+  custom_lexicon_url?: string;
+  prefer_locales?: string[];
+  locale?: string;
+  style?: string;
+  pitch?: string;
+  rate?: string;
+  volume?: string;
+}
+
+@doc("Azure personal voice configuration.")
+model AzurePersonalVoice extends AzureVoice {
+  type: "azure-personal";
+
+  @minLength(1) @doc("Voice name cannot be empty.") name: string;
+
+  @minValue(0) @maxValue(1)
+  @doc("Temperature must be between 0.0 and 1.0.")
+  temperature?: float32;
+
+  @doc("Underlying neural model to use for personal voice.")
+  `model`: "DragonLatestNeural" | "PhoenixLatestNeural" | "PhoenixV2Neural";
+}
+
+// --- Phi4mm voices ---------------------------------------------------------
+
+@doc("Voice identifier for Phi4mm voices.")
+union Phi4mmVoice {
+  string,
+  "cosyvoice"
+}
+
+@doc("""
+Voice configuration for LLM (Large Language Model) voices.
+""")
+model LLMVoice {
+  type: "llm";
+  name: Phi4mmVoice;
+}
+
+// --- Top-level Voice union -------------------------------------------------
+
+@doc("Union of all supported voice configurations.")
+union Voice {
+  OAIVoice,
+  OpenAIVoice,
+  AzureVoice,   // includes AzureCustomVoice, CustomVoice, AzurePersonalVoice
+  Phi4mmVoice,
+  LLMVoice
+}
+
+union AudioFormat {
+  string,
+  pcm16: "pcm16",
+  g711_ulaw: "g711_ulaw",
+  g711_alaw: "g711_alaw",
+}
+
+union AudioInputTranscriptionModel {
+  string,
+  whisper_1: "whisper-1",
+}
+
+@doc("Configuration for input audio transcription.")
+model AudioInputTranscriptionSettings {
+  @doc("The model used for transcription. E.g., 'whisper-1', 'azure-fast-transcription', 's2s-ingraph'.")
+  `model`: string | "whisper-1" | "azure-fast-transcription" | "s2s-ingraph" | "azure-speech";
+
+  @doc("The language code to use for transcription, if specified.")
+  language?: string;
+
+  @doc("Whether transcription is enabled.")
+  enabled: boolean;
+
+  @doc("Whether a custom model is being used.")
+  custom_model: boolean;
+}
+
+union Modality {
+  string,
+  text: "text",
+  audio: "audio",
+  animation: "animation",
+  avatar: "avatar",
+}
+
+@discriminator("model")
+@doc("Top-level union for end-of-utterance (EOU) semantic detection configuration.")
+model EOUDetection {
+  `model`: "semantic_detection_v1" | "semantic_detection_v1_en" | "semantic_detection_v1_multilingual";
+}
+
+@doc("Azure semantic end-of-utterance detection (default).")
+model AzureSemanticDetection extends EOUDetection {
+  `model`: "semantic_detection_v1";
+  threshold?: float32;
+  timeout?: float32;
+  secondary_threshold?: float32;
+  secondary_timeout?: float32;
+  disable_rules?: boolean;
+  // developer options
+  sr_boost?: float32;
+  extra_imend_check?: boolean;
+}
+
+@doc("Azure semantic end-of-utterance detection (English-optimized).")
+model AzureSemanticDetectionEn extends EOUDetection {
+  `model`: "semantic_detection_v1_en";
+  threshold?: float32;
+  timeout?: float32;
+  secondary_threshold?: float32;
+  secondary_timeout?: float32;
+  disable_rules?: boolean;
+  // developer options
+  sr_boost?: float32;
+  extra_imend_check?: boolean;
+}
+
+@doc("Azure semantic end-of-utterance detection (multilingual).")
+model AzureSemanticDetectionMultilingual extends EOUDetection {
+  `model`: "semantic_detection_v1_multilingual";
+  threshold?: float32;
+  timeout?: float32;
+  secondary_threshold?: float32;
+  secondary_timeout?: float32;
+  disable_rules?: boolean;
+  // developer options
+  sr_boost?: float32;
+  extra_imend_check?: boolean;
+}
+
+@discriminator("type")
+@doc("Top-level union for turn detection configuration.")
+model TurnDetection {
+  type:
+    | "none"
+    | "server_vad"
+    | "azure_semantic_vad"
+    | "azure_semantic_vad_en"
+    | "server_sd"
+    | "azure_semantic_vad_multilingual";
+}
+
+@doc("Disables turn detection.")
+model NoTurnDetection extends TurnDetection {
+  type: "none";
+}
+
+@doc("Base model for VAD-based turn detection.")
+model ServerVad extends TurnDetection {
+  type: "server_vad";
+  threshold?: float32;
+  prefix_padding_ms?: int32;
+  silence_duration_ms?: int32;
+  end_of_utterance_detection?: EOUDetection;
+  auto_truncate?: boolean = false;
+}
+
+@doc("Server Speech Detection (Azure semantic VAD, default variant).")
+model AzureSemanticVad extends TurnDetection {
+  type: "azure_semantic_vad";
+  threshold?: float32;
+  prefix_padding_ms?: int32;
+  silence_duration_ms?: int32;
+  end_of_utterance_detection?: EOUDetection;
+  neg_threshold?: float32;
+  speech_duration_ms?: int32;
+  window_size?: int32;
+  distinct_ci_phones?: int32;
+  require_vowel?: boolean;
+  remove_filler_words?: boolean = false;
+  languages?: string[];
+  auto_truncate?: boolean = false;
+}
+
+@doc("Server Speech Detection (Azure semantic VAD, English-only).")
+model AzureSemanticVadEn extends TurnDetection {
+  type: "azure_semantic_vad_en";
+  threshold?: float32;
+  prefix_padding_ms?: int32;
+  silence_duration_ms?: int32;
+  end_of_utterance_detection?: EOUDetection;
+  neg_threshold?: float32;
+  speech_duration_ms?: int32;
+  window_size?: int32;
+  distinct_ci_phones?: int32;
+  require_vowel?: boolean;
+  remove_filler_words?: boolean = false;
+  languages?: string[];
+  auto_truncate?: boolean = false;
+}
+
+@doc("Server Speech Detection (legacy `server_sd` alias).")
+model AzureSemanticVadServer extends TurnDetection {
+  type: "server_sd";
+  threshold?: float32;
+  prefix_padding_ms?: int32;
+  silence_duration_ms?: int32;
+  end_of_utterance_detection?: EOUDetection;
+  neg_threshold?: float32;
+  speech_duration_ms?: int32;
+  window_size?: int32;
+  distinct_ci_phones?: int32;
+  require_vowel?: boolean;
+  remove_filler_words?: boolean = false;
+  languages?: string[];
+  auto_truncate?: boolean = false;
+}
+
+@doc("Server Speech Detection (Azure semantic VAD).")
+model AzureMultilingualSemanticVad extends TurnDetection {
+  type: "azure_semantic_vad_multilingual";
+  threshold?: float32;
+  prefix_padding_ms?: int32;
+  silence_duration_ms?: int32;
+  end_of_utterance_detection?: EOUDetection;
+  neg_threshold?: float32;
+  speech_duration_ms?: int32;
+  window_size?: int32;
+  distinct_ci_phones?: int32;
+  require_vowel?: boolean;
+  remove_filler_words?: boolean = false;
+  languages?: string[];
+  auto_truncate?: boolean = false;
+}
+
+@doc("Configuration for input audio noise reduction.")
+model AudioNoiseReduction {
+  @doc("The type of noise reduction model.")
+  type: "azure_deep_noise_suppression";
+}
+
+@doc("Configuration for client audio input. Used to specify the audio model and optional phrase list.")
+model InputAudio {
+  @doc("The name of the model to use for input audio (currently only 'azure-standard' is supported).")
+  `model`: "azure-standard";
+
+  @doc("Optional list of phrases to bias the speech recognition engine.")
+  phrase_list?: string[];
+}
+
+@doc("Echo cancellation configuration for server-side audio processing.")
+model AudioEchoCancellation {
+  @doc("The type of echo cancellation model to use.")
+  type: "server_echo_cancellation";
+}
+
+@doc("Output timestamp types supported in audio response content.")
+union AudioTimestampType {
+  string,
+  @doc("Timestamps per word in the output audio.")
+  word: "word",
+}
+
+@doc("Specifies the types of animation data to output.")
+union AnimationOutputType {
+  blendshapes: "blendshapes",
+  viseme_id: "viseme_id",
+  emotion: "emotion",
+}
+
+@doc("Configuration for animation outputs including blendshapes, visemes, and emotion metadata.")
+model Animation {
+  @doc("The name of the animation model to use.")
+  model_name?: string = "default";
+
+  @doc("Set of output data types requested from the animation system.")
+  outputs?: AnimationOutputType[] = #[AnimationOutputType.blendshapes];
+
+  @doc("Interval for emotion detection in milliseconds. If not set, emotion detection is disabled.")
+  emotion_detection_interval_ms?: int32;
+}
+
+@doc("Configuration for avatar streaming and behavior during the session.")
+model AvatarConfig {
+  @doc("Optional list of ICE servers to use for WebRTC connection establishment.")
+  ice_servers?: IceServer[];
+
+  @doc("The character name or ID used for the avatar.")
+  character: string;
+
+  @doc("Optional avatar style, such as emotional tone or speaking style.")
+  style?: string;
+
+  @doc("Indicates whether the avatar is customized or not.")
+  customized: boolean;
+
+  @doc("Optional video configuration including resolution, bitrate, and codec.")
+  video?: VideoParams;
+}
+
+@doc("ICE server configuration for WebRTC connection negotiation.")
+model IceServer {
+  @doc("List of ICE server URLs (e.g., TURN or STUN endpoints).")
+  urls: url[];
+
+  @doc("Optional username used for authentication with the ICE server.")
+  username?: string;
+
+  @doc("Optional credential (e.g., password or token) used for authentication.")
+  credential?: string;
+}
+
+model AgentConfig {
+  type: "agent";
+  name: string;
+  description?: string;
+  agent_id: string;
+  thread_id: string;
+}
+
+@doc("Video streaming parameters for avatar.")
+model VideoParams {
+  @doc("Bitrate in bits per second (e.g., 2000000 for 2 Mbps).")
+  bitrate?: int32 = 2000000;
+
+  @doc("Codec to use for encoding. Currently only 'h264' is supported.")
+  codec?: "h264" = "h264";
+
+  @doc("Optional cropping settings for the video stream.")
+  crop?: VideoCrop;
+
+  @doc("Optional resolution settings for the video stream.")
+  resolution?: VideoResolution;
+}
+
+@doc("Defines a video crop rectangle using top-left and bottom-right coordinates.")
+model VideoCrop {
+  @doc("Top-left corner of the crop region. Array of [x, y], must be non-negative integers.")
+  @minItems(2)
+  @maxItems(2)
+  top_left: int32[];
+
+  @doc("Bottom-right corner of the crop region. Array of [x, y], must be non-negative integers.")
+  @minItems(2)
+  @maxItems(2)
+  bottom_right: int32[];
+}
+
+@doc("Resolution of the video feed in pixels.")
+model VideoResolution {
+  @doc("Width of the video in pixels. Must be greater than 0.")
+  width: int32;
+
+  @doc("Height of the video in pixels. Must be greater than 0.")
+  height: int32;
+}
\ No newline at end of file
diff --git a/specification/ai/data-plane/VoiceLive/custom/content_parts.tsp b/specification/ai/data-plane/VoiceLive/custom/content_parts.tsp
new file mode 100644
index 000000000000..ff4c86acd705
--- /dev/null
+++ b/specification/ai/data-plane/VoiceLive/custom/content_parts.tsp
@@ -0,0 +1,36 @@
+using TypeSpec.OpenAPI;
+
+namespace VoiceLive;
+
+union ContentPartType {
+  string,
+  input_text: "input_text",
+  input_audio: "input_audio",
+  text: "text",
+  audio: "audio",
+}
+
+@discriminator("type")
+model ContentPart {
+  type: ContentPartType;
+}
+
+model RequestTextContentPart extends ContentPart {
+  type: ContentPartType.input_text;
+  text?: string;
+}
+
+model RequestAudioContentPart extends ContentPart {
+  type: ContentPartType.input_audio;
+  transcript?: string;
+}
+
+model ResponseTextContentPart extends ContentPart {
+  type: ContentPartType.text;
+  text?: string;
+}
+
+model ResponseAudioContentPart extends ContentPart {
+  type: ContentPartType.audio;
+  transcript?: string;
+}
diff --git a/specification/ai/data-plane/VoiceLive/custom/events.tsp b/specification/ai/data-plane/VoiceLive/custom/events.tsp
new file mode 100644
index 000000000000..5b38866ee7bf
--- /dev/null
+++ b/specification/ai/data-plane/VoiceLive/custom/events.tsp
@@ -0,0 +1,65 @@
+using TypeSpec.OpenAPI;
+
+namespace VoiceLive;
+
+@doc("Client event types used in VoiceLive protocol.")
+union ClientEventType {
+  string,
+  session_update: "session.update",
+  input_audio_buffer_append: "input_audio_buffer.append",
+  input_audio_buffer_commit: "input_audio_buffer.commit",
+  input_audio_buffer_clear: "input_audio_buffer.clear",
+  input_audio_turn_start: "input_audio.turn.start",
+  input_audio_turn_append: "input_audio.turn.append",
+  input_audio_turn_end: "input_audio.turn.end",
+  input_audio_turn_cancel: "input_audio.turn.cancel",
+  input_audio_clear: "input_audio.clear",
+  conversation_item_create: "conversation.item.create",
+  conversation_item_retrieve: "conversation.item.retrieve",
+  conversation_item_truncate: "conversation.item.truncate",
+  conversation_item_delete: "conversation.item.delete",
+  response_create: "response.create",
+  response_cancel: "response.cancel",
+  session_avatar_connect: "session.avatar.connect",
+}
+
+@doc("Server event types used in VoiceLive protocol.")
+union ServerEventType {
+  string,
+  error: "error",
+  session_avatar_connecting: "session.avatar.connecting",
+  session_created: "session.created",
+  session_updated: "session.updated",
+  conversation_item_input_audio_transcription_completed: "conversation.item.input_audio_transcription.completed",
+  conversation_item_input_audio_transcription_delta: "conversation.item.input_audio_transcription.delta",
+  conversation_item_input_audio_transcription_failed: "conversation.item.input_audio_transcription.failed",
+  conversation_item_created: "conversation.item.created",
+  conversation_item_retrieved: "conversation.item.retrieved",
+  conversation_item_truncated: "conversation.item.truncated",
+  conversation_item_deleted: "conversation.item.deleted",
+  input_audio_buffer_committed: "input_audio_buffer.committed",
+  input_audio_buffer_cleared: "input_audio_buffer.cleared",
+  input_audio_buffer_speech_started: "input_audio_buffer.speech_started",
+  input_audio_buffer_speech_stopped: "input_audio_buffer.speech_stopped",
+  response_created: "response.created",
+  response_done: "response.done",
+  response_output_item_added: "response.output_item.added",
+  response_output_item_done: "response.output_item.done",
+  response_content_part_added: "response.content_part.added",
+  response_content_part_done: "response.content_part.done",
+  response_text_delta: "response.text.delta",
+  response_text_done: "response.text.done",
+  response_audio_transcript_delta: "response.audio_transcript.delta",
+  response_audio_transcript_done: "response.audio_transcript.done",
+  response_audio_delta: "response.audio.delta",
+  response_audio_done: "response.audio.done",
+  response_animation_blendshapes_delta: "response.animation_blendshapes.delta",
+  response_animation_blendshapes_done: "response.animation_blendshapes.done",
+  response_emotion_hypothesis: "response.emotion_hypothesis",
+  response_audio_timestamp_delta: "response.audio_timestamp.delta",
+  response_audio_timestamp_done: "response.audio_timestamp.done",
+  response_animation_viseme_delta: "response.animation_viseme.delta",
+  response_animation_viseme_done: "response.animation_viseme.done",
+  response_function_call_arguments_delta: "response.function_call_arguments.delta",
+  response_function_call_arguments_done: "response.function_call_arguments.done",
+}
\ No newline at end of file
diff --git a/specification/ai/data-plane/VoiceLive/custom/items.tsp b/specification/ai/data-plane/VoiceLive/custom/items.tsp
new file mode 100644
index 000000000000..4568b4651bf2
--- /dev/null
+++ b/specification/ai/data-plane/VoiceLive/custom/items.tsp
@@ -0,0 +1,207 @@
+import "./content_parts.tsp";
+
+using TypeSpec.OpenAPI;
+
+namespace VoiceLive;
+
+union ItemType {
+  string,
+  message: "message",
+  function_call: "function_call",
+  function_call_output: "function_call_output",
+}
+
+// Base for user content parts
+@discriminator("type")
+model UserContentPart {
+  type: string;
+}
+
+// Variants
+model InputTextContentPart extends UserContentPart {
+  type: "input_text";
+  text: string;
+}
+
+model InputAudioContentPart extends UserContentPart {
+  type: "input_audio";
+  audio: string;
+  transcript?: string;
+}
+
+@doc("Output text content part.")
+model OutputTextContentPart {
+  type: "text";
+  text: string;
+}
+
+// Status enum
+enum ItemParamStatus {
+  completed: "completed",
+  incomplete: "incomplete",
+}
+
+@doc("Base for any response item; discriminated by `type`.")
+@discriminator("type")
+model ConversationRequestItem {
+  type: ItemType;
+  id?: string;
+}
+
+// ----- Message Items -----
+@discriminator("role")
+model MessageItem extends ConversationRequestItem {
+  type: ItemType.message;
+  role: string;
+  status?: ItemParamStatus;
+}
+
+model SystemMessageItem extends MessageItem {
+  role: "system";
+  content: InputTextContentPart[];
+}
+
+model UserMessageItem extends MessageItem {
+  role: "user";
+  content: UserContentPart[];
+}
+
+model AssistantMessageItem extends MessageItem {
+  role: "assistant";
+  content: OutputTextContentPart[];
+}
+
+// ----- Function Call Items -----
+model FunctionCallItem extends ConversationRequestItem {
+  type: ItemType.function_call;
+  name: string;
+  call_id: string;
+  arguments: string;
+  status?: ItemParamStatus;
+}
+
+model FunctionCallOutputItem extends ConversationRequestItem {
+  type: ItemType.function_call_output;
+  call_id: string;
+  output: string;
+  status?: ItemParamStatus;
+}
+
+@discriminator("type")
+model ResponseItem {
+  // must stay here, required, broad type
+  type: ItemType;
+  id?: string;
+  object?: "realtime.item";
+}
+
+model ResponseMessageItem extends ResponseItem {
+  type: ItemType.message;
+  role: MessageRole;
+  content: ContentPart[];
+  status: ResponseItemStatus;
+}
+
+model ResponseFunctionCallItem
+  extends ResponseItem {
+  type: ItemType.function_call;
+  name: string;
+  call_id: string;
+  arguments: string;
+  status: ResponseItemStatus;
+}
+
+model ResponseFunctionCallOutputItem
+  extends ResponseItem {
+  type: ItemType.function_call_output;
+  call_id: string;
+  output: string;
+}
+
+union ResponseItemStatus {
+  string,
+  in_progress: "in_progress",
+  completed: "completed",
+  incomplete: "incomplete",
+}
+
+union MessageRole {
+  string,
+  system: "system",
+  user: "user",
+  assistant: "assistant",
+}
+
+@doc("Terminal status of a response.")
+enum ResponseStatus {
+  completed: "completed",
+  cancelled: "cancelled",
+  failed: "failed",
+  incomplete: "incomplete",
+  in_progress: "in_progress",
+}
+
+@doc("Base for all non-success response details.")
+@discriminator("type")  // or just @discriminator("type") if imported unqualified
+model ResponseStatusDetails {
+  // Required discriminator key on the base; keep it as a broad string.
+  type: string;
+}
+
+@doc("Details for a cancelled response.")
+model ResponseCancelledDetails extends ResponseStatusDetails {
+  // Narrow the discriminator to a literal in each child:
+  type: "cancelled";
+  reason: "turn_detected" | "client_cancelled";
+}
+
+@doc("Details for an incomplete response.")
+model ResponseIncompleteDetails extends ResponseStatusDetails {
+  type: "incomplete";
+  reason: "max_output_tokens" | "content_filter";
+}
+
+@doc("Details for a failed response.")
+model ResponseFailedDetails extends ResponseStatusDetails {
+  type: "failed";
+  error: unknown;
+}
+
+@doc("Details of input token usage.")
+model InputTokenDetails {
+  @doc("Number of cached tokens used in the input.")
+  cached_tokens: int32;
+
+  @doc("Number of text tokens used in the input.")
+  text_tokens: int32;
+
+  @doc("Number of audio tokens used in the input.")
+  audio_tokens: int32;
+}
+
+@doc("Details of output token usage.")
+model OutputTokenDetails {
+  @doc("Number of text tokens generated in the output.")
+  text_tokens: int32;
+
+  @doc("Number of audio tokens generated in the output.")
+  audio_tokens: int32;
+}
+
+@doc("Overall usage statistics for a response.")
+model Usage {
+  @doc("Total number of tokens (input + output).")
+  total_tokens: int32;
+
+  @doc("Number of input tokens.")
+  input_tokens: int32;
+
+  @doc("Number of output tokens.")
+  output_tokens: int32;
+
+  @doc("Detailed breakdown of input tokens.")
+  input_token_details: InputTokenDetails;
+
+  @doc("Detailed breakdown of output tokens.")
+  output_token_details: OutputTokenDetails;
+}
diff --git a/specification/ai/data-plane/VoiceLive/custom/tools.tsp b/specification/ai/data-plane/VoiceLive/custom/tools.tsp
new file mode 100644
index 000000000000..b346377da821
--- /dev/null
+++ b/specification/ai/data-plane/VoiceLive/custom/tools.tsp
@@ -0,0 +1,73 @@
+using TypeSpec.OpenAPI;
+
+namespace VoiceLive;
+
+/**
+ * The supported tool type discriminators for voicelive tools.
+ * Currently, only 'function' tools are supported.
+ */
+union ToolType {
+  string,
+  function: "function",
+}
+
+/**
+ * The base representation of a voicelive tool definition.
+ */
+@discriminator("type")
+model Tool {
+  type: ToolType;
+}
+
+/**
+ * The definition of a function tool as used by the voicelive endpoint.
+ */
+model FunctionTool extends Tool {
+  type: ToolType.function;
+  name: string;
+  description?: string;
+  parameters?: unknown;
+}
+
+/**
+ * The combined set of available representations for a voicelive tool_choice parameter, encompassing both string
+ * literal options like 'auto' as well as structured references to defined tools.
+ */
+union ToolChoice {
+  ToolChoiceLiteral,
+  ToolChoiceObject,
+}
+
+/**
+ * The available set of mode-level, string literal tool_choice options for the voicelive endpoint.
+ */
+union ToolChoiceLiteral {
+  string,
+
+  /** Specifies that the model should freely determine which tool or tools, if any, to call. */
+  auto: "auto",
+
+  /** Specifies that the model should call no tools whatsoever. */
+  none: "none",
+
+  /** Specifies that the model should call at least one tool. */
+  required: "required",
+}
+
+/**
+ * A base representation for a voicelive tool_choice selecting a named tool.
+ */
+@discriminator("type")
+model ToolChoiceObject {
+  type: ToolType;
+}
+
+/**
+ * The representation of a voicelive tool_choice selecting a named function tool.
+ */
+model ToolChoiceFunctionObject extends ToolChoiceObject {
+  type: ToolType.function;
+  function: {
+    name: string;
+  };
+}
diff --git a/specification/ai/data-plane/VoiceLive/main.tsp b/specification/ai/data-plane/VoiceLive/main.tsp
new file mode 100644
index 000000000000..144c4aeaff10
--- /dev/null
+++ b/specification/ai/data-plane/VoiceLive/main.tsp
@@ -0,0 +1 @@
+import "./operations.tsp";
diff --git a/specification/ai/data-plane/VoiceLive/models.tsp b/specification/ai/data-plane/VoiceLive/models.tsp
new file mode 100644
index 000000000000..5387eb699557
--- /dev/null
+++ b/specification/ai/data-plane/VoiceLive/models.tsp
@@ -0,0 +1,1242 @@
+/*
+ * This file was automatically generated from an OpenAPI .yaml file.
+ * Edits made directly to this file will be lost.
+ */
+
+import "./client.tsp";
+import "./common";
+import "./custom.tsp";
+
+using TypeSpec.OpenAPI;
+
+namespace VoiceLive;
+
+// Tool customization: Adjust union to be a discriminated type base
+/** A voicelive client event. */
+@discriminator("type")
+model ClientEvent {
+  /** The type of event. */
+  type: ClientEventType;
+
+  event_id?: string;
+}
+
+// Tool customization (apply_discriminator): apply discriminated type base
+@doc("""
+  Send this event to update the session’s default configuration.
+  The client may send this event at any time to update any field,
+  except for `voice`. However, note that once a session has been
+  initialized with a particular `model`, it can’t be changed to
+  another model using `session.update`.
+  
+  When the server receives a `session.update`, it will respond
+  with a `session.updated` event showing the full, effective configuration.
+  Only the fields that are present are updated. To clear a field like
+  `instructions`, pass an empty string.
+  """)
+model ClientEventSessionUpdate extends ClientEvent {
+  @doc("""
+    The event type, must be `session.update`.
+    """)
+  type: ClientEventType.session_update;
+
+  // Tool customization: apply enriched request-specific model
+  session: RequestSession;
+}
+
+@doc("""
+  Sent when the client connects and provides its SDP (Session Description Protocol)
+  for avatar-related media negotiation.
+""")
+model ClientEventSessionAvatarConnect extends ClientEvent {
+  @doc("The event type, must be 'session.avatar.connect'.")
+  type: ClientEventType.session_avatar_connect;
+
+  @doc("The client's SDP offer.")
+  client_sdp: string;
+}
+
+@doc("""
+  Indicates the start of a new audio input turn.
+""")
+model ClientEventInputAudioTurnStart extends ClientEvent {
+  @doc("The event type, must be 'input_audio.turn.start'.")
+  type: ClientEventType.input_audio_turn_start;
+
+  @doc("Unique identifier for the input audio turn.")
+  turn_id: string;
+}
+
+@doc("""
+  Appends audio data to an ongoing input turn.
+""")
+model ClientEventInputAudioTurnAppend extends ClientEvent {
+  @doc("The event type, must be 'input_audio.turn.append'.")
+  type: ClientEventType.input_audio_turn_append;
+
+  @doc("The ID of the turn this audio is part of.")
+  turn_id: string;
+
+  @doc("Base64-encoded audio chunk.")
+  audio: string;
+}
+
+@doc("""
+  Marks the end of an audio input turn.
+""")
+model ClientEventInputAudioTurnEnd extends ClientEvent {
+  @doc("The event type, must be 'input_audio.turn.end'.")
+  type: ClientEventType.input_audio_turn_end;
+
+  @doc("The ID of the audio turn being ended.")
+  turn_id: string;
+}
+
+@doc("""
+  Cancels an in-progress input audio turn.
+""")
+model ClientEventInputAudioTurnCancel extends ClientEvent {
+  @doc("The event type, must be 'input_audio.turn.cancel'.")
+  type: ClientEventType.input_audio_turn_cancel;
+
+  @doc("The ID of the turn to cancel.")
+  turn_id: string;
+}
+
+@doc("""
+  Clears all input audio currently being streamed.
+""")
+model ClientEventInputAudioClear extends ClientEvent {
+  @doc("The event type, must be 'input_audio.clear'.")
+  type: ClientEventType.input_audio_clear;
+}
+
+// Tool customization: establish custom, enriched discriminated type hierarchy
+/** The item to add to the conversation. */
+model ConversationItemBase {
+  /** Customized to enriched Conversation{Request,Response}Item models */
+}
+
+/** The response resource. */
+model Response {
+  /** The unique ID of the response. */
+  id?: string;
+
+  @doc("""
+    The object type, must be `realtime.response`.
+    """)
+  object?: "realtime.response";
+
+  @doc("""
+  The final status of the response.
+  One of: `completed`, `cancelled`, `failed`, `incomplete`, or `in_progress`.
+  """)
+  status?: ResponseStatus;
+
+  /** Additional details about the status. */
+  status_details?: ResponseStatusDetails;
+
+  // Tool customization: apply enriched response-specific type
+  /** The list of output items generated by the response. */
+  output?: ResponseItem[];
+
+  /**
+   * Usage statistics for the Response, this will correspond to billing. A
+   * VoiceLive API session will maintain a conversation context and append new
+   * Items to the Conversation, thus output from previous turns (text and
+   * audio tokens) will become the input for later turns.
+   */
+  usage?: Usage;
+
+  @doc("""
+    Which conversation the response is added to, determined by the `conversation`
+    field in the `response.create` event. If `auto`, the response will be added to
+    the default conversation and the value of `conversation_id` will be an id like
+    `conv_1234`. If `none`, the response will not be added to any conversation and
+    the value of `conversation_id` will be `null`. If responses are being triggered
+    by server VAD, the response will be added to the default conversation, thus
+    the `conversation_id` will be an id like `conv_1234`.
+    """)
+  conversation_id?: string;
+
+  @doc("""
+    supported voice identifiers and configurations.
+    """)
+  voice?: Voice;
+
+  @doc("""
+    The set of modalities the model used to respond. If there are multiple modalities,
+    the model will pick one, for example if `modalities` is `["text", "audio"]`, the model
+    could be responding in either text or audio.
+    """)
+  modalities?: ("text" | "audio")[];
+
+  @doc("""
+    The format of output audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`.
+    """)
+  output_audio_format?: "pcm16" | "g711_ulaw" | "g711_alaw";
+
+  /** Sampling temperature for the model, limited to [0.6, 1.2]. Defaults to 0.8. */
+  temperature?: float32;
+
+  /**
+   * Maximum number of output tokens for a single assistant response,
+   * inclusive of tool calls, that was used in this response.
+   */
+  max_output_tokens?: int32 | "inf";
+}
+
+// Tool customization (apply_discriminator): apply discriminated type base
+/**
+ * Send this event to append audio bytes to the input audio buffer. The audio
+ * buffer is temporary storage you can write to and later commit. In Server VAD
+ * mode, the audio buffer is used to detect speech and the server will decide
+ * when to commit. When Server VAD is disabled, you must commit the audio buffer
+ * manually.
+ *
+ * The client may choose how much audio to place in each event up to a maximum
+ * of 15 MiB, for example streaming smaller chunks from the client may allow the
+ * VAD to be more responsive. Unlike made other client events, the server will
+ * not send a confirmation response to this event.
+ */
+model ClientEventInputAudioBufferAppend extends ClientEvent {
+  @doc("""
+    The event type, must be `input_audio_buffer.append`.
+    """)
+  type: ClientEventType.input_audio_buffer_append;
+
+  // Tool customization: use encoded type for audio data
+  @doc("""
+    Base64-encoded audio. This must be in the format specified by the
+    `input_audio_format` field in the session configuration.
+    """)
+  audio: string;
+}
+
+// Tool customization (apply_discriminator): apply discriminated type base
+@doc("""
+  Send this event to commit the user input audio buffer, which will create a
+  new user message item in the conversation. This event will produce an error
+  if the input audio buffer is empty. When in Server VAD mode, the client does
+  not need to send this event, the server will commit the audio buffer
+  automatically.
+  
+  Committing the input audio buffer will trigger input audio transcription
+  (if enabled in session configuration), but it will not create a response
+  from the model. The server will respond with an `input_audio_buffer.committed`
+  event.
+  """)
+model ClientEventInputAudioBufferCommit extends ClientEvent {
+  @doc("""
+    The event type, must be `input_audio_buffer.commit`.
+    """)
+  type: ClientEventType.input_audio_buffer_commit;
+}
+
+// Tool customization (apply_discriminator): apply discriminated type base
+@doc("""
+  Send this event to clear the audio bytes in the buffer. The server will
+  respond with an `input_audio_buffer.cleared` event.
+  """)
+model ClientEventInputAudioBufferClear extends ClientEvent {
+  @doc("""
+    The event type, must be `input_audio_buffer.clear`.
+    """)
+  type: ClientEventType.input_audio_buffer_clear;
+}
+
+// Tool customization (apply_discriminator): apply discriminated type base
+@doc("""
+  Add a new Item to the Conversation's context, including messages, function
+  calls, and function call responses. This event can be used both to populate a
+  "history" of the conversation and to add new items mid-stream, but has the
+  current limitation that it cannot populate assistant audio messages.
+  
+  If successful, the server will respond with a `conversation.item.created`
+  event, otherwise an `error` event will be sent.
+  """)
+model ClientEventConversationItemCreate extends ClientEvent {
+  @doc("""
+    The event type, must be `conversation.item.create`.
+    """)
+  type: ClientEventType.conversation_item_create;
+
+  @doc("""
+    Optional client-generated ID used to identify this event.
+    """)
+  event_id?: string;
+
+  @doc("""
+    The ID of the preceding item after which the new item will be inserted.
+    If not set, the new item will be appended to the end of the conversation.
+    If set to `root`, the new item will be added to the beginning of the conversation.
+    If set to an existing ID, it allows an item to be inserted mid-conversation. If the
+    ID cannot be found, an error will be returned and the item will not be added.
+    """)
+  previous_item_id?: string;
+
+  // Tool customization: apply enriched item definition hierarchy
+  item?: ConversationRequestItem;
+}
+
+// Tool customization (apply_discriminator): apply discriminated type base
+@doc("""
+  Send this event to truncate a previous assistant message’s audio. The server
+  will produce audio faster than voicelive, so this event is useful when the user
+  interrupts to truncate audio that has already been sent to the client but not
+  yet played. This will synchronize the server's understanding of the audio with
+  the client's playback.
+  
+  Truncating audio will delete the server-side text transcript to ensure there
+  is not text in the context that hasn't been heard by the user.
+  
+  If successful, the server will respond with a `conversation.item.truncated`
+  event.
+  """)
+model ClientEventConversationItemTruncate extends ClientEvent {
+  @doc("""
+    The event type, must be `conversation.item.truncate`.
+    """)
+  type: ClientEventType.conversation_item_truncate;
+
+  /**
+   * The ID of the assistant message item to truncate. Only assistant message
+   * items can be truncated.
+   */
+  item_id: string;
+
+  /** The index of the content part to truncate. Set this to 0. */
+  content_index: int32;
+
+  /**
+   * Inclusive duration up to which audio is truncated, in milliseconds. If
+   * the audio_end_ms is greater than the actual audio duration, the server
+   * will respond with an error.
+   */
+  audio_end_ms: int32;
+}
+
+// Tool customization (apply_discriminator): apply discriminated type base
+@doc("""
+  Send this event when you want to remove any item from the conversation
+  history. The server will respond with a `conversation.item.deleted` event,
+  unless the item does not exist in the conversation history, in which case the
+  server will respond with an error.
+  """)
+model ClientEventConversationItemDelete extends ClientEvent {
+  @doc("""
+    The event type, must be `conversation.item.delete`.
+    """)
+  type: ClientEventType.conversation_item_delete;
+
+  /** The ID of the item to delete. */
+  item_id: string;
+}
+
+// Tool customization (apply_discriminator): apply discriminated type base
+@doc("""
+  This event instructs the server to create a Response, which means triggering
+  model inference. When in Server VAD mode, the server will create Responses
+  automatically.
+  
+  A Response will include at least one Item, and may have two, in which case
+  the second will be a function call. These Items will be appended to the
+  conversation history.
+  
+  The server will respond with a `response.created` event, events for Items
+  and content created, and finally a `response.done` event to indicate the
+  Response is complete.
+  
+  The `response.create` event includes inference configuration like
+  `instructions`, and `temperature`. These fields will override the Session's
+  configuration for this Response only.
+  """)
+model ClientEventResponseCreate extends ClientEvent {
+  @doc("""
+    The event type, must be `response.create`.
+    """)
+  type: ClientEventType.response_create;
+
+  response?: ResponseCreateParams;
+
+  @doc("""
+    additional instructions (system prompt) appended to the default instructions of the session. Only affects this response only.
+    """)
+  additional_instructions?: string;
+}
+
+// Tool customization (apply_discriminator): apply discriminated type base
+@doc("""
+  Send this event to cancel an in-progress response. The server will respond
+  with a `response.cancelled` event or an error if there is no response to
+  cancel.
+  """)
+model ClientEventResponseCancel extends ClientEvent {
+  @doc("""
+    The event type, must be `response.cancel`.
+    """)
+  type: ClientEventType.response_cancel;
+
+  /**
+   * A specific response ID to cancel - if not provided, will cancel an
+   * in-progress response in the default conversation.
+   */
+  response_id?: string;
+}
+
+// Tool customization (apply_discriminator): apply discriminated type
+/**
+ * Returned when an error occurs, which could be a client problem or a server
+ * problem. Most errors are recoverable and the session will stay open, we
+ * recommend to implementors to monitor and log error messages by default.
+ */
+model ServerEventError extends ServerEvent {
+  @doc("""
+    The event type, must be `error`.
+    """)
+  type: ServerEventType.error;
+
+  /** Details of the error. */
+  error: {
+    /** The type of error (e.g., "invalid_request_error", "server_error"). */
+    type: string;
+
+    /** Error code, if any. */
+    code?: string | null;
+
+    /** A human-readable error message. */
+    message: string;
+
+    /** Parameter related to the error, if any. */
+    param?: string | null;
+
+    /** The event_id of the client event that caused the error, if applicable. */
+    event_id?: string | null;
+  };
+}
+
+// Tool customization (apply_discriminator): apply discriminated type
+/**
+ * Returned when a Session is created. Emitted automatically when a new
+ * connection is established as the first server event. This event will contain
+ * the default Session configuration.
+ */
+model ServerEventSessionCreated extends ServerEvent {
+  @doc("""
+    The event type, must be `session.created`.
+    """)
+  type: ServerEventType.session_created;
+
+  // Tool customization: apply enriched response-specific model
+  session: ResponseSession;
+}
+
+// Tool customization (apply_discriminator): apply discriminated type
+@doc("""
+  Returned when a session is updated with a `session.update` event, unless
+  there is an error.
+  """)
+model ServerEventSessionUpdated extends ServerEvent {
+  @doc("""
+    The event type, must be `session.updated`.
+    """)
+  type: ServerEventType.session_updated;
+
+  // Tool customization: apply enriched response-specific model
+  session: ResponseSession;
+}
+
+@doc("Sent when the server is in the process of establishing an avatar media connection and provides its SDP answer.")
+model ServerEventSessionAvatarConnecting extends ServerEvent {
+  @doc("The event type, must be 'session.avatar.connecting'.")
+  type: ServerEventType.session_avatar_connecting;
+
+  @doc("The server's SDP answer for the avatar connection.")
+  server_sdp: string;
+}
+
+// Tool customization: establish base for enriched request/response split models
+/** VoiceLive session object configuration. */
+model SessionBase {}
+
+// Tool customization: Adjust union to be a discriminated type base
+/** A voicelive server event. */
+@discriminator("type")
+model ServerEvent {
+  /** The type of event. */
+  type: ServerEventType;
+
+  event_id?: string;
+}
+
+// Tool customization (apply_discriminator): apply discriminated type
+@doc("""
+  Returned when an input audio buffer is committed, either by the client or
+  automatically in server VAD mode. The `item_id` property is the ID of the user
+  message item that will be created, thus a `conversation.item.created` event
+  will also be sent to the client.
+  """)
+model ServerEventInputAudioBufferCommitted extends ServerEvent {
+  @doc("""
+    The event type, must be `input_audio_buffer.committed`.
+    """)
+  type: ServerEventType.input_audio_buffer_committed;
+
+  /** The ID of the preceding item after which the new item will be inserted. */
+  previous_item_id?: string;
+
+  /** The ID of the user message item that will be created. */
+  item_id: string;
+}
+
+// Tool customization (apply_discriminator): apply discriminated type
+@doc("""
+  Returned when the input audio buffer is cleared by the client with a
+  `input_audio_buffer.clear` event.
+  """)
+model ServerEventInputAudioBufferCleared extends ServerEvent {
+  @doc("""
+    The event type, must be `input_audio_buffer.cleared`.
+    """)
+  type: ServerEventType.input_audio_buffer_cleared;
+}
+
+// Tool customization (apply_discriminator): apply discriminated type
+@doc("""
+  Sent by the server when in `server_vad` mode to indicate that speech has been
+  detected in the audio buffer. This can happen any time audio is added to the
+  buffer (unless speech is already detected). The client may want to use this
+  event to interrupt audio playback or provide visual feedback to the user.
+  
+  The client should expect to receive a `input_audio_buffer.speech_stopped` event
+  when speech stops. The `item_id` property is the ID of the user message item
+  that will be created when speech stops and will also be included in the
+  `input_audio_buffer.speech_stopped` event (unless the client manually commits
+  the audio buffer during VAD activation).
+  """)
+model ServerEventInputAudioBufferSpeechStarted extends ServerEvent {
+  @doc("""
+    The event type, must be `input_audio_buffer.speech_started`.
+    """)
+  type: ServerEventType.input_audio_buffer_speech_started;
+
+  @doc("""
+    Milliseconds from the start of all audio written to the buffer during the
+    session when speech was first detected. This will correspond to the
+    beginning of audio sent to the model, and thus includes the
+    `prefix_padding_ms` configured in the Session.
+    """)
+  audio_start_ms: int32;
+
+  /** The ID of the user message item that will be created when speech stops. */
+  item_id: string;
+}
+
+// Tool customization (apply_discriminator): apply discriminated type
+@doc("""
+  Returned in `server_vad` mode when the server detects the end of speech in
+  the audio buffer. The server will also send an `conversation.item.created`
+  event with the user message item that is created from the audio buffer.
+  """)
+model ServerEventInputAudioBufferSpeechStopped extends ServerEvent {
+  @doc("""
+    The event type, must be `input_audio_buffer.speech_stopped`.
+    """)
+  type: ServerEventType.input_audio_buffer_speech_stopped;
+
+  @doc("""
+    Milliseconds since the session started when speech stopped. This will
+    correspond to the end of audio sent to the model, and thus includes the
+    `min_silence_duration_ms` configured in the Session.
+    """)
+  audio_end_ms: int32;
+
+  /** The ID of the user message item that will be created. */
+  item_id: string;
+}
+
+// Tool customization (apply_discriminator): apply discriminated type
+@doc("""
+  Returned when a conversation item is created. There are several scenarios that produce this event:
+    - The server is generating a Response, which if successful will produce
+      either one or two Items, which will be of type `message`
+      (role `assistant`) or type `function_call`.
+    - The input audio buffer has been committed, either by the client or the
+      server (in `server_vad` mode). The server will take the content of the
+      input audio buffer and add it to a new user message Item.
+    - The client has sent a `conversation.item.create` event to add a new Item
+      to the Conversation.
+  """)
+model ServerEventConversationItemCreated extends ServerEvent {
+  @doc("""
+    The event type, must be `conversation.item.created`.
+    """)
+  type: ServerEventType.conversation_item_created;
+
+  /**
+   * The ID of the preceding item in the Conversation context, allows the
+   * client to understand the order of the conversation.
+   */
+  previous_item_id?: string;
+
+  // Tool customization: apply enriched item definition hierarchy
+  item?: ResponseItem;
+}
+
+// Tool customization (apply_discriminator): apply discriminated type
+@doc("""
+  This event is the output of audio transcription for user audio written to the
+  user audio buffer. Transcription begins when the input audio buffer is
+  committed by the client or server (in `server_vad` mode). Transcription runs
+  asynchronously with Response creation, so this event may come before or after
+  the Response events.
+  
+  VoiceLive API models accept audio natively, and thus input transcription is a
+  separate process run on a separate ASR (Automatic Speech Recognition) model.
+  The transcript may diverge somewhat from the model's interpretation, and
+  should be treated as a rough guide.
+  """)
+model ServerEventConversationItemInputAudioTranscriptionCompleted
+  extends ServerEvent {
+  @doc("""
+    The event type, must be
+    `conversation.item.input_audio_transcription.completed`.
+    """)
+  type: ServerEventType.conversation_item_input_audio_transcription_completed;
+
+  /** The ID of the user message item containing the audio. */
+  item_id: string;
+
+  /** The index of the content part containing the audio. */
+  content_index: int32;
+
+  /** The transcribed text. */
+  transcript: string;
+}
+
+// Tool customization (apply_discriminator): apply discriminated type
+@doc("""
+  Returned when input audio transcription is configured, and a transcription
+  request for a user message failed. These events are separate from other
+  `error` events so that the client can identify the related Item.
+  """)
+model ServerEventConversationItemInputAudioTranscriptionFailed
+  extends ServerEvent {
+  @doc("""
+    The event type, must be
+    `conversation.item.input_audio_transcription.failed`.
+    """)
+  type: ServerEventType.conversation_item_input_audio_transcription_failed;
+
+  /** The ID of the user message item. */
+  item_id: string;
+
+  /** The index of the content part containing the audio. */
+  content_index: int32;
+
+  /** Details of the transcription error. */
+  error: VoiceLiveErrorDetails;
+}
+
+// Tool customization (apply_discriminator): apply discriminated type
+@doc("""
+  Returned when an earlier assistant audio message item is truncated by the
+  client with a `conversation.item.truncate` event. This event is used to
+  synchronize the server's understanding of the audio with the client's playback.
+  
+  This action will truncate the audio and remove the server-side text transcript
+  to ensure there is no text in the context that hasn't been heard by the user.
+  """)
+model ServerEventConversationItemTruncated extends ServerEvent {
+  @doc("""
+    The event type, must be `conversation.item.truncated`.
+    """)
+  type: ServerEventType.conversation_item_truncated;
+
+  /** The ID of the assistant message item that was truncated. */
+  item_id: string;
+
+  /** The index of the content part that was truncated. */
+  content_index: int32;
+
+  /** The duration up to which the audio was truncated, in milliseconds. */
+  audio_end_ms: int32;
+
+  event_id?: string;
+}
+
+// Tool customization (apply_discriminator): apply discriminated type
+@doc("""
+  Returned when an item in the conversation is deleted by the client with a
+  `conversation.item.delete` event. This event is used to synchronize the
+  server's understanding of the conversation history with the client's view.
+  """)
+model ServerEventConversationItemDeleted extends ServerEvent {
+  @doc("""
+    The event type, must be `conversation.item.deleted`.
+    """)
+  type: ServerEventType.conversation_item_deleted;
+
+  /** The ID of the item that was deleted. */
+  item_id: string;
+
+  event_id?: string;
+}
+
+// Tool customization (apply_discriminator): apply discriminated type
+@doc("""
+  Returned when a new Response is created. The first event of response creation,
+  where the response is in an initial state of `in_progress`.
+  """)
+model ServerEventResponseCreated extends ServerEvent {
+  @doc("""
+    The event type, must be `response.created`.
+    """)
+  type: ServerEventType.response_created;
+
+  response: Response;
+}
+
+// Tool customization (apply_discriminator): apply discriminated type
+@doc("""
+  Returned when a Response is done streaming. Always emitted, no matter the
+  final state. The Response object included in the `response.done` event will
+  include all output Items in the Response but will omit the raw audio data.
+  """)
+model ServerEventResponseDone extends ServerEvent {
+  @doc("""
+    The event type, must be `response.done`.
+    """)
+  type: ServerEventType.response_done;
+
+  response: Response;
+}
+
+// Tool customization (apply_discriminator): apply discriminated type
+/** Returned when a new Item is created during Response generation. */
+model ServerEventResponseOutputItemAdded extends ServerEvent {
+  @doc("""
+    The event type, must be `response.output_item.added`.
+    """)
+  type: ServerEventType.response_output_item_added;
+
+  /** The ID of the Response to which the item belongs. */
+  response_id: string;
+
+  /** The index of the output item in the Response. */
+  output_index: int32;
+
+  // Tool customization: apply enriched item definition hierarchy
+  item?: ResponseItem;
+}
+
+// Tool customization (apply_discriminator): apply discriminated type
+/**
+ * Returned when an Item is done streaming. Also emitted when a Response is
+ * interrupted, incomplete, or cancelled.
+ */
+model ServerEventResponseOutputItemDone extends ServerEvent {
+  @doc("""
+    The event type, must be `response.output_item.done`.
+    """)
+  type: ServerEventType.response_output_item_done;
+
+  /** The ID of the Response to which the item belongs. */
+  response_id: string;
+
+  /** The index of the output item in the Response. */
+  output_index: int32;
+
+  // Tool customization: apply enriched item definition hierarchy
+  item?: ResponseItem;
+}
+
+// Tool customization (apply_discriminator): apply discriminated type
+/**
+ * Returned when a new content part is added to an assistant message item during
+ * response generation.
+ */
+model ServerEventResponseContentPartAdded extends ServerEvent {
+  @doc("""
+    The event type, must be `response.content_part.added`.
+    """)
+  type: ServerEventType.response_content_part_added;
+
+  /** The ID of the response. */
+  response_id: string;
+
+  /** The ID of the item to which the content part was added. */
+  item_id: string;
+
+  /** The index of the output item in the response. */
+  output_index: int32;
+
+  /** The index of the content part in the item's content array. */
+  content_index: int32;
+
+  // Tool customization: apply detailed content part type
+  /** The content part that was added. */
+  part: ContentPart;
+}
+
+// Tool customization (apply_discriminator): apply discriminated type
+/**
+ * Returned when a content part is done streaming in an assistant message item.
+ * Also emitted when a Response is interrupted, incomplete, or cancelled.
+ */
+model ServerEventResponseContentPartDone extends ServerEvent {
+  @doc("""
+    The event type, must be `response.content_part.done`.
+    """)
+  type: ServerEventType.response_content_part_done;
+
+  /** The ID of the response. */
+  response_id: string;
+
+  /** The ID of the item. */
+  item_id: string;
+
+  /** The index of the output item in the response. */
+  output_index: int32;
+
+  /** The index of the content part in the item's content array. */
+  content_index: int32;
+
+  // Tool customization: apply detailed content part type
+  /** The content part that is done. */
+  part: ContentPart;
+}
+
+// Tool customization (apply_discriminator): apply discriminated type
+/** Returned when the text value of a "text" content part is updated. */
+model ServerEventResponseTextDelta extends ServerEvent {
+  @doc("""
+    The event type, must be `response.text.delta`.
+    """)
+  type: ServerEventType.response_text_delta;
+
+  /** The ID of the response. */
+  response_id: string;
+
+  /** The ID of the item. */
+  item_id: string;
+
+  /** The index of the output item in the response. */
+  output_index: int32;
+
+  /** The index of the content part in the item's content array. */
+  content_index: int32;
+
+  /** The text delta. */
+  delta: string;
+}
+
+// Tool customization (apply_discriminator): apply discriminated type
+/**
+ * Returned when the text value of a "text" content part is done streaming. Also
+ * emitted when a Response is interrupted, incomplete, or cancelled.
+ */
+model ServerEventResponseTextDone extends ServerEvent {
+  @doc("""
+    The event type, must be `response.text.done`.
+    """)
+  type: ServerEventType.response_text_done;
+
+  /** The ID of the response. */
+  response_id: string;
+
+  /** The ID of the item. */
+  item_id: string;
+
+  /** The index of the output item in the response. */
+  output_index: int32;
+
+  /** The index of the content part in the item's content array. */
+  content_index: int32;
+
+  /** The final text content. */
+  text: string;
+}
+
+// Tool customization (apply_discriminator): apply discriminated type
+/** Returned when the model-generated transcription of audio output is updated. */
+model ServerEventResponseAudioTranscriptDelta extends ServerEvent {
+  @doc("""
+    The event type, must be `response.audio_transcript.delta`.
+    """)
+  type: ServerEventType.response_audio_transcript_delta;
+
+  /** The ID of the response. */
+  response_id: string;
+
+  /** The ID of the item. */
+  item_id: string;
+
+  /** The index of the output item in the response. */
+  output_index: int32;
+
+  /** The index of the content part in the item's content array. */
+  content_index: int32;
+
+  /** The transcript delta. */
+  delta: string;
+}
+
+// Tool customization (apply_discriminator): apply discriminated type
+/**
+ * Returned when the model-generated transcription of audio output is done
+ * streaming. Also emitted when a Response is interrupted, incomplete, or
+ * cancelled.
+ */
+model ServerEventResponseAudioTranscriptDone extends ServerEvent {
+  @doc("""
+    The event type, must be `response.audio_transcript.done`.
+    """)
+  type: ServerEventType.response_audio_transcript_done;
+
+  /** The ID of the response. */
+  response_id: string;
+
+  /** The ID of the item. */
+  item_id: string;
+
+  /** The index of the output item in the response. */
+  output_index: int32;
+
+  /** The index of the content part in the item's content array. */
+  content_index: int32;
+
+  /** The final transcript of the audio. */
+  transcript: string;
+}
+
+// Tool customization (apply_discriminator): apply discriminated type
+/** Returned when the model-generated audio is updated. */
+model ServerEventResponseAudioDelta extends ServerEvent {
+  @doc("""
+    The event type, must be `response.audio.delta`.
+    """)
+  type: ServerEventType.response_audio_delta;
+
+  /** The ID of the response. */
+  response_id: string;
+
+  /** The ID of the item. */
+  item_id: string;
+
+  /** The index of the output item in the response. */
+  output_index: int32;
+
+  /** The index of the content part in the item's content array. */
+  content_index: int32;
+
+  // Tool customization: use encoded type for audio data
+  /** Base64-encoded audio data delta. */
+  @encode("base64")
+  delta: bytes;
+
+  event_id?: string;
+}
+
+// Tool customization (apply_discriminator): apply discriminated type
+/**
+ * Returned when the model-generated audio is done. Also emitted when a Response
+ * is interrupted, incomplete, or cancelled.
+ */
+model ServerEventResponseAudioDone extends ServerEvent {
+  @doc("""
+    The event type, must be `response.audio.done`.
+    """)
+  type: ServerEventType.response_audio_done;
+
+  /** The ID of the response. */
+  response_id: string;
+
+  /** The ID of the item. */
+  item_id: string;
+
+  /** The index of the output item in the response. */
+  output_index: int32;
+
+  /** The index of the content part in the item's content array. */
+  content_index: int32;
+}
+
+@doc("""
+Represents a delta update of blendshape animation frames for a specific output of a response.
+""")
+model ServerEventResponseAnimationBlendshapeDelta extends ServerEvent {
+  type: ServerEventType.response_animation_blendshapes_delta;
+  response_id: string;
+  item_id: string;
+  output_index: int32;
+  content_index: int32;
+  frames: float32[][] | string;
+  frame_index: int32;
+}
+
+@doc("""
+Indicates the completion of blendshape animation processing for a specific output of a response.
+""")
+model ServerEventResponseAnimationBlendshapeDone extends ServerEvent {
+  type: ServerEventType.response_animation_blendshapes_done;
+  response_id: string;
+  item_id: string;
+  output_index: int32;
+}
+
+@doc("""
+Represents an emotion hypothesis detected from response audio with multiple candidates.
+""")
+model ServerEventResponseEmotionHypothesis extends ServerEvent {
+  type: ServerEventType.response_emotion_hypothesis;
+  emotion: string;
+  candidates: EmotionCandidate[];
+  audio_offset_ms: int32;
+  audio_duration_ms: int32;
+  response_id?: string;
+  item_id: string;
+}
+
+@doc("""
+Represents a word-level audio timestamp delta for a response.
+""")
+model ServerEventResponseAudioTimestampDelta extends ServerEvent {
+  type: ServerEventType.response_audio_timestamp_delta;
+  response_id: string;
+  item_id: string;
+  output_index: int32;
+  content_index: int32;
+  audio_offset_ms: int32;
+  audio_duration_ms: int32;
+  text: string;
+  timestamp_type: "word";
+}
+
+@doc("""
+Indicates completion of audio timestamp delivery for a response.
+""")
+model ServerEventResponseAudioTimestampDone extends ServerEvent {
+  type: ServerEventType.response_audio_timestamp_done;
+  response_id: string;
+  item_id: string;
+  output_index: int32;
+  content_index: int32;
+}
+
+@doc("""
+Represents a viseme ID delta update for animation based on audio.
+""")
+model ServerEventResponseAnimationVisemeDelta extends ServerEvent {
+  type: ServerEventType.response_animation_viseme_delta;
+  response_id: string;
+  item_id: string;
+  output_index: int32;
+  content_index: int32;
+  audio_offset_ms: int32;
+  viseme_id: int32;
+}
+
+@doc("""
+Indicates completion of viseme animation delivery for a response.
+""")
+model ServerEventResponseAnimationVisemeDone extends ServerEvent {
+  type: ServerEventType.response_animation_viseme_done;
+  response_id: string;
+  item_id: string;
+  output_index: int32;
+  content_index: int32;
+}
+
+/** Create a new VoiceLive response with these parameters */
+model ResponseCreateParams {
+  @doc("""
+  Whether to commit the response to the conversation. Defaults to true.
+  """)
+  commit?: boolean = true;
+
+  @doc("""
+  Whether to cancel any ongoing generation before starting this one. Defaults to true.
+  """)
+  cancel_previous?: boolean = true;
+
+  @doc("""
+  Input items to append to the conversation context before generating a response.
+  """)
+  append_input_items?: ConversationRequestItem[];
+
+  @doc("""
+  Input items to be used as the context for this response.
+  An empty array clears previous context.
+  """)
+  input_items?: ConversationRequestItem[];
+
+  // Tool customization: Apply reusable modality representation
+  /**
+   * The set of modalities the model can respond with. To disable audio,
+   * set this to ["text"].
+   */
+  modalities?: Modality[];
+
+  @doc("""
+    The default system instructions (i.e. system message) prepended to model
+    calls. This field allows the client to guide the model on desired
+    responses. The model can be instructed on response content and format,
+    (e.g. "be extremely succinct", "act friendly", "here are examples of good
+    responses") and on audio behavior (e.g. "talk quickly", "inject emotion
+    into your voice", "laugh frequently"). The instructions are not guaranteed
+    to be followed by the model, but they provide guidance to the model on the
+    desired behavior.
+    
+    Note that the server sets default instructions which will be used if this
+    field is not set and are visible in the `session.created` event at the
+    start of the session.
+    """)
+  instructions?: string;
+
+  @doc("""
+    supported voice identifiers and configurations.
+    """)
+  voice?: Voice;
+
+  // Tool customization: use extracted and reusable audio format definition
+  @doc("""
+    The format of output audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`.
+    """)
+  output_audio_format?: AudioFormat = AudioFormat.pcm16;
+
+  // Tool customization: use enriched tool definition
+  /** Tools (functions) available to the model. */
+  tools?: Tool[];
+
+  @doc("""
+    How the model chooses tools. Options are `auto`, `none`, `required`, or
+    specify a function, like `{"type": "function", "function": {"name": "my_function"}}`.
+    """)
+  tool_choice?: string;
+
+  /** Sampling temperature for the model, limited to [0.6, 1.2]. Defaults to 0.8. */
+  temperature?: float32;
+
+  // Tool customization: Address (observed as of 2025-01-31) spec issue with 'max_response_output_tokens'
+  @doc("""
+    Maximum number of output tokens for a single assistant response,
+    inclusive of tool calls. Provide an integer between 1 and 4096 to
+    limit output tokens, or `inf` for the maximum available tokens for a
+    given model. Defaults to `inf`.
+    """)
+  max_output_tokens?: int32 | "inf";
+}
+
+// Tool customization (apply_discriminator): apply discriminated type base
+@doc("""
+  Send this event when you want to retrieve the server's representation of a specific item in the conversation history. This is useful, for example, to inspect user audio after noise cancellation and VAD.
+  The server will respond with a `conversation.item.retrieved` event,
+  unless the item does not exist in the conversation history, in which case the
+  server will respond with an error.
+  """)
+model ClientEventConversationItemRetrieve extends ClientEvent {
+  @doc("""
+    The event type, must be `conversation.item.retrieve`.
+    """)
+  type: ClientEventType.conversation_item_retrieve;
+
+  /** The ID of the item to retrieve. */
+  item_id: string;
+}
+
+// Tool customization (apply_discriminator): apply discriminated type
+/** Returned when the text value of an input audio transcription content part is updated. */
+model ServerEventConversationItemInputAudioTranscriptionDelta
+  extends ServerEvent {
+  @doc("""
+    The event type, must be `conversation.item.input_audio_transcription.delta`.
+    """)
+  type: ServerEventType.conversation_item_input_audio_transcription_delta;
+
+  /** The ID of the item. */
+  item_id: string;
+
+  /** The index of the content part in the item's content array. */
+  content_index?: int32;
+
+  /** The text delta. */
+  delta?: string;
+
+  /** The log probabilities of the transcription. */
+  logprobs?: LogProbProperties[] | null;
+}
+
+// Tool customization (apply_discriminator): apply discriminated type
+@doc("""
+  Returned when a conversation item is retrieved with `conversation.item.retrieve`.
+  """)
+model ServerEventConversationItemRetrieved extends ServerEvent {
+  @doc("""
+    The event type, must be `conversation.item.retrieved`.
+    """)
+  type: ServerEventType.conversation_item_retrieved;
+
+  // Tool customization: apply enriched item definition hierarchy
+  item?: ResponseItem;
+  event_id?: string;
+}
+
+model EmotionCandidate {
+  emotion: string;
+  confidence: float32;
+}
+
+// Tool customization (apply_discriminator): apply discriminated type
+/** Returned when the model-generated function call arguments are updated. */
+model ServerEventResponseFunctionCallArgumentsDelta extends ServerEvent {
+  @doc("""
+    The event type, must be `response.function_call_arguments.delta`.
+    """)
+  type: ServerEventType.response_function_call_arguments_delta;
+
+  /** The ID of the response. */
+  response_id: string;
+
+  /** The ID of the function call item. */
+  item_id: string;
+
+  /** The index of the output item in the response. */
+  output_index: int32;
+
+  /** The ID of the function call. */
+  call_id: string;
+
+  /** The arguments delta as a JSON string. */
+  delta: string;
+}
+
+// Tool customization (apply_discriminator): apply discriminated type
+/**
+ * Returned when the model-generated function call arguments are done streaming.
+ * Also emitted when a Response is interrupted, incomplete, or cancelled.
+ */
+model ServerEventResponseFunctionCallArgumentsDone extends ServerEvent {
+  @doc("""
+    The event type, must be `response.function_call_arguments.done`.
+    """)
+  type: ServerEventType.response_function_call_arguments_done;
+
+  /** The ID of the response. */
+  response_id: string;
+
+  /** The ID of the function call item. */
+  item_id: string;
+
+  /** The index of the output item in the response. */
+  output_index: int32;
+
+  /** The ID of the function call. */
+  call_id: string;
+
+  /** The final arguments as a JSON string. */
+  arguments: string;
+
+  /** The name of the function call. */
+  name: string;
+}
diff --git a/specification/ai/data-plane/VoiceLive/operations.tsp b/specification/ai/data-plane/VoiceLive/operations.tsp
new file mode 100644
index 000000000000..7beb943459a7
--- /dev/null
+++ b/specification/ai/data-plane/VoiceLive/operations.tsp
@@ -0,0 +1,18 @@
+import "./common";
+import "./models.tsp";
+import "@azure-tools/typespec-azure-core";
+
+using TypeSpec.Http;
+using TypeSpec.OpenAPI;
+using TypeSpec.Versioning;
+
+namespace VoiceLive;
+
+alias VoiceLiveBetaHeader = {
+  @header("VoiceLive-Beta") voiceLiveBeta: "voicelive=v1";
+};
+
+enum Versions {
+  @useDependency(Azure.Core.Versions.v1_0_Preview_2)
+  v2025_05_01_preview: "2025-05-01-preview",
+}
\ No newline at end of file
diff --git a/specification/ai/data-plane/VoiceLive/servers/websocket.tsp b/specification/ai/data-plane/VoiceLive/servers/websocket.tsp
new file mode 100644
index 000000000000..f54e1bcf5709
--- /dev/null
+++ b/specification/ai/data-plane/VoiceLive/servers/websocket.tsp
@@ -0,0 +1,93 @@
+import "@typespec/http";
+import "@typespec/versioning";
+import "@azure-tools/typespec-azure-core";
+
+import "../models.tsp";
+import "../operations.tsp";
+
+using TypeSpec.Http;
+using TypeSpec.Versioning;
+using Azure.Core;
+
+@service(#{ title: "VoiceLive"})
+@versioned(VoiceLive.Versions)
+@useAuth(
+  ApiKeyAuth<ApiKeyLocation.header, "api-key"> | AadOauth2Auth<[
+    "https://cognitiveservices.azure.com/.default"
+  ]>
+)
+@server(
+  "{endpoint}/voice-agent/realtime",
+  "VoiceLive Endpoint",
+  {
+    @doc("""
+      Azure AI VoiceLive endpoint.
+      """)
+    endpoint: url,
+  }
+)
+
+
+namespace VoiceLive;
+
+// Union of all client events that can be passed into `force_models`
+alias ForceModelClientEvent =
+  ClientEventSessionUpdate |
+  ClientEventInputAudioBufferAppend |
+  ClientEventInputAudioBufferCommit |
+  ClientEventInputAudioBufferClear |
+  ClientEventInputAudioTurnStart |
+  ClientEventInputAudioTurnAppend |
+  ClientEventInputAudioTurnEnd |
+  ClientEventInputAudioTurnCancel |
+  ClientEventInputAudioClear |
+  ClientEventConversationItemCreate |
+  ClientEventConversationItemRetrieve |
+  ClientEventConversationItemTruncate |
+  ClientEventConversationItemDelete |
+  ClientEventResponseCreate |
+  ClientEventResponseCancel |
+  ClientEventSessionAvatarConnect;
+
+// Union of all server events that can be returned from `force_models`
+alias ForceModelServerEvent =
+  ServerEventSessionAvatarConnecting |
+  ServerEventSessionCreated |
+  ServerEventSessionUpdated |
+  ServerEventError |
+  ServerEventResponseTextDelta |
+  ServerEventResponseAudioDelta |
+  ServerEventConversationItemCreated |
+  ServerEventConversationItemDeleted |
+  ServerEventConversationItemRetrieved |
+  ServerEventConversationItemTruncated |
+  ServerEventConversationItemInputAudioTranscriptionCompleted |
+  ServerEventConversationItemInputAudioTranscriptionDelta |
+  ServerEventConversationItemInputAudioTranscriptionFailed |
+  ServerEventInputAudioBufferCommitted |
+  ServerEventInputAudioBufferCleared |
+  ServerEventInputAudioBufferSpeechStarted |
+  ServerEventInputAudioBufferSpeechStopped |
+  ServerEventResponseCreated |
+  ServerEventResponseDone |
+  ServerEventResponseOutputItemAdded |
+  ServerEventResponseOutputItemDone |
+  ServerEventResponseContentPartAdded |
+  ServerEventResponseContentPartDone |
+  ServerEventResponseTextDone |
+  ServerEventResponseAudioTranscriptDelta |
+  ServerEventResponseAudioTranscriptDone |
+  ServerEventResponseAudioDone |
+  ServerEventResponseFunctionCallArgumentsDelta |
+  ServerEventResponseFunctionCallArgumentsDone |
+  ServerEventResponseAnimationBlendshapeDelta |
+  ServerEventResponseAnimationBlendshapeDone |
+  ServerEventResponseEmotionHypothesis |
+  ServerEventResponseAudioTimestampDelta |
+  ServerEventResponseAudioTimestampDone |
+  ServerEventResponseAnimationVisemeDelta |
+  ServerEventResponseAnimationVisemeDone;
+
+
+// Operation definition
+op force_models(event: ForceModelClientEvent): ForceModelServerEvent;
\ No newline at end of file
diff --git a/specification/ai/data-plane/VoiceLive/tspconfig.yaml b/specification/ai/data-plane/VoiceLive/tspconfig.yaml
new file mode 100644
index 000000000000..a96a6cd87f9a
--- /dev/null
+++ b/specification/ai/data-plane/VoiceLive/tspconfig.yaml
@@ -0,0 +1,55 @@
+parameters:
+  "service-dir":
+    default: "sdk/ai"
+  "dependencies":
+    default: ""
+emit:
+  - "@azure-tools/typespec-autorest"
+linter:
+  extends:
+    - "@azure-tools/typespec-azure-rulesets/data-plane"
+options:
+  "@azure-tools/typespec-autorest":
+    azure-resource-provider-folder: "data-plane"
+    emit-lro-options: "none"
+    emitter-output-dir: "{project-root}/.."
+    output-file: "{azure-resource-provider-folder}/{service-name}/{version-status}/{version}/widgets.json"
+  "@azure-tools/typespec-python":
+    package-dir: "azure-ai-voicelive"
+    namespace: "azure.ai.voicelive"
+    generate-test: false
+    generate-sample: false
+    flavor: azure
+    package-name: "azure-ai-voicelive"
+  "@azure-tools/typespec-csharp":
+    package-dir: "Azure.AI.VoiceLive"
+    clear-output-folder: true
+    model-namespace: false
+    namespace: "{package-dir}"
+    flavor: azure
+    emitterPackageJsonPath: eng/azure-typespec-http-client-csharp-emitter-package.json
+  "@azure-typespec/http-client-csharp":
+    namespace: Azure.AI.VoiceLive
+    model-namespace: false
+  "@azure-tools/typespec-ts":
+    package-dir: "azure-ai-voicelive"
+    package-details:
+      name: "@azure-rest/ai-voicelive"
+    flavor: azure
+  "@azure-tools/typespec-java":
+    package-dir: "azure-ai-voicelive"
+    namespace: com.azure.ai.voicelive
+    flavor: azure
+  "@azure-tools/typespec-go":
+    module: "github.com/Azure/azure-sdk-for-go/{service-dir}/{package-dir}"
+    service-dir: "sdk/ai"
+    package-dir: "voicelive"
+    module-version: "0.0.1"
+    generate-fakes: true
+    inject-spans: true
+    single-client: true
+    slice-elements-byval: true
+    flavor: azure
+  "@azure-tools/typespec-client-generator-cli":
+    additionalDirectories:
+      - "specification/ai/data-plane/VoiceLive/"
\ No newline at end of file