firebase · dlarocque · Oct 17, 2025 · Oct 21, 2025 · hsubox76 · Oct 21, 2025
@@ -0,0 +1,6 @@
+---
+'firebase': minor
+'@firebase/ai': minor
+---
+
+Add support for audio transcriptions in the Live API.
@@ -92,6 +92,10 @@ export interface AudioConversationController {
     stop: () => Promise<void>;
 }
 
+// @public
+export interface AudioTranscriptionConfig {
+}
+
 // @public
 export abstract class Backend {
     protected constructor(type: BackendType);
@@ -922,7 +926,9 @@ export interface LanguageModelPromptOptions {
 // @beta
 export interface LiveGenerationConfig {
     frequencyPenalty?: number;
+    inputAudioTranscription?: AudioTranscriptionConfig;
     maxOutputTokens?: number;
+    outputAudioTranscription?: AudioTranscriptionConfig;
     presencePenalty?: number;
     responseModalities?: ResponseModality[];
     speechConfig?: SpeechConfig;
@@ -975,8 +981,10 @@ export type LiveResponseType = (typeof LiveResponseType)[keyof typeof LiveRespon
 
 // @beta
 export interface LiveServerContent {
+    inputTranscription?: Transcription;
     interrupted?: boolean;
     modelTurn?: Content;
+    outputTranscription?: Transcription;
     turnComplete?: boolean;
     // (undocumented)
     type: 'serverContent';
@@ -1342,6 +1350,11 @@ export interface ToolConfig {
     functionCallingConfig?: FunctionCallingConfig;
 }
 
+// @beta
+export interface Transcription {
+    text?: string;
+}
+
 // @public
 export type TypedSchema = IntegerSchema | NumberSchema | StringSchema | BooleanSchema | ObjectSchema | ArraySchema | AnyOfSchema;
 

@@ -18,6 +18,8 @@ toc:
     path: /docs/reference/js/ai.arrayschema.md
   - title: AudioConversationController
     path: /docs/reference/js/ai.audioconversationcontroller.md
+  - title: AudioTranscriptionConfig
+    path: /docs/reference/js/ai.audiotranscriptionconfig.md
   - title: Backend
     path: /docs/reference/js/ai.backend.md
   - title: BaseParams
@@ -202,6 +204,8 @@ toc:
     path: /docs/reference/js/ai.thinkingconfig.md
   - title: ToolConfig
     path: /docs/reference/js/ai.toolconfig.md
+  - title: Transcription
+    path: /docs/reference/js/ai.transcription.md
   - title: URLContext
     path: /docs/reference/js/ai.urlcontext.md
   - title: URLContextMetadata

@@ -0,0 +1,19 @@
+Project: /docs/reference/js/_project.yaml
+Book: /docs/reference/_book.yaml
+page_type: reference
+
+{% comment %}
+DO NOT EDIT THIS FILE!
+This is generated by the JS SDK team, and any local changes will be
+overwritten. Changes should be made in the source code at
+https://github.com/firebase/firebase-js-sdk
+{% endcomment %}
+
+# AudioTranscriptionConfig interface
+The audio transcription configuration.
+
+<b>Signature:</b>
+
+```typescript
+export interface AudioTranscriptionConfig 
+```
@@ -26,7 +26,9 @@ export interface LiveGenerationConfig
 |  Property | Type | Description |
 |  --- | --- | --- |
 |  [frequencyPenalty](./ai.livegenerationconfig.md#livegenerationconfigfrequencypenalty) | number | <b><i>(Public Preview)</i></b> Frequency penalties. |
+|  [inputAudioTranscription](./ai.livegenerationconfig.md#livegenerationconfiginputaudiotranscription) | [AudioTranscriptionConfig](./ai.audiotranscriptionconfig.md#audiotranscriptionconfig_interface) | <b><i>(Public Preview)</i></b> Enables transcription of audio input.<!-- -->When enabled, the model will respond with transcriptions of your audio input in the <code>inputTranscriptions</code> property in [LiveServerContent](./ai.liveservercontent.md#liveservercontent_interface) messages. Note that the transcriptions are broken up across messages, so you may only receive small amounts of text per message. For example, if you ask the model "How are you today?", the model may transcribe that input across three messages, broken up as "How a", "re yo", "u today?". |
 |  [maxOutputTokens](./ai.livegenerationconfig.md#livegenerationconfigmaxoutputtokens) | number | <b><i>(Public Preview)</i></b> Specifies the maximum number of tokens that can be generated in the response. The number of tokens per word varies depending on the language outputted. Is unbounded by default. |
+|  [outputAudioTranscription](./ai.livegenerationconfig.md#livegenerationconfigoutputaudiotranscription) | [AudioTranscriptionConfig](./ai.audiotranscriptionconfig.md#audiotranscriptionconfig_interface) | <b><i>(Public Preview)</i></b> Enables transcription of audio input.<!-- -->When enabled, the model will respond with transcriptions of its audio output in the <code>outputTranscription</code> property in [LiveServerContent](./ai.liveservercontent.md#liveservercontent_interface) messages. Note that the transcriptions are broken up across messages, so you may only receive small amounts of text per message. For example, if the model says "How are you today?", the model may transcribe that output across three messages, broken up as "How a", "re yo", "u today?". |
 |  [presencePenalty](./ai.livegenerationconfig.md#livegenerationconfigpresencepenalty) | number | <b><i>(Public Preview)</i></b> Positive penalties. |
 |  [responseModalities](./ai.livegenerationconfig.md#livegenerationconfigresponsemodalities) | [ResponseModality](./ai.md#responsemodality)<!-- -->\[\] | <b><i>(Public Preview)</i></b> The modalities of the response. |
 |  [speechConfig](./ai.livegenerationconfig.md#livegenerationconfigspeechconfig) | [SpeechConfig](./ai.speechconfig.md#speechconfig_interface) | <b><i>(Public Preview)</i></b> Configuration for speech synthesis. |
@@ -47,6 +49,21 @@ Frequency penalties.
 frequencyPenalty?: number;
 ```
 
+## LiveGenerationConfig.inputAudioTranscription
+
+> This API is provided as a preview for developers and may change based on feedback that we receive. Do not use this API in a production environment.
+> 
+
+Enables transcription of audio input.
+
+When enabled, the model will respond with transcriptions of your audio input in the `inputTranscriptions` property in [LiveServerContent](./ai.liveservercontent.md#liveservercontent_interface) messages. Note that the transcriptions are broken up across messages, so you may only receive small amounts of text per message. For example, if you ask the model "How are you today?", the model may transcribe that input across three messages, broken up as "How a", "re yo", "u today?".
+
+<b>Signature:</b>
+
+```typescript
+inputAudioTranscription?: AudioTranscriptionConfig;
+```
+
 ## LiveGenerationConfig.maxOutputTokens
 
 > This API is provided as a preview for developers and may change based on feedback that we receive. Do not use this API in a production environment.
@@ -60,6 +77,21 @@ Specifies the maximum number of tokens that can be generated in the response. Th
 maxOutputTokens?: number;
 ```
 
+## LiveGenerationConfig.outputAudioTranscription
+
+> This API is provided as a preview for developers and may change based on feedback that we receive. Do not use this API in a production environment.
+> 
+
+Enables transcription of audio input.
+
+When enabled, the model will respond with transcriptions of its audio output in the `outputTranscription` property in [LiveServerContent](./ai.liveservercontent.md#liveservercontent_interface) messages. Note that the transcriptions are broken up across messages, so you may only receive small amounts of text per message. For example, if the model says "How are you today?", the model may transcribe that output across three messages, broken up as "How a", "re yo", "u today?".
+
+<b>Signature:</b>
+
+```typescript
+outputAudioTranscription?: AudioTranscriptionConfig;
+```
+
 ## LiveGenerationConfig.presencePenalty
 
 > This API is provided as a preview for developers and may change based on feedback that we receive. Do not use this API in a production environment.

@@ -25,11 +25,26 @@ export interface LiveServerContent
 
 |  Property | Type | Description |
 |  --- | --- | --- |
+|  [inputTranscription](./ai.liveservercontent.md#liveservercontentinputtranscription) | [Transcription](./ai.transcription.md#transcription_interface) | <b><i>(Public Preview)</i></b> Transcription of the audio that was input to the model. |
 |  [interrupted](./ai.liveservercontent.md#liveservercontentinterrupted) | boolean | <b><i>(Public Preview)</i></b> Indicates whether the model was interrupted by the client. An interruption occurs when the client sends a message before the model finishes it's turn. This is <code>undefined</code> if the model was not interrupted. |
 |  [modelTurn](./ai.liveservercontent.md#liveservercontentmodelturn) | [Content](./ai.content.md#content_interface) | <b><i>(Public Preview)</i></b> The content that the model has generated as part of the current conversation with the user. |
+|  [outputTranscription](./ai.liveservercontent.md#liveservercontentoutputtranscription) | [Transcription](./ai.transcription.md#transcription_interface) | <b><i>(Public Preview)</i></b> Transcription of the audio output from the model. |
 |  [turnComplete](./ai.liveservercontent.md#liveservercontentturncomplete) | boolean | <b><i>(Public Preview)</i></b> Indicates whether the turn is complete. This is <code>undefined</code> if the turn is not complete. |
 |  [type](./ai.liveservercontent.md#liveservercontenttype) | 'serverContent' | <b><i>(Public Preview)</i></b> |
 
+## LiveServerContent.inputTranscription
+
+> This API is provided as a preview for developers and may change based on feedback that we receive. Do not use this API in a production environment.
+> 
+
+Transcription of the audio that was input to the model.
+
+<b>Signature:</b>
+
+```typescript
+inputTranscription?: Transcription;
+```
+
 ## LiveServerContent.interrupted
 
 > This API is provided as a preview for developers and may change based on feedback that we receive. Do not use this API in a production environment.
@@ -56,6 +71,19 @@ The content that the model has generated as part of the current conversation wit
 modelTurn?: Content;
 ```
 
+## LiveServerContent.outputTranscription
+
+> This API is provided as a preview for developers and may change based on feedback that we receive. Do not use this API in a production environment.
+> 
+
+Transcription of the audio output from the model.
+
+<b>Signature:</b>
+
+```typescript
+outputTranscription?: Transcription;
+```
+
 ## LiveServerContent.turnComplete
 
 > This API is provided as a preview for developers and may change based on feedback that we receive. Do not use this API in a production environment.

@@ -56,6 +56,7 @@ The Firebase AI Web SDK.
 |  [AI](./ai.ai.md#ai_interface) | An instance of the Firebase AI SDK.<!-- -->Do not create this instance directly. Instead, use [getAI()](./ai.md#getai_a94a413)<!-- -->. |
 |  [AIOptions](./ai.aioptions.md#aioptions_interface) | Options for initializing the AI service using [getAI()](./ai.md#getai_a94a413)<!-- -->. This allows specifying which backend to use (Vertex AI Gemini API or Gemini Developer API) and configuring its specific options (like location for Vertex AI). |
 |  [AudioConversationController](./ai.audioconversationcontroller.md#audioconversationcontroller_interface) | <b><i>(Public Preview)</i></b> A controller for managing an active audio conversation. |
+|  [AudioTranscriptionConfig](./ai.audiotranscriptionconfig.md#audiotranscriptionconfig_interface) | The audio transcription configuration. |
 |  [BaseParams](./ai.baseparams.md#baseparams_interface) | Base parameters for a number of methods. |
 |  [ChromeAdapter](./ai.chromeadapter.md#chromeadapter_interface) | <b><i>(Public Preview)</i></b> Defines an inference "backend" that uses Chrome's on-device model, and encapsulates logic for detecting when on-device inference is possible.<!-- -->These methods should not be called directly by the user. |
 |  [Citation](./ai.citation.md#citation_interface) | A single citation. |
@@ -134,6 +135,7 @@ The Firebase AI Web SDK.
 |  [TextPart](./ai.textpart.md#textpart_interface) | Content part interface if the part represents a text string. |
 |  [ThinkingConfig](./ai.thinkingconfig.md#thinkingconfig_interface) | Configuration for "thinking" behavior of compatible Gemini models.<!-- -->Certain models utilize a thinking process before generating a response. This allows them to reason through complex problems and plan a more coherent and accurate answer. |
 |  [ToolConfig](./ai.toolconfig.md#toolconfig_interface) | Tool config. This config is shared for all tools provided in the request. |
+|  [Transcription](./ai.transcription.md#transcription_interface) | <b><i>(Public Preview)</i></b> Transcription of audio. This can be returned from a [LiveGenerativeModel](./ai.livegenerativemodel.md#livegenerativemodel_class) if transcription is enabled with the <code>inputAudioTranscription</code> or <code>outputAudioTranscription</code> properties on the [LiveGenerationConfig](./ai.livegenerationconfig.md#livegenerationconfig_interface)<!-- -->. |
 |  [URLContext](./ai.urlcontext.md#urlcontext_interface) | <b><i>(Public Preview)</i></b> Specifies the URL Context configuration. |
 |  [URLContextMetadata](./ai.urlcontextmetadata.md#urlcontextmetadata_interface) | <b><i>(Public Preview)</i></b> Metadata related to [URLContextTool](./ai.urlcontexttool.md#urlcontexttool_interface)<!-- -->. |
 |  [URLContextTool](./ai.urlcontexttool.md#urlcontexttool_interface) | <b><i>(Public Preview)</i></b> A tool that allows you to provide additional context to the models in the form of public web URLs. By including URLs in your request, the Gemini model will access the content from those pages to inform and enhance its response. |

@@ -0,0 +1,41 @@
+Project: /docs/reference/js/_project.yaml
+Book: /docs/reference/_book.yaml
+page_type: reference
+
+{% comment %}
+DO NOT EDIT THIS FILE!
+This is generated by the JS SDK team, and any local changes will be
+overwritten. Changes should be made in the source code at
+https://github.com/firebase/firebase-js-sdk
+{% endcomment %}
+
+# Transcription interface
+> This API is provided as a preview for developers and may change based on feedback that we receive. Do not use this API in a production environment.
+> 
+
+Transcription of audio. This can be returned from a [LiveGenerativeModel](./ai.livegenerativemodel.md#livegenerativemodel_class) if transcription is enabled with the `inputAudioTranscription` or `outputAudioTranscription` properties on the [LiveGenerationConfig](./ai.livegenerationconfig.md#livegenerationconfig_interface)<!-- -->.
+
+<b>Signature:</b>
+
+```typescript
+export interface Transcription 
+```
+
+## Properties
+
+|  Property | Type | Description |
+|  --- | --- | --- |
+|  [text](./ai.transcription.md#transcriptiontext) | string | <b><i>(Public Preview)</i></b> The text transcription of the audio. |
+
+## Transcription.text
+
+> This API is provided as a preview for developers and may change based on feedback that we receive. Do not use this API in a production environment.
+> 
+
+The text transcription of the audio.
+
+<b>Signature:</b>
+
+```typescript
+text?: string;
+```
@@ -270,6 +270,56 @@ describe('Live', function () {
         });
       });
 
+      describe('Transcripts', async () => {
+        it('should receive transcript of audio input', async () => {
+          const model = getLiveGenerativeModel(testConfig.ai, {
+            model: testConfig.model,
+            generationConfig: {
+              responseModalities: [ResponseModality.AUDIO],
+              inputAudioTranscription: {},
+              outputAudioTranscription: {}
+            }
+          });
+          const session = await model.connect();
+          const stream = session.receive();
+
+          await session.sendAudioRealtime({
+            data: HELLO_AUDIO_PCM_BASE64,
+            mimeType: 'audio/pcm'
+          });
+
+          let aggregatedInputTranscription = '';
+          let aggregatedOutputTranscription = '';
+          let result = await stream.next();
+          while (!result.done) {
+            const chunk = result.value as
+              | LiveServerContent
+              | LiveServerToolCall
+              | LiveServerToolCallCancellation;
+            if (chunk.type === 'serverContent') {
+              if (chunk.turnComplete) {
+                break;
+              }
+
+              if (chunk.inputTranscription) {
+                aggregatedInputTranscription += chunk.inputTranscription?.text;
+              }
+              if (chunk.outputTranscription) {
+                aggregatedOutputTranscription +=
+                  chunk.outputTranscription?.text;
+              }
+            }
+
+            result = await stream.next();
+          }
+
+          expect(aggregatedInputTranscription).to.not.be.empty;
+          expect(aggregatedOutputTranscription).to.not.be.empty;
+
+          await session.close();
+        });
+      });
+
       /**
        * These tests are currently very unreliable. Their behavior seems to change frequently.
        * Skipping them for now.

@@ -168,4 +168,35 @@ describe('LiveGenerativeModel', () => {
     mockHandler.simulateServerMessage({ setupComplete: true });
     await connectPromise;
   });
+  it('connect() should deconstruct generationConfig to send transcription configs in top level setup', async () => {
+    const model = new LiveGenerativeModel(
+      fakeAI,
+      {
+        model: 'gemini-pro',
+        generationConfig: {
+          temperature: 0.8,
+          inputAudioTranscription: {},
+          outputAudioTranscription: {}
+        },
+        systemInstruction: { role: 'system', parts: [{ text: 'Be a pirate' }] }
+      },
+      mockHandler
+    );
+    const connectPromise = model.connect();
+
+    // Wait for setup message
+    await clock.runAllAsync();
+
+    const sentData = JSON.parse(mockHandler.send.getCall(0).args[0]);
+    // inputAudioTranscription and outputAudioTranscription should be at the top-level setup message,
+    // rather than in the generationConfig.
+    expect(sentData.setup.generationConfig).to.deep.equal({ temperature: 0.8 });
+    expect(sentData.setup.inputAudioTranscription).to.deep.equal({});
+    expect(sentData.setup.outputAudioTranscription).to.deep.equal({});
+    expect(sentData.setup.systemInstruction.parts[0].text).to.equal(
+      'Be a pirate'
+    );
+    mockHandler.simulateServerMessage({ setupComplete: true });
+    await connectPromise;
+  });
 });
@@ -86,13 +86,23 @@ export class LiveGenerativeModel extends AIModel {
       fullModelPath = `projects/${this._apiSettings.project}/locations/${this._apiSettings.location}/${this.model}`;
     }
 
+    // inputAudioTranscription and outputAudioTranscription are on the generation config in the public API,
+    // but the backend expects them to be in the `setup` message.
+    const {
+      inputAudioTranscription,
+      outputAudioTranscription,
+      ...generationConfig
+    } = this.generationConfig;
+
     const setupMessage: _LiveClientSetup = {
       setup: {
         model: fullModelPath,
-        generationConfig: this.generationConfig,
+        generationConfig,
         tools: this.tools,
         toolConfig: this.toolConfig,
-        systemInstruction: this.systemInstruction
+        systemInstruction: this.systemInstruction,
+        inputAudioTranscription,
+        outputAudioTranscription
       }
     };