diff --git a/genai/live/live_audiogen_with_txt.py b/genai/live/live_audiogen_with_txt.py new file mode 100644 index 0000000000..cf7f24a6fc --- /dev/null +++ b/genai/live/live_audiogen_with_txt.py @@ -0,0 +1,89 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +# Test file: https://storage.googleapis.com/generativeai-downloads/data/16000.wav +# Install helpers for converting files: pip install librosa soundfile + +import asyncio + + +async def generate_content() -> None: + # [START googlegenaisdk_live_audiogen_with_txt] + import numpy as np + import scipy.io.wavfile as wavfile + from google import genai + from google.genai.types import (Content, LiveConnectConfig, Modality, Part, + PrebuiltVoiceConfig, SpeechConfig, + VoiceConfig) + + client = genai.Client() + model = "gemini-2.0-flash-live-preview-04-09" + # For more Voice options, check https://cloud.google.com/vertex-ai/generative-ai/docs/models/gemini/2-5-flash#live-api-native-audio + voice_name = "Aoede" + + config = LiveConnectConfig( + response_modalities=[Modality.AUDIO], + speech_config=SpeechConfig( + voice_config=VoiceConfig( + prebuilt_voice_config=PrebuiltVoiceConfig( + voice_name=voice_name, + ) + ), + ), + ) + + async with client.aio.live.connect( + model=model, + config=config, + ) as session: + text_input = "Hello? Gemini are you there?" + print("> ", text_input, "\n") + + await session.send_client_content( + turns=Content(role="user", parts=[Part(text=text_input)]) + ) + + audio_data_chunks = [] + async for message in session.receive(): + if ( + message.server_content.model_turn + and message.server_content.model_turn.parts + ): + for part in message.server_content.model_turn.parts: + if part.inline_data: + audio_data_chunks.append( + np.frombuffer(part.inline_data.data, dtype=np.int16) + ) + + if audio_data_chunks: + print("Received audio answer. Saving to local file...") + full_audio_array = np.concatenate(audio_data_chunks) + + output_filename = "gemini_response.wav" + sample_rate = 24000 + + wavfile.write(output_filename, sample_rate, full_audio_array) + print(f"Audio saved to {output_filename}") + + # Example output: + # > Hello? Gemini are you there? + # Received audio answer. Saving to local file... + # Audio saved to gemini_response.wav + # [END googlegenaisdk_live_audiogen_with_txt] + return None + + +if __name__ == "__main__": + asyncio.run(generate_content()) diff --git a/genai/live/live_code_exec_with_txt.py b/genai/live/live_code_exec_with_txt.py new file mode 100644 index 0000000000..70db7402ee --- /dev/null +++ b/genai/live/live_code_exec_with_txt.py @@ -0,0 +1,62 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import asyncio + + +async def generate_content() -> list[str]: + # [START googlegenaisdk_live_code_exec_with_txt] + from google import genai + from google.genai.types import (Content, LiveConnectConfig, Modality, Part, + Tool, ToolCodeExecution) + + client = genai.Client() + model_id = "gemini-2.0-flash-live-preview-04-09" + config = LiveConnectConfig( + response_modalities=[Modality.TEXT], + tools=[Tool(code_execution=ToolCodeExecution())], + ) + async with client.aio.live.connect(model=model_id, config=config) as session: + text_input = "Compute the largest prime palindrome under 10" + print("> ", text_input, "\n") + await session.send_client_content( + turns=Content(role="user", parts=[Part(text=text_input)]) + ) + + response = [] + + async for chunk in session.receive(): + if chunk.server_content: + if chunk.text is not None: + response.append(chunk.text) + + model_turn = chunk.server_content.model_turn + if model_turn: + for part in model_turn.parts: + if part.executable_code is not None: + print(part.executable_code.code) + + if part.code_execution_result is not None: + print(part.code_execution_result.output) + + print("".join(response)) + # Example output: + # > Compute the largest prime palindrome under 10 + # Final Answer: The final answer is $\boxed{7}$ + # [END googlegenaisdk_live_code_exec_with_txt] + return response + + +if __name__ == "__main__": + asyncio.run(generate_content()) diff --git a/genai/live/live_func_call_with_txt.py b/genai/live/live_func_call_with_txt.py new file mode 100644 index 0000000000..7761a49b7b --- /dev/null +++ b/genai/live/live_func_call_with_txt.py @@ -0,0 +1,74 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import asyncio + +from google.genai.types import FunctionResponse + + +async def generate_content() -> list[FunctionResponse]: + # [START googlegenaisdk_live_func_call_with_txt] + from google import genai + from google.genai.types import (Content, FunctionDeclaration, + FunctionResponse, LiveConnectConfig, + Modality, Part, Tool) + + client = genai.Client() + model_id = "gemini-2.0-flash-live-preview-04-09" + + # Simple function definitions + turn_on_the_lights = FunctionDeclaration(name="turn_on_the_lights") + turn_off_the_lights = FunctionDeclaration(name="turn_off_the_lights") + + config = LiveConnectConfig( + response_modalities=[Modality.TEXT], + tools=[Tool(function_declarations=[turn_on_the_lights, turn_off_the_lights])], + ) + async with client.aio.live.connect(model=model_id, config=config) as session: + text_input = "Turn on the lights please" + print("> ", text_input, "\n") + await session.send_client_content( + turns=Content(role="user", parts=[Part(text=text_input)]) + ) + + function_responses = [] + + async for chunk in session.receive(): + if chunk.server_content: + if chunk.text is not None: + print(chunk.text) + + elif chunk.tool_call: + + for fc in chunk.tool_call.function_calls: + function_response = FunctionResponse( + name=fc.name, + response={ + "result": "ok" + }, # simple, hard-coded function response + ) + function_responses.append(function_response) + print(function_response.response["result"]) + + await session.send_tool_response(function_responses=function_responses) + + # Example output: + # > Turn on the lights please + # ok + # [END googlegenaisdk_live_func_call_with_txt] + return function_responses + + +if __name__ == "__main__": + asyncio.run(generate_content()) diff --git a/genai/live/live_ground_googsearch_with_txt.py b/genai/live/live_ground_googsearch_with_txt.py new file mode 100644 index 0000000000..cfca4a87e1 --- /dev/null +++ b/genai/live/live_ground_googsearch_with_txt.py @@ -0,0 +1,63 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import asyncio + + +async def generate_content() -> list[str]: + # [START googlegenaisdk_live_ground_googsearch_with_txt] + from google import genai + from google.genai.types import (Content, GoogleSearch, LiveConnectConfig, + Modality, Part, Tool) + + client = genai.Client() + model_id = "gemini-2.0-flash-live-preview-04-09" + config = LiveConnectConfig( + response_modalities=[Modality.TEXT], + tools=[Tool(google_search=GoogleSearch())], + ) + async with client.aio.live.connect(model=model_id, config=config) as session: + text_input = "When did the last Brazil vs. Argentina soccer match happen?" + await session.send_client_content( + turns=Content(role="user", parts=[Part(text=text_input)]) + ) + + response = [] + + async for chunk in session.receive(): + if chunk.server_content: + if chunk.text is not None: + response.append(chunk.text) + + # The model might generate and execute Python code to use Search + model_turn = chunk.server_content.model_turn + if model_turn: + for part in model_turn.parts: + if part.executable_code is not None: + print(part.executable_code.code) + + if part.code_execution_result is not None: + print(part.code_execution_result.output) + + print("".join(response)) + # Example output: + # > When did the last Brazil vs. Argentina soccer match happen? + # The last Brazil vs. Argentina soccer match was on March 25, 2025, a 2026 World Cup qualifier, where Argentina defeated Brazil 4-1. + # [END googlegenaisdk_live_ground_googsearch_with_txt] + return response + + +if __name__ == "__main__": + asyncio.run(generate_content()) diff --git a/genai/live/live_structured_ouput_with_txt.py b/genai/live/live_structured_ouput_with_txt.py new file mode 100644 index 0000000000..f0b2466ff5 --- /dev/null +++ b/genai/live/live_structured_ouput_with_txt.py @@ -0,0 +1,86 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# Test file: https://storage.googleapis.com/generativeai-downloads/data/16000.wav +# Install helpers for converting files: pip install librosa soundfile + +from pydantic import BaseModel + + +class CalendarEvent(BaseModel): + name: str + date: str + participants: list[str] + + +def generate_content() -> CalendarEvent: + # [START googlegenaisdk_live_structured_ouput_with_txt] + import os + + import google.auth.transport.requests + import openai + from google.auth import default + from openai.types.chat import (ChatCompletionSystemMessageParam, + ChatCompletionUserMessageParam) + + project_id = os.environ["GOOGLE_CLOUD_PROJECT"] + location = "us-central1" + + # Programmatically get an access token + credentials, _ = default(scopes=["https://www.googleapis.com/auth/cloud-platform"]) + credentials.refresh(google.auth.transport.requests.Request()) + # Note: the credential lives for 1 hour by default (https://cloud.google.com/docs/authentication/token-types#at-lifetime); after expiration, it must be refreshed. + + ############################## + # Choose one of the following: + ############################## + + # If you are calling a Gemini model, set the ENDPOINT_ID variable to use openapi. + ENDPOINT_ID = "openapi" + + # If you are calling a self-deployed model from Model Garden, set the + # ENDPOINT_ID variable and set the client's base URL to use your endpoint. + # ENDPOINT_ID = "YOUR_ENDPOINT_ID" + + # OpenAI Client + client = openai.OpenAI( + base_url=f"https://{location}-aiplatform.googleapis.com/v1/projects/{project_id}/locations/{location}/endpoints/{ENDPOINT_ID}", + api_key=credentials.token, + ) + + completion = client.beta.chat.completions.parse( + model="google/gemini-2.0-flash-001", + messages=[ + ChatCompletionSystemMessageParam( + role="system", content="Extract the event information." + ), + ChatCompletionUserMessageParam( + role="user", + content="Alice and Bob are going to a science fair on Friday.", + ), + ], + response_format=CalendarEvent, + ) + + response = completion.choices[0].message.parsed + print(response) + + # System message: Extract the event information. + # User message: Alice and Bob are going to a science fair on Friday. + # Output message: name='science fair' date='Friday' participants=['Alice', 'Bob'] + # [END googlegenaisdk_live_structured_ouput_with_txt] + return response + + +if __name__ == "__main__": + generate_content() diff --git a/genai/live/live_transcribe_with_audio.py b/genai/live/live_transcribe_with_audio.py new file mode 100644 index 0000000000..b702672bc7 --- /dev/null +++ b/genai/live/live_transcribe_with_audio.py @@ -0,0 +1,67 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +# Test file: https://storage.googleapis.com/generativeai-downloads/data/16000.wav +# Install helpers for converting files: pip install librosa soundfile + +import asyncio + + +async def generate_content() -> list[str]: + # [START googlegenaisdk_live_transcribe_with_audio] + from google import genai + from google.genai.types import (AudioTranscriptionConfig, Content, + LiveConnectConfig, Modality, Part) + + client = genai.Client() + model = "gemini-live-2.5-flash-preview-native-audio" + config = LiveConnectConfig( + response_modalities=[Modality.AUDIO], + input_audio_transcription=AudioTranscriptionConfig(), + output_audio_transcription=AudioTranscriptionConfig(), + ) + + async with client.aio.live.connect(model=model, config=config) as session: + input_txt = "Hello? Gemini are you there?" + print(f"> {input_txt}") + + await session.send_client_content( + turns=Content(role="user", parts=[Part(text=input_txt)]) + ) + + response = [] + + async for message in session.receive(): + if message.server_content.model_turn: + print("Model turn:", message.server_content.model_turn) + if message.server_content.input_transcription: + print( + "Input transcript:", message.server_content.input_transcription.text + ) + if message.server_content.output_transcription: + if message.server_content.output_transcription.text: + response.append(message.server_content.output_transcription.text) + + print("".join(response)) + + # Example output: + # > Hello? Gemini are you there? + # Yes, I'm here. What would you like to talk about? + # [END googlegenaisdk_live_transcribe_with_audio] + return response + + +if __name__ == "__main__": + asyncio.run(generate_content()) diff --git a/genai/live/live_txtgen_with_audio.py b/genai/live/live_txtgen_with_audio.py new file mode 100644 index 0000000000..175ec89f67 --- /dev/null +++ b/genai/live/live_txtgen_with_audio.py @@ -0,0 +1,78 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +# Test file: https://storage.googleapis.com/generativeai-downloads/data/16000.wav +# Install helpers for converting files: pip install librosa soundfile + +import asyncio +from pathlib import Path + + +async def generate_content() -> list[str]: + # [START googlegenaisdk_live_txtgen_with_audio] + import requests + import soundfile as sf + from google import genai + from google.genai.types import Blob, LiveConnectConfig, Modality + + client = genai.Client() + model = "gemini-2.0-flash-live-preview-04-09" + config = LiveConnectConfig(response_modalities=[Modality.TEXT]) + + def get_audio(url: str) -> bytes: + input_path = Path("temp_input.wav") + output_path = Path("temp_output.pcm") + + input_path.write_bytes(requests.get(url).content) + + y, sr = sf.read(input_path) + sf.write(output_path, y, sr, format="RAW", subtype="PCM_16") + + audio = output_path.read_bytes() + + input_path.unlink(missing_ok=True) + output_path.unlink(missing_ok=True) + return audio + + async with client.aio.live.connect(model=model, config=config) as session: + audio_url = "https://storage.googleapis.com/generativeai-downloads/data/16000.wav" + audio_bytes = get_audio(audio_url) + + # If you've pre-converted to sample.pcm using ffmpeg, use this instead: + # from pathlib import Path + # audio_bytes = Path("sample.pcm").read_bytes() + + print("> Answer to this audio url", audio_url, "\n") + + await session.send_realtime_input( + media=Blob(data=audio_bytes, mime_type="audio/pcm;rate=16000") + ) + + response = [] + + async for message in session.receive(): + if message.text is not None: + response.append(message.text) + + print("".join(response)) + # Example output: + # > Answer to this audio url https://storage.googleapis.com/generativeai-downloads/data/16000.wav + # Yes, I can hear you. How can I help you today? + # [END googlegenaisdk_live_txtgen_with_audio] + return response + + +if __name__ == "__main__": + asyncio.run(generate_content()) diff --git a/genai/live/live_websocket_audiogen_with_txt.py b/genai/live/live_websocket_audiogen_with_txt.py index f7b6f07e5f..b63e60aaac 100644 --- a/genai/live/live_websocket_audiogen_with_txt.py +++ b/genai/live/live_websocket_audiogen_with_txt.py @@ -39,10 +39,10 @@ async def generate_content() -> str: # [START googlegenaisdk_live_audiogen_websocket_with_txt] import base64 import json - import numpy as np - from websockets.asyncio.client import connect + import numpy as np from scipy.io import wavfile + from websockets.asyncio.client import connect # Configuration Constants PROJECT_ID = os.getenv("GOOGLE_CLOUD_PROJECT") diff --git a/genai/live/live_websocket_audiotranscript_with_txt.py b/genai/live/live_websocket_audiotranscript_with_txt.py index 5192b81ef1..6b769639eb 100644 --- a/genai/live/live_websocket_audiotranscript_with_txt.py +++ b/genai/live/live_websocket_audiotranscript_with_txt.py @@ -39,10 +39,10 @@ async def generate_content() -> str: # [START googlegenaisdk_live_websocket_audiotranscript_with_txt] import base64 import json - import numpy as np - from websockets.asyncio.client import connect + import numpy as np from scipy.io import wavfile + from websockets.asyncio.client import connect # Configuration Constants PROJECT_ID = os.getenv("GOOGLE_CLOUD_PROJECT") diff --git a/genai/live/live_websocket_textgen_with_audio.py b/genai/live/live_websocket_textgen_with_audio.py index de6fd9d55c..00923d3931 100644 --- a/genai/live/live_websocket_textgen_with_audio.py +++ b/genai/live/live_websocket_textgen_with_audio.py @@ -40,8 +40,8 @@ async def generate_content() -> str: import base64 import json - from websockets.asyncio.client import connect from scipy.io import wavfile + from websockets.asyncio.client import connect def read_wavefile(filepath: str) -> tuple[str, str]: # Read the .wav file using scipy.io.wavfile.read diff --git a/genai/live/live_websocket_textgen_with_txt.py b/genai/live/live_websocket_textgen_with_txt.py index b36487cc9a..56b6947205 100644 --- a/genai/live/live_websocket_textgen_with_txt.py +++ b/genai/live/live_websocket_textgen_with_txt.py @@ -38,6 +38,7 @@ async def generate_content() -> str: """ # [START googlegenaisdk_live_websocket_with_txt] import json + from websockets.asyncio.client import connect # Configuration Constants diff --git a/genai/live/live_with_txt.py b/genai/live/live_with_txt.py index a3c7518843..8b8b090812 100644 --- a/genai/live/live_with_txt.py +++ b/genai/live/live_with_txt.py @@ -18,13 +18,8 @@ async def generate_content() -> list[str]: # [START googlegenaisdk_live_with_txt] from google import genai - from google.genai.types import ( - Content, - LiveConnectConfig, - HttpOptions, - Modality, - Part, - ) + from google.genai.types import (Content, HttpOptions, LiveConnectConfig, + Modality, Part) client = genai.Client(http_options=HttpOptions(api_version="v1beta1")) model_id = "gemini-2.0-flash-live-preview-04-09" @@ -35,7 +30,9 @@ async def generate_content() -> list[str]: ) as session: text_input = "Hello? Gemini, are you there?" print("> ", text_input, "\n") - await session.send_client_content(turns=Content(role="user", parts=[Part(text=text_input)])) + await session.send_client_content( + turns=Content(role="user", parts=[Part(text=text_input)]) + ) response = [] diff --git a/genai/live/requirements-test.txt b/genai/live/requirements-test.txt index 4fb57f7f08..1b59fd9d24 100644 --- a/genai/live/requirements-test.txt +++ b/genai/live/requirements-test.txt @@ -1,4 +1,4 @@ backoff==2.2.1 -google-api-core==2.19.0 -pytest==8.2.0 -pytest-asyncio==0.25.3 +google-api-core==2.25.1 +pytest==8.4.1 +pytest-asyncio==1.1.0 diff --git a/genai/live/requirements.txt b/genai/live/requirements.txt index be9472583c..dd1891ee07 100644 --- a/genai/live/requirements.txt +++ b/genai/live/requirements.txt @@ -1,3 +1,7 @@ -google-genai==1.27.0 -scipy==1.15.3 -websockets==15.0.1 \ No newline at end of file +google-genai==1.28.0 +scipy==1.16.1 +websockets==15.0.1 +numpy==1.26.4 +soundfile==0.12.1 +openai==1.99.1 +setuptools==80.9.0 \ No newline at end of file diff --git a/genai/live/test_live_examples.py b/genai/live/test_live_examples.py index ce38253986..f4d25e137e 100644 --- a/genai/live/test_live_examples.py +++ b/genai/live/test_live_examples.py @@ -20,6 +20,13 @@ import pytest +import live_audiogen_with_txt +import live_code_exec_with_txt +import live_func_call_with_txt +import live_ground_googsearch_with_txt +import live_structured_ouput_with_txt +import live_transcribe_with_audio +import live_txtgen_with_audio import live_websocket_audiogen_with_txt import live_websocket_audiotranscript_with_txt import live_websocket_textgen_with_audio @@ -55,3 +62,39 @@ async def test_live_websocket_audiogen_with_txt() -> None: @pytest.mark.asyncio async def test_live_websocket_audiotranscript_with_txt() -> None: assert await live_websocket_audiotranscript_with_txt.generate_content() + + +@pytest.mark.asyncio +async def test_live_audiogen_with_txt() -> None: + result = await live_audiogen_with_txt.generate_content() + assert result is None + + +@pytest.mark.asyncio +async def test_live_code_exec_with_txt() -> None: + assert await live_code_exec_with_txt.generate_content() + + +@pytest.mark.asyncio +async def test_live_func_call_with_txt() -> None: + assert await live_func_call_with_txt.generate_content() + + +@pytest.mark.asyncio +async def test_live_ground_googsearch_with_txt() -> None: + assert await live_ground_googsearch_with_txt.generate_content() + + +@pytest.mark.asyncio +async def test_live_transcribe_with_audio() -> None: + assert await live_transcribe_with_audio.generate_content() + + +@pytest.mark.asyncio +async def test_live_txtgen_with_audio() -> None: + assert await live_txtgen_with_audio.generate_content() + + +@pytest.mark.asyncio +async def test_live_structured_ouput_with_txt() -> None: + assert live_structured_ouput_with_txt.generate_content()