Skip to content

Commit d8d408e

Browse files
committed
feat: first version of voice input/output for robot agent
1 parent 0eb3dd2 commit d8d408e

File tree

1 file changed

+217
-0
lines changed

1 file changed

+217
-0
lines changed
Lines changed: 217 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,217 @@
1+
import os
2+
from dotenv import load_dotenv
3+
4+
from std_msgs.msg import String
5+
6+
from rclpy.node import Node
7+
8+
import time
9+
10+
import speech_recognition as sr
11+
from piper import PiperVoice, SynthesisConfig
12+
import pyaudio
13+
14+
15+
load_dotenv() # This loads the variables from .env file
16+
17+
18+
class VoiceInOut(Node):
19+
20+
user_query_topic = "/user_query"
21+
llm_response_topic = "/llm_response"
22+
input_audio_service = "/input_audio_service" # service to request enabling of audio input for commands
23+
output_audio_service = "/output_audio_service" # service to request enabling of audio output for responses
24+
25+
format = pyaudio.paInt16
26+
rate = 22050
27+
channels = 1
28+
29+
voice_gender = "female" # or male
30+
31+
can_listen = True # boolean variable to decide if the node should listen for audio user input
32+
can_talk = True # boolean variable to decide if the node should speak out loud the LLM responses
33+
34+
def __init__(self):
35+
super().__init__("VoiceInOut_Node")
36+
37+
self.llm_response_sub = None
38+
self.user_query_pub = None
39+
40+
# Initialize variables for speech recognition
41+
self.stop_listening = None
42+
43+
# Initialize speech-to-text recognizer
44+
self.stt_recognizer = sr.Recognizer()
45+
46+
# Initialize microphone for speech input
47+
self.stt_mic = None
48+
49+
self.start_listening()
50+
51+
# Initialize text-to-speech engine
52+
self.model_names = {
53+
"female": "en_US-amy-medium.onnx",
54+
"male": "en_US-kusal-medium.onnx",
55+
}
56+
57+
current_file_dir = os.path.dirname(os.path.abspath(__file__))
58+
model_path = os.path.join(
59+
current_file_dir,
60+
"models",
61+
self.voice_gender,
62+
self.model_names[self.voice_gender],
63+
)
64+
65+
self.tts_voice = PiperVoice.load(model_path)
66+
67+
self.syn_config = SynthesisConfig(
68+
volume=1.0, # half as loud
69+
length_scale=1.0, # twice as slow
70+
noise_scale=1.0, # more audio variation
71+
noise_w_scale=1.0, # more speaking variation
72+
normalize_audio=True, # use raw audio from voice
73+
)
74+
75+
# Start audio stream
76+
self.p = pyaudio.PyAudio()
77+
self.stream = self.p.open(
78+
format=self.format, channels=self.channels, rate=self.rate, output=True
79+
)
80+
81+
# Announce initialization of speech
82+
self.speak("Voice input output node initialized.")
83+
self.speak("Listening for your commands.")
84+
85+
self._init_parameters()
86+
self._init_publishers()
87+
self._init_subscriptions()
88+
89+
########################################## Initialization Methods ############################################################################
90+
91+
def _init_parameters(self) -> None:
92+
"""Method to initialize parameters such as ROS topics' names"""
93+
self.declare_parameter("user_query_topic", self.user_query_topic)
94+
self.declare_parameter("llm_response_topic", self.llm_response_topic)
95+
96+
self.declare_parameter("input_audio_service", self.input_audio_service)
97+
self.declare_parameter("output_audio_service", self.output_audio_service)
98+
99+
self.declare_parameter("voice_gender", self.voice_gender)
100+
self.declare_parameter("can_listen", self.can_listen)
101+
self.declare_parameter("can_talk", self.can_talk)
102+
103+
self.user_query_topic = (
104+
self.get_parameter("user_query_topic").get_parameter_value().string_value
105+
)
106+
107+
self.llm_response_topic = (
108+
self.get_parameter("llm_response_topic").get_parameter_value().string_value
109+
)
110+
111+
self.input_audio_service = (
112+
self.get_parameter("input_audio_service").get_parameter_value().string_value
113+
)
114+
self.output_audio_service = (
115+
self.get_parameter("output_audio_service")
116+
.get_parameter_value()
117+
.string_value
118+
)
119+
120+
self.voice_gender = (
121+
self.get_parameter("voice_gender").get_parameter_value().string_value
122+
)
123+
124+
self.can_listen = (
125+
self.get_parameter("can_listen").get_parameter_value().bool_value
126+
)
127+
self.can_talk = self.get_parameter("can_talk").get_parameter_value().bool_value
128+
129+
def _init_publishers(self) -> None:
130+
"""Method to initialize publishers"""
131+
self.user_query_pub = self.create_publisher(String, self.user_query_topic, 10)
132+
133+
def _init_subscriptions(self) -> None:
134+
"""Method to initialize subscriptions"""
135+
self.llm_response_sub = self.create_subscription(
136+
String, self.llm_response_topic, self.llm_response_callback, 10
137+
)
138+
139+
def start_listening(self) -> None:
140+
"""Method to start the listening for voice input"""
141+
self.get_logger().debug(
142+
f"Started listening for user query at {self.get_clock().now()}"
143+
)
144+
self.stt_mic = sr.Microphone()
145+
with self.stt_mic as source:
146+
self.stt_recognizer.adjust_for_ambient_noise(source)
147+
self.stop_listening = self.stt_recognizer.listen_in_background(
148+
self.stt_mic, self.publish_audio_as_text
149+
)
150+
151+
def pause_listening(self) -> None:
152+
"""Method to pause the listening. this should be used when the text is being spoken out loud"""
153+
if self.stop_listening is not None:
154+
self.stop_listening(wait_for_stop=False)
155+
self.get_logger().debug(
156+
f"Stopped listening for user query at {self.get_clock().now()}."
157+
)
158+
159+
def publish_audio_as_text(self, recognizer, audio_input):
160+
if self.can_listen:
161+
try:
162+
user_query = recognizer.recognize_whisper(
163+
audio_input, language="english"
164+
)
165+
if user_query.strip() == "":
166+
self.get_logger().debug("No speech detected.")
167+
else:
168+
user_query_msg = String()
169+
user_query_msg.data = user_query
170+
self.user_query_pub.publish(user_query_msg)
171+
self.get_logger().debug(
172+
f"Published user query: {user_query_msg.data}"
173+
)
174+
175+
except sr.UnknownValueError:
176+
self.get_logger.error("Whisper could not understand audio")
177+
except sr.RequestError as e:
178+
self.get_logger.error(f"Could not request results from Whisper; {e}")
179+
180+
def speak(self, text: str) -> None:
181+
"""Method to speak out loud the given text"""
182+
if self.can_talk:
183+
try:
184+
self.pause_listening()
185+
self.get_logger().debug(f"Now speaking at {self.get_clock().now()}.")
186+
187+
for chunk in self.tts_voice.synthesize(
188+
text, syn_config=self.syn_config
189+
):
190+
self.stream.write(chunk.audio_int16_bytes)
191+
192+
time.sleep(0.2)
193+
self.get_logger().debug(
194+
f"LLM response should have been spoken out loud at {self.get_clock().now()}."
195+
)
196+
self.start_listening()
197+
except Exception as e:
198+
self.get_logger().error(f"Error during TTS: {e}")
199+
else:
200+
self.get_logger().debug("can_talk is disabled; not speaking out loud.")
201+
202+
########################################## Subscriber callback ############################################################################
203+
def llm_response_callback(self, msg):
204+
"""Callback method to receive the LLM response and save/stream it as audio"""
205+
self.get_logger().debug(f"Received LLM response: {msg.data}")
206+
self.speak(str(msg.data))
207+
208+
def destroy_node(self):
209+
try:
210+
self.stream.stop_stream()
211+
self.stream.close()
212+
self.p.terminate()
213+
except Exception as e:
214+
self.get_logger().warn(f"Error closing audio resources: {e}")
215+
216+
# Call base class destructor
217+
super().destroy_node()

0 commit comments

Comments
 (0)