|
| 1 | +import os |
| 2 | +from dotenv import load_dotenv |
| 3 | + |
| 4 | +from std_msgs.msg import String |
| 5 | + |
| 6 | +from rclpy.node import Node |
| 7 | + |
| 8 | +import time |
| 9 | + |
| 10 | +import speech_recognition as sr |
| 11 | +from piper import PiperVoice, SynthesisConfig |
| 12 | +import pyaudio |
| 13 | + |
| 14 | + |
| 15 | +load_dotenv() # This loads the variables from .env file |
| 16 | + |
| 17 | + |
| 18 | +class VoiceInOut(Node): |
| 19 | + |
| 20 | + user_query_topic = "/user_query" |
| 21 | + llm_response_topic = "/llm_response" |
| 22 | + input_audio_service = "/input_audio_service" # service to request enabling of audio input for commands |
| 23 | + output_audio_service = "/output_audio_service" # service to request enabling of audio output for responses |
| 24 | + |
| 25 | + format = pyaudio.paInt16 |
| 26 | + rate = 22050 |
| 27 | + channels = 1 |
| 28 | + |
| 29 | + voice_gender = "female" # or male |
| 30 | + |
| 31 | + can_listen = True # boolean variable to decide if the node should listen for audio user input |
| 32 | + can_talk = True # boolean variable to decide if the node should speak out loud the LLM responses |
| 33 | + |
| 34 | + def __init__(self): |
| 35 | + super().__init__("VoiceInOut_Node") |
| 36 | + |
| 37 | + self.llm_response_sub = None |
| 38 | + self.user_query_pub = None |
| 39 | + |
| 40 | + # Initialize variables for speech recognition |
| 41 | + self.stop_listening = None |
| 42 | + |
| 43 | + # Initialize speech-to-text recognizer |
| 44 | + self.stt_recognizer = sr.Recognizer() |
| 45 | + |
| 46 | + # Initialize microphone for speech input |
| 47 | + self.stt_mic = None |
| 48 | + |
| 49 | + self.start_listening() |
| 50 | + |
| 51 | + # Initialize text-to-speech engine |
| 52 | + self.model_names = { |
| 53 | + "female": "en_US-amy-medium.onnx", |
| 54 | + "male": "en_US-kusal-medium.onnx", |
| 55 | + } |
| 56 | + |
| 57 | + current_file_dir = os.path.dirname(os.path.abspath(__file__)) |
| 58 | + model_path = os.path.join( |
| 59 | + current_file_dir, |
| 60 | + "models", |
| 61 | + self.voice_gender, |
| 62 | + self.model_names[self.voice_gender], |
| 63 | + ) |
| 64 | + |
| 65 | + self.tts_voice = PiperVoice.load(model_path) |
| 66 | + |
| 67 | + self.syn_config = SynthesisConfig( |
| 68 | + volume=1.0, # half as loud |
| 69 | + length_scale=1.0, # twice as slow |
| 70 | + noise_scale=1.0, # more audio variation |
| 71 | + noise_w_scale=1.0, # more speaking variation |
| 72 | + normalize_audio=True, # use raw audio from voice |
| 73 | + ) |
| 74 | + |
| 75 | + # Start audio stream |
| 76 | + self.p = pyaudio.PyAudio() |
| 77 | + self.stream = self.p.open( |
| 78 | + format=self.format, channels=self.channels, rate=self.rate, output=True |
| 79 | + ) |
| 80 | + |
| 81 | + # Announce initialization of speech |
| 82 | + self.speak("Voice input output node initialized.") |
| 83 | + self.speak("Listening for your commands.") |
| 84 | + |
| 85 | + self._init_parameters() |
| 86 | + self._init_publishers() |
| 87 | + self._init_subscriptions() |
| 88 | + |
| 89 | + ########################################## Initialization Methods ############################################################################ |
| 90 | + |
| 91 | + def _init_parameters(self) -> None: |
| 92 | + """Method to initialize parameters such as ROS topics' names""" |
| 93 | + self.declare_parameter("user_query_topic", self.user_query_topic) |
| 94 | + self.declare_parameter("llm_response_topic", self.llm_response_topic) |
| 95 | + |
| 96 | + self.declare_parameter("input_audio_service", self.input_audio_service) |
| 97 | + self.declare_parameter("output_audio_service", self.output_audio_service) |
| 98 | + |
| 99 | + self.declare_parameter("voice_gender", self.voice_gender) |
| 100 | + self.declare_parameter("can_listen", self.can_listen) |
| 101 | + self.declare_parameter("can_talk", self.can_talk) |
| 102 | + |
| 103 | + self.user_query_topic = ( |
| 104 | + self.get_parameter("user_query_topic").get_parameter_value().string_value |
| 105 | + ) |
| 106 | + |
| 107 | + self.llm_response_topic = ( |
| 108 | + self.get_parameter("llm_response_topic").get_parameter_value().string_value |
| 109 | + ) |
| 110 | + |
| 111 | + self.input_audio_service = ( |
| 112 | + self.get_parameter("input_audio_service").get_parameter_value().string_value |
| 113 | + ) |
| 114 | + self.output_audio_service = ( |
| 115 | + self.get_parameter("output_audio_service") |
| 116 | + .get_parameter_value() |
| 117 | + .string_value |
| 118 | + ) |
| 119 | + |
| 120 | + self.voice_gender = ( |
| 121 | + self.get_parameter("voice_gender").get_parameter_value().string_value |
| 122 | + ) |
| 123 | + |
| 124 | + self.can_listen = ( |
| 125 | + self.get_parameter("can_listen").get_parameter_value().bool_value |
| 126 | + ) |
| 127 | + self.can_talk = self.get_parameter("can_talk").get_parameter_value().bool_value |
| 128 | + |
| 129 | + def _init_publishers(self) -> None: |
| 130 | + """Method to initialize publishers""" |
| 131 | + self.user_query_pub = self.create_publisher(String, self.user_query_topic, 10) |
| 132 | + |
| 133 | + def _init_subscriptions(self) -> None: |
| 134 | + """Method to initialize subscriptions""" |
| 135 | + self.llm_response_sub = self.create_subscription( |
| 136 | + String, self.llm_response_topic, self.llm_response_callback, 10 |
| 137 | + ) |
| 138 | + |
| 139 | + def start_listening(self) -> None: |
| 140 | + """Method to start the listening for voice input""" |
| 141 | + self.get_logger().debug( |
| 142 | + f"Started listening for user query at {self.get_clock().now()}" |
| 143 | + ) |
| 144 | + self.stt_mic = sr.Microphone() |
| 145 | + with self.stt_mic as source: |
| 146 | + self.stt_recognizer.adjust_for_ambient_noise(source) |
| 147 | + self.stop_listening = self.stt_recognizer.listen_in_background( |
| 148 | + self.stt_mic, self.publish_audio_as_text |
| 149 | + ) |
| 150 | + |
| 151 | + def pause_listening(self) -> None: |
| 152 | + """Method to pause the listening. this should be used when the text is being spoken out loud""" |
| 153 | + if self.stop_listening is not None: |
| 154 | + self.stop_listening(wait_for_stop=False) |
| 155 | + self.get_logger().debug( |
| 156 | + f"Stopped listening for user query at {self.get_clock().now()}." |
| 157 | + ) |
| 158 | + |
| 159 | + def publish_audio_as_text(self, recognizer, audio_input): |
| 160 | + if self.can_listen: |
| 161 | + try: |
| 162 | + user_query = recognizer.recognize_whisper( |
| 163 | + audio_input, language="english" |
| 164 | + ) |
| 165 | + if user_query.strip() == "": |
| 166 | + self.get_logger().debug("No speech detected.") |
| 167 | + else: |
| 168 | + user_query_msg = String() |
| 169 | + user_query_msg.data = user_query |
| 170 | + self.user_query_pub.publish(user_query_msg) |
| 171 | + self.get_logger().debug( |
| 172 | + f"Published user query: {user_query_msg.data}" |
| 173 | + ) |
| 174 | + |
| 175 | + except sr.UnknownValueError: |
| 176 | + self.get_logger.error("Whisper could not understand audio") |
| 177 | + except sr.RequestError as e: |
| 178 | + self.get_logger.error(f"Could not request results from Whisper; {e}") |
| 179 | + |
| 180 | + def speak(self, text: str) -> None: |
| 181 | + """Method to speak out loud the given text""" |
| 182 | + if self.can_talk: |
| 183 | + try: |
| 184 | + self.pause_listening() |
| 185 | + self.get_logger().debug(f"Now speaking at {self.get_clock().now()}.") |
| 186 | + |
| 187 | + for chunk in self.tts_voice.synthesize( |
| 188 | + text, syn_config=self.syn_config |
| 189 | + ): |
| 190 | + self.stream.write(chunk.audio_int16_bytes) |
| 191 | + |
| 192 | + time.sleep(0.2) |
| 193 | + self.get_logger().debug( |
| 194 | + f"LLM response should have been spoken out loud at {self.get_clock().now()}." |
| 195 | + ) |
| 196 | + self.start_listening() |
| 197 | + except Exception as e: |
| 198 | + self.get_logger().error(f"Error during TTS: {e}") |
| 199 | + else: |
| 200 | + self.get_logger().debug("can_talk is disabled; not speaking out loud.") |
| 201 | + |
| 202 | + ########################################## Subscriber callback ############################################################################ |
| 203 | + def llm_response_callback(self, msg): |
| 204 | + """Callback method to receive the LLM response and save/stream it as audio""" |
| 205 | + self.get_logger().debug(f"Received LLM response: {msg.data}") |
| 206 | + self.speak(str(msg.data)) |
| 207 | + |
| 208 | + def destroy_node(self): |
| 209 | + try: |
| 210 | + self.stream.stop_stream() |
| 211 | + self.stream.close() |
| 212 | + self.p.terminate() |
| 213 | + except Exception as e: |
| 214 | + self.get_logger().warn(f"Error closing audio resources: {e}") |
| 215 | + |
| 216 | + # Call base class destructor |
| 217 | + super().destroy_node() |
0 commit comments