diff --git a/README.md b/README.md index 3c4a5057..72c9b1c0 100644 --- a/README.md +++ b/README.md @@ -49,6 +49,9 @@ chat.ask "What's the best way to learn Ruby?" # Analyze images chat.ask "What's in this image?", with: { image: "ruby_conf.jpg" } +# Transcribe audio files +RubyLLM.transcribe "interview.wav" + # Analyze audio recordings chat.ask "Describe this meeting", with: { audio: "meeting.wav" } diff --git a/docs/guides/audio-transcription.md b/docs/guides/audio-transcription.md new file mode 100644 index 00000000..1dd6fead --- /dev/null +++ b/docs/guides/audio-transcription.md @@ -0,0 +1,74 @@ +--- +layout: default +title: Audio Transcription +parent: Guides +nav_order: 8 +permalink: /guides/audio-transcription +--- + +# Audio Transcription + +RubyLLM makes it easy to transcribe audio content using AI models. This guide covers how to convert speech to text using transcription models. + +## Basic Transcription + +The simplest way to convert speech to text is using the global `transcribe` method with only a local audio file path: + +```ruby +# Transcribe an audio file +text = RubyLLM.transcribe("meeting.wav") + +# Print the transcribed text +puts text +``` + +This method automatically uses the default transcription model (whisper-1) to convert the audio file to text. + +## Specifying a Language + +If you know the language in the audio, you can provide a hint to improve transcription accuracy: + +```ruby +# Transcribe Spanish audio +spanish_text = RubyLLM.transcribe("entrevista.mp3", language: "Spanish") +``` + +## Choosing Models + +You can specify which model to use for transcription: + +```ruby +# Use a specific model +text = RubyLLM.transcribe( + "interview.mp3", + model: "whisper-1" +) +``` + +You can configure the default transcription model globally: + +```ruby +RubyLLM.configure do |config| + config.default_transcription_model = "whisper-1" +end +``` + +## Working with Large Files + +For longer audio files, be aware of potential timeout issues. You can set a global timeout in your application configuration: + +```ruby +RubyLLM.configure do |config| + # Set a longer timeout for large files (in seconds) + config.request_timeout = 300 +end +``` + +Currently, RubyLLM doesn't support per-request timeout configuration. For handling very large files, you may need to increase the global timeout or consider breaking up the audio into smaller segments. + +## Next Steps + +Now that you understand audio transcription, you might want to explore: + +- [Error Handling]({% link guides/error-handling.md %}) for robust applications +- [Tools]({% link guides/tools.md %}) to extend AI capabilities \ No newline at end of file diff --git a/docs/guides/getting-started.md b/docs/guides/getting-started.md index da411077..5b45d93f 100644 --- a/docs/guides/getting-started.md +++ b/docs/guides/getting-started.md @@ -167,4 +167,6 @@ Now that you've got the basics down, you're ready to explore more advanced featu - [Chatting with AI]({% link guides/chat.md %}) - Learn more about chat capabilities - [Using Tools]({% link guides/tools.md %}) - Let AI use your Ruby code -- [Rails Integration]({% link guides/rails.md %}) - Persist chats in your Rails apps \ No newline at end of file +- [Rails Integration]({% link guides/rails.md %}) - Persist chats in your Rails apps +- [Available Models]({% link guides/available-models.md %}) - Complete list of supported models and their capabilities +- [Audio Transcription]({% link guides/audio-transcription.md %}) - Convert speech to text \ No newline at end of file diff --git a/lib/ruby_llm.rb b/lib/ruby_llm.rb index 70aa4a8c..d08a9c40 100644 --- a/lib/ruby_llm.rb +++ b/lib/ruby_llm.rb @@ -4,6 +4,7 @@ require 'event_stream_parser' require 'faraday' require 'faraday/retry' +require 'faraday/multipart' require 'json' require 'logger' require 'securerandom' @@ -42,6 +43,14 @@ def paint(...) Image.paint(...) end + def transcribe(audio_file, model: nil, language: nil) + model_id = model || RubyLLM.config.default_transcription_model + Models.find(model_id) # Validates model exists + + provider = Provider.for(model_id) + provider.transcribe(audio_file, model: model_id, language: language) + end + def models Models.instance end diff --git a/lib/ruby_llm/configuration.rb b/lib/ruby_llm/configuration.rb index c6741450..f308dc32 100644 --- a/lib/ruby_llm/configuration.rb +++ b/lib/ruby_llm/configuration.rb @@ -23,6 +23,7 @@ class Configuration :default_model, :default_embedding_model, :default_image_model, + :default_transcription_model, # Connection configuration :request_timeout, :max_retries, @@ -42,6 +43,7 @@ def initialize @default_model = 'gpt-4o-mini' @default_embedding_model = 'text-embedding-3-small' @default_image_model = 'dall-e-3' + @default_transcription_model = 'whisper-1' end end end diff --git a/lib/ruby_llm/content.rb b/lib/ruby_llm/content.rb index ac326ffd..7d566056 100644 --- a/lib/ruby_llm/content.rb +++ b/lib/ruby_llm/content.rb @@ -34,6 +34,35 @@ def format to_a end + # Determine the MIME type based on file extension + def self.mime_type_for(path) # rubocop:disable Metrics/CyclomaticComplexity, Metrics/MethodLength + ext = File.extname(path).delete('.').downcase + + case ext + when 'jpeg', 'jpg' + 'image/jpeg' + when 'png' + 'image/png' + when 'gif' + 'image/gif' + when 'webp' + 'image/webp' + when 'mgpa', 'mp3', 'mpeg' + 'audio/mpeg' + when 'm4a', 'mp4' + 'audio/mp4' + when 'wav' + 'audio/wav' + when 'ogg' + 'audio/ogg' + when 'webm' + 'audio/webm' + else + # Default to the extension as the subtype + "application/#{ext}" + end + end + private def attach_image(source) # rubocop:disable Metrics/MethodLength @@ -97,8 +126,7 @@ def encode_file(source) end def mime_type_for(path) - ext = File.extname(path).delete('.') - "image/#{ext}" + self.class.mime_type_for(path) end end end diff --git a/lib/ruby_llm/model_info.rb b/lib/ruby_llm/model_info.rb index 31b2e8b1..8a36af40 100644 --- a/lib/ruby_llm/model_info.rb +++ b/lib/ruby_llm/model_info.rb @@ -15,7 +15,8 @@ module RubyLLM class ModelInfo attr_reader :id, :created_at, :display_name, :provider, :metadata, :context_window, :max_tokens, :supports_vision, :supports_functions, - :supports_json_mode, :input_price_per_million, :output_price_per_million, :type, :family + :supports_json_mode, :input_price_per_million, :output_price_per_million, + :type, :family def initialize(data) # rubocop:disable Metrics/AbcSize,Metrics/MethodLength @id = data[:id] diff --git a/lib/ruby_llm/provider.rb b/lib/ruby_llm/provider.rb index 5540e907..b1b7e7be 100644 --- a/lib/ruby_llm/provider.rb +++ b/lib/ruby_llm/provider.rb @@ -51,6 +51,13 @@ def paint(prompt, model:, size:) parse_image_response(response) end + def transcribe(audio_file, model:, language: nil) + payload = render_transcription_payload(audio_file, model:, language:) + + response = post(transcription_url, payload) + parse_transcription_response(response) + end + def configured? missing_configs.empty? end diff --git a/lib/ruby_llm/providers/openai.rb b/lib/ruby_llm/providers/openai.rb index 7dd602f2..f2ebd754 100644 --- a/lib/ruby_llm/providers/openai.rb +++ b/lib/ruby_llm/providers/openai.rb @@ -14,6 +14,7 @@ module OpenAI extend OpenAI::Tools extend OpenAI::Images extend OpenAI::Media + extend OpenAI::Transcription def self.extended(base) base.extend(Provider) @@ -24,6 +25,7 @@ def self.extended(base) base.extend(OpenAI::Tools) base.extend(OpenAI::Images) base.extend(OpenAI::Media) + base.extend(OpenAI::Transcription) end module_function diff --git a/lib/ruby_llm/providers/openai/transcription.rb b/lib/ruby_llm/providers/openai/transcription.rb new file mode 100644 index 00000000..0b8debf6 --- /dev/null +++ b/lib/ruby_llm/providers/openai/transcription.rb @@ -0,0 +1,74 @@ +# frozen_string_literal: true + +module RubyLLM + module Providers + module OpenAI + # Handles audio transcription functionality for the OpenAI API + module Transcription + # Helper methods as module_function + + module_function + + def self.extended(base) + # module_function causes the 'transcribe' method to be private, but we need it to be public + base.public_class_method :transcribe + end + + def self.transcribe(audio_file, model: nil, language: nil) + model ||= RubyLLM.config.default_transcription_model + payload = render_transcription_payload(audio_file, model: model, language: language) + + response = post_multipart(transcription_url, payload) + parse_transcription_response(response) + end + + def transcription_url + "#{api_base}/audio/transcriptions" + end + + def api_base + 'https://api.openai.com/v1' + end + + def headers + { + 'Authorization' => "Bearer #{RubyLLM.config.openai_api_key}" + } + end + + def post_multipart(url, payload) + connection = Faraday.new(url: api_base) do |f| + f.request :multipart + f.request :url_encoded + f.adapter Faraday.default_adapter + end + + response = connection.post(url) do |req| + req.headers.merge!(headers) + req.body = payload + end + + JSON.parse(response.body) + end + + def render_transcription_payload(audio_file, model:, language: nil) + file_part = Faraday::Multipart::FilePart.new(audio_file, Content.mime_type_for(audio_file)) + + payload = { + model: model, + file: file_part + } + + # Add language if provided + payload[:language] = language if language + + payload + end + + def parse_transcription_response(response) + response['text'] + end + end + end + end +end diff --git a/spec/ruby_llm/ruby_llm_spec.rb b/spec/ruby_llm/ruby_llm_spec.rb new file mode 100644 index 00000000..7444e7f6 --- /dev/null +++ b/spec/ruby_llm/ruby_llm_spec.rb @@ -0,0 +1,53 @@ +# frozen_string_literal: true + +require 'spec_helper' + +RSpec.describe RubyLLM do + include_context 'with configured RubyLLM' + + let(:audio_path) { File.expand_path('../fixtures/ruby.wav', __dir__) } + let(:default_model) { 'whisper-1' } + + before do + allow(described_class.config).to receive(:default_transcription_model).and_return(default_model) + allow(described_class::Models).to receive(:find).with(default_model) + allow(described_class::Provider).to receive(:for).with(default_model).and_return(described_class::Providers::OpenAI) + allow(described_class::Providers::OpenAI).to receive(:transcribe) + end + + describe '.transcribe' do + it 'uses the default model from config when no model is specified' do # rubocop:disable RSpec/MultipleExpectations + described_class.transcribe(audio_path) + + expect(described_class::Provider).to have_received(:for).with(default_model) + expect(described_class::Providers::OpenAI).to have_received(:transcribe).with( + audio_path, model: default_model, language: nil + ) + end + + it 'validates and uses a custom model when specified' do # rubocop:disable RSpec/MultipleExpectations, RSpec/ExampleLength + custom_model = 'whisper-large' + allow(described_class::Models).to receive(:find).with(custom_model) + allow(described_class::Provider).to receive(:for).with(custom_model) + .and_return(described_class::Providers::OpenAI) + + described_class.transcribe(audio_path, model: custom_model) + + expect(described_class::Models).to have_received(:find).with(custom_model) + expect(described_class::Provider).to have_received(:for).with(custom_model) + expect(described_class::Providers::OpenAI).to have_received(:transcribe).with( + audio_path, model: custom_model, language: nil + ) + end + + it 'passes language parameter to the provider' do + language = 'en' + + described_class.transcribe(audio_path, language: language) + + expect(described_class::Providers::OpenAI).to have_received(:transcribe).with( + audio_path, model: default_model, language: language + ) + end + end +end