crmne · keithrbennett · Apr 4, 2025 · Apr 5, 2025 · Apr 9, 2025 · Apr 9, 2025
diff --git a/README.md b/README.md
@@ -49,6 +49,9 @@ chat.ask "What's the best way to learn Ruby?"
 # Analyze images
 chat.ask "What's in this image?", with: { image: "ruby_conf.jpg" }
 
+# Transcribe audio files
+RubyLLM.transcribe "interview.wav"
+
 # Analyze audio recordings
 chat.ask "Describe this meeting", with: { audio: "meeting.wav" }
 

diff --git a/docs/guides/audio-transcription.md b/docs/guides/audio-transcription.md
@@ -0,0 +1,74 @@
+---
+layout: default
+title: Audio Transcription
+parent: Guides
+nav_order: 8
+permalink: /guides/audio-transcription
+---
+
+# Audio Transcription
+
+RubyLLM makes it easy to transcribe audio content using AI models. This guide covers how to convert speech to text using transcription models.
+
+## Basic Transcription
+
+The simplest way to convert speech to text is using the global `transcribe` method with only a local audio file path:
+
+```ruby
+# Transcribe an audio file
+text = RubyLLM.transcribe("meeting.wav")
+
+# Print the transcribed text
+puts text
+```
+
+This method automatically uses the default transcription model (whisper-1) to convert the audio file to text.
+
+## Specifying a Language
+
+If you know the language in the audio, you can provide a hint to improve transcription accuracy:
+
+```ruby
+# Transcribe Spanish audio
+spanish_text = RubyLLM.transcribe("entrevista.mp3", language: "Spanish")
+```
+
+## Choosing Models
+
+You can specify which model to use for transcription:
+
+```ruby
+# Use a specific model
+text = RubyLLM.transcribe(
+  "interview.mp3",
+  model: "whisper-1"
+)
+```
+
+You can configure the default transcription model globally:
+
+```ruby
+RubyLLM.configure do |config|
+  config.default_transcription_model = "whisper-1"
+end
+```
+
+## Working with Large Files
+
+For longer audio files, be aware of potential timeout issues. You can set a global timeout in your application configuration:
+
+```ruby
+RubyLLM.configure do |config|
+  # Set a longer timeout for large files (in seconds)
+  config.request_timeout = 300
+end
+```
+
+Currently, RubyLLM doesn't support per-request timeout configuration. For handling very large files, you may need to increase the global timeout or consider breaking up the audio into smaller segments.
+
+## Next Steps
+
+Now that you understand audio transcription, you might want to explore:
+
+- [Error Handling]({% link guides/error-handling.md %}) for robust applications
+- [Tools]({% link guides/tools.md %}) to extend AI capabilities
diff --git a/docs/guides/getting-started.md b/docs/guides/getting-started.md
@@ -167,4 +167,6 @@ Now that you've got the basics down, you're ready to explore more advanced featu
 
 - [Chatting with AI]({% link guides/chat.md %}) - Learn more about chat capabilities
 - [Using Tools]({% link guides/tools.md %}) - Let AI use your Ruby code
-- [Rails Integration]({% link guides/rails.md %}) - Persist chats in your Rails apps
+- [Rails Integration]({% link guides/rails.md %}) - Persist chats in your Rails apps
+- [Available Models]({% link guides/available-models.md %}) - Complete list of supported models and their capabilities
+- [Audio Transcription]({% link guides/audio-transcription.md %}) - Convert speech to text
diff --git a/lib/ruby_llm.rb b/lib/ruby_llm.rb
@@ -4,6 +4,7 @@
 require 'event_stream_parser'
 require 'faraday'
 require 'faraday/retry'
+require 'faraday/multipart'
 require 'json'
 require 'logger'
 require 'securerandom'
@@ -42,6 +43,14 @@ def paint(...)
       Image.paint(...)
     end
 
+    def transcribe(audio_file, model: nil, language: nil)
+      model_id = model || RubyLLM.config.default_transcription_model
+      Models.find(model_id) # Validates model exists
+
+      provider = Provider.for(model_id)
+      provider.transcribe(audio_file, model: model_id, language: language)
+    end
+
     def models
       Models.instance
     end

diff --git a/lib/ruby_llm/configuration.rb b/lib/ruby_llm/configuration.rb
@@ -23,6 +23,7 @@ class Configuration
                   :default_model,
                   :default_embedding_model,
                   :default_image_model,
+                  :default_transcription_model,
                   # Connection configuration
                   :request_timeout,
                   :max_retries,
@@ -42,6 +43,7 @@ def initialize
       @default_model = 'gpt-4o-mini'
       @default_embedding_model = 'text-embedding-3-small'
       @default_image_model = 'dall-e-3'
+      @default_transcription_model = 'whisper-1'
     end
   end
 end
diff --git a/lib/ruby_llm/content.rb b/lib/ruby_llm/content.rb
@@ -34,6 +34,35 @@ def format
       to_a
     end
 
+    # Determine the MIME type based on file extension
+    def self.mime_type_for(path) # rubocop:disable Metrics/CyclomaticComplexity, Metrics/MethodLength
+      ext = File.extname(path).delete('.').downcase
+
+      case ext
+      when 'jpeg', 'jpg'
+        'image/jpeg'
+      when 'png'
+        'image/png'
+      when 'gif'
+        'image/gif'
+      when 'webp'
+        'image/webp'
+      when 'mgpa', 'mp3', 'mpeg'
+        'audio/mpeg'
+      when 'm4a', 'mp4'
+        'audio/mp4'
+      when 'wav'
+        'audio/wav'
+      when 'ogg'
+        'audio/ogg'
+      when 'webm'
+        'audio/webm'
+      else
+        # Default to the extension as the subtype
+        "application/#{ext}"
+      end
+    end
+
     private
 
     def attach_image(source) # rubocop:disable Metrics/MethodLength
@@ -97,8 +126,7 @@ def encode_file(source)
     end
 
     def mime_type_for(path)
-      ext = File.extname(path).delete('.')
-      "image/#{ext}"
+      self.class.mime_type_for(path)
     end
   end
 end
diff --git a/lib/ruby_llm/model_info.rb b/lib/ruby_llm/model_info.rb
@@ -15,7 +15,8 @@ module RubyLLM
   class ModelInfo
     attr_reader :id, :created_at, :display_name, :provider, :metadata,
                 :context_window, :max_tokens, :supports_vision, :supports_functions,
-                :supports_json_mode, :input_price_per_million, :output_price_per_million, :type, :family
+                :supports_json_mode, :input_price_per_million, :output_price_per_million,
+                :type, :family
 
     def initialize(data) # rubocop:disable Metrics/AbcSize,Metrics/MethodLength
       @id = data[:id]

diff --git a/lib/ruby_llm/provider.rb b/lib/ruby_llm/provider.rb
@@ -51,6 +51,13 @@ def paint(prompt, model:, size:)
         parse_image_response(response)
       end
 
+      def transcribe(audio_file, model:, language: nil)
+        payload = render_transcription_payload(audio_file, model:, language:)
+
+        response = post(transcription_url, payload)
+        parse_transcription_response(response)
+      end
+
       def configured?
         missing_configs.empty?
       end

diff --git a/lib/ruby_llm/providers/openai.rb b/lib/ruby_llm/providers/openai.rb
@@ -14,6 +14,7 @@ module OpenAI
       extend OpenAI::Tools
       extend OpenAI::Images
       extend OpenAI::Media
+      extend OpenAI::Transcription
 
       def self.extended(base)
         base.extend(Provider)
@@ -24,6 +25,7 @@ def self.extended(base)
         base.extend(OpenAI::Tools)
         base.extend(OpenAI::Images)
         base.extend(OpenAI::Media)
+        base.extend(OpenAI::Transcription)
       end
 
       module_function

diff --git a/lib/ruby_llm/providers/openai/transcription.rb b/lib/ruby_llm/providers/openai/transcription.rb
@@ -0,0 +1,74 @@
+# frozen_string_literal: true
+
+module RubyLLM
+  module Providers
+    module OpenAI
+      # Handles audio transcription functionality for the OpenAI API
+      module Transcription
+        # Helper methods as module_function
+
+        module_function
+
+        def self.extended(base)
+          # module_function causes the 'transcribe' method to be private, but we need it to be public
+          base.public_class_method :transcribe
+        end
+
+        def self.transcribe(audio_file, model: nil, language: nil)
+          model ||= RubyLLM.config.default_transcription_model
+          payload = render_transcription_payload(audio_file, model: model, language: language)
+
+          response = post_multipart(transcription_url, payload)
+          parse_transcription_response(response)
+        end
+
+        def transcription_url
+          "#{api_base}/audio/transcriptions"
+        end
+
+        def api_base
+          'https://api.openai.com/v1'
+        end
+
+        def headers
+          {
+            'Authorization' => "Bearer #{RubyLLM.config.openai_api_key}"
+          }
+        end
+
+        def post_multipart(url, payload)
+          connection = Faraday.new(url: api_base) do |f|
+            f.request :multipart
+            f.request :url_encoded
+            f.adapter Faraday.default_adapter
+          end
+
+          response = connection.post(url) do |req|
+            req.headers.merge!(headers)
+            req.body = payload
+          end
+
+          JSON.parse(response.body)
+        end
+
+        def render_transcription_payload(audio_file, model:, language: nil)
+          file_part = Faraday::Multipart::FilePart.new(audio_file, Content.mime_type_for(audio_file))
+
+          payload = {
+            model: model,
+            file: file_part
+          }
+
+          # Add language if provided
+          payload[:language] = language if language
+
+          payload
+        end
+
+        def parse_transcription_response(response)
+          response['text']
+        end
+      end
+    end
+  end
+end
diff --git a/spec/ruby_llm/ruby_llm_spec.rb b/spec/ruby_llm/ruby_llm_spec.rb
@@ -0,0 +1,53 @@
+# frozen_string_literal: true
+
+require 'spec_helper'
+
+RSpec.describe RubyLLM do
+  include_context 'with configured RubyLLM'
+
+  let(:audio_path) { File.expand_path('../fixtures/ruby.wav', __dir__) }
+  let(:default_model) { 'whisper-1' }
+
+  before do
+    allow(described_class.config).to receive(:default_transcription_model).and_return(default_model)
+    allow(described_class::Models).to receive(:find).with(default_model)
+    allow(described_class::Provider).to receive(:for).with(default_model).and_return(described_class::Providers::OpenAI)
+    allow(described_class::Providers::OpenAI).to receive(:transcribe)
+  end
+
+  describe '.transcribe' do
+    it 'uses the default model from config when no model is specified' do # rubocop:disable RSpec/MultipleExpectations
+      described_class.transcribe(audio_path)
+
+      expect(described_class::Provider).to have_received(:for).with(default_model)
+      expect(described_class::Providers::OpenAI).to have_received(:transcribe).with(
+        audio_path, model: default_model, language: nil
+      )
+    end
+
+    it 'validates and uses a custom model when specified' do # rubocop:disable RSpec/MultipleExpectations, RSpec/ExampleLength
+      custom_model = 'whisper-large'
+      allow(described_class::Models).to receive(:find).with(custom_model)
+      allow(described_class::Provider).to receive(:for).with(custom_model)
+                                                       .and_return(described_class::Providers::OpenAI)
+
+      described_class.transcribe(audio_path, model: custom_model)
+
+      expect(described_class::Models).to have_received(:find).with(custom_model)
+      expect(described_class::Provider).to have_received(:for).with(custom_model)
+      expect(described_class::Providers::OpenAI).to have_received(:transcribe).with(
+        audio_path, model: custom_model, language: nil
+      )
+    end
+
+    it 'passes language parameter to the provider' do
+      language = 'en'
+
+      described_class.transcribe(audio_path, language: language)
+
+      expect(described_class::Providers::OpenAI).to have_received(:transcribe).with(
+        audio_path, model: default_model, language: language
+      )
+    end
+  end
+end