fix(voxtral): correct typo in apply_transcription_request (#39572)

rev2607 · eustlb · web-flow · commit 3b3f9c0c46ea · 2025-07-25T12:09:44.000Z
* fix(voxtral): correct typo in apply_transcription_request

* temporary wrapper: apply_transcrition_request

* Update processing_voxtral.py

* style: sort imports in processing_voxtral.py

* docs(voxtral): fix typo in voxtral.md

* make style

* doc update

---------

Co-authored-by: eustlb &lt;94853470+eustlb@users.noreply.github.com&gt;
Co-authored-by: Eustache Le Bihan &lt;eulebihan@gmail.com&gt;
diff --git a/docs/source/en/model_doc/voxtral.md b/docs/source/en/model_doc/voxtral.md
@@ -37,7 +37,11 @@ Voxtral builds on Ministral-3B by adding audio processing capabilities:
 
 ## Usage
 
-Let's first load the model!
+### Audio Instruct Mode
+
+The model supports audio-text instructions, including multi-turn and multi-audio interactions, all processed in batches.
+
+➡️ audio + text instruction
 ```python
 from transformers import VoxtralForConditionalGeneration, AutoProcessor
 import torch
@@ -47,14 +51,7 @@ repo_id = "mistralai/Voxtral-Mini-3B-2507"
 
 processor = AutoProcessor.from_pretrained(repo_id)
 model = VoxtralForConditionalGeneration.from_pretrained(repo_id, torch_dtype=torch.bfloat16, device_map=device)
-```
-
-### Audio Instruct Mode
 
-The model supports audio-text instructions, including multi-turn and multi-audio interactions, all processed in batches.
-
-➡️ audio + text instruction
-```python
 conversation = [
     {
         "role": "user",
@@ -82,6 +79,15 @@ print("=" * 80)
 
 ➡️ multi-audio + text instruction 
 ```python
+from transformers import VoxtralForConditionalGeneration, AutoProcessor
+import torch
+
+device = "cuda" if torch.cuda.is_available() else "cpu"
+repo_id = "mistralai/Voxtral-Mini-3B-2507"
+
+processor = AutoProcessor.from_pretrained(repo_id)
+model = VoxtralForConditionalGeneration.from_pretrained(repo_id, torch_dtype=torch.bfloat16, device_map=device)
+
 conversation = [
     {
         "role": "user",
@@ -113,6 +119,15 @@ print("=" * 80)
 
 ➡️ multi-turn:
 ```python
+from transformers import VoxtralForConditionalGeneration, AutoProcessor
+import torch
+
+device = "cuda" if torch.cuda.is_available() else "cpu"
+repo_id = "mistralai/Voxtral-Mini-3B-2507"
+
+processor = AutoProcessor.from_pretrained(repo_id)
+model = VoxtralForConditionalGeneration.from_pretrained(repo_id, torch_dtype=torch.bfloat16, device_map=device)
+
 conversation = [
     {
         "role": "user",
@@ -158,6 +173,15 @@ print("=" * 80)
 
 ➡️ text only:
 ```python
+from transformers import VoxtralForConditionalGeneration, AutoProcessor
+import torch
+
+device = "cuda" if torch.cuda.is_available() else "cpu"
+repo_id = "mistralai/Voxtral-Mini-3B-2507"
+
+processor = AutoProcessor.from_pretrained(repo_id)
+model = VoxtralForConditionalGeneration.from_pretrained(repo_id, torch_dtype=torch.bfloat16, device_map=device)
+
 conversation = [
     {
         "role": "user",
@@ -184,6 +208,15 @@ print("=" * 80)
 
 ➡️ audio only:
 ```python
+from transformers import VoxtralForConditionalGeneration, AutoProcessor
+import torch
+
+device = "cuda" if torch.cuda.is_available() else "cpu"
+repo_id = "mistralai/Voxtral-Mini-3B-2507"
+
+processor = AutoProcessor.from_pretrained(repo_id)
+model = VoxtralForConditionalGeneration.from_pretrained(repo_id, torch_dtype=torch.bfloat16, device_map=device)
+
 conversation = [
     {
         "role": "user",
@@ -210,6 +243,15 @@ print("=" * 80)
 
 ➡️ batched inference!
 ```python
+from transformers import VoxtralForConditionalGeneration, AutoProcessor
+import torch
+
+device = "cuda" if torch.cuda.is_available() else "cpu"
+repo_id = "mistralai/Voxtral-Mini-3B-2507"
+
+processor = AutoProcessor.from_pretrained(repo_id)
+model = VoxtralForConditionalGeneration.from_pretrained(repo_id, torch_dtype=torch.bfloat16, device_map=device)
+
 conversations = [
     [
         {
@@ -262,7 +304,16 @@ for decoded_output in decoded_outputs:
 Use the model to transcribe audio (supports English, Spanish, French, Portuguese, Hindi, German, Dutch, Italian)!
 
 ```python
-inputs = processor.apply_transcrition_request(language="en", audio="https://huggingface.co/datasets/hf-internal-testing/dummy-audio-samples/resolve/main/obama.mp3")
+from transformers import VoxtralForConditionalGeneration, AutoProcessor
+import torch
+
+device = "cuda" if torch.cuda.is_available() else "cpu"
+repo_id = "mistralai/Voxtral-Mini-3B-2507"
+
+processor = AutoProcessor.from_pretrained(repo_id)
+model = VoxtralForConditionalGeneration.from_pretrained(repo_id, torch_dtype=torch.bfloat16, device_map=device)
+
+inputs = processor.apply_transcription_request(language="en", audio="https://huggingface.co/datasets/hf-internal-testing/dummy-audio-samples/resolve/main/obama.mp3", model_id=repo_id)
 inputs = inputs.to(device, dtype=torch.bfloat16)
 
 outputs = model.generate(**inputs, max_new_tokens=500)
diff --git a/src/transformers/models/voxtral/processing_voxtral.py b/src/transformers/models/voxtral/processing_voxtral.py
@@ -14,6 +14,7 @@
 # limitations under the License.
 
 import io
+import warnings
 from typing import Optional, Union
 
 from ...utils import is_mistral_common_available, is_soundfile_available, is_torch_available, logging
@@ -242,7 +243,7 @@ def __call__(
         the text. Please refer to the docstring of the above methods for more information.
         This methods does not support audio. To prepare the audio, please use:
         1. `apply_chat_template` [`~VoxtralProcessor.apply_chat_template`] method.
-        2. `apply_transcrition_request` [`~VoxtralProcessor.apply_transcrition_request`] method.
+        2. `apply_transcription_request` [`~VoxtralProcessor.apply_transcription_request`] method.
 
         Args:
             text (`str`, `list[str]`, `list[list[str]]`):
@@ -284,7 +285,7 @@ def __call__(
         return BatchFeature(data=out, tensor_type=common_kwargs.pop("return_tensors", None))
 
     # TODO: @eustlb, this should be moved to mistral_common + testing
-    def apply_transcrition_request(
+    def apply_transcription_request(
         self,
         language: Union[str, list[str]],
         audio: Union[str, list[str], AudioInput],
@@ -306,7 +307,7 @@ def apply_transcrition_request(
         language = "en"
         audio = "https://huggingface.co/datasets/hf-internal-testing/dummy-audio-samples/resolve/main/obama.mp3"
 
-        inputs = processor.apply_transcrition_request(language=language, audio=audio, model_id=model_id)
+        inputs = processor.apply_transcription_request(language=language, audio=audio, model_id=model_id)
         ```
 
         Args:
@@ -431,6 +432,17 @@ def apply_transcrition_request(
 
         return texts
 
+    # Deprecated typo'd method for backward compatibility
+    def apply_transcrition_request(self, *args, **kwargs):
+        """
+        Deprecated typo'd method. Use `apply_transcription_request` instead.
+        """
+        warnings.warn(
+            "`apply_transcrition_request` is deprecated due to a typo and will be removed in a future release. Please use `apply_transcription_request` instead.",
+            FutureWarning,
+        )
+        return self.apply_transcription_request(*args, **kwargs)
+
     def batch_decode(self, *args, **kwargs):
         """
         This method forwards all its arguments to MistralCommonTokenizer's [`~MistralCommonTokenizer.batch_decode`]. Please
diff --git a/tests/models/voxtral/test_modeling_voxtral.py b/tests/models/voxtral/test_modeling_voxtral.py
@@ -493,7 +493,7 @@ def test_transcribe_mode_audio_input(self):
         model = VoxtralForConditionalGeneration.from_pretrained(
             self.checkpoint_name, torch_dtype=self.dtype, device_map=torch_device
         )
-        inputs = self.processor.apply_transcrition_request(
+        inputs = self.processor.apply_transcription_request(
             language="en",
             audio="https://huggingface.co/datasets/hf-internal-testing/dummy-audio-samples/resolve/main/obama.mp3",
             model_id=self.checkpoint_name,

Original file line number	Diff line number	Diff line change
`@@ -493,7 +493,7 @@ def test_transcribe_mode_audio_input(self):`
`493`	`493`	`model = VoxtralForConditionalGeneration.from_pretrained(`
`494`	`494`	`self.checkpoint_name, torch_dtype=self.dtype, device_map=torch_device`
`495`	`495`	`)`
`496`		`- inputs = self.processor.apply_transcrition_request(`
	`496`	`+ inputs = self.processor.apply_transcription_request(`
`497`	`497`	`language="en",`
`498`	`498`	`audio="https://huggingface.co/datasets/hf-internal-testing/dummy-audio-samples/resolve/main/obama.mp3",`
`499`	`499`	`model_id=self.checkpoint_name,`