huggingface · blakkd · Jul 25, 2025
diff --git a/docs/source/en/model_doc/voxtral.md b/docs/source/en/model_doc/voxtral.md
@@ -262,7 +262,7 @@ for decoded_output in decoded_outputs:
 Use the model to transcribe audio (supports English, Spanish, French, Portuguese, Hindi, German, Dutch, Italian)!
 
 ```python
-inputs = processor.apply_transcrition_request(language="en", audio="https://huggingface.co/datasets/hf-internal-testing/dummy-audio-samples/resolve/main/obama.mp3")
+inputs = processor.apply_transcription_request(language="en", audio="https://huggingface.co/datasets/hf-internal-testing/dummy-audio-samples/resolve/main/obama.mp3")
 inputs = inputs.to(device, dtype=torch.bfloat16)
 
 outputs = model.generate(**inputs, max_new_tokens=500)

diff --git a/src/transformers/models/voxtral/processing_voxtral.py b/src/transformers/models/voxtral/processing_voxtral.py
@@ -242,7 +242,7 @@ def __call__(
         the text. Please refer to the docstring of the above methods for more information.
         This methods does not support audio. To prepare the audio, please use:
         1. `apply_chat_template` [`~VoxtralProcessor.apply_chat_template`] method.
-        2. `apply_transcrition_request` [`~VoxtralProcessor.apply_transcrition_request`] method.
+        2. `apply_transcription_request` [`~VoxtralProcessor.apply_transcription_request`] method.
 
         Args:
             text (`str`, `list[str]`, `list[list[str]]`):
@@ -284,7 +284,7 @@ def __call__(
         return BatchFeature(data=out, tensor_type=common_kwargs.pop("return_tensors", None))
 
     # TODO: @eustlb, this should be moved to mistral_common + testing
-    def apply_transcrition_request(
+    def apply_transcription_request(
         self,
         language: Union[str, list[str]],
         audio: Union[str, list[str], AudioInput],
@@ -306,7 +306,7 @@ def apply_transcrition_request(
         language = "en"
         audio = "https://huggingface.co/datasets/hf-internal-testing/dummy-audio-samples/resolve/main/obama.mp3"
 
-        inputs = processor.apply_transcrition_request(language=language, audio=audio, model_id=model_id)
+        inputs = processor.apply_transcription_request(language=language, audio=audio, model_id=model_id)
         ```
 
         Args:

diff --git a/tests/models/voxtral/test_modeling_voxtral.py b/tests/models/voxtral/test_modeling_voxtral.py
@@ -492,7 +492,7 @@ def test_transcribe_mode_audio_input(self):
         model = VoxtralForConditionalGeneration.from_pretrained(
             self.checkpoint_name, torch_dtype=self.dtype, device_map=torch_device
         )
-        inputs = self.processor.apply_transcrition_request(
+        inputs = self.processor.apply_transcription_request(
             language="en",
             audio="https://huggingface.co/datasets/hf-internal-testing/dummy-audio-samples/resolve/main/obama.mp3",
             model_id=self.checkpoint_name,