Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
69 changes: 60 additions & 9 deletions docs/source/en/model_doc/voxtral.md
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,11 @@ Voxtral builds on Ministral-3B by adding audio processing capabilities:

## Usage

Let's first load the model!
### Audio Instruct Mode

The model supports audio-text instructions, including multi-turn and multi-audio interactions, all processed in batches.

➡️ audio + text instruction
```python
from transformers import VoxtralForConditionalGeneration, AutoProcessor
import torch
Expand All @@ -47,14 +51,7 @@ repo_id = "mistralai/Voxtral-Mini-3B-2507"

processor = AutoProcessor.from_pretrained(repo_id)
model = VoxtralForConditionalGeneration.from_pretrained(repo_id, torch_dtype=torch.bfloat16, device_map=device)
```

### Audio Instruct Mode

The model supports audio-text instructions, including multi-turn and multi-audio interactions, all processed in batches.

➡️ audio + text instruction
```python
conversation = [
{
"role": "user",
Expand Down Expand Up @@ -82,6 +79,15 @@ print("=" * 80)

➡️ multi-audio + text instruction
```python
from transformers import VoxtralForConditionalGeneration, AutoProcessor
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"
repo_id = "mistralai/Voxtral-Mini-3B-2507"

processor = AutoProcessor.from_pretrained(repo_id)
model = VoxtralForConditionalGeneration.from_pretrained(repo_id, torch_dtype=torch.bfloat16, device_map=device)

conversation = [
{
"role": "user",
Expand Down Expand Up @@ -113,6 +119,15 @@ print("=" * 80)

➡️ multi-turn:
```python
from transformers import VoxtralForConditionalGeneration, AutoProcessor
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"
repo_id = "mistralai/Voxtral-Mini-3B-2507"

processor = AutoProcessor.from_pretrained(repo_id)
model = VoxtralForConditionalGeneration.from_pretrained(repo_id, torch_dtype=torch.bfloat16, device_map=device)

conversation = [
{
"role": "user",
Expand Down Expand Up @@ -158,6 +173,15 @@ print("=" * 80)

➡️ text only:
```python
from transformers import VoxtralForConditionalGeneration, AutoProcessor
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"
repo_id = "mistralai/Voxtral-Mini-3B-2507"

processor = AutoProcessor.from_pretrained(repo_id)
model = VoxtralForConditionalGeneration.from_pretrained(repo_id, torch_dtype=torch.bfloat16, device_map=device)

conversation = [
{
"role": "user",
Expand All @@ -184,6 +208,15 @@ print("=" * 80)

➡️ audio only:
```python
from transformers import VoxtralForConditionalGeneration, AutoProcessor
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"
repo_id = "mistralai/Voxtral-Mini-3B-2507"

processor = AutoProcessor.from_pretrained(repo_id)
model = VoxtralForConditionalGeneration.from_pretrained(repo_id, torch_dtype=torch.bfloat16, device_map=device)

conversation = [
{
"role": "user",
Expand All @@ -210,6 +243,15 @@ print("=" * 80)

➡️ batched inference!
```python
from transformers import VoxtralForConditionalGeneration, AutoProcessor
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"
repo_id = "mistralai/Voxtral-Mini-3B-2507"

processor = AutoProcessor.from_pretrained(repo_id)
model = VoxtralForConditionalGeneration.from_pretrained(repo_id, torch_dtype=torch.bfloat16, device_map=device)

conversations = [
[
{
Expand Down Expand Up @@ -262,7 +304,16 @@ for decoded_output in decoded_outputs:
Use the model to transcribe audio (supports English, Spanish, French, Portuguese, Hindi, German, Dutch, Italian)!

```python
inputs = processor.apply_transcrition_request(language="en", audio="https://huggingface.co/datasets/hf-internal-testing/dummy-audio-samples/resolve/main/obama.mp3")
from transformers import VoxtralForConditionalGeneration, AutoProcessor
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"
repo_id = "mistralai/Voxtral-Mini-3B-2507"

processor = AutoProcessor.from_pretrained(repo_id)
model = VoxtralForConditionalGeneration.from_pretrained(repo_id, torch_dtype=torch.bfloat16, device_map=device)

inputs = processor.apply_transcription_request(language="en", audio="https://huggingface.co/datasets/hf-internal-testing/dummy-audio-samples/resolve/main/obama.mp3", model_id=repo_id)
inputs = inputs.to(device, dtype=torch.bfloat16)

outputs = model.generate(**inputs, max_new_tokens=500)
Expand Down
18 changes: 15 additions & 3 deletions src/transformers/models/voxtral/processing_voxtral.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
# limitations under the License.

import io
import warnings
from typing import Optional, Union

from ...utils import is_mistral_common_available, is_soundfile_available, is_torch_available, logging
Expand Down Expand Up @@ -242,7 +243,7 @@ def __call__(
the text. Please refer to the docstring of the above methods for more information.
This methods does not support audio. To prepare the audio, please use:
1. `apply_chat_template` [`~VoxtralProcessor.apply_chat_template`] method.
2. `apply_transcrition_request` [`~VoxtralProcessor.apply_transcrition_request`] method.
2. `apply_transcription_request` [`~VoxtralProcessor.apply_transcription_request`] method.

Args:
text (`str`, `list[str]`, `list[list[str]]`):
Expand Down Expand Up @@ -284,7 +285,7 @@ def __call__(
return BatchFeature(data=out, tensor_type=common_kwargs.pop("return_tensors", None))

# TODO: @eustlb, this should be moved to mistral_common + testing
def apply_transcrition_request(
def apply_transcription_request(
self,
language: Union[str, list[str]],
audio: Union[str, list[str], AudioInput],
Expand All @@ -306,7 +307,7 @@ def apply_transcrition_request(
language = "en"
audio = "https://huggingface.co/datasets/hf-internal-testing/dummy-audio-samples/resolve/main/obama.mp3"

inputs = processor.apply_transcrition_request(language=language, audio=audio, model_id=model_id)
inputs = processor.apply_transcription_request(language=language, audio=audio, model_id=model_id)
```

Args:
Expand Down Expand Up @@ -431,6 +432,17 @@ def apply_transcrition_request(

return texts

# Deprecated typo'd method for backward compatibility
def apply_transcrition_request(self, *args, **kwargs):
"""
Deprecated typo'd method. Use `apply_transcription_request` instead.
"""
warnings.warn(
"`apply_transcrition_request` is deprecated due to a typo and will be removed in a future release. Please use `apply_transcription_request` instead.",
FutureWarning,
)
return self.apply_transcription_request(*args, **kwargs)

def batch_decode(self, *args, **kwargs):
"""
This method forwards all its arguments to MistralCommonTokenizer's [`~MistralCommonTokenizer.batch_decode`]. Please
Expand Down
2 changes: 1 addition & 1 deletion tests/models/voxtral/test_modeling_voxtral.py
Original file line number Diff line number Diff line change
Expand Up @@ -493,7 +493,7 @@ def test_transcribe_mode_audio_input(self):
model = VoxtralForConditionalGeneration.from_pretrained(
self.checkpoint_name, torch_dtype=self.dtype, device_map=torch_device
)
inputs = self.processor.apply_transcrition_request(
inputs = self.processor.apply_transcription_request(
language="en",
audio="https://huggingface.co/datasets/hf-internal-testing/dummy-audio-samples/resolve/main/obama.mp3",
model_id=self.checkpoint_name,
Expand Down