@@ -37,7 +37,11 @@ Voxtral builds on Ministral-3B by adding audio processing capabilities:
37
37
38
38
## Usage
39
39
40
- Let's first load the model!
40
+ ### Audio Instruct Mode
41
+
42
+ The model supports audio-text instructions, including multi-turn and multi-audio interactions, all processed in batches.
43
+
44
+ ➡️ audio + text instruction
41
45
``` python
42
46
from transformers import VoxtralForConditionalGeneration, AutoProcessor
43
47
import torch
@@ -47,14 +51,7 @@ repo_id = "mistralai/Voxtral-Mini-3B-2507"
47
51
48
52
processor = AutoProcessor.from_pretrained(repo_id)
49
53
model = VoxtralForConditionalGeneration.from_pretrained(repo_id, torch_dtype = torch.bfloat16, device_map = device)
50
- ```
51
-
52
- ### Audio Instruct Mode
53
54
54
- The model supports audio-text instructions, including multi-turn and multi-audio interactions, all processed in batches.
55
-
56
- ➡️ audio + text instruction
57
- ``` python
58
55
conversation = [
59
56
{
60
57
" role" : " user" ,
@@ -82,6 +79,15 @@ print("=" * 80)
82
79
83
80
➡️ multi-audio + text instruction
84
81
``` python
82
+ from transformers import VoxtralForConditionalGeneration, AutoProcessor
83
+ import torch
84
+
85
+ device = " cuda" if torch.cuda.is_available() else " cpu"
86
+ repo_id = " mistralai/Voxtral-Mini-3B-2507"
87
+
88
+ processor = AutoProcessor.from_pretrained(repo_id)
89
+ model = VoxtralForConditionalGeneration.from_pretrained(repo_id, torch_dtype = torch.bfloat16, device_map = device)
90
+
85
91
conversation = [
86
92
{
87
93
" role" : " user" ,
@@ -113,6 +119,15 @@ print("=" * 80)
113
119
114
120
➡️ multi-turn:
115
121
``` python
122
+ from transformers import VoxtralForConditionalGeneration, AutoProcessor
123
+ import torch
124
+
125
+ device = " cuda" if torch.cuda.is_available() else " cpu"
126
+ repo_id = " mistralai/Voxtral-Mini-3B-2507"
127
+
128
+ processor = AutoProcessor.from_pretrained(repo_id)
129
+ model = VoxtralForConditionalGeneration.from_pretrained(repo_id, torch_dtype = torch.bfloat16, device_map = device)
130
+
116
131
conversation = [
117
132
{
118
133
" role" : " user" ,
@@ -158,6 +173,15 @@ print("=" * 80)
158
173
159
174
➡️ text only:
160
175
``` python
176
+ from transformers import VoxtralForConditionalGeneration, AutoProcessor
177
+ import torch
178
+
179
+ device = " cuda" if torch.cuda.is_available() else " cpu"
180
+ repo_id = " mistralai/Voxtral-Mini-3B-2507"
181
+
182
+ processor = AutoProcessor.from_pretrained(repo_id)
183
+ model = VoxtralForConditionalGeneration.from_pretrained(repo_id, torch_dtype = torch.bfloat16, device_map = device)
184
+
161
185
conversation = [
162
186
{
163
187
" role" : " user" ,
@@ -184,6 +208,15 @@ print("=" * 80)
184
208
185
209
➡️ audio only:
186
210
``` python
211
+ from transformers import VoxtralForConditionalGeneration, AutoProcessor
212
+ import torch
213
+
214
+ device = " cuda" if torch.cuda.is_available() else " cpu"
215
+ repo_id = " mistralai/Voxtral-Mini-3B-2507"
216
+
217
+ processor = AutoProcessor.from_pretrained(repo_id)
218
+ model = VoxtralForConditionalGeneration.from_pretrained(repo_id, torch_dtype = torch.bfloat16, device_map = device)
219
+
187
220
conversation = [
188
221
{
189
222
" role" : " user" ,
@@ -210,6 +243,15 @@ print("=" * 80)
210
243
211
244
➡️ batched inference!
212
245
``` python
246
+ from transformers import VoxtralForConditionalGeneration, AutoProcessor
247
+ import torch
248
+
249
+ device = " cuda" if torch.cuda.is_available() else " cpu"
250
+ repo_id = " mistralai/Voxtral-Mini-3B-2507"
251
+
252
+ processor = AutoProcessor.from_pretrained(repo_id)
253
+ model = VoxtralForConditionalGeneration.from_pretrained(repo_id, torch_dtype = torch.bfloat16, device_map = device)
254
+
213
255
conversations = [
214
256
[
215
257
{
@@ -262,7 +304,16 @@ for decoded_output in decoded_outputs:
262
304
Use the model to transcribe audio (supports English, Spanish, French, Portuguese, Hindi, German, Dutch, Italian)!
263
305
264
306
``` python
265
- inputs = processor.apply_transcrition_request(language = " en" , audio = " https://huggingface.co/datasets/hf-internal-testing/dummy-audio-samples/resolve/main/obama.mp3" )
307
+ from transformers import VoxtralForConditionalGeneration, AutoProcessor
308
+ import torch
309
+
310
+ device = " cuda" if torch.cuda.is_available() else " cpu"
311
+ repo_id = " mistralai/Voxtral-Mini-3B-2507"
312
+
313
+ processor = AutoProcessor.from_pretrained(repo_id)
314
+ model = VoxtralForConditionalGeneration.from_pretrained(repo_id, torch_dtype = torch.bfloat16, device_map = device)
315
+
316
+ inputs = processor.apply_transcription_request(language = " en" , audio = " https://huggingface.co/datasets/hf-internal-testing/dummy-audio-samples/resolve/main/obama.mp3" , model_id = repo_id)
266
317
inputs = inputs.to(device, dtype = torch.bfloat16)
267
318
268
319
outputs = model.generate(** inputs, max_new_tokens = 500 )
0 commit comments