fix(voxtral): correct typo in apply_transcription_request (#39572)
* fix(voxtral): correct typo in apply_transcription_request * temporary wrapper: apply_transcrition_request * Update processing_voxtral.py * style: sort imports in processing_voxtral.py * docs(voxtral): fix typo in voxtral.md * make style * doc update --------- Co-authored-by: eustlb <94853470+eustlb@users.noreply.github.com> Co-authored-by: Eustache Le Bihan <eulebihan@gmail.com>
This commit is contained in:
@@ -37,7 +37,11 @@ Voxtral builds on Ministral-3B by adding audio processing capabilities:
|
||||
|
||||
## Usage
|
||||
|
||||
Let's first load the model!
|
||||
### Audio Instruct Mode
|
||||
|
||||
The model supports audio-text instructions, including multi-turn and multi-audio interactions, all processed in batches.
|
||||
|
||||
➡️ audio + text instruction
|
||||
```python
|
||||
from transformers import VoxtralForConditionalGeneration, AutoProcessor
|
||||
import torch
|
||||
@@ -47,14 +51,7 @@ repo_id = "mistralai/Voxtral-Mini-3B-2507"
|
||||
|
||||
processor = AutoProcessor.from_pretrained(repo_id)
|
||||
model = VoxtralForConditionalGeneration.from_pretrained(repo_id, torch_dtype=torch.bfloat16, device_map=device)
|
||||
```
|
||||
|
||||
### Audio Instruct Mode
|
||||
|
||||
The model supports audio-text instructions, including multi-turn and multi-audio interactions, all processed in batches.
|
||||
|
||||
➡️ audio + text instruction
|
||||
```python
|
||||
conversation = [
|
||||
{
|
||||
"role": "user",
|
||||
@@ -82,6 +79,15 @@ print("=" * 80)
|
||||
|
||||
➡️ multi-audio + text instruction
|
||||
```python
|
||||
from transformers import VoxtralForConditionalGeneration, AutoProcessor
|
||||
import torch
|
||||
|
||||
device = "cuda" if torch.cuda.is_available() else "cpu"
|
||||
repo_id = "mistralai/Voxtral-Mini-3B-2507"
|
||||
|
||||
processor = AutoProcessor.from_pretrained(repo_id)
|
||||
model = VoxtralForConditionalGeneration.from_pretrained(repo_id, torch_dtype=torch.bfloat16, device_map=device)
|
||||
|
||||
conversation = [
|
||||
{
|
||||
"role": "user",
|
||||
@@ -113,6 +119,15 @@ print("=" * 80)
|
||||
|
||||
➡️ multi-turn:
|
||||
```python
|
||||
from transformers import VoxtralForConditionalGeneration, AutoProcessor
|
||||
import torch
|
||||
|
||||
device = "cuda" if torch.cuda.is_available() else "cpu"
|
||||
repo_id = "mistralai/Voxtral-Mini-3B-2507"
|
||||
|
||||
processor = AutoProcessor.from_pretrained(repo_id)
|
||||
model = VoxtralForConditionalGeneration.from_pretrained(repo_id, torch_dtype=torch.bfloat16, device_map=device)
|
||||
|
||||
conversation = [
|
||||
{
|
||||
"role": "user",
|
||||
@@ -158,6 +173,15 @@ print("=" * 80)
|
||||
|
||||
➡️ text only:
|
||||
```python
|
||||
from transformers import VoxtralForConditionalGeneration, AutoProcessor
|
||||
import torch
|
||||
|
||||
device = "cuda" if torch.cuda.is_available() else "cpu"
|
||||
repo_id = "mistralai/Voxtral-Mini-3B-2507"
|
||||
|
||||
processor = AutoProcessor.from_pretrained(repo_id)
|
||||
model = VoxtralForConditionalGeneration.from_pretrained(repo_id, torch_dtype=torch.bfloat16, device_map=device)
|
||||
|
||||
conversation = [
|
||||
{
|
||||
"role": "user",
|
||||
@@ -184,6 +208,15 @@ print("=" * 80)
|
||||
|
||||
➡️ audio only:
|
||||
```python
|
||||
from transformers import VoxtralForConditionalGeneration, AutoProcessor
|
||||
import torch
|
||||
|
||||
device = "cuda" if torch.cuda.is_available() else "cpu"
|
||||
repo_id = "mistralai/Voxtral-Mini-3B-2507"
|
||||
|
||||
processor = AutoProcessor.from_pretrained(repo_id)
|
||||
model = VoxtralForConditionalGeneration.from_pretrained(repo_id, torch_dtype=torch.bfloat16, device_map=device)
|
||||
|
||||
conversation = [
|
||||
{
|
||||
"role": "user",
|
||||
@@ -210,6 +243,15 @@ print("=" * 80)
|
||||
|
||||
➡️ batched inference!
|
||||
```python
|
||||
from transformers import VoxtralForConditionalGeneration, AutoProcessor
|
||||
import torch
|
||||
|
||||
device = "cuda" if torch.cuda.is_available() else "cpu"
|
||||
repo_id = "mistralai/Voxtral-Mini-3B-2507"
|
||||
|
||||
processor = AutoProcessor.from_pretrained(repo_id)
|
||||
model = VoxtralForConditionalGeneration.from_pretrained(repo_id, torch_dtype=torch.bfloat16, device_map=device)
|
||||
|
||||
conversations = [
|
||||
[
|
||||
{
|
||||
@@ -262,7 +304,16 @@ for decoded_output in decoded_outputs:
|
||||
Use the model to transcribe audio (supports English, Spanish, French, Portuguese, Hindi, German, Dutch, Italian)!
|
||||
|
||||
```python
|
||||
inputs = processor.apply_transcrition_request(language="en", audio="https://huggingface.co/datasets/hf-internal-testing/dummy-audio-samples/resolve/main/obama.mp3")
|
||||
from transformers import VoxtralForConditionalGeneration, AutoProcessor
|
||||
import torch
|
||||
|
||||
device = "cuda" if torch.cuda.is_available() else "cpu"
|
||||
repo_id = "mistralai/Voxtral-Mini-3B-2507"
|
||||
|
||||
processor = AutoProcessor.from_pretrained(repo_id)
|
||||
model = VoxtralForConditionalGeneration.from_pretrained(repo_id, torch_dtype=torch.bfloat16, device_map=device)
|
||||
|
||||
inputs = processor.apply_transcription_request(language="en", audio="https://huggingface.co/datasets/hf-internal-testing/dummy-audio-samples/resolve/main/obama.mp3", model_id=repo_id)
|
||||
inputs = inputs.to(device, dtype=torch.bfloat16)
|
||||
|
||||
outputs = model.generate(**inputs, max_new_tokens=500)
|
||||
|
||||
Reference in New Issue
Block a user