fix(voxtral): correct typo in apply_transcription_request (#39572)

* fix(voxtral): correct typo in apply_transcription_request

* temporary wrapper: apply_transcrition_request

* Update processing_voxtral.py

* style: sort imports in processing_voxtral.py

* docs(voxtral): fix typo in voxtral.md

* make style

* doc update

---------

Co-authored-by: eustlb <94853470+eustlb@users.noreply.github.com>
Co-authored-by: Eustache Le Bihan <eulebihan@gmail.com>
This commit is contained in:
revanth
2025-07-25 17:39:44 +05:30
committed by GitHub
parent 2a82cf06ad
commit 3b3f9c0c46
3 changed files with 76 additions and 13 deletions

View File

@@ -37,7 +37,11 @@ Voxtral builds on Ministral-3B by adding audio processing capabilities:
## Usage ## Usage
Let's first load the model! ### Audio Instruct Mode
The model supports audio-text instructions, including multi-turn and multi-audio interactions, all processed in batches.
➡️ audio + text instruction
```python ```python
from transformers import VoxtralForConditionalGeneration, AutoProcessor from transformers import VoxtralForConditionalGeneration, AutoProcessor
import torch import torch
@@ -47,14 +51,7 @@ repo_id = "mistralai/Voxtral-Mini-3B-2507"
processor = AutoProcessor.from_pretrained(repo_id) processor = AutoProcessor.from_pretrained(repo_id)
model = VoxtralForConditionalGeneration.from_pretrained(repo_id, torch_dtype=torch.bfloat16, device_map=device) model = VoxtralForConditionalGeneration.from_pretrained(repo_id, torch_dtype=torch.bfloat16, device_map=device)
```
### Audio Instruct Mode
The model supports audio-text instructions, including multi-turn and multi-audio interactions, all processed in batches.
➡️ audio + text instruction
```python
conversation = [ conversation = [
{ {
"role": "user", "role": "user",
@@ -82,6 +79,15 @@ print("=" * 80)
➡️ multi-audio + text instruction ➡️ multi-audio + text instruction
```python ```python
from transformers import VoxtralForConditionalGeneration, AutoProcessor
import torch
device = "cuda" if torch.cuda.is_available() else "cpu"
repo_id = "mistralai/Voxtral-Mini-3B-2507"
processor = AutoProcessor.from_pretrained(repo_id)
model = VoxtralForConditionalGeneration.from_pretrained(repo_id, torch_dtype=torch.bfloat16, device_map=device)
conversation = [ conversation = [
{ {
"role": "user", "role": "user",
@@ -113,6 +119,15 @@ print("=" * 80)
➡️ multi-turn: ➡️ multi-turn:
```python ```python
from transformers import VoxtralForConditionalGeneration, AutoProcessor
import torch
device = "cuda" if torch.cuda.is_available() else "cpu"
repo_id = "mistralai/Voxtral-Mini-3B-2507"
processor = AutoProcessor.from_pretrained(repo_id)
model = VoxtralForConditionalGeneration.from_pretrained(repo_id, torch_dtype=torch.bfloat16, device_map=device)
conversation = [ conversation = [
{ {
"role": "user", "role": "user",
@@ -158,6 +173,15 @@ print("=" * 80)
➡️ text only: ➡️ text only:
```python ```python
from transformers import VoxtralForConditionalGeneration, AutoProcessor
import torch
device = "cuda" if torch.cuda.is_available() else "cpu"
repo_id = "mistralai/Voxtral-Mini-3B-2507"
processor = AutoProcessor.from_pretrained(repo_id)
model = VoxtralForConditionalGeneration.from_pretrained(repo_id, torch_dtype=torch.bfloat16, device_map=device)
conversation = [ conversation = [
{ {
"role": "user", "role": "user",
@@ -184,6 +208,15 @@ print("=" * 80)
➡️ audio only: ➡️ audio only:
```python ```python
from transformers import VoxtralForConditionalGeneration, AutoProcessor
import torch
device = "cuda" if torch.cuda.is_available() else "cpu"
repo_id = "mistralai/Voxtral-Mini-3B-2507"
processor = AutoProcessor.from_pretrained(repo_id)
model = VoxtralForConditionalGeneration.from_pretrained(repo_id, torch_dtype=torch.bfloat16, device_map=device)
conversation = [ conversation = [
{ {
"role": "user", "role": "user",
@@ -210,6 +243,15 @@ print("=" * 80)
➡️ batched inference! ➡️ batched inference!
```python ```python
from transformers import VoxtralForConditionalGeneration, AutoProcessor
import torch
device = "cuda" if torch.cuda.is_available() else "cpu"
repo_id = "mistralai/Voxtral-Mini-3B-2507"
processor = AutoProcessor.from_pretrained(repo_id)
model = VoxtralForConditionalGeneration.from_pretrained(repo_id, torch_dtype=torch.bfloat16, device_map=device)
conversations = [ conversations = [
[ [
{ {
@@ -262,7 +304,16 @@ for decoded_output in decoded_outputs:
Use the model to transcribe audio (supports English, Spanish, French, Portuguese, Hindi, German, Dutch, Italian)! Use the model to transcribe audio (supports English, Spanish, French, Portuguese, Hindi, German, Dutch, Italian)!
```python ```python
inputs = processor.apply_transcrition_request(language="en", audio="https://huggingface.co/datasets/hf-internal-testing/dummy-audio-samples/resolve/main/obama.mp3") from transformers import VoxtralForConditionalGeneration, AutoProcessor
import torch
device = "cuda" if torch.cuda.is_available() else "cpu"
repo_id = "mistralai/Voxtral-Mini-3B-2507"
processor = AutoProcessor.from_pretrained(repo_id)
model = VoxtralForConditionalGeneration.from_pretrained(repo_id, torch_dtype=torch.bfloat16, device_map=device)
inputs = processor.apply_transcription_request(language="en", audio="https://huggingface.co/datasets/hf-internal-testing/dummy-audio-samples/resolve/main/obama.mp3", model_id=repo_id)
inputs = inputs.to(device, dtype=torch.bfloat16) inputs = inputs.to(device, dtype=torch.bfloat16)
outputs = model.generate(**inputs, max_new_tokens=500) outputs = model.generate(**inputs, max_new_tokens=500)

View File

@@ -14,6 +14,7 @@
# limitations under the License. # limitations under the License.
import io import io
import warnings
from typing import Optional, Union from typing import Optional, Union
from ...utils import is_mistral_common_available, is_soundfile_available, is_torch_available, logging from ...utils import is_mistral_common_available, is_soundfile_available, is_torch_available, logging
@@ -242,7 +243,7 @@ class VoxtralProcessor(ProcessorMixin):
the text. Please refer to the docstring of the above methods for more information. the text. Please refer to the docstring of the above methods for more information.
This methods does not support audio. To prepare the audio, please use: This methods does not support audio. To prepare the audio, please use:
1. `apply_chat_template` [`~VoxtralProcessor.apply_chat_template`] method. 1. `apply_chat_template` [`~VoxtralProcessor.apply_chat_template`] method.
2. `apply_transcrition_request` [`~VoxtralProcessor.apply_transcrition_request`] method. 2. `apply_transcription_request` [`~VoxtralProcessor.apply_transcription_request`] method.
Args: Args:
text (`str`, `list[str]`, `list[list[str]]`): text (`str`, `list[str]`, `list[list[str]]`):
@@ -284,7 +285,7 @@ class VoxtralProcessor(ProcessorMixin):
return BatchFeature(data=out, tensor_type=common_kwargs.pop("return_tensors", None)) return BatchFeature(data=out, tensor_type=common_kwargs.pop("return_tensors", None))
# TODO: @eustlb, this should be moved to mistral_common + testing # TODO: @eustlb, this should be moved to mistral_common + testing
def apply_transcrition_request( def apply_transcription_request(
self, self,
language: Union[str, list[str]], language: Union[str, list[str]],
audio: Union[str, list[str], AudioInput], audio: Union[str, list[str], AudioInput],
@@ -306,7 +307,7 @@ class VoxtralProcessor(ProcessorMixin):
language = "en" language = "en"
audio = "https://huggingface.co/datasets/hf-internal-testing/dummy-audio-samples/resolve/main/obama.mp3" audio = "https://huggingface.co/datasets/hf-internal-testing/dummy-audio-samples/resolve/main/obama.mp3"
inputs = processor.apply_transcrition_request(language=language, audio=audio, model_id=model_id) inputs = processor.apply_transcription_request(language=language, audio=audio, model_id=model_id)
``` ```
Args: Args:
@@ -431,6 +432,17 @@ class VoxtralProcessor(ProcessorMixin):
return texts return texts
# Deprecated typo'd method for backward compatibility
def apply_transcrition_request(self, *args, **kwargs):
"""
Deprecated typo'd method. Use `apply_transcription_request` instead.
"""
warnings.warn(
"`apply_transcrition_request` is deprecated due to a typo and will be removed in a future release. Please use `apply_transcription_request` instead.",
FutureWarning,
)
return self.apply_transcription_request(*args, **kwargs)
def batch_decode(self, *args, **kwargs): def batch_decode(self, *args, **kwargs):
""" """
This method forwards all its arguments to MistralCommonTokenizer's [`~MistralCommonTokenizer.batch_decode`]. Please This method forwards all its arguments to MistralCommonTokenizer's [`~MistralCommonTokenizer.batch_decode`]. Please

View File

@@ -493,7 +493,7 @@ class VoxtralForConditionalGenerationIntegrationTest(unittest.TestCase):
model = VoxtralForConditionalGeneration.from_pretrained( model = VoxtralForConditionalGeneration.from_pretrained(
self.checkpoint_name, torch_dtype=self.dtype, device_map=torch_device self.checkpoint_name, torch_dtype=self.dtype, device_map=torch_device
) )
inputs = self.processor.apply_transcrition_request( inputs = self.processor.apply_transcription_request(
language="en", language="en",
audio="https://huggingface.co/datasets/hf-internal-testing/dummy-audio-samples/resolve/main/obama.mp3", audio="https://huggingface.co/datasets/hf-internal-testing/dummy-audio-samples/resolve/main/obama.mp3",
model_id=self.checkpoint_name, model_id=self.checkpoint_name,