From 3b3f9c0c46ea2dd1de519cc428e6f27dd2ef4b97 Mon Sep 17 00:00:00 2001 From: revanth <117919399+rev2607@users.noreply.github.com> Date: Fri, 25 Jul 2025 17:39:44 +0530 Subject: [PATCH] fix(voxtral): correct typo in apply_transcription_request (#39572) * fix(voxtral): correct typo in apply_transcription_request * temporary wrapper: apply_transcrition_request * Update processing_voxtral.py * style: sort imports in processing_voxtral.py * docs(voxtral): fix typo in voxtral.md * make style * doc update --------- Co-authored-by: eustlb <94853470+eustlb@users.noreply.github.com> Co-authored-by: Eustache Le Bihan --- docs/source/en/model_doc/voxtral.md | 69 ++++++++++++++++--- .../models/voxtral/processing_voxtral.py | 18 ++++- tests/models/voxtral/test_modeling_voxtral.py | 2 +- 3 files changed, 76 insertions(+), 13 deletions(-) diff --git a/docs/source/en/model_doc/voxtral.md b/docs/source/en/model_doc/voxtral.md index 365c19b281..ad15631a96 100644 --- a/docs/source/en/model_doc/voxtral.md +++ b/docs/source/en/model_doc/voxtral.md @@ -37,7 +37,11 @@ Voxtral builds on Ministral-3B by adding audio processing capabilities: ## Usage -Let's first load the model! +### Audio Instruct Mode + +The model supports audio-text instructions, including multi-turn and multi-audio interactions, all processed in batches. + +➡️ audio + text instruction ```python from transformers import VoxtralForConditionalGeneration, AutoProcessor import torch @@ -47,14 +51,7 @@ repo_id = "mistralai/Voxtral-Mini-3B-2507" processor = AutoProcessor.from_pretrained(repo_id) model = VoxtralForConditionalGeneration.from_pretrained(repo_id, torch_dtype=torch.bfloat16, device_map=device) -``` -### Audio Instruct Mode - -The model supports audio-text instructions, including multi-turn and multi-audio interactions, all processed in batches. - -➡️ audio + text instruction -```python conversation = [ { "role": "user", @@ -82,6 +79,15 @@ print("=" * 80) ➡️ multi-audio + text instruction ```python +from transformers import VoxtralForConditionalGeneration, AutoProcessor +import torch + +device = "cuda" if torch.cuda.is_available() else "cpu" +repo_id = "mistralai/Voxtral-Mini-3B-2507" + +processor = AutoProcessor.from_pretrained(repo_id) +model = VoxtralForConditionalGeneration.from_pretrained(repo_id, torch_dtype=torch.bfloat16, device_map=device) + conversation = [ { "role": "user", @@ -113,6 +119,15 @@ print("=" * 80) ➡️ multi-turn: ```python +from transformers import VoxtralForConditionalGeneration, AutoProcessor +import torch + +device = "cuda" if torch.cuda.is_available() else "cpu" +repo_id = "mistralai/Voxtral-Mini-3B-2507" + +processor = AutoProcessor.from_pretrained(repo_id) +model = VoxtralForConditionalGeneration.from_pretrained(repo_id, torch_dtype=torch.bfloat16, device_map=device) + conversation = [ { "role": "user", @@ -158,6 +173,15 @@ print("=" * 80) ➡️ text only: ```python +from transformers import VoxtralForConditionalGeneration, AutoProcessor +import torch + +device = "cuda" if torch.cuda.is_available() else "cpu" +repo_id = "mistralai/Voxtral-Mini-3B-2507" + +processor = AutoProcessor.from_pretrained(repo_id) +model = VoxtralForConditionalGeneration.from_pretrained(repo_id, torch_dtype=torch.bfloat16, device_map=device) + conversation = [ { "role": "user", @@ -184,6 +208,15 @@ print("=" * 80) ➡️ audio only: ```python +from transformers import VoxtralForConditionalGeneration, AutoProcessor +import torch + +device = "cuda" if torch.cuda.is_available() else "cpu" +repo_id = "mistralai/Voxtral-Mini-3B-2507" + +processor = AutoProcessor.from_pretrained(repo_id) +model = VoxtralForConditionalGeneration.from_pretrained(repo_id, torch_dtype=torch.bfloat16, device_map=device) + conversation = [ { "role": "user", @@ -210,6 +243,15 @@ print("=" * 80) ➡️ batched inference! ```python +from transformers import VoxtralForConditionalGeneration, AutoProcessor +import torch + +device = "cuda" if torch.cuda.is_available() else "cpu" +repo_id = "mistralai/Voxtral-Mini-3B-2507" + +processor = AutoProcessor.from_pretrained(repo_id) +model = VoxtralForConditionalGeneration.from_pretrained(repo_id, torch_dtype=torch.bfloat16, device_map=device) + conversations = [ [ { @@ -262,7 +304,16 @@ for decoded_output in decoded_outputs: Use the model to transcribe audio (supports English, Spanish, French, Portuguese, Hindi, German, Dutch, Italian)! ```python -inputs = processor.apply_transcrition_request(language="en", audio="https://huggingface.co/datasets/hf-internal-testing/dummy-audio-samples/resolve/main/obama.mp3") +from transformers import VoxtralForConditionalGeneration, AutoProcessor +import torch + +device = "cuda" if torch.cuda.is_available() else "cpu" +repo_id = "mistralai/Voxtral-Mini-3B-2507" + +processor = AutoProcessor.from_pretrained(repo_id) +model = VoxtralForConditionalGeneration.from_pretrained(repo_id, torch_dtype=torch.bfloat16, device_map=device) + +inputs = processor.apply_transcription_request(language="en", audio="https://huggingface.co/datasets/hf-internal-testing/dummy-audio-samples/resolve/main/obama.mp3", model_id=repo_id) inputs = inputs.to(device, dtype=torch.bfloat16) outputs = model.generate(**inputs, max_new_tokens=500) diff --git a/src/transformers/models/voxtral/processing_voxtral.py b/src/transformers/models/voxtral/processing_voxtral.py index f684466874..598529bf5c 100644 --- a/src/transformers/models/voxtral/processing_voxtral.py +++ b/src/transformers/models/voxtral/processing_voxtral.py @@ -14,6 +14,7 @@ # limitations under the License. import io +import warnings from typing import Optional, Union from ...utils import is_mistral_common_available, is_soundfile_available, is_torch_available, logging @@ -242,7 +243,7 @@ class VoxtralProcessor(ProcessorMixin): the text. Please refer to the docstring of the above methods for more information. This methods does not support audio. To prepare the audio, please use: 1. `apply_chat_template` [`~VoxtralProcessor.apply_chat_template`] method. - 2. `apply_transcrition_request` [`~VoxtralProcessor.apply_transcrition_request`] method. + 2. `apply_transcription_request` [`~VoxtralProcessor.apply_transcription_request`] method. Args: text (`str`, `list[str]`, `list[list[str]]`): @@ -284,7 +285,7 @@ class VoxtralProcessor(ProcessorMixin): return BatchFeature(data=out, tensor_type=common_kwargs.pop("return_tensors", None)) # TODO: @eustlb, this should be moved to mistral_common + testing - def apply_transcrition_request( + def apply_transcription_request( self, language: Union[str, list[str]], audio: Union[str, list[str], AudioInput], @@ -306,7 +307,7 @@ class VoxtralProcessor(ProcessorMixin): language = "en" audio = "https://huggingface.co/datasets/hf-internal-testing/dummy-audio-samples/resolve/main/obama.mp3" - inputs = processor.apply_transcrition_request(language=language, audio=audio, model_id=model_id) + inputs = processor.apply_transcription_request(language=language, audio=audio, model_id=model_id) ``` Args: @@ -431,6 +432,17 @@ class VoxtralProcessor(ProcessorMixin): return texts + # Deprecated typo'd method for backward compatibility + def apply_transcrition_request(self, *args, **kwargs): + """ + Deprecated typo'd method. Use `apply_transcription_request` instead. + """ + warnings.warn( + "`apply_transcrition_request` is deprecated due to a typo and will be removed in a future release. Please use `apply_transcription_request` instead.", + FutureWarning, + ) + return self.apply_transcription_request(*args, **kwargs) + def batch_decode(self, *args, **kwargs): """ This method forwards all its arguments to MistralCommonTokenizer's [`~MistralCommonTokenizer.batch_decode`]. Please diff --git a/tests/models/voxtral/test_modeling_voxtral.py b/tests/models/voxtral/test_modeling_voxtral.py index 5ad7552c01..5b7ffcd802 100644 --- a/tests/models/voxtral/test_modeling_voxtral.py +++ b/tests/models/voxtral/test_modeling_voxtral.py @@ -493,7 +493,7 @@ class VoxtralForConditionalGenerationIntegrationTest(unittest.TestCase): model = VoxtralForConditionalGeneration.from_pretrained( self.checkpoint_name, torch_dtype=self.dtype, device_map=torch_device ) - inputs = self.processor.apply_transcrition_request( + inputs = self.processor.apply_transcription_request( language="en", audio="https://huggingface.co/datasets/hf-internal-testing/dummy-audio-samples/resolve/main/obama.mp3", model_id=self.checkpoint_name,