fix(voxtral): correct typo in apply_transcription_request (#39572)
* fix(voxtral): correct typo in apply_transcription_request * temporary wrapper: apply_transcrition_request * Update processing_voxtral.py * style: sort imports in processing_voxtral.py * docs(voxtral): fix typo in voxtral.md * make style * doc update --------- Co-authored-by: eustlb <94853470+eustlb@users.noreply.github.com> Co-authored-by: Eustache Le Bihan <eulebihan@gmail.com>
This commit is contained in:
@@ -37,7 +37,11 @@ Voxtral builds on Ministral-3B by adding audio processing capabilities:
|
|||||||
|
|
||||||
## Usage
|
## Usage
|
||||||
|
|
||||||
Let's first load the model!
|
### Audio Instruct Mode
|
||||||
|
|
||||||
|
The model supports audio-text instructions, including multi-turn and multi-audio interactions, all processed in batches.
|
||||||
|
|
||||||
|
➡️ audio + text instruction
|
||||||
```python
|
```python
|
||||||
from transformers import VoxtralForConditionalGeneration, AutoProcessor
|
from transformers import VoxtralForConditionalGeneration, AutoProcessor
|
||||||
import torch
|
import torch
|
||||||
@@ -47,14 +51,7 @@ repo_id = "mistralai/Voxtral-Mini-3B-2507"
|
|||||||
|
|
||||||
processor = AutoProcessor.from_pretrained(repo_id)
|
processor = AutoProcessor.from_pretrained(repo_id)
|
||||||
model = VoxtralForConditionalGeneration.from_pretrained(repo_id, torch_dtype=torch.bfloat16, device_map=device)
|
model = VoxtralForConditionalGeneration.from_pretrained(repo_id, torch_dtype=torch.bfloat16, device_map=device)
|
||||||
```
|
|
||||||
|
|
||||||
### Audio Instruct Mode
|
|
||||||
|
|
||||||
The model supports audio-text instructions, including multi-turn and multi-audio interactions, all processed in batches.
|
|
||||||
|
|
||||||
➡️ audio + text instruction
|
|
||||||
```python
|
|
||||||
conversation = [
|
conversation = [
|
||||||
{
|
{
|
||||||
"role": "user",
|
"role": "user",
|
||||||
@@ -82,6 +79,15 @@ print("=" * 80)
|
|||||||
|
|
||||||
➡️ multi-audio + text instruction
|
➡️ multi-audio + text instruction
|
||||||
```python
|
```python
|
||||||
|
from transformers import VoxtralForConditionalGeneration, AutoProcessor
|
||||||
|
import torch
|
||||||
|
|
||||||
|
device = "cuda" if torch.cuda.is_available() else "cpu"
|
||||||
|
repo_id = "mistralai/Voxtral-Mini-3B-2507"
|
||||||
|
|
||||||
|
processor = AutoProcessor.from_pretrained(repo_id)
|
||||||
|
model = VoxtralForConditionalGeneration.from_pretrained(repo_id, torch_dtype=torch.bfloat16, device_map=device)
|
||||||
|
|
||||||
conversation = [
|
conversation = [
|
||||||
{
|
{
|
||||||
"role": "user",
|
"role": "user",
|
||||||
@@ -113,6 +119,15 @@ print("=" * 80)
|
|||||||
|
|
||||||
➡️ multi-turn:
|
➡️ multi-turn:
|
||||||
```python
|
```python
|
||||||
|
from transformers import VoxtralForConditionalGeneration, AutoProcessor
|
||||||
|
import torch
|
||||||
|
|
||||||
|
device = "cuda" if torch.cuda.is_available() else "cpu"
|
||||||
|
repo_id = "mistralai/Voxtral-Mini-3B-2507"
|
||||||
|
|
||||||
|
processor = AutoProcessor.from_pretrained(repo_id)
|
||||||
|
model = VoxtralForConditionalGeneration.from_pretrained(repo_id, torch_dtype=torch.bfloat16, device_map=device)
|
||||||
|
|
||||||
conversation = [
|
conversation = [
|
||||||
{
|
{
|
||||||
"role": "user",
|
"role": "user",
|
||||||
@@ -158,6 +173,15 @@ print("=" * 80)
|
|||||||
|
|
||||||
➡️ text only:
|
➡️ text only:
|
||||||
```python
|
```python
|
||||||
|
from transformers import VoxtralForConditionalGeneration, AutoProcessor
|
||||||
|
import torch
|
||||||
|
|
||||||
|
device = "cuda" if torch.cuda.is_available() else "cpu"
|
||||||
|
repo_id = "mistralai/Voxtral-Mini-3B-2507"
|
||||||
|
|
||||||
|
processor = AutoProcessor.from_pretrained(repo_id)
|
||||||
|
model = VoxtralForConditionalGeneration.from_pretrained(repo_id, torch_dtype=torch.bfloat16, device_map=device)
|
||||||
|
|
||||||
conversation = [
|
conversation = [
|
||||||
{
|
{
|
||||||
"role": "user",
|
"role": "user",
|
||||||
@@ -184,6 +208,15 @@ print("=" * 80)
|
|||||||
|
|
||||||
➡️ audio only:
|
➡️ audio only:
|
||||||
```python
|
```python
|
||||||
|
from transformers import VoxtralForConditionalGeneration, AutoProcessor
|
||||||
|
import torch
|
||||||
|
|
||||||
|
device = "cuda" if torch.cuda.is_available() else "cpu"
|
||||||
|
repo_id = "mistralai/Voxtral-Mini-3B-2507"
|
||||||
|
|
||||||
|
processor = AutoProcessor.from_pretrained(repo_id)
|
||||||
|
model = VoxtralForConditionalGeneration.from_pretrained(repo_id, torch_dtype=torch.bfloat16, device_map=device)
|
||||||
|
|
||||||
conversation = [
|
conversation = [
|
||||||
{
|
{
|
||||||
"role": "user",
|
"role": "user",
|
||||||
@@ -210,6 +243,15 @@ print("=" * 80)
|
|||||||
|
|
||||||
➡️ batched inference!
|
➡️ batched inference!
|
||||||
```python
|
```python
|
||||||
|
from transformers import VoxtralForConditionalGeneration, AutoProcessor
|
||||||
|
import torch
|
||||||
|
|
||||||
|
device = "cuda" if torch.cuda.is_available() else "cpu"
|
||||||
|
repo_id = "mistralai/Voxtral-Mini-3B-2507"
|
||||||
|
|
||||||
|
processor = AutoProcessor.from_pretrained(repo_id)
|
||||||
|
model = VoxtralForConditionalGeneration.from_pretrained(repo_id, torch_dtype=torch.bfloat16, device_map=device)
|
||||||
|
|
||||||
conversations = [
|
conversations = [
|
||||||
[
|
[
|
||||||
{
|
{
|
||||||
@@ -262,7 +304,16 @@ for decoded_output in decoded_outputs:
|
|||||||
Use the model to transcribe audio (supports English, Spanish, French, Portuguese, Hindi, German, Dutch, Italian)!
|
Use the model to transcribe audio (supports English, Spanish, French, Portuguese, Hindi, German, Dutch, Italian)!
|
||||||
|
|
||||||
```python
|
```python
|
||||||
inputs = processor.apply_transcrition_request(language="en", audio="https://huggingface.co/datasets/hf-internal-testing/dummy-audio-samples/resolve/main/obama.mp3")
|
from transformers import VoxtralForConditionalGeneration, AutoProcessor
|
||||||
|
import torch
|
||||||
|
|
||||||
|
device = "cuda" if torch.cuda.is_available() else "cpu"
|
||||||
|
repo_id = "mistralai/Voxtral-Mini-3B-2507"
|
||||||
|
|
||||||
|
processor = AutoProcessor.from_pretrained(repo_id)
|
||||||
|
model = VoxtralForConditionalGeneration.from_pretrained(repo_id, torch_dtype=torch.bfloat16, device_map=device)
|
||||||
|
|
||||||
|
inputs = processor.apply_transcription_request(language="en", audio="https://huggingface.co/datasets/hf-internal-testing/dummy-audio-samples/resolve/main/obama.mp3", model_id=repo_id)
|
||||||
inputs = inputs.to(device, dtype=torch.bfloat16)
|
inputs = inputs.to(device, dtype=torch.bfloat16)
|
||||||
|
|
||||||
outputs = model.generate(**inputs, max_new_tokens=500)
|
outputs = model.generate(**inputs, max_new_tokens=500)
|
||||||
|
|||||||
@@ -14,6 +14,7 @@
|
|||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
|
|
||||||
import io
|
import io
|
||||||
|
import warnings
|
||||||
from typing import Optional, Union
|
from typing import Optional, Union
|
||||||
|
|
||||||
from ...utils import is_mistral_common_available, is_soundfile_available, is_torch_available, logging
|
from ...utils import is_mistral_common_available, is_soundfile_available, is_torch_available, logging
|
||||||
@@ -242,7 +243,7 @@ class VoxtralProcessor(ProcessorMixin):
|
|||||||
the text. Please refer to the docstring of the above methods for more information.
|
the text. Please refer to the docstring of the above methods for more information.
|
||||||
This methods does not support audio. To prepare the audio, please use:
|
This methods does not support audio. To prepare the audio, please use:
|
||||||
1. `apply_chat_template` [`~VoxtralProcessor.apply_chat_template`] method.
|
1. `apply_chat_template` [`~VoxtralProcessor.apply_chat_template`] method.
|
||||||
2. `apply_transcrition_request` [`~VoxtralProcessor.apply_transcrition_request`] method.
|
2. `apply_transcription_request` [`~VoxtralProcessor.apply_transcription_request`] method.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
text (`str`, `list[str]`, `list[list[str]]`):
|
text (`str`, `list[str]`, `list[list[str]]`):
|
||||||
@@ -284,7 +285,7 @@ class VoxtralProcessor(ProcessorMixin):
|
|||||||
return BatchFeature(data=out, tensor_type=common_kwargs.pop("return_tensors", None))
|
return BatchFeature(data=out, tensor_type=common_kwargs.pop("return_tensors", None))
|
||||||
|
|
||||||
# TODO: @eustlb, this should be moved to mistral_common + testing
|
# TODO: @eustlb, this should be moved to mistral_common + testing
|
||||||
def apply_transcrition_request(
|
def apply_transcription_request(
|
||||||
self,
|
self,
|
||||||
language: Union[str, list[str]],
|
language: Union[str, list[str]],
|
||||||
audio: Union[str, list[str], AudioInput],
|
audio: Union[str, list[str], AudioInput],
|
||||||
@@ -306,7 +307,7 @@ class VoxtralProcessor(ProcessorMixin):
|
|||||||
language = "en"
|
language = "en"
|
||||||
audio = "https://huggingface.co/datasets/hf-internal-testing/dummy-audio-samples/resolve/main/obama.mp3"
|
audio = "https://huggingface.co/datasets/hf-internal-testing/dummy-audio-samples/resolve/main/obama.mp3"
|
||||||
|
|
||||||
inputs = processor.apply_transcrition_request(language=language, audio=audio, model_id=model_id)
|
inputs = processor.apply_transcription_request(language=language, audio=audio, model_id=model_id)
|
||||||
```
|
```
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
@@ -431,6 +432,17 @@ class VoxtralProcessor(ProcessorMixin):
|
|||||||
|
|
||||||
return texts
|
return texts
|
||||||
|
|
||||||
|
# Deprecated typo'd method for backward compatibility
|
||||||
|
def apply_transcrition_request(self, *args, **kwargs):
|
||||||
|
"""
|
||||||
|
Deprecated typo'd method. Use `apply_transcription_request` instead.
|
||||||
|
"""
|
||||||
|
warnings.warn(
|
||||||
|
"`apply_transcrition_request` is deprecated due to a typo and will be removed in a future release. Please use `apply_transcription_request` instead.",
|
||||||
|
FutureWarning,
|
||||||
|
)
|
||||||
|
return self.apply_transcription_request(*args, **kwargs)
|
||||||
|
|
||||||
def batch_decode(self, *args, **kwargs):
|
def batch_decode(self, *args, **kwargs):
|
||||||
"""
|
"""
|
||||||
This method forwards all its arguments to MistralCommonTokenizer's [`~MistralCommonTokenizer.batch_decode`]. Please
|
This method forwards all its arguments to MistralCommonTokenizer's [`~MistralCommonTokenizer.batch_decode`]. Please
|
||||||
|
|||||||
@@ -493,7 +493,7 @@ class VoxtralForConditionalGenerationIntegrationTest(unittest.TestCase):
|
|||||||
model = VoxtralForConditionalGeneration.from_pretrained(
|
model = VoxtralForConditionalGeneration.from_pretrained(
|
||||||
self.checkpoint_name, torch_dtype=self.dtype, device_map=torch_device
|
self.checkpoint_name, torch_dtype=self.dtype, device_map=torch_device
|
||||||
)
|
)
|
||||||
inputs = self.processor.apply_transcrition_request(
|
inputs = self.processor.apply_transcription_request(
|
||||||
language="en",
|
language="en",
|
||||||
audio="https://huggingface.co/datasets/hf-internal-testing/dummy-audio-samples/resolve/main/obama.mp3",
|
audio="https://huggingface.co/datasets/hf-internal-testing/dummy-audio-samples/resolve/main/obama.mp3",
|
||||||
model_id=self.checkpoint_name,
|
model_id=self.checkpoint_name,
|
||||||
|
|||||||
Reference in New Issue
Block a user