From 1ecd52e50a31e7c344c32564e0484d7e9a0f2256 Mon Sep 17 00:00:00 2001 From: Quentin Lhoest <42851186+lhoestq@users.noreply.github.com> Date: Tue, 8 Jul 2025 17:06:12 +0200 Subject: [PATCH] Add torchcodec in docstrings/tests for `datasets` 4.0 (#39156) * fix dataset run_object_detection * bump version * keep same dataset actually * torchcodec in docstrings and testing utils * torchcodec in dockerfiles and requirements * remove duplicate * add torchocodec to all the remaining docker files * fix tests * support torchcodec in audio classification and ASR * [commit to revert] build ci-dev images * [commit to revert] trigger circleci * [commit to revert] build ci-dev images * fix * fix modeling_hubert * backward compatible run_object_detection * revert ci trigger commits * fix mono conversion and support torch tensor as input * revert map_to_array docs + fix it * revert mono * nit in docstring * style * fix modular --------- Co-authored-by: ydshieh --- docker/examples-torch.dockerfile | 4 +-- docker/pipeline-torch.dockerfile | 4 +-- docker/torch-light.dockerfile | 4 +-- docker/transformers-all-latest-gpu/Dockerfile | 2 +- .../Dockerfile | 2 +- .../Dockerfile | 2 +- .../Dockerfile | 2 +- docs/source/en/model_doc/speech_to_text_2.md | 9 ++--- docs/source/en/model_doc/wav2vec2.md | 6 ++-- examples/pytorch/_tests_requirements.txt | 1 + .../pytorch/object-detection/requirements.txt | 2 +- .../object-detection/run_object_detection.py | 5 ++- .../run_object_detection_no_trainer.py | 5 ++- .../modeling_audio_spectrogram_transformer.py | 19 +++++----- src/transformers/models/clvp/modeling_clvp.py | 10 +++--- .../data2vec/modeling_data2vec_audio.py | 21 ++++++----- .../models/hubert/modeling_hubert.py | 15 ++++---- .../models/hubert/modeling_tf_hubert.py | 16 ++++----- .../models/hubert/modular_hubert.py | 8 ++--- .../models/moonshine/modeling_moonshine.py | 15 ++++---- .../models/moonshine/modular_moonshine.py | 15 ++++---- .../qwen2_5_omni/modeling_qwen2_5_omni.py | 14 ++++---- .../qwen2_5_omni/modular_qwen2_5_omni.py | 14 ++++---- .../qwen2_audio/modeling_qwen2_audio.py | 14 ++++---- src/transformers/models/sew/modeling_sew.py | 7 ++-- .../models/sew_d/modeling_sew_d.py | 7 ++-- .../modeling_flax_speech_encoder_decoder.py | 13 +++---- .../modeling_speech_encoder_decoder.py | 21 ++++++----- .../speech_to_text/modeling_speech_to_text.py | 25 +++++++------ .../modeling_tf_speech_to_text.py | 23 ++++++------ .../models/speecht5/modeling_speecht5.py | 21 ++++++----- .../models/unispeech/modeling_unispeech.py | 7 ++-- .../unispeech_sat/modeling_unispeech_sat.py | 21 ++++++----- .../models/wav2vec2/modeling_flax_wav2vec2.py | 31 +++++++--------- .../models/wav2vec2/modeling_tf_wav2vec2.py | 17 ++++----- .../models/wav2vec2/modeling_wav2vec2.py | 21 ++++++----- .../wav2vec2_bert/modeling_wav2vec2_bert.py | 35 +++++++++++-------- .../wav2vec2_bert/modular_wav2vec2_bert.py | 35 +++++++++++-------- .../modeling_wav2vec2_conformer.py | 21 ++++++----- .../models/wavlm/modeling_wavlm.py | 21 ++++++----- .../models/whisper/generation_whisper.py | 11 +++--- .../models/whisper/modeling_flax_whisper.py | 19 +++++----- .../models/whisper/modeling_tf_whisper.py | 15 ++++---- .../models/whisper/modeling_whisper.py | 32 +++++++++-------- .../pipelines/audio_classification.py | 25 ++++++++++--- .../pipelines/automatic_speech_recognition.py | 26 +++++++++++--- src/transformers/testing_utils.py | 13 +------ src/transformers/utils/args_doc.py | 7 ++-- ...xtraction_audio_spectrogram_transformer.py | 2 +- .../clap/test_feature_extraction_clap.py | 2 +- .../clvp/test_feature_extraction_clvp.py | 2 +- tests/models/clvp/test_modeling_clvp.py | 9 +++-- .../models/dac/test_feature_extraction_dac.py | 2 +- .../data2vec/test_modeling_data2vec_audio.py | 4 +-- .../models/dia/test_feature_extraction_dia.py | 2 +- tests/models/dia/test_modeling_dia.py | 8 +++-- .../test_feature_extraction_encodec.py | 2 +- .../test_modeling_granite_speech.py | 2 +- tests/models/hubert/test_modeling_hubert.py | 4 +-- .../test_modeling_kyutai_speech_to_text.py | 2 +- .../moonshine/test_modeling_moonshine.py | 2 +- .../test_feature_extractor_phi4_multimodal.py | 2 +- .../test_modeling_phi4_multimodal.py | 19 +++++----- tests/models/sew/test_modeling_sew.py | 4 +-- tests/models/sew_d/test_modeling_sew_d.py | 4 +-- .../test_feature_extraction_speech_to_text.py | 2 +- .../test_feature_extraction_speecht5.py | 2 +- .../models/speecht5/test_modeling_speecht5.py | 4 +-- .../unispeech/test_modeling_unispeech.py | 4 +-- .../test_modeling_unispeech_sat.py | 4 +-- .../test_feature_extraction_univnet.py | 2 +- tests/models/univnet/test_modeling_univnet.py | 2 +- .../models/wav2vec2/test_modeling_wav2vec2.py | 4 +-- .../test_feature_extraction_whisper.py | 2 +- tests/models/whisper/test_modeling_whisper.py | 2 +- ..._pipelines_automatic_speech_recognition.py | 2 +- tests/utils/test_audio_utils.py | 2 +- utils/print_env.py | 11 ++++++ 78 files changed, 448 insertions(+), 350 deletions(-) diff --git a/docker/examples-torch.dockerfile b/docker/examples-torch.dockerfile index 1302774ae4..2509c1f05b 100644 --- a/docker/examples-torch.dockerfile +++ b/docker/examples-torch.dockerfile @@ -2,10 +2,10 @@ FROM python:3.9-slim ENV PYTHONDONTWRITEBYTECODE=1 ARG REF=main USER root -RUN apt-get update && apt-get install -y --no-install-recommends libsndfile1-dev espeak-ng time git g++ cmake pkg-config openssh-client git +RUN apt-get update && apt-get install -y --no-install-recommends libsndfile1-dev espeak-ng time git g++ cmake pkg-config openssh-client git ffmpeg ENV UV_PYTHON=/usr/local/bin/python RUN pip --no-cache-dir install uv && uv venv && uv pip install --no-cache-dir -U pip setuptools -RUN uv pip install --no-cache-dir 'torch' 'torchaudio' 'torchvision' --index-url https://download.pytorch.org/whl/cpu +RUN uv pip install --no-cache-dir 'torch' 'torchaudio' 'torchvision' 'torchcodec' --index-url https://download.pytorch.org/whl/cpu RUN uv pip install --no-deps timm accelerate --extra-index-url https://download.pytorch.org/whl/cpu RUN uv pip install --no-cache-dir librosa "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[sklearn,sentencepiece,vision,testing]" seqeval albumentations jiwer RUN uv pip uninstall transformers diff --git a/docker/pipeline-torch.dockerfile b/docker/pipeline-torch.dockerfile index 155cfcd5ac..dd37683ecf 100644 --- a/docker/pipeline-torch.dockerfile +++ b/docker/pipeline-torch.dockerfile @@ -2,10 +2,10 @@ FROM python:3.9-slim ENV PYTHONDONTWRITEBYTECODE=1 ARG REF=main USER root -RUN apt-get update && apt-get install -y --no-install-recommends libsndfile1-dev espeak-ng time git pkg-config openssh-client git +RUN apt-get update && apt-get install -y --no-install-recommends libsndfile1-dev espeak-ng time git pkg-config openssh-client git ffmpeg ENV UV_PYTHON=/usr/local/bin/python RUN pip --no-cache-dir install uv && uv venv && uv pip install --no-cache-dir -U pip setuptools -RUN uv pip install --no-cache-dir 'torch' 'torchaudio' 'torchvision' --index-url https://download.pytorch.org/whl/cpu +RUN uv pip install --no-cache-dir 'torch' 'torchaudio' 'torchvision' 'torchcodec' --index-url https://download.pytorch.org/whl/cpu RUN uv pip install --no-deps timm accelerate --extra-index-url https://download.pytorch.org/whl/cpu RUN uv pip install --no-cache-dir librosa "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[sklearn,sentencepiece,vision,testing]" RUN uv pip uninstall transformers diff --git a/docker/torch-light.dockerfile b/docker/torch-light.dockerfile index b46a2d3912..b4e6cdffb3 100644 --- a/docker/torch-light.dockerfile +++ b/docker/torch-light.dockerfile @@ -2,10 +2,10 @@ FROM python:3.9-slim ENV PYTHONDONTWRITEBYTECODE=1 ARG REF=main USER root -RUN apt-get update && apt-get install -y --no-install-recommends libsndfile1-dev espeak-ng time git g++ cmake pkg-config openssh-client git git-lfs +RUN apt-get update && apt-get install -y --no-install-recommends libsndfile1-dev espeak-ng time git g++ cmake pkg-config openssh-client git git-lfs ffmpeg ENV UV_PYTHON=/usr/local/bin/python RUN pip --no-cache-dir install uv && uv venv && uv pip install --no-cache-dir -U pip setuptools -RUN uv pip install --no-cache-dir 'torch' 'torchaudio' 'torchvision' --index-url https://download.pytorch.org/whl/cpu +RUN uv pip install --no-cache-dir 'torch' 'torchaudio' 'torchvision' 'torchcodec' --index-url https://download.pytorch.org/whl/cpu RUN uv pip install --no-deps timm accelerate --extra-index-url https://download.pytorch.org/whl/cpu RUN uv pip install --no-cache-dir librosa "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[sklearn,sentencepiece,vision,testing,tiktoken,num2words,video]" RUN uv pip uninstall transformers diff --git a/docker/transformers-all-latest-gpu/Dockerfile b/docker/transformers-all-latest-gpu/Dockerfile index 04c82e8776..b6f9986b81 100644 --- a/docker/transformers-all-latest-gpu/Dockerfile +++ b/docker/transformers-all-latest-gpu/Dockerfile @@ -26,7 +26,7 @@ RUN git clone https://github.com/huggingface/transformers && cd transformers && # 1. Put several commands in a single `RUN` to avoid image/layer exporting issue. Could be revised in the future. # 2. Regarding `torch` part, We might need to specify proper versions for `torchvision` and `torchaudio`. # Currently, let's not bother to specify their versions explicitly (so installed with their latest release versions). -RUN python3 -m pip install --no-cache-dir -e ./transformers[dev,onnxruntime] && [ ${#PYTORCH} -gt 0 -a "$PYTORCH" != "pre" ] && VERSION='torch=='$PYTORCH'.*' || VERSION='torch'; echo "export VERSION='$VERSION'" >> ~/.profile && echo torch=$VERSION && [ "$PYTORCH" != "pre" ] && python3 -m pip install --no-cache-dir -U $VERSION torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/$CUDA || python3 -m pip install --no-cache-dir -U --pre torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/nightly/$CUDA && python3 -m pip uninstall -y tensorflow tensorflow_text tensorflow_probability +RUN python3 -m pip install --no-cache-dir -e ./transformers[dev,onnxruntime] && [ ${#PYTORCH} -gt 0 -a "$PYTORCH" != "pre" ] && VERSION='torch=='$PYTORCH'.*' || VERSION='torch'; echo "export VERSION='$VERSION'" >> ~/.profile && echo torch=$VERSION && [ "$PYTORCH" != "pre" ] && python3 -m pip install --no-cache-dir -U $VERSION torchvision torchaudio torchcodec --extra-index-url https://download.pytorch.org/whl/$CUDA || python3 -m pip install --no-cache-dir -U --pre torch torchvision torchaudio torchcodec --extra-index-url https://download.pytorch.org/whl/nightly/$CUDA && python3 -m pip uninstall -y tensorflow tensorflow_text tensorflow_probability RUN python3 -m pip uninstall -y flax jax diff --git a/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile b/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile index 4613706284..b58435087d 100644 --- a/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile +++ b/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile @@ -21,7 +21,7 @@ RUN python3 -m pip install --no-cache-dir './transformers[deepspeed-testing]' 'p # Install latest release PyTorch # (PyTorch must be installed before pre-compiling any DeepSpeed c++/cuda ops.) # (https://www.deepspeed.ai/tutorials/advanced-install/#pre-install-deepspeed-ops) -RUN python3 -m pip uninstall -y torch torchvision torchaudio && python3 -m pip install --no-cache-dir -U torch==$PYTORCH torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/$CUDA +RUN python3 -m pip uninstall -y torch torchvision torchaudio && python3 -m pip install --no-cache-dir -U torch==$PYTORCH torchvision torchaudio torchcodec --extra-index-url https://download.pytorch.org/whl/$CUDA RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/accelerate@main#egg=accelerate diff --git a/docker/transformers-pytorch-deepspeed-nightly-gpu/Dockerfile b/docker/transformers-pytorch-deepspeed-nightly-gpu/Dockerfile index d71b4ad334..3a8ca97735 100644 --- a/docker/transformers-pytorch-deepspeed-nightly-gpu/Dockerfile +++ b/docker/transformers-pytorch-deepspeed-nightly-gpu/Dockerfile @@ -19,7 +19,7 @@ RUN python3 -m pip uninstall -y torch torchvision torchaudio # Install **nightly** release PyTorch (flag `--pre`) # (PyTorch must be installed before pre-compiling any DeepSpeed c++/cuda ops.) # (https://www.deepspeed.ai/tutorials/advanced-install/#pre-install-deepspeed-ops) -RUN python3 -m pip install --no-cache-dir -U --pre torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/nightly/$CUDA +RUN python3 -m pip install --no-cache-dir -U --pre torch torchvision torchaudio torchcodec --extra-index-url https://download.pytorch.org/whl/nightly/$CUDA # `datasets` requires pandas, pandas has some modules compiled with numpy=1.x causing errors RUN python3 -m pip install --no-cache-dir './transformers[deepspeed-testing]' 'pandas<2' 'numpy<2' diff --git a/docker/transformers-quantization-latest-gpu/Dockerfile b/docker/transformers-quantization-latest-gpu/Dockerfile index ad9cf891e2..cfc0478016 100755 --- a/docker/transformers-quantization-latest-gpu/Dockerfile +++ b/docker/transformers-quantization-latest-gpu/Dockerfile @@ -26,7 +26,7 @@ RUN [ ${#PYTORCH} -gt 0 ] && VERSION='torch=='$PYTORCH'.*' || VERSION='torch'; RUN echo torch=$VERSION # `torchvision` and `torchaudio` should be installed along with `torch`, especially for nightly build. # Currently, let's just use their latest releases (when `torch` is installed with a release version) -RUN python3 -m pip install --no-cache-dir -U $VERSION torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/$CUDA +RUN python3 -m pip install --no-cache-dir -U $VERSION torchvision torchaudio torchcodec --extra-index-url https://download.pytorch.org/whl/$CUDA RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/accelerate@main#egg=accelerate diff --git a/docs/source/en/model_doc/speech_to_text_2.md b/docs/source/en/model_doc/speech_to_text_2.md index 8caf774e73..6d77e5ad39 100644 --- a/docs/source/en/model_doc/speech_to_text_2.md +++ b/docs/source/en/model_doc/speech_to_text_2.md @@ -61,19 +61,16 @@ predicted token ids. - Step-by-step Speech Translation ```python ->>> import torch >>> from transformers import Speech2Text2Processor, SpeechEncoderDecoderModel >>> from datasets import load_dataset ->>> import soundfile as sf >>> model = SpeechEncoderDecoderModel.from_pretrained("facebook/s2t-wav2vec2-large-en-de") >>> processor = Speech2Text2Processor.from_pretrained("facebook/s2t-wav2vec2-large-en-de") ->>> def map_to_array(batch): -... speech, _ = sf.read(batch["file"]) -... batch["speech"] = speech -... return batch +>>> def map_to_array(example): +... example["speech"] = example["audio"]["array"] +... return example >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") diff --git a/docs/source/en/model_doc/wav2vec2.md b/docs/source/en/model_doc/wav2vec2.md index d884f44b1e..340ac4b193 100644 --- a/docs/source/en/model_doc/wav2vec2.md +++ b/docs/source/en/model_doc/wav2vec2.md @@ -172,9 +172,9 @@ Otherwise, [`~Wav2Vec2ProcessorWithLM.batch_decode`] performance will be slower >>> dataset = dataset.cast_column("audio", datasets.Audio(sampling_rate=16_000)) ->>> def map_to_array(batch): -... batch["speech"] = batch["audio"]["array"] -... return batch +>>> def map_to_array(example): +... example["speech"] = example["audio"]["array"] +... return example >>> # prepare speech data for batch inference diff --git a/examples/pytorch/_tests_requirements.txt b/examples/pytorch/_tests_requirements.txt index fb86259116..d66ba08f15 100644 --- a/examples/pytorch/_tests_requirements.txt +++ b/examples/pytorch/_tests_requirements.txt @@ -22,6 +22,7 @@ protobuf torch torchvision torchaudio +torchcodec jiwer librosa evaluate >= 0.2.0 diff --git a/examples/pytorch/object-detection/requirements.txt b/examples/pytorch/object-detection/requirements.txt index 7772958dee..8a9ba6ddcd 100644 --- a/examples/pytorch/object-detection/requirements.txt +++ b/examples/pytorch/object-detection/requirements.txt @@ -1,5 +1,5 @@ albumentations >= 1.4.16 timm -datasets +datasets>=4.0 torchmetrics pycocotools diff --git a/examples/pytorch/object-detection/run_object_detection.py b/examples/pytorch/object-detection/run_object_detection.py index 25bc7bd548..f534d9a323 100644 --- a/examples/pytorch/object-detection/run_object_detection.py +++ b/examples/pytorch/object-detection/run_object_detection.py @@ -399,7 +399,10 @@ def main(): dataset["validation"] = split["test"] # Get dataset categories and prepare mappings for label_name <-> label_id - categories = dataset["train"].features["objects"].feature["category"].names + if isinstance(dataset["train"].features["objects"], dict): + categories = dataset["train"].features["objects"]["category"].feature.names + else: # (for old versions of `datasets` that used Sequence({...}) of the objects) + categories = dataset["train"].features["objects"].feature["category"].names id2label = dict(enumerate(categories)) label2id = {v: k for k, v in id2label.items()} diff --git a/examples/pytorch/object-detection/run_object_detection_no_trainer.py b/examples/pytorch/object-detection/run_object_detection_no_trainer.py index 1759f9920f..1133435d7b 100644 --- a/examples/pytorch/object-detection/run_object_detection_no_trainer.py +++ b/examples/pytorch/object-detection/run_object_detection_no_trainer.py @@ -460,7 +460,10 @@ def main(): dataset["validation"] = split["test"] # Get dataset categories and prepare mappings for label_name <-> label_id - categories = dataset["train"].features["objects"].feature["category"].names + if isinstance(dataset["train"].features["objects"], dict): + categories = dataset["train"].features["objects"]["category"].feature.names + else: # (for old versions of `datasets` that used Sequence({...}) of the objects) + categories = dataset["train"].features["objects"].feature["category"].names id2label = dict(enumerate(categories)) label2id = {v: k for k, v in id2label.items()} diff --git a/src/transformers/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py b/src/transformers/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py index 602de3ff72..37215d4486 100644 --- a/src/transformers/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py +++ b/src/transformers/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py @@ -435,10 +435,12 @@ class ASTModel(ASTPreTrainedModel): r""" input_values (`torch.FloatTensor` of shape `(batch_size, max_length, num_mel_bins)`): Float values mel features extracted from the raw audio waveform. Raw audio waveform can be obtained by - loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via - the soundfile library (`pip install soundfile`). To prepare the array into `input_features`, the - [`AutoFeatureExtractor`] should be used for extracting the mel features, padding and conversion into a - tensor of type `torch.FloatTensor`. See [`~ASTFeatureExtractor.__call__`] + loading a `.flac` or `.wav` audio file into an array of type `list[float]`, a `numpy.ndarray` or a + `torch.Tensor`, *e.g.* via the torchcodec library (`pip install torchcodec`) or the soundfile library + (`pip install soundfile`). + To prepare the array into `input_features`, the [`AutoFeatureExtractor`] should be used for extracting the + mel features, padding and conversion into a tensor of type `torch.FloatTensor`. + See [`~ASTFeatureExtractor.__call__`] """ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( @@ -525,10 +527,11 @@ class ASTForAudioClassification(ASTPreTrainedModel): r""" input_values (`torch.FloatTensor` of shape `(batch_size, max_length, num_mel_bins)`): Float values mel features extracted from the raw audio waveform. Raw audio waveform can be obtained by - loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via - the soundfile library (`pip install soundfile`). To prepare the array into `input_features`, the - [`AutoFeatureExtractor`] should be used for extracting the mel features, padding and conversion into a - tensor of type `torch.FloatTensor`. See [`~ASTFeatureExtractor.__call__`] + loading a `.flac` or `.wav` audio file into an array of type `list[float]`, a `numpy.ndarray` or a `torch.Tensor`, *e.g.* via + the torchcodec library (`pip install torchcodec`) or the soundfile library (`pip install soundfile`). + To prepare the array into `input_features`, the [`AutoFeatureExtractor`] should be used for extracting the + mel features, padding and conversion into a tensor of type `torch.FloatTensor`. + See [`~ASTFeatureExtractor.__call__`] labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for computing the audio classification/regression loss. Indices should be in `[0, ..., config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If diff --git a/src/transformers/models/clvp/modeling_clvp.py b/src/transformers/models/clvp/modeling_clvp.py index 9eb8140103..a40c21932f 100644 --- a/src/transformers/models/clvp/modeling_clvp.py +++ b/src/transformers/models/clvp/modeling_clvp.py @@ -1653,14 +1653,15 @@ class ClvpModelForConditionalGeneration(ClvpPreTrainedModel, GenerationMixin): >>> text = "This is an example text." >>> ds = datasets.load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") >>> ds = ds.cast_column("audio", datasets.Audio(sampling_rate=22050)) - >>> _, audio, sr = ds.sort("id").select(range(1))[:1]["audio"][0].values() + >>> audio = ds.sort("id")["audio"][0] + >>> audio_sample, sr = audio["array"], audio["sampling_rate"] >>> # Define processor and model >>> processor = ClvpProcessor.from_pretrained("susnato/clvp_dev") >>> model = ClvpModelForConditionalGeneration.from_pretrained("susnato/clvp_dev") >>> # Generate processor output and model output - >>> processor_output = processor(raw_speech=audio, sampling_rate=sr, text=text, return_tensors="pt") + >>> processor_output = processor(raw_speech=audio_sample, sampling_rate=sr, text=text, return_tensors="pt") >>> speech_embeds = model.get_speech_features( ... input_ids=processor_output["input_ids"], input_features=processor_output["input_features"] ... ) @@ -1732,14 +1733,15 @@ class ClvpModelForConditionalGeneration(ClvpPreTrainedModel, GenerationMixin): >>> ds = datasets.load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") >>> ds = ds.cast_column("audio", datasets.Audio(sampling_rate=22050)) - >>> _, audio, sr = ds.sort("id").select(range(1))[:1]["audio"][0].values() + >>> audio = ds.sort("id")["audio"][0] + >>> audio_sample, sr = audio["array"], audio["sampling_rate"] >>> # Define processor and model >>> processor = ClvpProcessor.from_pretrained("susnato/clvp_dev") >>> model = ClvpModelForConditionalGeneration.from_pretrained("susnato/clvp_dev") >>> # processor outputs and model outputs - >>> processor_output = processor(raw_speech=audio, sampling_rate=sr, text=text, return_tensors="pt") + >>> processor_output = processor(raw_speech=audio_sample, sampling_rate=sr, text=text, return_tensors="pt") >>> outputs = model( ... input_ids=processor_output["input_ids"], ... input_features=processor_output["input_features"], diff --git a/src/transformers/models/data2vec/modeling_data2vec_audio.py b/src/transformers/models/data2vec/modeling_data2vec_audio.py index 60509f419f..d140c40710 100755 --- a/src/transformers/models/data2vec/modeling_data2vec_audio.py +++ b/src/transformers/models/data2vec/modeling_data2vec_audio.py @@ -1022,9 +1022,10 @@ class Data2VecAudioForSequenceClassification(Data2VecAudioPreTrainedModel): r""" input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`): Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file - into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install - soundfile`). To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and - conversion into a tensor of type `torch.FloatTensor`. See [`Data2VecAudioProcessor.__call__`] for details. + into an array of type `list[float]`, a `numpy.ndarray` or a `torch.Tensor`, *e.g.* via the torchcodec library + (`pip install torchcodec`) or the soundfile library (`pip install soundfile`). + To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and conversion + into a tensor of type `torch.FloatTensor`. See [`Data2VecAudioProcessor.__call__`] for details. labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If @@ -1136,9 +1137,10 @@ class Data2VecAudioForAudioFrameClassification(Data2VecAudioPreTrainedModel): r""" input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`): Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file - into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install - soundfile`). To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and - conversion into a tensor of type `torch.FloatTensor`. See [`Data2VecAudioProcessor.__call__`] for details. + into an array of type `list[float]`, a `numpy.ndarray` or a `torch.Tensor`, *e.g.* via the torchcodec library + (`pip install torchcodec`) or the soundfile library (`pip install soundfile`). + To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and conversion + into a tensor of type `torch.FloatTensor`. See [`Data2VecAudioProcessor.__call__`] for details. labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If @@ -1318,9 +1320,10 @@ class Data2VecAudioForXVector(Data2VecAudioPreTrainedModel): r""" input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`): Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file - into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install - soundfile`). To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and - conversion into a tensor of type `torch.FloatTensor`. See [`Data2VecAudioProcessor.__call__`] for details. + into an array of type `list[float]`, a `numpy.ndarray` or a `torch.Tensor`, *e.g.* via the torchcodec library + (`pip install torchcodec`) or the soundfile library (`pip install soundfile`). + To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and conversion + into a tensor of type `torch.FloatTensor`. See [`Data2VecAudioProcessor.__call__`] for details. labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If diff --git a/src/transformers/models/hubert/modeling_hubert.py b/src/transformers/models/hubert/modeling_hubert.py index 810279c7ac..4b36a575b8 100755 --- a/src/transformers/models/hubert/modeling_hubert.py +++ b/src/transformers/models/hubert/modeling_hubert.py @@ -954,16 +954,14 @@ class HubertModel(HubertPreTrainedModel): ```python >>> from transformers import AutoProcessor, HubertModel >>> from datasets import load_dataset - >>> import soundfile as sf >>> processor = AutoProcessor.from_pretrained("facebook/hubert-large-ls960-ft") >>> model = HubertModel.from_pretrained("facebook/hubert-large-ls960-ft") - >>> def map_to_array(batch): - ... speech, _ = sf.read(batch["file"]) - ... batch["speech"] = speech - ... return batch + >>> def map_to_array(example): + ... example["speech"] = example["audio"]["array"] + ... return example >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") @@ -1230,9 +1228,10 @@ class HubertForSequenceClassification(HubertPreTrainedModel): r""" input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`): Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file - into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install - soundfile`). To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and - conversion into a tensor of type `torch.FloatTensor`. See [`HubertProcessor.__call__`] for details. + into an array of type `list[float]`, a `numpy.ndarray` or a `torch.Tensor`, *e.g.* via the torchcodec library + (`pip install torchcodec`) or the soundfile library (`pip install soundfile`). + To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and conversion + into a tensor of type `torch.FloatTensor`. See [`HubertProcessor.__call__`] for details. labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If diff --git a/src/transformers/models/hubert/modeling_tf_hubert.py b/src/transformers/models/hubert/modeling_tf_hubert.py index a701252f63..a0de497dc0 100644 --- a/src/transformers/models/hubert/modeling_tf_hubert.py +++ b/src/transformers/models/hubert/modeling_tf_hubert.py @@ -1459,16 +1459,14 @@ class TFHubertModel(TFHubertPreTrainedModel): ```python >>> from transformers import AutoProcessor, TFHubertModel >>> from datasets import load_dataset - >>> import soundfile as sf >>> processor = AutoProcessor.from_pretrained("facebook/hubert-large-ls960-ft") >>> model = TFHubertModel.from_pretrained("facebook/hubert-large-ls960-ft") - >>> def map_to_array(batch): - ... speech, _ = sf.read(batch["file"]) - ... batch["speech"] = speech - ... return batch + >>> def map_to_array(example): + ... example["speech"] = example["audio"]["array"] + ... return example >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") @@ -1571,16 +1569,14 @@ class TFHubertForCTC(TFHubertPreTrainedModel): >>> import tensorflow as tf >>> from transformers import AutoProcessor, TFHubertForCTC >>> from datasets import load_dataset - >>> import soundfile as sf >>> processor = AutoProcessor.from_pretrained("facebook/hubert-large-ls960-ft") >>> model = TFHubertForCTC.from_pretrained("facebook/hubert-large-ls960-ft") - >>> def map_to_array(batch): - ... speech, _ = sf.read(batch["file"]) - ... batch["speech"] = speech - ... return batch + >>> def map_to_array(example): + ... example["speech"] = example["audio"]["array"] + ... return example >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") diff --git a/src/transformers/models/hubert/modular_hubert.py b/src/transformers/models/hubert/modular_hubert.py index ec13ad6090..29f186f811 100644 --- a/src/transformers/models/hubert/modular_hubert.py +++ b/src/transformers/models/hubert/modular_hubert.py @@ -239,16 +239,14 @@ class HubertModel(Wav2Vec2Model, HubertPreTrainedModel): ```python >>> from transformers import AutoProcessor, HubertModel >>> from datasets import load_dataset - >>> import soundfile as sf >>> processor = AutoProcessor.from_pretrained("facebook/hubert-large-ls960-ft") >>> model = HubertModel.from_pretrained("facebook/hubert-large-ls960-ft") - >>> def map_to_array(batch): - ... speech, _ = sf.read(batch["file"]) - ... batch["speech"] = speech - ... return batch + >>> def map_to_array(example): + ... example["speech"] = example["audio"]["array"] + ... return example >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") diff --git a/src/transformers/models/moonshine/modeling_moonshine.py b/src/transformers/models/moonshine/modeling_moonshine.py index c8cfde30b4..2605d0f032 100644 --- a/src/transformers/models/moonshine/modeling_moonshine.py +++ b/src/transformers/models/moonshine/modeling_moonshine.py @@ -540,8 +540,9 @@ class MoonshineEncoder(MoonshinePreTrainedModel): Args: input_values (`torch.FloatTensor` of shape `(batch_size, audio_length)`): Float values of the raw speech waveform. Raw speech waveform can be - obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a - `numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into + obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]`, a + `numpy.ndarray` or a `torch.Tensor`, *e.g.* via the torchcodec libary (`pip install torchcodec`) or + the soundfile library (`pip install soundfile`). To prepare the array into `input_values`, the [`AutoFeatureExtractor`] should be used for padding and conversion into a tensor of type `torch.FloatTensor`. attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): @@ -922,8 +923,9 @@ class MoonshineModel(MoonshinePreTrainedModel): r""" input_values (`torch.FloatTensor` of shape `(batch_size, audio_length)`): Float values of the raw speech waveform. Raw speech waveform can be - obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a - `numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into + obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]`, a + `numpy.ndarray` or a `torch.Tensor`, *e.g.* via the torchcodec libary (`pip install torchcodec`) or + the soundfile library (`pip install soundfile`). To prepare the array into `input_values`, the [`AutoFeatureExtractor`] should be used for padding and conversion into a tensor of type `torch.FloatTensor`. Example: @@ -1039,8 +1041,9 @@ class MoonshineForConditionalGeneration(MoonshinePreTrainedModel, GenerationMixi r""" input_values (`torch.FloatTensor` of shape `(batch_size, audio_length)`): Float values of the raw speech waveform. Raw speech waveform can be - obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a - `numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into + obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]`, a + `numpy.ndarray` or a `torch.Tensor`, *e.g.* via the torchcodec libary (`pip install torchcodec`) or + the soundfile library (`pip install soundfile`). To prepare the array into `input_values`, the [`AutoFeatureExtractor`] should be used for padding and conversion into a tensor of type `torch.FloatTensor`. diff --git a/src/transformers/models/moonshine/modular_moonshine.py b/src/transformers/models/moonshine/modular_moonshine.py index 8a5851ec7d..c8d0fe56b1 100644 --- a/src/transformers/models/moonshine/modular_moonshine.py +++ b/src/transformers/models/moonshine/modular_moonshine.py @@ -575,8 +575,9 @@ class MoonshineEncoder(MoonshinePreTrainedModel): Args: input_values (`torch.FloatTensor` of shape `(batch_size, audio_length)`): Float values of the raw speech waveform. Raw speech waveform can be - obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a - `numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into + obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]`, a + `numpy.ndarray` or a `torch.Tensor`, *e.g.* via the torchcodec libary (`pip install torchcodec`) or + the soundfile library (`pip install soundfile`). To prepare the array into `input_values`, the [`AutoFeatureExtractor`] should be used for padding and conversion into a tensor of type `torch.FloatTensor`. attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): @@ -751,8 +752,9 @@ class MoonshineModel(WhisperModel): r""" input_values (`torch.FloatTensor` of shape `(batch_size, audio_length)`): Float values of the raw speech waveform. Raw speech waveform can be - obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a - `numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into + obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]`, a + `numpy.ndarray` or a `torch.Tensor`, *e.g.* via the torchcodec libary (`pip install torchcodec`) or + the soundfile library (`pip install soundfile`). To prepare the array into `input_values`, the [`AutoFeatureExtractor`] should be used for padding and conversion into a tensor of type `torch.FloatTensor`. Example: @@ -852,8 +854,9 @@ class MoonshineForConditionalGeneration(MoonshinePreTrainedModel, GenerationMixi r""" input_values (`torch.FloatTensor` of shape `(batch_size, audio_length)`): Float values of the raw speech waveform. Raw speech waveform can be - obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a - `numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into + obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]`, a + `numpy.ndarray` or a `torch.Tensor`, *e.g.* via the torchcodec libary (`pip install torchcodec`) or + the soundfile library (`pip install soundfile`). To prepare the array into `input_values`, the [`AutoFeatureExtractor`] should be used for padding and conversion into a tensor of type `torch.FloatTensor`. diff --git a/src/transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py b/src/transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py index aab4261046..3a55c3a351 100644 --- a/src/transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py +++ b/src/transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py @@ -810,8 +810,9 @@ class Qwen2_5OmniAudioEncoder(Qwen2_5OmniPreTrainedModel): r""" input_features (`torch.LongTensor` of shape `(batch_size, feature_size, sequence_length)`): Float values of mel features extracted from the raw speech waveform. Raw speech waveform can be - obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a - `numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into + obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]`, a + `numpy.ndarray` or a `torch.Tensor`, *e.g.* via the torchcodec libary (`pip install torchcodec`) or + the soundfile library (`pip install soundfile`). To prepare the array into `input_features`, the [`AutoFeatureExtractor`] should be used for extracting the mel features, padding and conversion into a tensor of type `torch.FloatTensor`. See [`~WhisperFeatureExtractor.__call__`] feature_lens (`torch.LongTensor` of shape `(batch_size,)`): @@ -1830,10 +1831,11 @@ class Qwen2_5OmniThinkerForConditionalGeneration(Qwen2_5OmniPreTrainedModelForCo r""" input_features (`torch.FloatTensor` of shape `(batch_size, feature_size, feature_sequence_length)`): Float values mel features extracted from the raw speech waveform. Raw speech waveform can be obtained by - loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via - the soundfile library (`pip install soundfile`). To prepare the array into `input_features`, the - [`AutoFeatureExtractor`] should be used for extracting the mel features, padding and conversion into a - tensor of type `torch.FloatTensor`. See [`~WhisperFeatureExtractor.__call__`] + loading a `.flac` or `.wav` audio file into an array of type `list[float]`, a `numpy.ndarray` or a `torch.Tensor`, *e.g.* via + the torchcodec library (`pip install torchcodec`) or the soundfile library (`pip install soundfile`). + To prepare the array into `input_features`, the [`AutoFeatureExtractor`] should be used for extracting the + mel features, padding and conversion into a tensor of type `torch.FloatTensor`. + See [`~WhisperFeatureExtractor.__call__`] pixel_values_videos (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size), *optional*): The tensors corresponding to the input videos. Pixel values can be obtained using [`AutoImageProcessor`]. See [`SiglipImageProcessor.__call__`] for details ([]`NewTaskModelProcessor`] uses diff --git a/src/transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py b/src/transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py index b3d4ae90e8..5ee0f347dd 100644 --- a/src/transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py +++ b/src/transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py @@ -1795,8 +1795,9 @@ class Qwen2_5OmniAudioEncoder(Qwen2_5OmniPreTrainedModel): r""" input_features (`torch.LongTensor` of shape `(batch_size, feature_size, sequence_length)`): Float values of mel features extracted from the raw speech waveform. Raw speech waveform can be - obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a - `numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into + obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]`, a + `numpy.ndarray` or a `torch.Tensor`, *e.g.* via the torchcodec libary (`pip install torchcodec`) or + the soundfile library (`pip install soundfile`). To prepare the array into `input_features`, the [`AutoFeatureExtractor`] should be used for extracting the mel features, padding and conversion into a tensor of type `torch.FloatTensor`. See [`~WhisperFeatureExtractor.__call__`] feature_lens (`torch.LongTensor` of shape `(batch_size,)`): @@ -2276,10 +2277,11 @@ class Qwen2_5OmniThinkerForConditionalGeneration(Qwen2_5OmniPreTrainedModelForCo r""" input_features (`torch.FloatTensor` of shape `(batch_size, feature_size, feature_sequence_length)`): Float values mel features extracted from the raw speech waveform. Raw speech waveform can be obtained by - loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via - the soundfile library (`pip install soundfile`). To prepare the array into `input_features`, the - [`AutoFeatureExtractor`] should be used for extracting the mel features, padding and conversion into a - tensor of type `torch.FloatTensor`. See [`~WhisperFeatureExtractor.__call__`] + loading a `.flac` or `.wav` audio file into an array of type `list[float]`, a `numpy.ndarray` or a `torch.Tensor`, *e.g.* via + the torchcodec library (`pip install torchcodec`) or the soundfile library (`pip install soundfile`). + To prepare the array into `input_features`, the [`AutoFeatureExtractor`] should be used for extracting the + mel features, padding and conversion into a tensor of type `torch.FloatTensor`. + See [`~WhisperFeatureExtractor.__call__`] pixel_values_videos (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size), *optional*): The tensors corresponding to the input videos. Pixel values can be obtained using [`AutoImageProcessor`]. See [`SiglipImageProcessor.__call__`] for details ([]`NewTaskModelProcessor`] uses diff --git a/src/transformers/models/qwen2_audio/modeling_qwen2_audio.py b/src/transformers/models/qwen2_audio/modeling_qwen2_audio.py index f90f7ff9cf..e065269320 100644 --- a/src/transformers/models/qwen2_audio/modeling_qwen2_audio.py +++ b/src/transformers/models/qwen2_audio/modeling_qwen2_audio.py @@ -362,8 +362,9 @@ class Qwen2AudioEncoder(Qwen2AudioPreTrainedModel): Args: input_features (`torch.LongTensor` of shape `(batch_size, feature_size, sequence_length)`): Float values of mel features extracted from the raw speech waveform. Raw speech waveform can be - obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a - `numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into + obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]`, a + `numpy.ndarray` or a `torch.Tensor`, *e.g.* via the torchcodec libary (`pip install torchcodec`) or + the soundfile library (`pip install soundfile`). To prepare the array into `input_features`, the [`AutoFeatureExtractor`] should be used for extracting the mel features, padding and conversion into a tensor of type `torch.FloatTensor`. See [`~WhisperFeatureExtractor.__call__`] attention_mask (`torch.Tensor`)`, *optional*): @@ -742,10 +743,11 @@ class Qwen2AudioForConditionalGeneration(Qwen2AudioPreTrainedModel, GenerationMi r""" input_features (`torch.FloatTensor` of shape `(batch_size, feature_size, feature_sequence_length)`): Float values mel features extracted from the raw speech waveform. Raw speech waveform can be obtained by - loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via - the soundfile library (`pip install soundfile`). To prepare the array into `input_features`, the - [`AutoFeatureExtractor`] should be used for extracting the mel features, padding and conversion into a - tensor of type `torch.FloatTensor`. See [`~WhisperFeatureExtractor.__call__`] + loading a `.flac` or `.wav` audio file into an array of type `list[float]`, a `numpy.ndarray` or a `torch.Tensor`, *e.g.* via + the torchcodec library (`pip install torchcodec`) or the soundfile library (`pip install soundfile`). + To prepare the array into `input_features`, the [`AutoFeatureExtractor`] should be used for extracting the + mel features, padding and conversion into a tensor of type `torch.FloatTensor`. + See [`~WhisperFeatureExtractor.__call__`] feature_attention_mask (`torch.Tensor` of shape `(batch_size, feature_sequence_length)`): Mask to avoid performing attention on padding feature indices. Mask values selected in `[0, 1]`: diff --git a/src/transformers/models/sew/modeling_sew.py b/src/transformers/models/sew/modeling_sew.py index 5ca359f0b0..c19571bda2 100644 --- a/src/transformers/models/sew/modeling_sew.py +++ b/src/transformers/models/sew/modeling_sew.py @@ -1046,9 +1046,10 @@ class SEWForSequenceClassification(SEWPreTrainedModel): r""" input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`): Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file - into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install - soundfile`). To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and - conversion into a tensor of type `torch.FloatTensor`. See [`SEWProcessor.__call__`] for details. + into an array of type `list[float]`, a `numpy.ndarray` or a `torch.Tensor`, *e.g.* via the torchcodec library + (`pip install torchcodec`) or the soundfile library (`pip install soundfile`). + To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and conversion + into a tensor of type `torch.FloatTensor`. See [`SEWProcessor.__call__`] for details. labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If diff --git a/src/transformers/models/sew_d/modeling_sew_d.py b/src/transformers/models/sew_d/modeling_sew_d.py index 5e00ddcd1f..678cddde30 100644 --- a/src/transformers/models/sew_d/modeling_sew_d.py +++ b/src/transformers/models/sew_d/modeling_sew_d.py @@ -1597,9 +1597,10 @@ class SEWDForSequenceClassification(SEWDPreTrainedModel): r""" input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`): Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file - into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install - soundfile`). To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and - conversion into a tensor of type `torch.FloatTensor`. See [`SEWDProcessor.__call__`] for details. + into an array of type `list[float]`, a `numpy.ndarray` or a `torch.Tensor`, *e.g.* via the torchcodec library + (`pip install torchcodec`) or the soundfile library (`pip install soundfile`). + To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and conversion + into a tensor of type `torch.FloatTensor`. See [`SEWDProcessor.__call__`] for details. labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If diff --git a/src/transformers/models/speech_encoder_decoder/modeling_flax_speech_encoder_decoder.py b/src/transformers/models/speech_encoder_decoder/modeling_flax_speech_encoder_decoder.py index d2d23d2900..c809b804c8 100644 --- a/src/transformers/models/speech_encoder_decoder/modeling_flax_speech_encoder_decoder.py +++ b/src/transformers/models/speech_encoder_decoder/modeling_flax_speech_encoder_decoder.py @@ -86,8 +86,9 @@ SPEECH_ENCODER_DECODER_INPUTS_DOCSTRING = r""" Args: inputs (`jnp.ndarray` of shape `(batch_size, sequence_length)` or `(batch_size, sequence_length, feature_dim)`, *optional*): Float values of input raw speech waveform or speech features. Values can be obtained by loading a `.flac` - or `.wav` audio file into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the soundfile - library (`pip install soundfile`). To prepare the array into `inputs`, either the [`Wav2Vec2Processor`] or + or `.wav` audio file into an array of type `list[float]`, a `numpy.ndarray` or a `torch.Tensor`, *e.g.* + via the torchcodec library (`pip install torchcodec`) or the soundfile library (`pip install soundfile`). + To prepare the array into `inputs`, either the [`Wav2Vec2Processor`] or [`Speech2TextProcessor`] should be used for padding and conversion into a tensor of type `torch.FloatTensor`. attention_mask (`jnp.ndarray` of shape `(batch_size, sequence_length)`, *optional*): @@ -128,10 +129,10 @@ SPEECH_ENCODER_DECODER_ENCODE_INPUTS_DOCSTRING = r""" Args: inputs (`jnp.ndarray` of shape `(batch_size, sequence_length)` or `(batch_size, sequence_length, feature_dim)`, *optional*): Float values of input raw speech waveform or speech features. Values can be obtained by loading a *.flac* - or *.wav* audio file into an array of type *list[float]* or a *numpy.ndarray*, *e.g.* via the soundfile - library (*pip install soundfile*). To prepare the array into *inputs*, either the [`Wav2Vec2Processor`] or - [`Speech2TextProcessor`] should be used for padding and conversion into a tensor of type - *torch.FloatTensor*. + or *.wav* audio file into an array of type *list[float]* or a *numpy.ndarray*, *e.g.* via the torchcodec library + (`pip install torchcodec`) or the soundfile library (`pip install soundfile`). + To prepare the array into *inputs*, either the [`Wav2Vec2Processor`] or [`Speech2TextProcessor`] should be used + for padding and conversion into a tensor of type *torch.FloatTensor*. attention_mask (`jnp.ndarray` of shape `(batch_size, sequence_length)`, *optional*): Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: diff --git a/src/transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py b/src/transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py index b4582949fe..5894c035b7 100644 --- a/src/transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py +++ b/src/transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py @@ -339,8 +339,9 @@ class SpeechEncoderDecoderModel(PreTrainedModel, GenerationMixin): r""" inputs (`torch.FloatTensor` of shape `(batch_size, sequence_length)` or `(batch_size, sequence_length, feature_dim)`, *optional*): Float values of input raw speech waveform or speech features. Values can be obtained by loading a `.flac` - or `.wav` audio file into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the soundfile - library (`pip install soundfile`). To prepare the array into `inputs`, either the [`Wav2Vec2Processor`] or + or `.wav` audio file into an array of type `list[float]`, a `numpy.ndarray` or a `torch.Tensor`, *e.g.* + via the torchcodec library (`pip install torchcodec`) or the soundfile library (`pip install soundfile`). + To prepare the array into `inputs`, either the [`Wav2Vec2Processor`] or [`Speech2TextProcessor`] should be used for padding and conversion into a tensor of type `torch.FloatTensor`. decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*): @@ -369,15 +370,17 @@ class SpeechEncoderDecoderModel(PreTrainedModel, GenerationMixin): (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]` input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*): Float values of input raw speech waveform. Values can be obtained by loading a *.flac* or *.wav* audio file - into an array of type *list[float]* or a *numpy.ndarray*, *e.g.* via the soundfile library (*pip install - soundfile*). To prepare the array into *input_values*, the [`Wav2Vec2Processor`] should be used for padding - and conversion into a tensor of type *torch.FloatTensor*. See [`Wav2Vec2Processor.__call__`] for details. + into an array of type *list[float]* or a *numpy.ndarray*, *e.g.* via the torchcodec library + (`pip install torchcodec`) or the soundfile library (`pip install soundfile`). + To prepare the array into *input_values*, the [`Wav2Vec2Processor`] should be used for padding and conversion + into a tensor of type *torch.FloatTensor*. See [`Wav2Vec2Processor.__call__`] for details. input_features (`torch.FloatTensor` of shape `(batch_size, sequence_length, feature_size)`, *optional*): Float values of fbank features extracted from the raw speech waveform. Raw speech waveform can be obtained - by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* - via the soundfile library (`pip install soundfile`). To prepare the array into `input_features`, the - [`Speech2TextFeatureExtractor`] should be used for extracting the fbank features, padding and conversion - into a tensor of type `torch.FloatTensor`. See [`~Speech2TextFeatureExtractor.__call__`] + by loading a `.flac` or `.wav` audio file into an array of type `list[float]`, a `numpy.ndarray` or a `torch.Tensor`, *e.g.* + via the torchcodec library (`pip install torchcodec`) or the soundfile library (`pip install soundfile`). + To prepare the array into `input_features`, the [`Speech2TextFeatureExtractor`] should be used for extracting + the fbank features, padding and conversion into a tensor of type `torch.FloatTensor`. + See [`~Speech2TextFeatureExtractor.__call__`] Examples: diff --git a/src/transformers/models/speech_to_text/modeling_speech_to_text.py b/src/transformers/models/speech_to_text/modeling_speech_to_text.py index 73e3df2b4a..29debaeaac 100755 --- a/src/transformers/models/speech_to_text/modeling_speech_to_text.py +++ b/src/transformers/models/speech_to_text/modeling_speech_to_text.py @@ -619,8 +619,9 @@ class Speech2TextEncoder(Speech2TextPreTrainedModel): Args: input_features (`torch.LongTensor` of shape `(batch_size, sequence_length, feature_size)`): Float values of fbank features extracted from the raw speech waveform. Raw speech waveform can be - obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a - `numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into + obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]`, a + `numpy.ndarray` or a `torch.Tensor`, *e.g.* via the torchcodec libary (`pip install torchcodec`) or + the soundfile library (`pip install soundfile`). To prepare the array into `input_features`, the [`AutoFeatureExtractor`] should be used for extracting the fbank features, padding and conversion into a tensor of type `torch.FloatTensor`. See [`~Speech2TextFeatureExtractor.__call__`] @@ -1096,10 +1097,12 @@ class Speech2TextModel(Speech2TextPreTrainedModel): r""" input_features (`torch.FloatTensor` of shape `(batch_size, sequence_length, feature_size)`): Float values of fbank features extracted from the raw speech waveform. Raw speech waveform can be obtained - by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* - via the soundfile library (`pip install soundfile`). To prepare the array into `input_features`, the - [`AutoFeatureExtractor`] should be used for extracting the fbank features, padding and conversion into a - tensor of type `torch.FloatTensor`. See [`~Speech2TextFeatureExtractor.__call__`] + by loading a `.flac` or `.wav` audio file into an array of type `list[float]`, a `numpy.ndarray` or a + `torch.Tensor`, *e.g.* via the torchcodec library (`pip install torchcodec`) or the soundfile library + (`pip install soundfile`). + To prepare the array into `input_features`, the [`AutoFeatureExtractor`] should be used for extracting + the fbank features, padding and conversion into a tensor of type `torch.FloatTensor`. + See [`~Speech2TextFeatureExtractor.__call__`] decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*): Indices of decoder input sequence tokens in the vocabulary. @@ -1258,10 +1261,12 @@ class Speech2TextForConditionalGeneration(Speech2TextPreTrainedModel, Generation r""" input_features (`torch.FloatTensor` of shape `(batch_size, sequence_length, feature_size)`): Float values of fbank features extracted from the raw speech waveform. Raw speech waveform can be obtained - by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* - via the soundfile library (`pip install soundfile`). To prepare the array into `input_features`, the - [`AutoFeatureExtractor`] should be used for extracting the fbank features, padding and conversion into a - tensor of type `torch.FloatTensor`. See [`~Speech2TextFeatureExtractor.__call__`] + by loading a `.flac` or `.wav` audio file into an array of type `list[float]`, a `numpy.ndarray` or a + `torch.Tensor`, *e.g.* via the torchcodec library (`pip install torchcodec`) or the soundfile library + (`pip install soundfile`). + To prepare the array into `input_features`, the [`AutoFeatureExtractor`] should be used for extracting + the fbank features, padding and conversion into a tensor of type `torch.FloatTensor`. + See [`~Speech2TextFeatureExtractor.__call__`] decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*): Indices of decoder input sequence tokens in the vocabulary. diff --git a/src/transformers/models/speech_to_text/modeling_tf_speech_to_text.py b/src/transformers/models/speech_to_text/modeling_tf_speech_to_text.py index fad6a026b1..555fc5659b 100755 --- a/src/transformers/models/speech_to_text/modeling_tf_speech_to_text.py +++ b/src/transformers/models/speech_to_text/modeling_tf_speech_to_text.py @@ -691,10 +691,12 @@ SPEECH_TO_TEXT_INPUTS_DOCSTRING = r""" Args: input_features (`tf.Tensor` of shape `(batch_size, sequence_length, feature_size)`): Float values of fbank features extracted from the raw speech waveform. Raw speech waveform can be obtained - by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* - via the soundfile library (`pip install soundfile`). To prepare the array into `input_features`, the - [`AutoFeatureExtractor`] should be used for extracting the fbank features, padding and conversion into a - tensor of floats. See [`~Speech2TextFeatureExtractor.__call__`] + by loading a `.flac` or `.wav` audio file into an array of type `list[float]`, a `numpy.ndarray or a + `torch.Tensor``, *e.g.* via the torchcodec library (`pip install torchcodec`) or the soundfile library + (`pip install soundfile`). + To prepare the arrayinto `input_features`, the [`AutoFeatureExtractor`] should be used for extracting + the fbank features, padding and conversion into a tensor of floats. + See [`~Speech2TextFeatureExtractor.__call__`] attention_mask (`tf.Tensor` of shape `({0})`, *optional*): Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: @@ -847,8 +849,9 @@ class TFSpeech2TextEncoder(keras.layers.Layer): Args: input_features (`tf.Tensor` of shape `(batch_size, sequence_length, feature_size)`): Float values of fbank features extracted from the raw speech waveform. Raw speech waveform can be - obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a - `numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into + obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]`, a + `numpy.ndarray` or a `torch.Tensor`, *e.g.* via the torchcodec libary (`pip install torchcodec`) or + the soundfile library (`pip install soundfile`). To prepare the array into `input_features`, the [`AutoFeatureExtractor`] should be used for extracting the fbank features, padding and conversion into a tensor of floats. See [`~Speech2TextFeatureExtractor.__call__`] attention_mask (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*): @@ -1469,7 +1472,6 @@ class TFSpeech2TextForConditionalGeneration(TFSpeech2TextPreTrainedModel, TFCaus >>> import tensorflow as tf >>> from transformers import Speech2TextProcessor, TFSpeech2TextForConditionalGeneration >>> from datasets import load_dataset - >>> import soundfile as sf >>> model = TFSpeech2TextForConditionalGeneration.from_pretrained( ... "facebook/s2t-small-librispeech-asr", from_pt=True @@ -1477,10 +1479,9 @@ class TFSpeech2TextForConditionalGeneration(TFSpeech2TextPreTrainedModel, TFCaus >>> processor = Speech2TextProcessor.from_pretrained("facebook/s2t-small-librispeech-asr") - >>> def map_to_array(batch): - ... speech, _ = sf.read(batch["file"]) - ... batch["speech"] = speech - ... return batch + >>> def map_to_array(example): + ... example["speech"] = example["audio"]["array"] + ... return example >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") diff --git a/src/transformers/models/speecht5/modeling_speecht5.py b/src/transformers/models/speecht5/modeling_speecht5.py index 8d26f7d790..8f46fd00c8 100644 --- a/src/transformers/models/speecht5/modeling_speecht5.py +++ b/src/transformers/models/speecht5/modeling_speecht5.py @@ -2156,8 +2156,9 @@ class SpeechT5ForSpeechToText(SpeechT5PreTrainedModel, GenerationMixin): r""" input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`): Float values of input raw speech waveform. Values can be obtained by loading a *.flac* or *.wav* audio file - into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (*pip install - soundfile*). To prepare the array into `input_values`, the [`SpeechT5Processor`] should be used for padding + into an array of type `list[float]`, a `numpy.ndarray` or a `torch.Tensor`, *e.g.* via the torchcodec library + (`pip install torchcodec`) or the soundfile library (`pip install soundfile`). + To prepare the array into `input_values`, the [`SpeechT5Processor`] should be used for padding and conversion into a tensor of type `torch.FloatTensor`. See [`SpeechT5Processor.__call__`] for details. decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*): Indices of decoder input sequence tokens in the vocabulary. @@ -2841,9 +2842,10 @@ class SpeechT5ForSpeechToSpeech(SpeechT5PreTrainedModel): r""" input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`): Float values of input raw speech waveform. Values can be obtained by loading a *.flac* or *.wav* audio file - into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (*pip install - soundfile*). To prepare the array into `input_values`, the [`SpeechT5Processor`] should be used for padding - and conversion into a tensor of type `torch.FloatTensor`. See [`SpeechT5Processor.__call__`] for details. + into an array of type `list[float]`, a `numpy.ndarray` or a `torch.Tensor`, *e.g.* via the torchcodec library + (`pip install torchcodec`) or the soundfile library (`pip install soundfile`). + To prepare the array into `input_values`, the [`SpeechT5Processor`] should be used for padding and conversion into + a tensor of type `torch.FloatTensor`. See [`SpeechT5Processor.__call__`] for details. decoder_input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_mel_bins)`): Float values of input mel spectrogram. @@ -2966,10 +2968,11 @@ class SpeechT5ForSpeechToSpeech(SpeechT5PreTrainedModel): input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`): Float values of input raw speech waveform. - Values can be obtained by loading a *.flac* or *.wav* audio file into an array of type `list[float]` or - a `numpy.ndarray`, *e.g.* via the soundfile library (*pip install soundfile*). To prepare the array - into `input_values`, the [`SpeechT5Processor`] should be used for padding and conversion into a tensor - of type `torch.FloatTensor`. See [`SpeechT5Processor.__call__`] for details. + Values can be obtained by loading a *.flac* or *.wav* audio file into an array of type `list[float]`, + a `numpy.ndarray` or a `torch.Tensor`, *e.g.* via the torchcodec library (`pip install torchcodec`) + or the soundfile library (`pip install soundfile`). + To prepare the array into `input_values`, the [`SpeechT5Processor`] should be used for padding and + conversion into a tensor of type `torch.FloatTensor`. See [`SpeechT5Processor.__call__`] for details. speaker_embeddings (`torch.FloatTensor` of shape `(batch_size, config.speaker_embedding_dim)`, *optional*): Tensor containing the speaker embeddings. attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): diff --git a/src/transformers/models/unispeech/modeling_unispeech.py b/src/transformers/models/unispeech/modeling_unispeech.py index e8a43c2826..e982240e26 100755 --- a/src/transformers/models/unispeech/modeling_unispeech.py +++ b/src/transformers/models/unispeech/modeling_unispeech.py @@ -1455,9 +1455,10 @@ class UniSpeechForSequenceClassification(UniSpeechPreTrainedModel): r""" input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`): Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file - into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install - soundfile`). To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and - conversion into a tensor of type `torch.FloatTensor`. See [`UniSpeechProcessor.__call__`] for details. + into an array of type `list[float]`, a `numpy.ndarray` or a `torch.Tensor`, *e.g.* via the torchcodec library + (`pip install torchcodec`) or the soundfile library (`pip install soundfile`). + To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and conversion + into a tensor of type `torch.FloatTensor`. See [`UniSpeechProcessor.__call__`] for details. labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If diff --git a/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py b/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py index 0e2140aee8..18e9d88645 100755 --- a/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py +++ b/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py @@ -1450,9 +1450,10 @@ class UniSpeechSatForSequenceClassification(UniSpeechSatPreTrainedModel): r""" input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`): Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file - into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install - soundfile`). To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and - conversion into a tensor of type `torch.FloatTensor`. See [`UniSpeechSatProcessor.__call__`] for details. + into an array of type `list[float]`, a `numpy.ndarray` or a `torch.Tensor`, *e.g.* via the torchcodec library + (`pip install torchcodec`) or the soundfile library (`pip install soundfile`). + To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and conversion + into a tensor of type `torch.FloatTensor`. See [`UniSpeechSatProcessor.__call__`] for details. labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If @@ -1564,9 +1565,10 @@ class UniSpeechSatForAudioFrameClassification(UniSpeechSatPreTrainedModel): r""" input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`): Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file - into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install - soundfile`). To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and - conversion into a tensor of type `torch.FloatTensor`. See [`UniSpeechSatProcessor.__call__`] for details. + into an array of type `list[float]`, a `numpy.ndarray` or a `torch.Tensor`, *e.g.* via the torchcodec library + (`pip install torchcodec`) or the soundfile library (`pip install soundfile`). + To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and conversion + into a tensor of type `torch.FloatTensor`. See [`UniSpeechSatProcessor.__call__`] for details. labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If @@ -1746,9 +1748,10 @@ class UniSpeechSatForXVector(UniSpeechSatPreTrainedModel): r""" input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`): Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file - into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install - soundfile`). To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and - conversion into a tensor of type `torch.FloatTensor`. See [`UniSpeechSatProcessor.__call__`] for details. + into an array of type `list[float]`, a `numpy.ndarray` or a `torch.Tensor`, *e.g.* via the torchcodec library + (`pip install torchcodec`) or the soundfile library (`pip install soundfile`). + To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and conversion + into a tensor of type `torch.FloatTensor`. See [`UniSpeechSatProcessor.__call__`] for details. labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If diff --git a/src/transformers/models/wav2vec2/modeling_flax_wav2vec2.py b/src/transformers/models/wav2vec2/modeling_flax_wav2vec2.py index ca5eb700ee..472600afd9 100644 --- a/src/transformers/models/wav2vec2/modeling_flax_wav2vec2.py +++ b/src/transformers/models/wav2vec2/modeling_flax_wav2vec2.py @@ -255,9 +255,10 @@ WAV2VEC2_INPUTS_DOCSTRING = r""" Args: input_values (`jnp.ndarray` of shape `(batch_size, sequence_length)`): Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file - into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install - soundfile`). To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and - conversion into a tensor of type `jnp.ndarray`. See [`Wav2Vec2Processor.__call__`] for details. + into an array of type `list[float]`, a `numpy.ndarray` or a `torch.Tensor`, *e.g.* via the torchcodec library + (`pip install torchcodec`) or the soundfile library (`pip install soundfile`). + To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and conversion + into a tensor of type `jnp.ndarray`. See [`Wav2Vec2Processor.__call__`] for details. attention_mask (`jnp.ndarray` of shape `(batch_size, sequence_length)`, *optional*): Mask to avoid performing convolution and attention on padding token indices. Mask values selected in `[0, 1]`: @@ -1064,16 +1065,14 @@ FLAX_WAV2VEC2_MODEL_DOCSTRING = """ ```python >>> from transformers import AutoProcessor, FlaxWav2Vec2Model >>> from datasets import load_dataset - >>> import soundfile as sf >>> processor = AutoProcessor.from_pretrained("facebook/wav2vec2-large-lv60") >>> model = FlaxWav2Vec2Model.from_pretrained("facebook/wav2vec2-large-lv60") - >>> def map_to_array(batch): - ... speech, _ = sf.read(batch["file"]) - ... batch["speech"] = speech - ... return batch + >>> def map_to_array(example): + ... example["speech"] = example["audio"]["array"] + ... return example >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") @@ -1183,16 +1182,14 @@ FLAX_WAV2VEC2_FOR_CTC_DOCSTRING = """ >>> import jax.numpy as jnp >>> from transformers import AutoProcessor, FlaxWav2Vec2ForCTC >>> from datasets import load_dataset - >>> import soundfile as sf >>> processor = AutoProcessor.from_pretrained("facebook/wav2vec2-large-960h-lv60") >>> model = FlaxWav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h-lv60") - >>> def map_to_array(batch): - ... speech, _ = sf.read(batch["file"]) - ... batch["speech"] = speech - ... return batch + >>> def map_to_array(example): + ... example["speech"] = example["audio"]["array"] + ... return example >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") @@ -1384,16 +1381,14 @@ FLAX_WAV2VEC2_FOR_PRETRAINING_DOCSTRING = """ >>> from transformers import AutoFeatureExtractor, FlaxWav2Vec2ForPreTraining >>> from transformers.models.wav2vec2.modeling_flax_wav2vec2 import _compute_mask_indices >>> from datasets import load_dataset - >>> import soundfile as sf >>> feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-large-lv60") >>> model = FlaxWav2Vec2ForPreTraining.from_pretrained("facebook/wav2vec2-large-lv60") - >>> def map_to_array(batch): - ... speech, _ = sf.read(batch["file"]) - ... batch["speech"] = speech - ... return batch + >>> def map_to_array(example): + ... example["speech"] = example["audio"]["array"] + ... return example >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") diff --git a/src/transformers/models/wav2vec2/modeling_tf_wav2vec2.py b/src/transformers/models/wav2vec2/modeling_tf_wav2vec2.py index 657674c3ff..d4364c3147 100644 --- a/src/transformers/models/wav2vec2/modeling_tf_wav2vec2.py +++ b/src/transformers/models/wav2vec2/modeling_tf_wav2vec2.py @@ -1530,16 +1530,14 @@ class TFWav2Vec2Model(TFWav2Vec2PreTrainedModel): ```python >>> from transformers import AutoProcessor, TFWav2Vec2Model >>> from datasets import load_dataset - >>> import soundfile as sf >>> processor = AutoProcessor.from_pretrained("facebook/wav2vec2-base-960h") >>> model = TFWav2Vec2Model.from_pretrained("facebook/wav2vec2-base-960h") - >>> def map_to_array(batch): - ... speech, _ = sf.read(batch["file"]) - ... batch["speech"] = speech - ... return batch + >>> def map_to_array(example): + ... example["speech"] = example["audio"]["array"] + ... return example >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") @@ -1642,16 +1640,15 @@ class TFWav2Vec2ForCTC(TFWav2Vec2PreTrainedModel): >>> import tensorflow as tf >>> from transformers import AutoProcessor, TFWav2Vec2ForCTC >>> from datasets import load_dataset - >>> import soundfile as sf + >>> from torchcodec.decoders import AudioDecoder >>> processor = AutoProcessor.from_pretrained("facebook/wav2vec2-base-960h") >>> model = TFWav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h") - >>> def map_to_array(batch): - ... speech, _ = sf.read(batch["file"]) - ... batch["speech"] = speech - ... return batch + >>> def map_to_array(example): + ... example["speech"] = example["audio"]["array"] + ... return example >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") diff --git a/src/transformers/models/wav2vec2/modeling_wav2vec2.py b/src/transformers/models/wav2vec2/modeling_wav2vec2.py index be43995e97..c12dc4093a 100755 --- a/src/transformers/models/wav2vec2/modeling_wav2vec2.py +++ b/src/transformers/models/wav2vec2/modeling_wav2vec2.py @@ -1981,9 +1981,10 @@ class Wav2Vec2ForSequenceClassification(Wav2Vec2PreTrainedModel): r""" input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`): Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file - into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install - soundfile`). To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and - conversion into a tensor of type `torch.FloatTensor`. See [`Wav2Vec2Processor.__call__`] for details. + into an array of type `list[float]`, a `numpy.ndarray` or a `torch.Tensor`, *e.g.* via the torchcodec library + (`pip install torchcodec`) or the soundfile library (`pip install soundfile`). + To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and conversion + into a tensor of type `torch.FloatTensor`. See [`Wav2Vec2Processor.__call__`] for details. labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If @@ -2095,9 +2096,10 @@ class Wav2Vec2ForAudioFrameClassification(Wav2Vec2PreTrainedModel): r""" input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`): Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file - into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install - soundfile`). To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and - conversion into a tensor of type `torch.FloatTensor`. See [`Wav2Vec2Processor.__call__`] for details. + into an array of type `list[float]`, a `numpy.ndarray` or a `torch.Tensor`, *e.g.* via the torchcodec library + (`pip install torchcodec`) or the soundfile library (`pip install soundfile`). + To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and conversion + into a tensor of type `torch.FloatTensor`. See [`Wav2Vec2Processor.__call__`] for details. labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If @@ -2277,9 +2279,10 @@ class Wav2Vec2ForXVector(Wav2Vec2PreTrainedModel): r""" input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`): Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file - into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install - soundfile`). To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and - conversion into a tensor of type `torch.FloatTensor`. See [`Wav2Vec2Processor.__call__`] for details. + into an array of type `list[float]`, a `numpy.ndarray` or a `torch.Tensor`, *e.g.* via the torchcodec library + (`pip install torchcodec`) or the soundfile library (`pip install soundfile`). + To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and conversion + into a tensor of type `torch.FloatTensor`. See [`Wav2Vec2Processor.__call__`] for details. labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If diff --git a/src/transformers/models/wav2vec2_bert/modeling_wav2vec2_bert.py b/src/transformers/models/wav2vec2_bert/modeling_wav2vec2_bert.py index e41d7f32ff..a2be15d8ce 100644 --- a/src/transformers/models/wav2vec2_bert/modeling_wav2vec2_bert.py +++ b/src/transformers/models/wav2vec2_bert/modeling_wav2vec2_bert.py @@ -997,9 +997,10 @@ class Wav2Vec2BertModel(Wav2Vec2BertPreTrainedModel): r""" input_features (`torch.FloatTensor` of shape `(batch_size, sequence_length)`): Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file - into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install - soundfile`). To prepare the array into `input_features`, the [`AutoProcessor`] should be used for padding and - conversion into a tensor of type `torch.FloatTensor`. See [`Wav2Vec2BertProcessor.__call__`] for details. + into an array of type `list[float]`, a `numpy.ndarray` or a `torch.Tensor`, *e.g.* via the torchcodec library + (`pip install torchcodec`) or the soundfile library (`pip install soundfile`). + To prepare the array into `input_features`, the [`AutoProcessor`] should be used for padding and conversion + into a tensor of type `torch.FloatTensor`. See [`Wav2Vec2BertProcessor.__call__`] for details. mask_time_indices (`torch.BoolTensor` of shape `(batch_size, sequence_length)`, *optional*): Indices to mask extracted features for contrastive loss. When in training mode, model learns to predict masked extracted features in *config.proj_codevector_dim* space. @@ -1094,9 +1095,10 @@ class Wav2Vec2BertForCTC(Wav2Vec2BertPreTrainedModel): r""" input_features (`torch.FloatTensor` of shape `(batch_size, sequence_length)`): Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file - into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install - soundfile`). To prepare the array into `input_features`, the [`AutoProcessor`] should be used for padding and - conversion into a tensor of type `torch.FloatTensor`. See [`Wav2Vec2BertProcessor.__call__`] for details. + into an array of type `list[float]`, a `numpy.ndarray` or a `torch.Tensor`, *e.g.* via the torchcodec library + (`pip install torchcodec`) or the soundfile library (`pip install soundfile`). + To prepare the array into `input_features`, the [`AutoProcessor`] should be used for padding and conversion + into a tensor of type `torch.FloatTensor`. See [`Wav2Vec2BertProcessor.__call__`] for details. labels (`torch.LongTensor` of shape `(batch_size, target_length)`, *optional*): Labels for connectionist temporal classification. Note that `target_length` has to be smaller or equal to the sequence length of the output logits. Indices are selected in `[-100, 0, ..., config.vocab_size - 1]`. @@ -1205,9 +1207,10 @@ class Wav2Vec2BertForSequenceClassification(Wav2Vec2BertPreTrainedModel): r""" input_features (`torch.FloatTensor` of shape `(batch_size, sequence_length)`): Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file - into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install - soundfile`). To prepare the array into `input_features`, the [`AutoProcessor`] should be used for padding and - conversion into a tensor of type `torch.FloatTensor`. See [`Wav2Vec2BertProcessor.__call__`] for details. + into an array of type `list[float]`, a `numpy.ndarray` or a `torch.Tensor`, *e.g.* via the torchcodec library + (`pip install torchcodec`) or the soundfile library (`pip install soundfile`). + To prepare the array into `input_features`, the [`AutoProcessor`] should be used for padding and conversion + into a tensor of type `torch.FloatTensor`. See [`Wav2Vec2BertProcessor.__call__`] for details. labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If @@ -1300,9 +1303,10 @@ class Wav2Vec2BertForAudioFrameClassification(Wav2Vec2BertPreTrainedModel): r""" input_features (`torch.FloatTensor` of shape `(batch_size, sequence_length)`): Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file - into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install - soundfile`). To prepare the array into `input_features`, the [`AutoProcessor`] should be used for padding and - conversion into a tensor of type `torch.FloatTensor`. See [`Wav2Vec2ConformerProcessor.__call__`] for details. + into an array of type `list[float]`, a `numpy.ndarray` or a `torch.Tensor`, *e.g.* via the torchcodec library + (`pip install torchcodec`) or the soundfile library (`pip install soundfile`). + To prepare the array into `input_features`, the [`AutoProcessor`] should be used for padding and conversion + into a tensor of type `torch.FloatTensor`. See [`Wav2Vec2ConformerProcessor.__call__`] for details. labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If @@ -1463,9 +1467,10 @@ class Wav2Vec2BertForXVector(Wav2Vec2BertPreTrainedModel): r""" input_features (`torch.FloatTensor` of shape `(batch_size, sequence_length)`): Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file - into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install - soundfile`). To prepare the array into `input_features`, the [`AutoProcessor`] should be used for padding and - conversion into a tensor of type `torch.FloatTensor`. See [`Wav2Vec2ConformerProcessor.__call__`] for details. + into an array of type `list[float]`, a `numpy.ndarray` or a `torch.Tensor`, *e.g.* via the torchcodec library + (`pip install torchcodec`) or the soundfile library (`pip install soundfile`). + To prepare the array into `input_features`, the [`AutoProcessor`] should be used for padding and conversion + into a tensor of type `torch.FloatTensor`. See [`Wav2Vec2ConformerProcessor.__call__`] for details. labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If diff --git a/src/transformers/models/wav2vec2_bert/modular_wav2vec2_bert.py b/src/transformers/models/wav2vec2_bert/modular_wav2vec2_bert.py index d0f375332b..64f4103195 100644 --- a/src/transformers/models/wav2vec2_bert/modular_wav2vec2_bert.py +++ b/src/transformers/models/wav2vec2_bert/modular_wav2vec2_bert.py @@ -708,9 +708,10 @@ class Wav2Vec2BertModel(Wav2Vec2Model, Wav2Vec2BertPreTrainedModel): r""" input_features (`torch.FloatTensor` of shape `(batch_size, sequence_length)`): Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file - into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install - soundfile`). To prepare the array into `input_features`, the [`AutoProcessor`] should be used for padding and - conversion into a tensor of type `torch.FloatTensor`. See [`Wav2Vec2BertProcessor.__call__`] for details. + into an array of type `list[float]`, a `numpy.ndarray` or a `torch.Tensor`, *e.g.* via the torchcodec library + (`pip install torchcodec`) or the soundfile library (`pip install soundfile`). + To prepare the array into `input_features`, the [`AutoProcessor`] should be used for padding and conversion + into a tensor of type `torch.FloatTensor`. See [`Wav2Vec2BertProcessor.__call__`] for details. mask_time_indices (`torch.BoolTensor` of shape `(batch_size, sequence_length)`, *optional*): Indices to mask extracted features for contrastive loss. When in training mode, model learns to predict masked extracted features in *config.proj_codevector_dim* space. @@ -779,9 +780,10 @@ class Wav2Vec2BertForCTC(Wav2Vec2ConformerForCTC): r""" input_features (`torch.FloatTensor` of shape `(batch_size, sequence_length)`): Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file - into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install - soundfile`). To prepare the array into `input_features`, the [`AutoProcessor`] should be used for padding and - conversion into a tensor of type `torch.FloatTensor`. See [`Wav2Vec2BertProcessor.__call__`] for details. + into an array of type `list[float]`, a `numpy.ndarray` or a `torch.Tensor`, *e.g.* via the torchcodec library + (`pip install torchcodec`) or the soundfile library (`pip install soundfile`). + To prepare the array into `input_features`, the [`AutoProcessor`] should be used for padding and conversion + into a tensor of type `torch.FloatTensor`. See [`Wav2Vec2BertProcessor.__call__`] for details. labels (`torch.LongTensor` of shape `(batch_size, target_length)`, *optional*): Labels for connectionist temporal classification. Note that `target_length` has to be smaller or equal to the sequence length of the output logits. Indices are selected in `[-100, 0, ..., config.vocab_size - 1]`. @@ -875,9 +877,10 @@ class Wav2Vec2BertForSequenceClassification(Wav2Vec2ForSequenceClassification): r""" input_features (`torch.FloatTensor` of shape `(batch_size, sequence_length)`): Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file - into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install - soundfile`). To prepare the array into `input_features`, the [`AutoProcessor`] should be used for padding and - conversion into a tensor of type `torch.FloatTensor`. See [`Wav2Vec2BertProcessor.__call__`] for details. + into an array of type `list[float]`, a `numpy.ndarray` or a `torch.Tensor`, *e.g.* via the torchcodec library + (`pip install torchcodec`) or the soundfile library (`pip install soundfile`). + To prepare the array into `input_features`, the [`AutoProcessor`] should be used for padding and conversion + into a tensor of type `torch.FloatTensor`. See [`Wav2Vec2BertProcessor.__call__`] for details. labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If @@ -950,9 +953,10 @@ class Wav2Vec2BertForAudioFrameClassification(Wav2Vec2ConformerForAudioFrameClas r""" input_features (`torch.FloatTensor` of shape `(batch_size, sequence_length)`): Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file - into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install - soundfile`). To prepare the array into `input_features`, the [`AutoProcessor`] should be used for padding and - conversion into a tensor of type `torch.FloatTensor`. See [`Wav2Vec2ConformerProcessor.__call__`] for details. + into an array of type `list[float]`, a `numpy.ndarray` or a `torch.Tensor`, *e.g.* via the torchcodec library + (`pip install torchcodec`) or the soundfile library (`pip install soundfile`). + To prepare the array into `input_features`, the [`AutoProcessor`] should be used for padding and conversion + into a tensor of type `torch.FloatTensor`. See [`Wav2Vec2ConformerProcessor.__call__`] for details. labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If @@ -1016,9 +1020,10 @@ class Wav2Vec2BertForXVector(Wav2Vec2ConformerForXVector): r""" input_features (`torch.FloatTensor` of shape `(batch_size, sequence_length)`): Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file - into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install - soundfile`). To prepare the array into `input_features`, the [`AutoProcessor`] should be used for padding and - conversion into a tensor of type `torch.FloatTensor`. See [`Wav2Vec2ConformerProcessor.__call__`] for details. + into an array of type `list[float]`, a `numpy.ndarray` or a `torch.Tensor`, *e.g.* via the torchcodec library + (`pip install torchcodec`) or the soundfile library (`pip install soundfile`). + To prepare the array into `input_features`, the [`AutoProcessor`] should be used for padding and conversion + into a tensor of type `torch.FloatTensor`. See [`Wav2Vec2ConformerProcessor.__call__`] for details. labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If diff --git a/src/transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py b/src/transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py index ec88628296..9cc47da5d1 100644 --- a/src/transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py +++ b/src/transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py @@ -1579,9 +1579,10 @@ class Wav2Vec2ConformerForSequenceClassification(Wav2Vec2ConformerPreTrainedMode r""" input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`): Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file - into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install - soundfile`). To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and - conversion into a tensor of type `torch.FloatTensor`. See [`Wav2Vec2ConformerProcessor.__call__`] for details. + into an array of type `list[float]`, a `numpy.ndarray` or a `torch.Tensor`, *e.g.* via the torchcodec library + (`pip install torchcodec`) or the soundfile library (`pip install soundfile`). + To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and conversion + into a tensor of type `torch.FloatTensor`. See [`Wav2Vec2ConformerProcessor.__call__`] for details. labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If @@ -1681,9 +1682,10 @@ class Wav2Vec2ConformerForAudioFrameClassification(Wav2Vec2ConformerPreTrainedMo r""" input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`): Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file - into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install - soundfile`). To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and - conversion into a tensor of type `torch.FloatTensor`. See [`Wav2Vec2ConformerProcessor.__call__`] for details. + into an array of type `list[float]`, a `numpy.ndarray` or a `torch.Tensor`, *e.g.* via the torchcodec library + (`pip install torchcodec`) or the soundfile library (`pip install soundfile`). + To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and conversion + into a tensor of type `torch.FloatTensor`. See [`Wav2Vec2ConformerProcessor.__call__`] for details. labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If @@ -1851,9 +1853,10 @@ class Wav2Vec2ConformerForXVector(Wav2Vec2ConformerPreTrainedModel): r""" input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`): Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file - into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install - soundfile`). To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and - conversion into a tensor of type `torch.FloatTensor`. See [`Wav2Vec2ConformerProcessor.__call__`] for details. + into an array of type `list[float]`, a `numpy.ndarray` or a `torch.Tensor`, *e.g.* via the torchcodec library + (`pip install torchcodec`) or the soundfile library (`pip install soundfile`). + To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and conversion + into a tensor of type `torch.FloatTensor`. See [`Wav2Vec2ConformerProcessor.__call__`] for details. labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If diff --git a/src/transformers/models/wavlm/modeling_wavlm.py b/src/transformers/models/wavlm/modeling_wavlm.py index 5904f05dcb..867e15e923 100755 --- a/src/transformers/models/wavlm/modeling_wavlm.py +++ b/src/transformers/models/wavlm/modeling_wavlm.py @@ -1328,9 +1328,10 @@ class WavLMForSequenceClassification(WavLMPreTrainedModel): r""" input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`): Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file - into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install - soundfile`). To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and - conversion into a tensor of type `torch.FloatTensor`. See [`WavLMProcessor.__call__`] for details. + into an array of type `list[float]`, a `numpy.ndarray` or a `torch.Tensor`, *e.g.* via the torchcodec library + (`pip install torchcodec`) or the soundfile library (`pip install soundfile`). + To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and conversion + into a tensor of type `torch.FloatTensor`. See [`WavLMProcessor.__call__`] for details. labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If @@ -1442,9 +1443,10 @@ class WavLMForAudioFrameClassification(WavLMPreTrainedModel): r""" input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`): Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file - into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install - soundfile`). To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and - conversion into a tensor of type `torch.FloatTensor`. See [`WavLMProcessor.__call__`] for details. + into an array of type `list[float]`, a `numpy.ndarray` or a `torch.Tensor`, *e.g.* via the torchcodec library + (`pip install torchcodec`) or the soundfile library (`pip install soundfile`). + To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and conversion + into a tensor of type `torch.FloatTensor`. See [`WavLMProcessor.__call__`] for details. labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If @@ -1624,9 +1626,10 @@ class WavLMForXVector(WavLMPreTrainedModel): r""" input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`): Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file - into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install - soundfile`). To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and - conversion into a tensor of type `torch.FloatTensor`. See [`WavLMProcessor.__call__`] for details. + into an array of type `list[float]`, a `numpy.ndarray` or a `torch.Tensor`, *e.g.* via the torchcodec library + (`pip install torchcodec`) or the soundfile library (`pip install soundfile`). + To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and conversion + into a tensor of type `torch.FloatTensor`. See [`WavLMProcessor.__call__`] for details. labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If diff --git a/src/transformers/models/whisper/generation_whisper.py b/src/transformers/models/whisper/generation_whisper.py index 5379e5d59d..1bed6ce27b 100644 --- a/src/transformers/models/whisper/generation_whisper.py +++ b/src/transformers/models/whisper/generation_whisper.py @@ -429,10 +429,11 @@ class WhisperGenerationMixin(GenerationMixin): Parameters: input_features (`torch.Tensor` of shape `(batch_size, feature_size, sequence_length)`, *optional*): Float values of log-mel features extracted from the raw speech waveform. The raw speech waveform can be obtained by - loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via - the soundfile library (`pip install soundfile`). To prepare the array into `input_features`, the - [`AutoFeatureExtractor`] should be used for extracting the mel features, padding and conversion into a - tensor of type `torch.FloatTensor`. See [`~WhisperFeatureExtractor.__call__`] for details. + loading a `.flac` or `.wav` audio file into an array of type `list[float]`, a `numpy.ndarray` or a `torch.Tensor`, + *e.g.* via the torchcodec library (`pip install torchcodec`) or the soundfile library (`pip install soundfile`). + To prepare the array into `input_features`, the [`AutoFeatureExtractor`] should be used for extracting the mel + features, padding and conversion into a tensor of type `torch.FloatTensor`. + See [`~WhisperFeatureExtractor.__call__`] for details. generation_config ([`~generation.GenerationConfig`], *optional*): The generation configuration to be used as base parametrization for the generation call. `**kwargs` passed to generate matching the attributes of `generation_config` will override them. If @@ -1598,7 +1599,7 @@ class WhisperGenerationMixin(GenerationMixin): Parameters: input_features (`torch.Tensor` of shape `(batch_size, feature_size, sequence_length)`, *optional*): Float values of log-mel features extracted from the raw speech waveform. The raw speech waveform can be obtained by - loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via + loading a `.flac` or `.wav` audio file into an array of type `list[float]`, a `numpy.ndarray` or a `torch.Tensor`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into `input_features`, the [`AutoFeatureExtractor`] should be used for extracting the mel features, padding and conversion into a tensor of type `torch.FloatTensor`. See [`~WhisperFeatureExtractor.__call__`] for details. diff --git a/src/transformers/models/whisper/modeling_flax_whisper.py b/src/transformers/models/whisper/modeling_flax_whisper.py index 63b7f71853..183fdd58f4 100644 --- a/src/transformers/models/whisper/modeling_flax_whisper.py +++ b/src/transformers/models/whisper/modeling_flax_whisper.py @@ -101,10 +101,12 @@ WHISPER_INPUTS_DOCSTRING = r""" Args: input_features (`numpy.ndarray` of shape `(batch_size, feature_size, sequence_length)`): Float values mel features extracted from the raw speech waveform. Raw speech waveform can be obtained by - loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via - the soundfile library (`pip install soundfile`). To prepare the array into `input_features`, the - [`WhisperFeatureExtractor`] should be used for extracting the features, padding and conversion into a - tensor of type `numpy.ndarray`. See [`~WhisperFeatureExtractor.__call__`] + loading a `.flac` or `.wav` audio file into an array of type `list[float]`, a `numpy.ndarray` or a + `torch.Tensor`, *e.g.* via the torchcodec library (`pip install torchcodec`) or the soundfile library + (`pip install soundfile`). + To prepare the array into `input_features`, the [`WhisperFeatureExtractor`] should be used for extracting + the features, padding and conversion into a tensor of type `numpy.ndarray`. + See [`~WhisperFeatureExtractor.__call__`] attention_mask (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*): Whisper does not support masking of the `input_features`, this argument is preserved for compatibility, but is not used. By default the silence in the input log mel spectrogram are ignored. @@ -138,10 +140,11 @@ WHISPER_ENCODE_INPUTS_DOCSTRING = r""" Args: input_features (`numpy.ndarray` of shape `(batch_size, feature_size, sequence_length)`): Float values mel features extracted from the raw speech waveform. Raw speech waveform can be obtained by - loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via - the soundfile library (`pip install soundfile`). To prepare the array into `input_features`, the - [`WhisperFeatureExtractor`] should be used for extracting the mel features, padding and conversion into a - tensor of type `numpy.ndarray`. See [`~WhisperFeatureExtractor.__call__`]. + loading a `.flac` or `.wav` audio file into an array of type `list[float]`, a `numpy.ndarray` or a `torch.Tensor`, *e.g.* via + the torchcodec library (`pip install torchcodec`) or the soundfile library (`pip install soundfile`). + To prepare the array into `input_features`, the [`WhisperFeatureExtractor`] should be used for extracting + the mel features, padding and conversion into a tensor of type `numpy.ndarray`. + See [`~WhisperFeatureExtractor.__call__`]. attention_mask (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*): Whisper does not support masking of the `input_features`, this argument is preserved for compatibility, but is not used. By default the silence in the input log mel spectrogram are ignored. diff --git a/src/transformers/models/whisper/modeling_tf_whisper.py b/src/transformers/models/whisper/modeling_tf_whisper.py index f5ca846b81..32adc6b5af 100644 --- a/src/transformers/models/whisper/modeling_tf_whisper.py +++ b/src/transformers/models/whisper/modeling_tf_whisper.py @@ -600,10 +600,12 @@ WHISPER_INPUTS_DOCSTRING = r""" Args: input_features (`tf.Tensor` of shape `(batch_size, feature_size, sequence_length)`): Float values of fbank features extracted from the raw speech waveform. Raw speech waveform can be obtained - by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* - via the soundfile library (`pip install soundfile`). To prepare the array into `input_features`, the - [`AutoFeatureExtractor`] should be used for extracting the fbank features, padding and conversion into a - tensor of type `tf.Tensor`. See [`~WhisperFeatureExtractor.__call__`] + by loading a `.flac` or `.wav` audio file into an array of type `list[float]`, a `numpy.ndarray` or a + `torch.Tensor`, *e.g.* via the torchcodec library (`pip install torchcodec`) or the soundfile library + (`pip install soundfile`). + To prepare the array into `input_features`, the [`AutoFeatureExtractor`] should be used for extracting the + fbank features, padding and conversion into a tensor of type `tf.Tensor`. + See [`~WhisperFeatureExtractor.__call__`] decoder_input_ids (`tf.Tensor` of shape `(batch_size, target_sequence_length)`, *optional*): Indices of decoder input sequence tokens in the vocabulary. @@ -728,8 +730,9 @@ class TFWhisperEncoder(keras.layers.Layer): Args: input_features (`tf.Tensor` of shape `(batch_size, feature_size, sequence_length)`): Float values of fbank features extracted from the raw speech waveform. Raw speech waveform can be - obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a - `numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into + obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]`, a + `numpy.ndarray` or a `torch.Tensor`, *e.g.* via the torchcodec libary (`pip install torchcodec`) or + the soundfile library (`pip install soundfile`). To prepare the array into `input_features`, the [`AutoFeatureExtractor`] should be used for extracting the fbank features, padding and conversion into a tensor of type `tf.Tensor`. See [`~WhisperFeatureExtractor.__call__`] head_mask (`tf.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*): diff --git a/src/transformers/models/whisper/modeling_whisper.py b/src/transformers/models/whisper/modeling_whisper.py index 43f1eccdc0..d9bb321582 100644 --- a/src/transformers/models/whisper/modeling_whisper.py +++ b/src/transformers/models/whisper/modeling_whisper.py @@ -650,8 +650,9 @@ class WhisperEncoder(WhisperPreTrainedModel): Args: input_features (`torch.LongTensor` of shape `(batch_size, feature_size, sequence_length)`): Float values of mel features extracted from the raw speech waveform. Raw speech waveform can be - obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a - `numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into + obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]`, a + `numpy.ndarray` or a `torch.Tensor`, *e.g.* via the torchcodec libary (`pip install torchcodec`) or + the soundfile library (`pip install soundfile`). To prepare the array into `input_features`, the [`AutoFeatureExtractor`] should be used for extracting the mel features, padding and conversion into a tensor of type `torch.FloatTensor`. See [`~WhisperFeatureExtractor.__call__`] attention_mask (`torch.Tensor`)`, *optional*): @@ -1096,10 +1097,11 @@ class WhisperModel(WhisperPreTrainedModel): r""" input_features (`torch.FloatTensor` of shape `(batch_size, feature_size, sequence_length)`): Float values mel features extracted from the raw speech waveform. Raw speech waveform can be obtained by - loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via - the soundfile library (`pip install soundfile`). To prepare the array into `input_features`, the - [`AutoFeatureExtractor`] should be used for extracting the mel features, padding and conversion into a - tensor of type `torch.FloatTensor`. See [`~WhisperFeatureExtractor.__call__`] + loading a `.flac` or `.wav` audio file into an array of type `list[float]`, a `numpy.ndarray` or a `torch.Tensor`, *e.g.* via + the torchcodec library (`pip install torchcodec`) or the soundfile library (`pip install soundfile`). + To prepare the array into `input_features`, the [`AutoFeatureExtractor`] should be used for extracting the + mel features, padding and conversion into a tensor of type `torch.FloatTensor`. + See [`~WhisperFeatureExtractor.__call__`] decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*): Indices of decoder input sequence tokens in the vocabulary. @@ -1266,10 +1268,11 @@ class WhisperForConditionalGeneration(WhisperGenerationMixin, WhisperPreTrainedM r""" input_features (`torch.FloatTensor` of shape `(batch_size, feature_size, sequence_length)`): Float values mel features extracted from the raw speech waveform. Raw speech waveform can be obtained by - loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via - the soundfile library (`pip install soundfile`). To prepare the array into `input_features`, the - [`AutoFeatureExtractor`] should be used for extracting the mel features, padding and conversion into a - tensor of type `torch.FloatTensor`. See [`~WhisperFeatureExtractor.__call__`] + loading a `.flac` or `.wav` audio file into an array of type `list[float]`, a `numpy.ndarray` or a `torch.Tensor`, *e.g.* via + the torchcodec library (`pip install torchcodec`) or the soundfile library (`pip install soundfile`). + To prepare the array into `input_features`, the [`AutoFeatureExtractor`] should be used for extracting the + mel features, padding and conversion into a tensor of type `torch.FloatTensor`. + See [`~WhisperFeatureExtractor.__call__`] decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*): Indices of decoder input sequence tokens in the vocabulary. @@ -1600,10 +1603,11 @@ class WhisperForAudioClassification(WhisperPreTrainedModel): r""" input_features (`torch.FloatTensor` of shape `(batch_size, feature_size, sequence_length)`): Float values mel features extracted from the raw speech waveform. Raw speech waveform can be obtained by - loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via - the soundfile library (`pip install soundfile`). To prepare the array into `input_features`, the - [`AutoFeatureExtractor`] should be used for extracting the mel features, padding and conversion into a - tensor of type `torch.FloatTensor`. See [`~WhisperFeatureExtractor.__call__`] + loading a `.flac` or `.wav` audio file into an array of type `list[float]`, a `numpy.ndarray` or a `torch.Tensor`, *e.g.* via + the torchcodec library (`pip install torchcodec`) or the soundfile library (`pip install soundfile`). + To prepare the array into `input_features`, the [`AutoFeatureExtractor`] should be used for extracting the + mel features, padding and conversion into a tensor of type `torch.FloatTensor`. + See [`~WhisperFeatureExtractor.__call__`] labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If diff --git a/src/transformers/pipelines/audio_classification.py b/src/transformers/pipelines/audio_classification.py index 1715d247e5..5ce133def9 100644 --- a/src/transformers/pipelines/audio_classification.py +++ b/src/transformers/pipelines/audio_classification.py @@ -17,7 +17,7 @@ from typing import Any, Union import numpy as np import requests -from ..utils import add_end_docstrings, is_torch_available, is_torchaudio_available, logging +from ..utils import add_end_docstrings, is_torch_available, is_torchaudio_available, is_torchcodec_available, logging from .base import Pipeline, build_pipeline_init_args @@ -174,6 +174,21 @@ class AudioClassificationPipeline(Pipeline): if isinstance(inputs, bytes): inputs = ffmpeg_read(inputs, self.feature_extractor.sampling_rate) + if is_torch_available(): + import torch + + if isinstance(inputs, torch.Tensor): + inputs = inputs.cpu().numpy() + + if is_torchcodec_available(): + import torch + import torchcodec + + if isinstance(inputs, torchcodec.decoders.AudioDecoder): + _audio_samples = inputs.get_all_samples() + _array = _audio_samples.data + inputs = {"array": _array, "sampling_rate": _audio_samples.sample_rate} + if isinstance(inputs, dict): inputs = inputs.copy() # So we don't mutate the original dictionary outside the pipeline # Accepting `"array"` which is the key defined in `datasets` for @@ -181,7 +196,7 @@ class AudioClassificationPipeline(Pipeline): if not ("sampling_rate" in inputs and ("raw" in inputs or "array" in inputs)): raise ValueError( "When passing a dictionary to AudioClassificationPipeline, the dict needs to contain a " - '"raw" key containing the numpy array representing the audio and a "sampling_rate" key, ' + '"raw" key containing the numpy array or torch tensor representing the audio and a "sampling_rate" key, ' "containing the sampling_rate associated with that array" ) @@ -204,11 +219,13 @@ class AudioClassificationPipeline(Pipeline): ) inputs = F.resample( - torch.from_numpy(inputs), in_sampling_rate, self.feature_extractor.sampling_rate + torch.from_numpy(inputs) if isinstance(inputs, np.ndarray) else inputs, + in_sampling_rate, + self.feature_extractor.sampling_rate, ).numpy() if not isinstance(inputs, np.ndarray): - raise TypeError("We expect a numpy ndarray as input") + raise TypeError("We expect a numpy ndarray or torch tensor as input") if len(inputs.shape) != 1: raise ValueError("We expect a single channel audio input for AudioClassificationPipeline") diff --git a/src/transformers/pipelines/automatic_speech_recognition.py b/src/transformers/pipelines/automatic_speech_recognition.py index 232ef4463b..a950ab5ee6 100644 --- a/src/transformers/pipelines/automatic_speech_recognition.py +++ b/src/transformers/pipelines/automatic_speech_recognition.py @@ -19,7 +19,7 @@ import requests from ..generation import GenerationConfig from ..tokenization_utils import PreTrainedTokenizer -from ..utils import is_torch_available, is_torchaudio_available, logging +from ..utils import is_torch_available, is_torchaudio_available, is_torchcodec_available, logging from .audio_utils import ffmpeg_read from .base import ChunkPipeline @@ -364,6 +364,21 @@ class AutomaticSpeechRecognitionPipeline(ChunkPipeline): stride = None extra = {} + + if is_torch_available(): + import torch + + if isinstance(inputs, torch.Tensor): + inputs = inputs.cpu().numpy() + + if is_torchcodec_available(): + import torchcodec + + if isinstance(inputs, torchcodec.decoders.AudioDecoder): + _audio_samples = inputs.get_all_samples() + _array = _audio_samples.data + inputs = {"array": _array, "sampling_rate": _audio_samples.sample_rate} + if isinstance(inputs, dict): stride = inputs.pop("stride", None) # Accepting `"array"` which is the key defined in `datasets` for @@ -371,7 +386,7 @@ class AutomaticSpeechRecognitionPipeline(ChunkPipeline): if not ("sampling_rate" in inputs and ("raw" in inputs or "array" in inputs)): raise ValueError( "When passing a dictionary to AutomaticSpeechRecognitionPipeline, the dict needs to contain a " - '"raw" key containing the numpy array representing the audio and a "sampling_rate" key, ' + '"raw" key containing the numpy array or torch tensor representing the audio and a "sampling_rate" key, ' "containing the sampling_rate associated with that array" ) @@ -393,7 +408,10 @@ class AutomaticSpeechRecognitionPipeline(ChunkPipeline): ) inputs = F.resample( - torch.from_numpy(inputs), in_sampling_rate, self.feature_extractor.sampling_rate + torch.from_numpy(inputs) if isinstance(inputs, np.ndarray) else inputs, + in_sampling_rate, + in_sampling_rate, + self.feature_extractor.sampling_rate, ).numpy() ratio = self.feature_extractor.sampling_rate / in_sampling_rate else: @@ -408,7 +426,7 @@ class AutomaticSpeechRecognitionPipeline(ChunkPipeline): # of the original length in the stride so we can cut properly. stride = (inputs.shape[0], int(round(stride[0] * ratio)), int(round(stride[1] * ratio))) if not isinstance(inputs, np.ndarray): - raise TypeError(f"We expect a numpy ndarray as input, got `{type(inputs)}`") + raise TypeError(f"We expect a numpy ndarray or torch tensor as input, got `{type(inputs)}`") if len(inputs.shape) != 1: raise ValueError("We expect a single channel audio input for AutomaticSpeechRecognitionPipeline") diff --git a/src/transformers/testing_utils.py b/src/transformers/testing_utils.py index f9cc588797..3dc29371ba 100644 --- a/src/transformers/testing_utils.py +++ b/src/transformers/testing_utils.py @@ -130,7 +130,6 @@ from .utils import ( is_scipy_available, is_sentencepiece_available, is_seqio_available, - is_soundfile_available, is_spacy_available, is_speech_available, is_spqr_available, @@ -656,7 +655,7 @@ def require_torchcodec(test_case): These tests are skipped when Torchcodec isn't installed. """ - return unittest.skipUnless(is_torchcodec_available(), "test requires Torchvision")(test_case) + return unittest.skipUnless(is_torchcodec_available(), "test requires Torchcodec")(test_case) def require_torch_or_tf(test_case): @@ -1268,16 +1267,6 @@ def require_clearml(test_case): return unittest.skipUnless(is_clearml_available(), "test requires clearml")(test_case) -def require_soundfile(test_case): - """ - Decorator marking a test that requires soundfile - - These tests are skipped when soundfile isn't installed. - - """ - return unittest.skipUnless(is_soundfile_available(), "test requires soundfile")(test_case) - - def require_deepspeed(test_case): """ Decorator marking a test that requires deepspeed diff --git a/src/transformers/utils/args_doc.py b/src/transformers/utils/args_doc.py index 61f947516f..5028f4687a 100644 --- a/src/transformers/utils/args_doc.py +++ b/src/transformers/utils/args_doc.py @@ -248,9 +248,10 @@ class ModelArgs: input_values = { "description": """ Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file - into an array of type `list[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install - soundfile`). To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and - conversion into a tensor of type `torch.FloatTensor`. See [`{processor_class}.__call__`] for details. + into an array of type `list[float]`, a `numpy.ndarray` or a `torch.Tensor`, *e.g.* via the torchcodec library + (`pip install torchcodec`) or the soundfile library (`pip install soundfile`). + To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and conversion + into a tensor of type `torch.FloatTensor`. See [`{processor_class}.__call__`] for details. """, "shape": "of shape `(batch_size, sequence_length)`", } diff --git a/tests/models/audio_spectrogram_transformer/test_feature_extraction_audio_spectrogram_transformer.py b/tests/models/audio_spectrogram_transformer/test_feature_extraction_audio_spectrogram_transformer.py index 77c2ec6cd3..b0ee0066fb 100644 --- a/tests/models/audio_spectrogram_transformer/test_feature_extraction_audio_spectrogram_transformer.py +++ b/tests/models/audio_spectrogram_transformer/test_feature_extraction_audio_spectrogram_transformer.py @@ -154,7 +154,7 @@ class ASTFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest.Test ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") # automatic decoding with librispeech - speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"] + speech_samples = ds.sort("id")[:num_samples]["audio"] return [x["array"] for x in speech_samples] diff --git a/tests/models/clap/test_feature_extraction_clap.py b/tests/models/clap/test_feature_extraction_clap.py index b2ccb50171..e349e08119 100644 --- a/tests/models/clap/test_feature_extraction_clap.py +++ b/tests/models/clap/test_feature_extraction_clap.py @@ -165,7 +165,7 @@ class ClapFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest.Tes def _load_datasamples(self, num_samples): ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") # automatic decoding with librispeech - speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"] + speech_samples = ds.sort("id")[:num_samples]["audio"] return [x["array"] for x in speech_samples] diff --git a/tests/models/clvp/test_feature_extraction_clvp.py b/tests/models/clvp/test_feature_extraction_clvp.py index 78bf38e2df..409e510005 100644 --- a/tests/models/clvp/test_feature_extraction_clvp.py +++ b/tests/models/clvp/test_feature_extraction_clvp.py @@ -215,7 +215,7 @@ class ClvpFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest.Tes ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") ds = ds.cast_column("audio", Audio(sampling_rate=22050)) # automatic decoding with librispeech - speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"] + speech_samples = ds.sort("id")[:num_samples]["audio"] return [x["array"] for x in speech_samples], [x["sampling_rate"] for x in speech_samples] diff --git a/tests/models/clvp/test_modeling_clvp.py b/tests/models/clvp/test_modeling_clvp.py index 60c165fbbe..a33d787dc7 100644 --- a/tests/models/clvp/test_modeling_clvp.py +++ b/tests/models/clvp/test_modeling_clvp.py @@ -373,10 +373,12 @@ class ClvpModelForConditionalGenerationTester: ds = datasets.load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") ds = ds.cast_column("audio", datasets.Audio(sampling_rate=22050)) - _, audio, sr = ds.sort("id").select(range(1))[:1]["audio"][0].values() + audio = ds.sort("id")[0]["audio"] + audio_sample = audio["array"] + sr = audio["sampling_rate"] feature_extractor = ClvpFeatureExtractor() - input_features = feature_extractor(raw_speech=audio, sampling_rate=sr, return_tensors="pt")[ + input_features = feature_extractor(raw_speech=audio_sample, sampling_rate=sr, return_tensors="pt")[ "input_features" ].to(torch_device) @@ -562,7 +564,8 @@ class ClvpIntegrationTest(unittest.TestCase): self.text = "This is an example text." ds = datasets.load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") ds = ds.cast_column("audio", datasets.Audio(sampling_rate=22050)) - _, self.speech_samples, self.sr = ds.sort("id").select(range(1))[:1]["audio"][0].values() + audio = ds.sort("id")["audio"][0] + self.speech_samples, self.sr = audio["array"], audio["sampling_rate"] self.model = ClvpModelForConditionalGeneration.from_pretrained("susnato/clvp_dev").to(torch_device) self.model.eval() diff --git a/tests/models/dac/test_feature_extraction_dac.py b/tests/models/dac/test_feature_extraction_dac.py index 13d7232607..c995485d33 100644 --- a/tests/models/dac/test_feature_extraction_dac.py +++ b/tests/models/dac/test_feature_extraction_dac.py @@ -143,7 +143,7 @@ class DacFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest.Test ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") # automatic decoding with librispeech - audio_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"] + audio_samples = ds.sort("id")[:num_samples]["audio"] return [x["array"] for x in audio_samples] diff --git a/tests/models/data2vec/test_modeling_data2vec_audio.py b/tests/models/data2vec/test_modeling_data2vec_audio.py index e275b8d681..630f6238e7 100644 --- a/tests/models/data2vec/test_modeling_data2vec_audio.py +++ b/tests/models/data2vec/test_modeling_data2vec_audio.py @@ -21,7 +21,7 @@ from datasets import load_dataset from tests.test_modeling_common import floats_tensor, ids_tensor, random_attention_mask from transformers import Data2VecAudioConfig, is_torch_available -from transformers.testing_utils import require_soundfile, require_torch, slow, torch_device +from transformers.testing_utils import require_torch, require_torchcodec, slow, torch_device from ...test_configuration_common import ConfigTester from ...test_modeling_common import ModelTesterMixin, _config_zero_init @@ -656,7 +656,7 @@ class Data2VecAudioUtilsTest(unittest.TestCase): @require_torch -@require_soundfile +@require_torchcodec @slow class Data2VecAudioModelIntegrationTest(unittest.TestCase): def _load_datasamples(self, num_samples): diff --git a/tests/models/dia/test_feature_extraction_dia.py b/tests/models/dia/test_feature_extraction_dia.py index 6243dc4791..9a6f797d53 100644 --- a/tests/models/dia/test_feature_extraction_dia.py +++ b/tests/models/dia/test_feature_extraction_dia.py @@ -145,7 +145,7 @@ class DiaFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest.Test ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") # automatic decoding with librispeech - audio_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"] + audio_samples = ds.sort("id")[:num_samples]["audio"] return [x["array"] for x in audio_samples] diff --git a/tests/models/dia/test_modeling_dia.py b/tests/models/dia/test_modeling_dia.py index f9427160c2..447491f901 100644 --- a/tests/models/dia/test_modeling_dia.py +++ b/tests/models/dia/test_modeling_dia.py @@ -665,8 +665,12 @@ class DiaForConditionalGenerationIntegrationTest(unittest.TestCase): @require_torch_accelerator def test_dia_model_integration_generate_audio_context(self): text = ["[S1] Dia is an open weights text to dialogue model.", "This is a test"] - audio_sample_1 = torchaudio.load(self.audio_prompt_1_path, channels_first=True)[0].squeeze().numpy() - audio_sample_2 = torchaudio.load(self.audio_prompt_2_path, channels_first=True)[0].squeeze().numpy() + audio_sample_1 = ( + torchaudio.load(self.audio_prompt_1_path, channels_first=True, backend="soundfile")[0].squeeze().numpy() + ) + audio_sample_2 = ( + torchaudio.load(self.audio_prompt_2_path, channels_first=True, backend="soundfile")[0].squeeze().numpy() + ) audio = [audio_sample_1, audio_sample_2] processor = DiaProcessor.from_pretrained(self.model_checkpoint) diff --git a/tests/models/encodec/test_feature_extraction_encodec.py b/tests/models/encodec/test_feature_extraction_encodec.py index 3dc4c5fbb7..2823b00993 100644 --- a/tests/models/encodec/test_feature_extraction_encodec.py +++ b/tests/models/encodec/test_feature_extraction_encodec.py @@ -139,7 +139,7 @@ class EnCodecFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest. ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") # automatic decoding with librispeech - audio_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"] + audio_samples = ds.sort("id")[:num_samples]["audio"] return [x["array"] for x in audio_samples] diff --git a/tests/models/granite_speech/test_modeling_granite_speech.py b/tests/models/granite_speech/test_modeling_granite_speech.py index 67ef91db78..44a8624001 100644 --- a/tests/models/granite_speech/test_modeling_granite_speech.py +++ b/tests/models/granite_speech/test_modeling_granite_speech.py @@ -340,7 +340,7 @@ class GraniteSpeechForConditionalGenerationIntegrationTest(unittest.TestCase): def _load_datasamples(self, num_samples): ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") # automatic decoding with librispeech - speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"] + speech_samples = ds.sort("id")[:num_samples]["audio"] return [x["array"] for x in speech_samples] diff --git a/tests/models/hubert/test_modeling_hubert.py b/tests/models/hubert/test_modeling_hubert.py index 905b435bb5..904a04e1f9 100644 --- a/tests/models/hubert/test_modeling_hubert.py +++ b/tests/models/hubert/test_modeling_hubert.py @@ -22,7 +22,7 @@ import unittest import pytest from transformers import HubertConfig, is_torch_available -from transformers.testing_utils import require_soundfile, require_torch, slow, torch_device +from transformers.testing_utils import require_torch, require_torchcodec, slow, torch_device from ...test_configuration_common import ConfigTester from ...test_modeling_common import ( @@ -750,7 +750,7 @@ class HubertUtilsTest(unittest.TestCase): @require_torch -@require_soundfile +@require_torchcodec @slow class HubertModelIntegrationTest(unittest.TestCase): def _load_datasamples(self, num_samples): diff --git a/tests/models/kyutai_speech_to_text/test_modeling_kyutai_speech_to_text.py b/tests/models/kyutai_speech_to_text/test_modeling_kyutai_speech_to_text.py index 780658c77a..ad516904ef 100644 --- a/tests/models/kyutai_speech_to_text/test_modeling_kyutai_speech_to_text.py +++ b/tests/models/kyutai_speech_to_text/test_modeling_kyutai_speech_to_text.py @@ -713,7 +713,7 @@ class KyutaiSpeechToTextForConditionalGenerationIntegrationTests(unittest.TestCa def _load_datasamples(self, num_samples): self._load_dataset() ds = self._dataset - speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"] + speech_samples = ds.sort("id")[:num_samples]["audio"] return [x["array"] for x in speech_samples] @slow diff --git a/tests/models/moonshine/test_modeling_moonshine.py b/tests/models/moonshine/test_modeling_moonshine.py index e9d5f8f1c9..e0a617aeaf 100644 --- a/tests/models/moonshine/test_modeling_moonshine.py +++ b/tests/models/moonshine/test_modeling_moonshine.py @@ -443,7 +443,7 @@ class MoonshineModelIntegrationTests(unittest.TestCase): def _load_datasamples(self, num_samples): ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") # automatic decoding with librispeech - speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"] + speech_samples = ds.sort("id")[:num_samples]["audio"] return [x["array"] for x in speech_samples] diff --git a/tests/models/phi4_multimodal/test_feature_extractor_phi4_multimodal.py b/tests/models/phi4_multimodal/test_feature_extractor_phi4_multimodal.py index 0163deec33..8d235b5199 100644 --- a/tests/models/phi4_multimodal/test_feature_extractor_phi4_multimodal.py +++ b/tests/models/phi4_multimodal/test_feature_extractor_phi4_multimodal.py @@ -207,7 +207,7 @@ class Phi4MultimodalFeatureExtractionTest(SequenceFeatureExtractionTestMixin, un def _load_datasamples(self, num_samples): ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") # automatic decoding with librispeech - speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"] + speech_samples = ds.sort("id")[:num_samples]["audio"] return [x["array"] for x in speech_samples] diff --git a/tests/models/phi4_multimodal/test_modeling_phi4_multimodal.py b/tests/models/phi4_multimodal/test_modeling_phi4_multimodal.py index 6ef9aef962..07fd24577b 100644 --- a/tests/models/phi4_multimodal/test_modeling_phi4_multimodal.py +++ b/tests/models/phi4_multimodal/test_modeling_phi4_multimodal.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -import tempfile import unittest import requests @@ -33,13 +32,13 @@ from transformers import ( from transformers.testing_utils import ( Expectations, cleanup, - require_soundfile, require_torch, require_torch_large_accelerator, + require_torchcodec, slow, torch_device, ) -from transformers.utils import is_soundfile_available +from transformers.utils import is_torchcodec_available from ...generation.test_utils import GenerationTesterMixin from ...test_configuration_common import ConfigTester @@ -54,8 +53,8 @@ if is_vision_available(): from PIL import Image -if is_soundfile_available(): - import soundfile +if is_torchcodec_available(): + import torchcodec class Phi4MultimodalModelTester: @@ -296,11 +295,9 @@ class Phi4MultimodalIntegrationTest(unittest.TestCase): self.assistant_token = "<|assistant|>" self.end_token = "<|end|>" self.image = Image.open(requests.get(self.image_url, stream=True).raw) - with tempfile.NamedTemporaryFile(mode="w+b", suffix=".wav") as tmp: - tmp.write(requests.get(self.audio_url, stream=True).raw.data) - tmp.flush() - tmp.seek(0) - self.audio, self.sampling_rate = soundfile.read(tmp.name) + audio_bytes = requests.get(self.audio_url, stream=True).raw.data + samples = torchcodec.decoders.AudioDecoder(audio_bytes).get_all_samples() + self.audio, self.sampling_rate = samples.data, samples.sample_rate cleanup(torch_device, gc_collect=True) @@ -378,7 +375,7 @@ class Phi4MultimodalIntegrationTest(unittest.TestCase): self.assertEqual(response, EXPECTED_RESPONSE) - @require_soundfile + @require_torchcodec def test_audio_text_generation(self): model = AutoModelForCausalLM.from_pretrained( self.checkpoint_path, revision=self.revision, torch_dtype=torch.float16, device_map=torch_device diff --git a/tests/models/sew/test_modeling_sew.py b/tests/models/sew/test_modeling_sew.py index 6e049b4fab..270f91bdf6 100644 --- a/tests/models/sew/test_modeling_sew.py +++ b/tests/models/sew/test_modeling_sew.py @@ -19,7 +19,7 @@ import unittest import pytest from transformers import SEWConfig, is_torch_available -from transformers.testing_utils import require_soundfile, require_torch, slow, torch_device +from transformers.testing_utils import require_torch, require_torchcodec, slow, torch_device from ...test_configuration_common import ConfigTester from ...test_modeling_common import ( @@ -453,7 +453,7 @@ class SEWUtilsTest(unittest.TestCase): @require_torch -@require_soundfile +@require_torchcodec @slow class SEWModelIntegrationTest(unittest.TestCase): def _load_datasamples(self, num_samples): diff --git a/tests/models/sew_d/test_modeling_sew_d.py b/tests/models/sew_d/test_modeling_sew_d.py index 4df373e839..86064250b8 100644 --- a/tests/models/sew_d/test_modeling_sew_d.py +++ b/tests/models/sew_d/test_modeling_sew_d.py @@ -19,7 +19,7 @@ import unittest import pytest from transformers import SEWDConfig, is_torch_available -from transformers.testing_utils import require_soundfile, require_torch, slow, torch_device +from transformers.testing_utils import require_torch, require_torchcodec, slow, torch_device from ...test_configuration_common import ConfigTester from ...test_modeling_common import ( @@ -464,7 +464,7 @@ class SEWDUtilsTest(unittest.TestCase): @require_torch -@require_soundfile +@require_torchcodec @slow class SEWDModelIntegrationTest(unittest.TestCase): def _load_datasamples(self, num_samples): diff --git a/tests/models/speech_to_text/test_feature_extraction_speech_to_text.py b/tests/models/speech_to_text/test_feature_extraction_speech_to_text.py index 49433e1156..15f0d89a3b 100644 --- a/tests/models/speech_to_text/test_feature_extraction_speech_to_text.py +++ b/tests/models/speech_to_text/test_feature_extraction_speech_to_text.py @@ -294,7 +294,7 @@ class Speech2TextFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unitt ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") # automatic decoding with librispeech - speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"] + speech_samples = ds.sort("id")[:num_samples]["audio"] return [x["array"] for x in speech_samples] diff --git a/tests/models/speecht5/test_feature_extraction_speecht5.py b/tests/models/speecht5/test_feature_extraction_speecht5.py index c886704a04..9c1a3b524d 100644 --- a/tests/models/speecht5/test_feature_extraction_speecht5.py +++ b/tests/models/speecht5/test_feature_extraction_speecht5.py @@ -381,7 +381,7 @@ class SpeechT5FeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") # automatic decoding with librispeech - speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"] + speech_samples = ds.sort("id")[:num_samples]["audio"] return [x["array"] for x in speech_samples] diff --git a/tests/models/speecht5/test_modeling_speecht5.py b/tests/models/speecht5/test_modeling_speecht5.py index 2255e895ce..9c0fa0fa39 100644 --- a/tests/models/speecht5/test_modeling_speecht5.py +++ b/tests/models/speecht5/test_modeling_speecht5.py @@ -764,7 +764,7 @@ class SpeechT5ForSpeechToTextIntegrationTests(unittest.TestCase): ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") # automatic decoding with librispeech - speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"] + speech_samples = ds.sort("id")[:num_samples]["audio"] return [x["array"] for x in speech_samples] @@ -1792,7 +1792,7 @@ class SpeechT5ForSpeechToSpeechIntegrationTests(unittest.TestCase): ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") # automatic decoding with librispeech - speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"] + speech_samples = ds.sort("id")[:num_samples]["audio"] return [x["array"] for x in speech_samples] diff --git a/tests/models/unispeech/test_modeling_unispeech.py b/tests/models/unispeech/test_modeling_unispeech.py index 37da494a96..00614bca7c 100644 --- a/tests/models/unispeech/test_modeling_unispeech.py +++ b/tests/models/unispeech/test_modeling_unispeech.py @@ -21,7 +21,7 @@ import pytest from datasets import load_dataset from transformers import UniSpeechConfig, is_torch_available -from transformers.testing_utils import is_flaky, require_soundfile, require_torch, slow, torch_device +from transformers.testing_utils import is_flaky, require_torch, require_torchcodec, slow, torch_device from ...test_configuration_common import ConfigTester from ...test_modeling_common import ( @@ -553,7 +553,7 @@ class UniSpeechRobustModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.T @require_torch -@require_soundfile +@require_torchcodec @slow class UniSpeechModelIntegrationTest(unittest.TestCase): def _load_datasamples(self, num_samples): diff --git a/tests/models/unispeech_sat/test_modeling_unispeech_sat.py b/tests/models/unispeech_sat/test_modeling_unispeech_sat.py index 1b6a1cb804..2c5001fbbc 100644 --- a/tests/models/unispeech_sat/test_modeling_unispeech_sat.py +++ b/tests/models/unispeech_sat/test_modeling_unispeech_sat.py @@ -21,7 +21,7 @@ import pytest from datasets import load_dataset from transformers import UniSpeechSatConfig, is_torch_available -from transformers.testing_utils import require_soundfile, require_torch, slow, torch_device +from transformers.testing_utils import require_torch, require_torchcodec, slow, torch_device from ...test_configuration_common import ConfigTester from ...test_modeling_common import ( @@ -807,7 +807,7 @@ class UniSpeechSatRobustModelTest(ModelTesterMixin, unittest.TestCase): @require_torch -@require_soundfile +@require_torchcodec @slow class UniSpeechSatModelIntegrationTest(unittest.TestCase): def _load_datasamples(self, num_samples): diff --git a/tests/models/univnet/test_feature_extraction_univnet.py b/tests/models/univnet/test_feature_extraction_univnet.py index 51a5fb7724..e57c40396e 100644 --- a/tests/models/univnet/test_feature_extraction_univnet.py +++ b/tests/models/univnet/test_feature_extraction_univnet.py @@ -330,7 +330,7 @@ class UnivNetFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest. ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") ds = ds.cast_column("audio", Audio(sampling_rate=self.feat_extract_tester.sampling_rate)) # automatic decoding with librispeech - speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"] + speech_samples = ds.sort("id")[:num_samples]["audio"] return [x["array"] for x in speech_samples], [x["sampling_rate"] for x in speech_samples] diff --git a/tests/models/univnet/test_modeling_univnet.py b/tests/models/univnet/test_modeling_univnet.py index 003c63a3e6..00066d89bd 100644 --- a/tests/models/univnet/test_modeling_univnet.py +++ b/tests/models/univnet/test_modeling_univnet.py @@ -216,7 +216,7 @@ class UnivNetModelIntegrationTests(unittest.TestCase): ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") ds = ds.cast_column("audio", Audio(sampling_rate=sampling_rate)) # automatic decoding with librispeech - speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"] + speech_samples = ds.sort("id")[:num_samples]["audio"] return [x["array"] for x in speech_samples], [x["sampling_rate"] for x in speech_samples] diff --git a/tests/models/wav2vec2/test_modeling_wav2vec2.py b/tests/models/wav2vec2/test_modeling_wav2vec2.py index cea2801f09..560a8af6d9 100644 --- a/tests/models/wav2vec2/test_modeling_wav2vec2.py +++ b/tests/models/wav2vec2/test_modeling_wav2vec2.py @@ -34,10 +34,10 @@ from transformers.testing_utils import ( is_torchaudio_available, require_flash_attn, require_pyctcdecode, - require_soundfile, require_torch, require_torch_gpu, require_torchaudio, + require_torchcodec, run_test_in_subprocess, slow, torch_device, @@ -1444,7 +1444,7 @@ class Wav2Vec2UtilsTest(unittest.TestCase): @require_torch -@require_soundfile +@require_torchcodec @slow class Wav2Vec2ModelIntegrationTest(unittest.TestCase): def tearDown(self): diff --git a/tests/models/whisper/test_feature_extraction_whisper.py b/tests/models/whisper/test_feature_extraction_whisper.py index ec8748b32e..0834edb4e2 100644 --- a/tests/models/whisper/test_feature_extraction_whisper.py +++ b/tests/models/whisper/test_feature_extraction_whisper.py @@ -254,7 +254,7 @@ class WhisperFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest. def _load_datasamples(self, num_samples): ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") # automatic decoding with librispeech - speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"] + speech_samples = ds.sort("id")[:num_samples]["audio"] return [x["array"] for x in speech_samples] diff --git a/tests/models/whisper/test_modeling_whisper.py b/tests/models/whisper/test_modeling_whisper.py index 25b9c2316e..df442342ee 100644 --- a/tests/models/whisper/test_modeling_whisper.py +++ b/tests/models/whisper/test_modeling_whisper.py @@ -1460,7 +1460,7 @@ class WhisperModelIntegrationTests(unittest.TestCase): def _load_datasamples(self, num_samples): self._load_dataset() ds = self._dataset - speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"] + speech_samples = ds.sort("id")[:num_samples]["audio"] return [x["array"] for x in speech_samples] @slow diff --git a/tests/pipelines/test_pipelines_automatic_speech_recognition.py b/tests/pipelines/test_pipelines_automatic_speech_recognition.py index a9977d912c..0e3f2246cc 100644 --- a/tests/pipelines/test_pipelines_automatic_speech_recognition.py +++ b/tests/pipelines/test_pipelines_automatic_speech_recognition.py @@ -1190,7 +1190,7 @@ class AutomaticSpeechRecognitionPipelineTests(unittest.TestCase): num_beams=1, ) - transcription_non_ass = pipe(sample.copy(), generate_kwargs={"assistant_model": assistant_model})["text"] + transcription_non_ass = pipe(sample, generate_kwargs={"assistant_model": assistant_model})["text"] transcription_ass = pipe(sample)["text"] self.assertEqual(transcription_ass, transcription_non_ass) diff --git a/tests/utils/test_audio_utils.py b/tests/utils/test_audio_utils.py index 7147a9c893..4d0459d9a8 100644 --- a/tests/utils/test_audio_utils.py +++ b/tests/utils/test_audio_utils.py @@ -278,7 +278,7 @@ class AudioUtilsFunctionTester(unittest.TestCase): if self._dataset is None: self._dataset = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") - speech_samples = self._dataset.sort("id").select(range(num_samples))[:num_samples]["audio"] + speech_samples = self._dataset.sort("id")[:num_samples]["audio"] return [x["array"] for x in speech_samples] def test_spectrogram_impulse(self): diff --git a/utils/print_env.py b/utils/print_env.py index e6d54fff2c..ea2b877355 100644 --- a/utils/print_env.py +++ b/utils/print_env.py @@ -72,3 +72,14 @@ try: print("Number of TF GPUs available:", len(tf.config.list_physical_devices("GPU"))) except ImportError: print("TensorFlow version:", None) + + +try: + import torchcodec + + versions = torchcodec._core.get_ffmpeg_library_versions() + print("FFmpeg version:", versions["ffmpeg_version"]) +except ImportError: + print("FFmpeg version:", None) +except (AttributeError, KeyError): + print("Failed to get FFmpeg version")