From cb38ffcc5e0ae2fac653342ac36dc75c15ea178f Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Thu, 25 Feb 2021 17:42:46 +0300 Subject: [PATCH] [PretrainedFeatureExtractor] + Wav2Vec2FeatureExtractor, Wav2Vec2Processor, Wav2Vec2Tokenizer (#10324) * push to show * small improvement * small improvement * Update src/transformers/feature_extraction_utils.py * Update src/transformers/feature_extraction_utils.py * implement base * add common tests * make all tests pass for wav2vec2 * make padding work & add more tests * finalize feature extractor utils * add call method to feature extraction * finalize feature processor * finish tokenizer * finish general processor design * finish tests * typo * remove bogus file * finish docstring * add docs * finish docs * small fix * correct docs * save intermediate * load changes * apply changes * apply changes to doc * change tests * apply surajs recommend * final changes * Apply suggestions from code review * fix typo * fix import * correct docstring --- docs/source/index.rst | 2 + docs/source/internal/file_utils.rst | 54 ++ docs/source/internal/tokenization_utils.rst | 6 - .../source/main_classes/feature_extractor.rst | 33 + docs/source/model_doc/wav2vec2.rst | 20 +- examples/multiple-choice/run_swag.py | 5 +- src/transformers/__init__.py | 26 +- src/transformers/data/data_collator.py | 9 +- src/transformers/feature_extraction_utils.py | 737 ++++++++++++++++++ src/transformers/file_utils.py | 86 +- .../models/auto/tokenization_auto.py | 4 +- .../models/dpr/tokenization_dpr.py | 8 +- .../models/dpr/tokenization_dpr_fast.py | 8 +- src/transformers/models/rag/retrieval_rag.py | 2 +- .../models/tapas/tokenization_tapas.py | 9 +- src/transformers/models/wav2vec2/__init__.py | 8 +- .../wav2vec2/feature_extraction_wav2vec2.py | 192 +++++ .../models/wav2vec2/modeling_wav2vec2.py | 30 +- .../models/wav2vec2/processing_wav2vec2.py | 142 ++++ .../models/wav2vec2/tokenization_wav2vec2.py | 217 +++++- .../pipelines/question_answering.py | 3 +- .../pipelines/table_question_answering.py | 2 +- src/transformers/tokenization_utils.py | 4 +- src/transformers/tokenization_utils_base.py | 112 +-- src/transformers/tokenization_utils_fast.py | 5 +- src/transformers/trainer_utils.py | 2 +- tests/test_feature_extraction_common.py | 284 +++++++ tests/test_feature_extraction_wav2vec2.py | 147 ++++ tests/test_modeling_wav2vec2.py | 21 +- tests/test_pipelines_common.py | 2 +- tests/test_processor_wav2vec2.py | 139 ++++ tests/test_tokenization_wav2vec2.py | 108 ++- utils/check_repo.py | 1 + 33 files changed, 2252 insertions(+), 176 deletions(-) create mode 100644 docs/source/internal/file_utils.rst create mode 100644 docs/source/main_classes/feature_extractor.rst create mode 100644 src/transformers/feature_extraction_utils.py create mode 100644 src/transformers/models/wav2vec2/feature_extraction_wav2vec2.py create mode 100644 src/transformers/models/wav2vec2/processing_wav2vec2.py create mode 100644 tests/test_feature_extraction_common.py create mode 100644 tests/test_feature_extraction_wav2vec2.py create mode 100644 tests/test_processor_wav2vec2.py diff --git a/docs/source/index.rst b/docs/source/index.rst index f2b2463dc7..92293536ba 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -375,6 +375,7 @@ TensorFlow and/or Flax. main_classes/processors main_classes/tokenizer main_classes/trainer + main_classes/feature_extractor .. toctree:: :maxdepth: 2 @@ -441,3 +442,4 @@ TensorFlow and/or Flax. internal/tokenization_utils internal/trainer_utils internal/generation_utils + internal/file_utils diff --git a/docs/source/internal/file_utils.rst b/docs/source/internal/file_utils.rst new file mode 100644 index 0000000000..5122ed303b --- /dev/null +++ b/docs/source/internal/file_utils.rst @@ -0,0 +1,54 @@ +.. + Copyright 2021 The HuggingFace Team. All rights reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on + an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the + specific language governing permissions and limitations under the License. + +General Utilities +----------------------------------------------------------------------------------------------------------------------- + +This page lists all of Transformers general utility functions that are found in the file ``file_utils.py``. + +Most of those are only useful if you are studying the general code in the library. + + +Enums and namedtuples +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. autoclass:: transformers.file_utils.ExplicitEnum + +.. autoclass:: transformers.file_utils.PaddingStrategy + +.. autoclass:: transformers.file_utils.TensorType + + +Special Decorators +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. autofunction:: transformers.file_utils.add_start_docstrings + +.. autofunction:: transformers.file_utils.add_start_docstrings_to_model_forward + +.. autofunction:: transformers.file_utils.add_end_docstrings + +.. autofunction:: transformers.file_utils.add_code_sample_docstrings + +.. autofunction:: transformers.file_utils.replace_return_docstrings + + +Special Properties +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. autoclass:: transformers.file_utils.cached_property + + +Other Utilities +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. autoclass:: transformers.file_utils._BaseLazyModule diff --git a/docs/source/internal/tokenization_utils.rst b/docs/source/internal/tokenization_utils.rst index b82adbf3b3..4198c552c8 100644 --- a/docs/source/internal/tokenization_utils.rst +++ b/docs/source/internal/tokenization_utils.rst @@ -38,12 +38,6 @@ SpecialTokensMixin Enums and namedtuples ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -.. autoclass:: transformers.tokenization_utils_base.ExplicitEnum - -.. autoclass:: transformers.tokenization_utils_base.PaddingStrategy - -.. autoclass:: transformers.tokenization_utils_base.TensorType - .. autoclass:: transformers.tokenization_utils_base.TruncationStrategy .. autoclass:: transformers.tokenization_utils_base.CharSpan diff --git a/docs/source/main_classes/feature_extractor.rst b/docs/source/main_classes/feature_extractor.rst new file mode 100644 index 0000000000..6d99cc2504 --- /dev/null +++ b/docs/source/main_classes/feature_extractor.rst @@ -0,0 +1,33 @@ +.. + Copyright 2021 The HuggingFace Team. All rights reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on + an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the + specific language governing permissions and limitations under the License. + + +Feature Extractor +----------------------------------------------------------------------------------------------------------------------- + +A feature extractor is in charge of preparing read-in audio files for a speech model. This includes feature extraction, +such as processing audio files to, *e.g.*, Log-Mel Spectrogram features, but also padding, normalization, and +conversion to Numpy, PyTorch, and TensorFlow tensors. + + +PreTrainedFeatureExtractor +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. autoclass:: transformers.PreTrainedFeatureExtractor + :members: from_pretrained, save_pretrained, pad + + +BatchFeature +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. autoclass:: transformers.BatchFeature + :members: diff --git a/docs/source/model_doc/wav2vec2.rst b/docs/source/model_doc/wav2vec2.rst index 3dd6e293b0..7f59639581 100644 --- a/docs/source/model_doc/wav2vec2.rst +++ b/docs/source/model_doc/wav2vec2.rst @@ -34,7 +34,7 @@ Tips: - Wav2Vec2 is a speech model that accepts a float array corresponding to the raw waveform of the speech signal. - Wav2Vec2 model was trained using connectionist temporal classification (CTC) so the model output has to be decoded - using :class:`~transformers.Wav2Vec2Tokenizer`. + using :class:`~transformers.Wav2Vec2CTCTokenizer`. Wav2Vec2Config @@ -44,13 +44,27 @@ Wav2Vec2Config :members: -Wav2Vec2Tokenizer +Wav2Vec2CTCTokenizer ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -.. autoclass:: transformers.Wav2Vec2Tokenizer +.. autoclass:: transformers.Wav2Vec2CTCTokenizer :members: __call__, save_vocabulary +Wav2Vec2FeatureExtractor +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. autoclass:: transformers.Wav2Vec2FeatureExtractor + :members: __call__ + + +Wav2Vec2Processor +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. autoclass:: transformers.Wav2Vec2Processor + :members: __call__, from_pretrained, save_pretrained, batch_decode, decode, as_target_processor + + Wav2Vec2Model ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/examples/multiple-choice/run_swag.py b/examples/multiple-choice/run_swag.py index efe95247dc..659df6ab90 100755 --- a/examples/multiple-choice/run_swag.py +++ b/examples/multiple-choice/run_swag.py @@ -39,7 +39,8 @@ from transformers import ( default_data_collator, set_seed, ) -from transformers.tokenization_utils_base import PaddingStrategy, PreTrainedTokenizerBase +from transformers.file_utils import PaddingStrategy +from transformers.tokenization_utils_base import PreTrainedTokenizerBase from transformers.trainer_utils import get_last_checkpoint, is_main_process @@ -133,7 +134,7 @@ class DataCollatorForMultipleChoice: Args: tokenizer (:class:`~transformers.PreTrainedTokenizer` or :class:`~transformers.PreTrainedTokenizerFast`): The tokenizer used for encoding the data. - padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`): + padding (:obj:`bool`, :obj:`str` or :class:`~transformers.file_utils.PaddingStrategy`, `optional`, defaults to :obj:`True`): Select a strategy to pad the returned sequences (according to the model's padding side and padding index) among: diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index 62451be242..7f85f1f9bb 100755 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -88,6 +88,7 @@ _import_structure = { "TF_WEIGHTS_NAME", "TRANSFORMERS_CACHE", "WEIGHTS_NAME", + "TensorType", "add_end_docstrings", "add_start_docstrings", "cached_path", @@ -125,7 +126,14 @@ _import_structure = { ], "models": [], # Models - "models.wav2vec2": ["WAV_2_VEC_2_PRETRAINED_CONFIG_ARCHIVE_MAP", "Wav2Vec2Config", "Wav2Vec2Tokenizer"], + "models.wav2vec2": [ + "WAV_2_VEC_2_PRETRAINED_CONFIG_ARCHIVE_MAP", + "Wav2Vec2Config", + "Wav2Vec2CTCTokenizer", + "Wav2Vec2Tokenizer", + "Wav2Vec2FeatureExtractor", + "Wav2Vec2Processor", + ], "models.convbert": ["CONVBERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "ConvBertConfig", "ConvBertTokenizer"], "models.albert": ["ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "AlbertConfig"], "models.auto": [ @@ -234,9 +242,9 @@ _import_structure = { "CharSpan", "PreTrainedTokenizerBase", "SpecialTokensMixin", - "TensorType", "TokenSpan", ], + "feature_extraction_utils": ["PreTrainedFeatureExtractor", "BatchFeature"], "trainer_callback": [ "DefaultFlowCallback", "EarlyStoppingCallback", @@ -1217,6 +1225,9 @@ if TYPE_CHECKING: xnli_tasks_num_labels, ) + # Feature Extractor + from .feature_extraction_utils import BatchFeature, PreTrainedFeatureExtractor + # Files and general utilities from .file_utils import ( CONFIG_NAME, @@ -1228,6 +1239,7 @@ if TYPE_CHECKING: TF_WEIGHTS_NAME, TRANSFORMERS_CACHE, WEIGHTS_NAME, + TensorType, add_end_docstrings, add_start_docstrings, cached_path, @@ -1343,7 +1355,14 @@ if TYPE_CHECKING: TransfoXLCorpus, TransfoXLTokenizer, ) - from .models.wav2vec2 import WAV_2_VEC_2_PRETRAINED_CONFIG_ARCHIVE_MAP, Wav2Vec2Config, Wav2Vec2Tokenizer + from .models.wav2vec2 import ( + WAV_2_VEC_2_PRETRAINED_CONFIG_ARCHIVE_MAP, + Wav2Vec2Config, + Wav2Vec2CTCTokenizer, + Wav2Vec2FeatureExtractor, + Wav2Vec2Processor, + Wav2Vec2Tokenizer, + ) from .models.xlm import XLM_PRETRAINED_CONFIG_ARCHIVE_MAP, XLMConfig, XLMTokenizer from .models.xlm_prophetnet import XLM_PROPHETNET_PRETRAINED_CONFIG_ARCHIVE_MAP, XLMProphetNetConfig from .models.xlm_roberta import XLM_ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP, XLMRobertaConfig @@ -1381,7 +1400,6 @@ if TYPE_CHECKING: CharSpan, PreTrainedTokenizerBase, SpecialTokensMixin, - TensorType, TokenSpan, ) diff --git a/src/transformers/data/data_collator.py b/src/transformers/data/data_collator.py index 530d28306c..94eaade7b1 100644 --- a/src/transformers/data/data_collator.py +++ b/src/transformers/data/data_collator.py @@ -20,8 +20,9 @@ from typing import Any, Callable, Dict, List, NewType, Optional, Tuple, Union import torch from torch.nn.utils.rnn import pad_sequence +from ..file_utils import PaddingStrategy from ..modeling_utils import PreTrainedModel -from ..tokenization_utils_base import BatchEncoding, PaddingStrategy, PreTrainedTokenizerBase +from ..tokenization_utils_base import BatchEncoding, PreTrainedTokenizerBase InputDataClass = NewType("InputDataClass", Any) @@ -89,7 +90,7 @@ class DataCollatorWithPadding: Args: tokenizer (:class:`~transformers.PreTrainedTokenizer` or :class:`~transformers.PreTrainedTokenizerFast`): The tokenizer used for encoding the data. - padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`): + padding (:obj:`bool`, :obj:`str` or :class:`~transformers.file_utils.PaddingStrategy`, `optional`, defaults to :obj:`True`): Select a strategy to pad the returned sequences (according to the model's padding side and padding index) among: @@ -138,7 +139,7 @@ class DataCollatorForTokenClassification: Args: tokenizer (:class:`~transformers.PreTrainedTokenizer` or :class:`~transformers.PreTrainedTokenizerFast`): The tokenizer used for encoding the data. - padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`): + padding (:obj:`bool`, :obj:`str` or :class:`~transformers.file_utils.PaddingStrategy`, `optional`, defaults to :obj:`True`): Select a strategy to pad the returned sequences (according to the model's padding side and padding index) among: @@ -238,7 +239,7 @@ class DataCollatorForSeq2Seq: prepare the `decoder_input_ids` This is useful when using `label_smoothing` to avoid calculating loss twice. - padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`): + padding (:obj:`bool`, :obj:`str` or :class:`~transformers.file_utils.PaddingStrategy`, `optional`, defaults to :obj:`True`): Select a strategy to pad the returned sequences (according to the model's padding side and padding index) among: diff --git a/src/transformers/feature_extraction_utils.py b/src/transformers/feature_extraction_utils.py new file mode 100644 index 0000000000..250a144313 --- /dev/null +++ b/src/transformers/feature_extraction_utils.py @@ -0,0 +1,737 @@ +# coding=utf-8 +# Copyright 2021 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" + Feature extraction common class for python feature extractors. +""" +import copy +import json +import os +from collections import UserDict +from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union + +import numpy as np + +from .file_utils import ( + FEATURE_EXTRACTOR_NAME, + PaddingStrategy, + TensorType, + _is_jax, + _is_numpy, + _is_tensorflow, + _is_torch, + _is_torch_device, + cached_path, + hf_bucket_url, + is_flax_available, + is_remote_url, + is_tf_available, + is_torch_available, + to_py_obj, + torch_required, +) +from .utils import logging + + +logger = logging.get_logger(__name__) + + +if TYPE_CHECKING: + if is_torch_available(): + import torch + + +class BatchFeature(UserDict): + r""" + Holds the output of the :meth:`~transformers.PreTrainedFeatureExtractor.pad` and feature extractor specific + ``__call__`` methods. + + This class is derived from a python dictionary and can be used as a dictionary. + + Args: + data (:obj:`dict`): + Dictionary of lists/arrays/tensors returned by the __call__/pad methods ('input_values', 'attention_mask', + etc.). + tensor_type (:obj:`Union[None, str, TensorType]`, `optional`): + You can give a tensor_type here to convert the lists of integers in PyTorch/TensorFlow/Numpy Tensors at + initialization. + """ + + def __init__(self, data: Optional[Dict[str, Any]] = None, tensor_type: Union[None, str, TensorType] = None): + super().__init__(data) + self.convert_to_tensors(tensor_type=tensor_type) + + def __getitem__(self, item: str) -> Union[Any]: + """ + If the key is a string, returns the value of the dict associated to :obj:`key` ('input_values', + 'attention_mask', etc.). + """ + if isinstance(item, str): + return self.data[item] + else: + raise KeyError("Indexing with integers is not available when using Python based feature extractors") + + def __getattr__(self, item: str): + try: + return self.data[item] + except KeyError: + raise AttributeError + + def __getstate__(self): + return {"data": self.data} + + def __setstate__(self, state): + if "data" in state: + self.data = state["data"] + + # Copied from transformers.tokenization_utils_base.BatchEncoding.keys + def keys(self): + return self.data.keys() + + # Copied from transformers.tokenization_utils_base.BatchEncoding.values + def values(self): + return self.data.values() + + # Copied from transformers.tokenization_utils_base.BatchEncoding.items + def items(self): + return self.data.items() + + def convert_to_tensors(self, tensor_type: Optional[Union[str, TensorType]] = None): + """ + Convert the inner content to tensors. + + Args: + tensor_type (:obj:`str` or :class:`~transformers.file_utils.TensorType`, `optional`): + The type of tensors to use. If :obj:`str`, should be one of the values of the enum + :class:`~transformers.file_utils.TensorType`. If :obj:`None`, no modification is done. + """ + if tensor_type is None: + return self + + # Convert to TensorType + if not isinstance(tensor_type, TensorType): + tensor_type = TensorType(tensor_type) + + # Get a function reference for the correct framework + if tensor_type == TensorType.TENSORFLOW: + if not is_tf_available(): + raise ImportError( + "Unable to convert output to TensorFlow tensors format, TensorFlow is not installed." + ) + import tensorflow as tf + + as_tensor = tf.constant + is_tensor = tf.is_tensor + elif tensor_type == TensorType.PYTORCH: + if not is_torch_available(): + raise ImportError("Unable to convert output to PyTorch tensors format, PyTorch is not installed.") + import torch + + as_tensor = torch.tensor + is_tensor = torch.is_tensor + elif tensor_type == TensorType.JAX: + if not is_flax_available(): + raise ImportError("Unable to convert output to JAX tensors format, JAX is not installed.") + import jax.numpy as jnp # noqa: F811 + + as_tensor = jnp.array + is_tensor = _is_jax + else: + as_tensor = np.asarray + is_tensor = _is_numpy + + # Do the tensor conversion in batch + for key, value in self.items(): + try: + if not is_tensor(value): + tensor = as_tensor(value) + + self[key] = tensor + except: # noqa E722 + if key == "overflowing_values": + raise ValueError("Unable to create tensor returning overflowing values of different lengths. ") + raise ValueError( + "Unable to create tensor, you should probably activate padding " + "with 'padding=True' to have batched tensors with the same length." + ) + + return self + + @torch_required + # Copied from transformers.tokenization_utils_base.BatchEncoding.to with BatchEncoding->BatchFeature + def to(self, device: Union[str, "torch.device"]) -> "BatchFeature": + """ + Send all values to device by calling :obj:`v.to(device)` (PyTorch only). + + Args: + device (:obj:`str` or :obj:`torch.device`): The device to put the tensors on. + + Returns: + :class:`~transformers.BatchFeature`: The same instance of :class:`~transformers.BatchFeature` after + modification. + """ + + # This check catches things like APEX blindly calling "to" on all inputs to a module + # Otherwise it passes the casts down and casts the LongTensor containing the token idxs + # into a HalfTensor + if isinstance(device, str) or _is_torch_device(device) or isinstance(device, int): + self.data = {k: v.to(device=device) for k, v in self.data.items()} + else: + logger.warning(f"Attempting to cast a BatchFeature to type {str(device)}. This is not supported.") + return self + + +class PreTrainedFeatureExtractor: + """ + This is a general feature extraction class for speech recognition. + + Args: + feature_size (:obj:`int`): + The feature dimension of the extracted features. + sampling_rate (:obj:`int`): + The sampling rate at which the audio files should be digitalized expressed in Hertz per second (Hz). + padding_value (:obj:`float`): + The value that is used to fill the padding values / vectors. + """ + + def __init__(self, feature_size: int, sampling_rate: int, padding_value: float, **kwargs): + self.feature_size = feature_size + self.sampling_rate = sampling_rate + self.padding_value = padding_value + + self.padding_side = kwargs.pop("padding_side", "right") + self.return_attention_mask = kwargs.pop("return_attention_mask", True) + + # Additional attributes without default values + for key, value in kwargs.items(): + try: + setattr(self, key, value) + except AttributeError as err: + logger.error(f"Can't set {key} with value {value} for {self}") + raise err + + @classmethod + def from_pretrained( + cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs + ) -> "PreTrainedFeatureExtractor": + r""" + Instantiate a :class:`~transformers.PreTrainedFeatureExtractor` (or a derived class) from a pretrained feature + extractor. + + Args: + pretrained_model_name_or_path (:obj:`str` or :obj:`os.PathLike`): + This can be either: + + - a string, the `model id` of a pretrained feature_extractor hosted inside a model repo on + huggingface.co. Valid model ids can be located at the root-level, like ``bert-base-uncased``, or + namespaced under a user or organization name, like ``dbmdz/bert-base-german-cased``. + - a path to a `directory` containing a feature extractor file saved using the + :func:`~transformers.PreTrainedFeatureExtractor.save_pretrained` method, e.g., + ``./my_model_directory/``. + - a path or url to a saved feature extractor JSON `file`, e.g., + ``./my_model_directory/feature_extraction_config.json``. + cache_dir (:obj:`str` or :obj:`os.PathLike`, `optional`): + Path to a directory in which a downloaded pretrained model feature extractor should be cached if the + standard cache should not be used. + force_download (:obj:`bool`, `optional`, defaults to :obj:`False`): + Whether or not to force to (re-)download the feature extractor files and override the cached versions + if they exist. + resume_download (:obj:`bool`, `optional`, defaults to :obj:`False`): + Whether or not to delete incompletely received file. Attempts to resume the download if such a file + exists. + proxies (:obj:`Dict[str, str]`, `optional`): + A dictionary of proxy servers to use by protocol or endpoint, e.g., :obj:`{'http': 'foo.bar:3128', + 'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request. + use_auth_token (:obj:`str` or `bool`, `optional`): + The token to use as HTTP bearer authorization for remote files. If :obj:`True`, will use the token + generated when running :obj:`transformers-cli login` (stored in :obj:`~/.huggingface`). + revision(:obj:`str`, `optional`, defaults to :obj:`"main"`): + The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a + git-based system for storing models and other artifacts on huggingface.co, so ``revision`` can be any + identifier allowed by git. + return_unused_kwargs (:obj:`bool`, `optional`, defaults to :obj:`False`): + If :obj:`False`, then this function returns just the final feature extractor object. + + If :obj:`True`, then this functions returns a :obj:`Tuple(feature_extractor, unused_kwargs)` where + `unused_kwargs` is a dictionary consisting of the key/value pairs whose keys are not feature extractor + attributes: i.e., the part of ``kwargs`` which has not been used to update ``feature_extractor`` and is + otherwise ignored. + kwargs (:obj:`Dict[str, Any]`, `optional`): + The values in kwargs of any keys which are feature extractor attributes will be used to override the + loaded values. Behavior concerning key/value pairs whose keys are *not* feature extractor attributes is + controlled by the ``return_unused_kwargs`` keyword parameter. + + .. note:: + + Passing :obj:`use_auth_token=True` is required when you want to use a private model. + + + Returns: + :class:`~transformers.PreTrainedFeatureExtractor`: The feature extractor object instantiated from this + pretrained model. + + Examples:: + + # We can't instantiate directly the base class `PreTrainedFeatureExtractor` so let's show the examples on a + # derived class: Wav2Vec2FeatureExtractor + feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained('facebook/wav2vec2-base-960h') # Download feature_extraction_config from huggingface.co and cache. + feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained('./test/saved_model/') # E.g. feature_extractor (or model) was saved using `save_pretrained('./test/saved_model/')` + feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained('./test/saved_model/preprocessor_config.json') + feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained('facebook/wav2vec2-base-960h', return_attention_mask=False, foo=False) + assert feature_extractor.return_attention_mask is False + feature_extractor, unused_kwargs = Wav2Vec2FeatureExtractor.from_pretrained('facebook/wav2vec2-base-960h', return_attention_mask=False, + foo=False, return_unused_kwargs=True) + assert feature_extractor.return_attention_mask is False + assert unused_kwargs == {'foo': False} + + """ + feature_extractor_dict, kwargs = cls.get_feature_extractor_dict(pretrained_model_name_or_path, **kwargs) + + return cls.from_dict(feature_extractor_dict, **kwargs) + + def save_pretrained(self, save_directory: Union[str, os.PathLike]): + """ + Save a feature_extractor object to the directory ``save_directory``, so that it can be re-loaded using the + :func:`~transformers.PreTrainedFeatureExtractor.from_pretrained` class method. + + Args: + save_directory (:obj:`str` or :obj:`os.PathLike`): + Directory where the feature extractor JSON file will be saved (will be created if it does not exist). + """ + if os.path.isfile(save_directory): + raise AssertionError(f"Provided path ({save_directory}) should be a directory, not a file") + os.makedirs(save_directory, exist_ok=True) + # If we save using the predefined names, we can load using `from_pretrained` + output_feature_extractor_file = os.path.join(save_directory, FEATURE_EXTRACTOR_NAME) + + self.to_json_file(output_feature_extractor_file) + logger.info(f"Configuration saved in {output_feature_extractor_file}") + + @classmethod + def get_feature_extractor_dict( + cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs + ) -> Tuple[Dict[str, Any], Dict[str, Any]]: + """ + From a ``pretrained_model_name_or_path``, resolve to a dictionary of parameters, to be used for instantiating a + :class:`~transformers.PreTrainedFeatureExtractor` using ``from_dict``. + + Parameters: + pretrained_model_name_or_path (:obj:`str` or :obj:`os.PathLike`): + The identifier of the pre-trained checkpoint from which we want the dictionary of parameters. + + Returns: + :obj:`Tuple[Dict, Dict]`: The dictionary(ies) that will be used to instantiate the feature extractor + object. + """ + cache_dir = kwargs.pop("cache_dir", None) + force_download = kwargs.pop("force_download", False) + resume_download = kwargs.pop("resume_download", False) + proxies = kwargs.pop("proxies", None) + use_auth_token = kwargs.pop("use_auth_token", None) + local_files_only = kwargs.pop("local_files_only", False) + revision = kwargs.pop("revision", None) + + pretrained_model_name_or_path = str(pretrained_model_name_or_path) + if os.path.isdir(pretrained_model_name_or_path): + feature_extractor_file = os.path.join(pretrained_model_name_or_path, FEATURE_EXTRACTOR_NAME) + elif os.path.isfile(pretrained_model_name_or_path) or is_remote_url(pretrained_model_name_or_path): + feature_extractor_file = pretrained_model_name_or_path + else: + feature_extractor_file = hf_bucket_url( + pretrained_model_name_or_path, filename=FEATURE_EXTRACTOR_NAME, revision=revision, mirror=None + ) + + try: + # Load from URL or cache if already cached + resolved_feature_extractor_file = cached_path( + feature_extractor_file, + cache_dir=cache_dir, + force_download=force_download, + proxies=proxies, + resume_download=resume_download, + local_files_only=local_files_only, + use_auth_token=use_auth_token, + ) + # Load feature_extractor dict + with open(resolved_feature_extractor_file, "r", encoding="utf-8") as reader: + text = reader.read() + feature_extractor_dict = json.loads(text) + + except EnvironmentError as err: + logger.error(err) + msg = ( + f"Can't load feature extractor for '{pretrained_model_name_or_path}'. Make sure that:\n\n" + f"- '{pretrained_model_name_or_path}' is a correct model identifier listed on 'https://huggingface.co/models'\n\n" + f"- or '{pretrained_model_name_or_path}' is the correct path to a directory containing a {FEATURE_EXTRACTOR_NAME} file\n\n" + ) + raise EnvironmentError(msg) + + except json.JSONDecodeError: + msg = ( + f"Couldn't reach server at '{feature_extractor_file}' to download feature extractor configuration file or " + "feature extractor configuration file is not a valid JSON file. " + f"Please check network or file content here: {resolved_feature_extractor_file}." + ) + raise EnvironmentError(msg) + + if resolved_feature_extractor_file == feature_extractor_file: + logger.info(f"loading feature extractor configuration file {feature_extractor_file}") + else: + logger.info( + f"loading feature extractor configuration file {feature_extractor_file} from cache at {resolved_feature_extractor_file}" + ) + + return feature_extractor_dict, kwargs + + @classmethod + def from_dict(cls, feature_extractor_dict: Dict[str, Any], **kwargs) -> "PreTrainedFeatureExtractor": + """ + Instantiates a :class:`~transformers.PreTrainedFeatureExtractor` from a Python dictionary of parameters. + + Args: + feature_extractor_dict (:obj:`Dict[str, Any]`): + Dictionary that will be used to instantiate the feature extractor object. Such a dictionary can be + retrieved from a pretrained checkpoint by leveraging the + :func:`~transformers.PreTrainedFeatureExtractor.to_dict` method. + kwargs (:obj:`Dict[str, Any]`): + Additional parameters from which to initialize the feature extractor object. + + Returns: + :class:`~transformers.PreTrainedFeatureExtractor`: The feature extractor object instantiated from those + parameters. + """ + return_unused_kwargs = kwargs.pop("return_unused_kwargs", False) + + feature_extractor = cls(**feature_extractor_dict) + + # Update feature_extractor with kwargs if needed + to_remove = [] + for key, value in kwargs.items(): + if hasattr(feature_extractor, key): + setattr(feature_extractor, key, value) + to_remove.append(key) + for key in to_remove: + kwargs.pop(key, None) + + logger.info(f"Feature extractor {feature_extractor}") + if return_unused_kwargs: + return feature_extractor, kwargs + else: + return feature_extractor + + def to_dict(self) -> Dict[str, Any]: + """ + Serializes this instance to a Python dictionary. + + Returns: + :obj:`Dict[str, Any]`: Dictionary of all the attributes that make up this feature extractor instance. + """ + output = copy.deepcopy(self.__dict__) + + return output + + @classmethod + def from_json_file(cls, json_file: Union[str, os.PathLike]) -> "PreTrainedFeatureExtractor": + """ + Instantiates a :class:`~transformers.PreTrainedFeatureExtractor` from the path to a JSON file of parameters. + + Args: + json_file (:obj:`str` or :obj:`os.PathLike`): + Path to the JSON file containing the parameters. + + Returns: + :class:`~transformers.PreTrainedFeatureExtractor`: The feature_extractor object instantiated from that JSON + file. + + """ + with open(json_file, "r", encoding="utf-8") as reader: + text = reader.read() + feature_extractor_dict = json.loads(text) + return cls(**feature_extractor_dict) + + def to_json_string(self) -> str: + """ + Serializes this instance to a JSON string. + + Returns: + :obj:`str`: String containing all the attributes that make up this feature_extractor instance in JSON + format. + """ + return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n" + + def to_json_file(self, json_file_path: Union[str, os.PathLike]): + """ + Save this instance to a JSON file. + + Args: + json_file_path (:obj:`str` or :obj:`os.PathLike`): + Path to the JSON file in which this feature_extractor instance's parameters will be saved. + """ + with open(json_file_path, "w", encoding="utf-8") as writer: + writer.write(self.to_json_string()) + + def __repr__(self): + return f"{self.__class__.__name__} {self.to_json_string()}" + + def pad( + self, + processed_features: Union[ + BatchFeature, + List[BatchFeature], + Dict[str, BatchFeature], + Dict[str, List[BatchFeature]], + List[Dict[str, BatchFeature]], + ], + padding: Union[bool, str, PaddingStrategy] = True, + max_length: Optional[int] = None, + pad_to_multiple_of: Optional[int] = None, + return_attention_mask: Optional[bool] = None, + return_tensors: Optional[Union[str, TensorType]] = None, + ) -> BatchFeature: + """ + Pad input values / input vectors or a batch of input values / input vectors up to predefined length or to the + max sequence length in the batch. + + Padding side (left/right) padding values are defined at the feature extractor level (with + ``self.padding_side``, ``self.padding_value``) + + .. note:: + + If the ``processed_features`` passed are dictionary of numpy arrays, PyTorch tensors or TensorFlow tensors, + the result will use the same type unless you provide a different tensor type with ``return_tensors``. In + the case of PyTorch tensors, you will lose the specific device of your tensors however. + + Args: + processed_features (:class:`~transformers.BatchFeature`, list of :class:`~transformers.BatchFeature`, :obj:`Dict[str, List[float]]`, :obj:`Dict[str, List[List[float]]` or :obj:`List[Dict[str, List[float]]]`): + Processed inputs. Can represent one input (:class:`~transformers.BatchFeature` or :obj:`Dict[str, + List[float]]`) or a batch of input values / vectors (list of :class:`~transformers.BatchFeature`, + `Dict[str, List[List[float]]]` or `List[Dict[str, List[float]]]`) so you can use this method during + preprocessing as well as in a PyTorch Dataloader collate function. + + Instead of :obj:`List[float]` you can have tensors (numpy arrays, PyTorch tensors or TensorFlow + tensors), see the note above for the return type. + padding (:obj:`bool`, :obj:`str` or :class:`~transformers.file_utils.PaddingStrategy`, `optional`, defaults to :obj:`True`): + Select a strategy to pad the returned sequences (according to the model's padding side and padding + index) among: + + * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a + single sequence if provided). + * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the + maximum acceptable input length for the model if that argument is not provided. + * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of + different lengths). + max_length (:obj:`int`, `optional`): + Maximum length of the returned list and optionally padding length (see above). + pad_to_multiple_of (:obj:`int`, `optional`): + If set will pad the sequence to a multiple of the provided value. + + This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability + >= 7.5 (Volta), or on TPUs which benefit from having sequence lengths be a multiple of 128. + return_attention_mask (:obj:`bool`, `optional`): + Whether to return the attention mask. If left to the default, will return the attention mask according + to the specific feature_extractor's default. + + `What are attention masks? <../glossary.html#attention-mask>`__ + return_tensors (:obj:`str` or :class:`~transformers.file_utils.TensorType`, `optional`): + If set, will return tensors instead of list of python integers. Acceptable values are: + + * :obj:`'tf'`: Return TensorFlow :obj:`tf.constant` objects. + * :obj:`'pt'`: Return PyTorch :obj:`torch.Tensor` objects. + * :obj:`'np'`: Return Numpy :obj:`np.ndarray` objects. + """ + # If we have a list of dicts, let's convert it in a dict of lists + # We do this to allow using this method as a collate_fn function in PyTorch Dataloader + if isinstance(processed_features, (list, tuple)) and isinstance(processed_features[0], (dict, BatchFeature)): + processed_features = { + key: [example[key] for example in processed_features] for key in processed_features[0].keys() + } + + # The model's main input name, usually `input_values`, has be passed for padding + if self.model_input_names[0] not in processed_features: + raise ValueError( + "You should supply an instance of :class:`~transformers.BatchFeature` or list of :class:`~transformers.BatchFeature` to this method" + f"that includes {self.model_input_names[0]}, but you provided {list(processed_features.keys())}" + ) + + required_input = processed_features[self.model_input_names[0]] + return_attention_mask = ( + return_attention_mask if return_attention_mask is not None else self.return_attention_mask + ) + + if not required_input: + if return_attention_mask: + processed_features["attention_mask"] = [] + return processed_features + + # If we have PyTorch/TF/NumPy tensors/arrays as inputs, we cast them as python objects + # and rebuild them afterwards if no return_tensors is specified + # Note that we lose the specific device the tensor may be on for PyTorch + + first_element = required_input[0] + if isinstance(first_element, (list, tuple)): + # first_element might be an empty list/tuple in some edge cases so we grab the first non empty element. + index = 0 + while len(required_input[index]) == 0: + index += 1 + if index < len(required_input): + first_element = required_input[index][0] + # At this state, if `first_element` is still a list/tuple, it's an empty one so there is nothing to do. + if not isinstance(first_element, (float, int, list, tuple)): + if is_tf_available() and _is_tensorflow(first_element): + return_tensors = "tf" if return_tensors is None else return_tensors + elif is_torch_available() and _is_torch(first_element): + return_tensors = "pt" if return_tensors is None else return_tensors + elif isinstance(first_element, np.ndarray): + return_tensors = "np" if return_tensors is None else return_tensors + else: + raise ValueError( + f"type of {first_element} unknown: {type(first_element)}. " + f"Should be one of a python, numpy, pytorch or tensorflow object." + ) + + for key, value in processed_features.items(): + processed_features[key] = to_py_obj(value) + + # Convert padding_strategy in PaddingStrategy + padding_strategy, max_length, _ = self._get_padding_strategies(padding=padding, max_length=max_length) + + required_input = processed_features[self.model_input_names[0]] + if required_input and not isinstance(required_input[0], (list, tuple)): + processed_features = self._pad( + processed_features, + max_length=max_length, + padding_strategy=padding_strategy, + pad_to_multiple_of=pad_to_multiple_of, + return_attention_mask=return_attention_mask, + ) + return BatchFeature(processed_features, tensor_type=return_tensors) + + batch_size = len(required_input) + assert all( + len(v) == batch_size for v in processed_features.values() + ), "Some items in the output dictionary have a different batch size than others." + + if padding_strategy == PaddingStrategy.LONGEST: + max_length = max(len(inputs) for inputs in required_input) + padding_strategy = PaddingStrategy.MAX_LENGTH + + batch_outputs = {} + for i in range(batch_size): + inputs = dict((k, v[i]) for k, v in processed_features.items()) + outputs = self._pad( + inputs, + max_length=max_length, + padding_strategy=padding_strategy, + pad_to_multiple_of=pad_to_multiple_of, + return_attention_mask=return_attention_mask, + ) + + for key, value in outputs.items(): + if key not in batch_outputs: + batch_outputs[key] = [] + batch_outputs[key].append(value) + + return BatchFeature(batch_outputs, tensor_type=return_tensors) + + def _pad( + self, + processed_features: Union[Dict[str, List[float]], BatchFeature], + max_length: Optional[int] = None, + padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD, + pad_to_multiple_of: Optional[int] = None, + return_attention_mask: Optional[bool] = None, + ) -> dict: + """ + Pad inputs (on left/right and up to predefined length or max length in the batch) + + Args: + processed_features: Dictionary of input values (`List[float]`) / input vectors (`List[List[float]]`) or batch of inputs values (`List[List[int]]`) / input vectors (`List[List[List[int]]]`) + max_length: maximum length of the returned list and optionally padding length (see below) + padding_strategy: PaddingStrategy to use for padding. + + - PaddingStrategy.LONGEST Pad to the longest sequence in the batch + - PaddingStrategy.MAX_LENGTH: Pad to the max length (default) + - PaddingStrategy.DO_NOT_PAD: Do not pad + The feature_extractor padding sides are defined in self.padding_side: + + - 'left': pads on the left of the sequences + - 'right': pads on the right of the sequences + pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value. + This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability + >= 7.5 (Volta), or on TPUs which benefit from having sequence lengths be a multiple of 128. + return_attention_mask: (optional) Set to False to avoid returning attention mask (default: set to model specifics) + """ + required_input = processed_features[self.model_input_names[0]] + + if padding_strategy == PaddingStrategy.LONGEST: + max_length = len(required_input) + + if max_length is not None and pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0): + max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of + + needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(required_input) != max_length + + if needs_to_be_padded: + difference = max_length - len(required_input) + padding_vector = self.feature_size * [self.padding_value] if self.feature_size > 1 else self.padding_value + if self.padding_side == "right": + if return_attention_mask: + processed_features["attention_mask"] = [1] * len(required_input) + [0] * difference + processed_features[self.model_input_names[0]] = required_input + [ + padding_vector for _ in range(difference) + ] + elif self.padding_side == "left": + if return_attention_mask: + processed_features["attention_mask"] = [0] * difference + [1] * len(required_input) + processed_features[self.model_input_names[0]] = [ + padding_vector for _ in range(difference) + ] + required_input + else: + raise ValueError("Invalid padding strategy:" + str(self.padding_side)) + elif return_attention_mask and "attention_mask" not in processed_features: + processed_features["attention_mask"] = [1] * len(required_input) + + return processed_features + + def _get_padding_strategies(self, padding=False, max_length=None, pad_to_multiple_of=None, **kwargs): + """ + Find the correct padding strategy + """ + + # Get padding strategy + if padding is not False: + if padding is True: + padding_strategy = PaddingStrategy.LONGEST # Default to pad to the longest sequence in the batch + elif not isinstance(padding, PaddingStrategy): + padding_strategy = PaddingStrategy(padding) + elif isinstance(padding, PaddingStrategy): + padding_strategy = padding + else: + padding_strategy = PaddingStrategy.DO_NOT_PAD + + # Set max length if needed + if max_length is None: + if padding_strategy == PaddingStrategy.MAX_LENGTH: + raise ValueError( + f"When setting ``padding={PaddingStrategy.MAX_LENGTH}``, make sure that" f" max_length is defined" + ) + + # Test if we have a padding value + if padding_strategy != PaddingStrategy.DO_NOT_PAD and (self.padding_value is None): + raise ValueError( + "Asking to pad but the feature_extractor does not have a padding value. " + "Please select a value to use as `padding_value`. For example: `feature_extractor.padding_value = 0.0`." + ) + + return padding_strategy, max_length, kwargs diff --git a/src/transformers/file_utils.py b/src/transformers/file_utils.py index 1e3500833a..8d3008d1c4 100644 --- a/src/transformers/file_utils.py +++ b/src/transformers/file_utils.py @@ -27,9 +27,10 @@ import shutil import sys import tarfile import tempfile -from collections import OrderedDict +from collections import OrderedDict, UserDict from contextlib import contextmanager from dataclasses import fields +from enum import Enum from functools import partial, wraps from hashlib import sha256 from pathlib import Path @@ -211,6 +212,7 @@ TF2_WEIGHTS_NAME = "tf_model.h5" TF_WEIGHTS_NAME = "model.ckpt" FLAX_WEIGHTS_NAME = "flax_model.msgpack" CONFIG_NAME = "config.json" +FEATURE_EXTRACTOR_NAME = "preprocessor_config.json" MODEL_CARD_NAME = "modelcard.json" SENTENCEPIECE_UNDERLINE = "▁" @@ -1400,6 +1402,52 @@ def is_tensor(x): return isinstance(x, np.ndarray) +def _is_numpy(x): + return isinstance(x, np.ndarray) + + +def _is_torch(x): + import torch + + return isinstance(x, torch.Tensor) + + +def _is_torch_device(x): + import torch + + return isinstance(x, torch.device) + + +def _is_tensorflow(x): + import tensorflow as tf + + return isinstance(x, tf.Tensor) + + +def _is_jax(x): + import jax.numpy as jnp # noqa: F811 + + return isinstance(x, jnp.ndarray) + + +def to_py_obj(obj): + """ + Convert a TensorFlow tensor, PyTorch tensor, Numpy array or python list to a python list. + """ + if isinstance(obj, (dict, UserDict)): + return {k: to_py_obj(v) for k, v in obj.items()} + elif isinstance(obj, (list, tuple)): + return [to_py_obj(o) for o in obj] + elif is_tf_available() and _is_tensorflow(obj): + return obj.numpy().tolist() + elif is_torch_available() and _is_torch(obj): + return obj.detach().cpu().tolist() + elif isinstance(obj, np.ndarray): + return obj.tolist() + else: + return obj + + class ModelOutput(OrderedDict): """ Base class for all model outputs as dataclass. Has a ``__getitem__`` that allows indexing by integer or slice (like @@ -1489,6 +1537,42 @@ class ModelOutput(OrderedDict): return tuple(self[k] for k in self.keys()) +class ExplicitEnum(Enum): + """ + Enum with more explicit error message for missing values. + """ + + @classmethod + def _missing_(cls, value): + raise ValueError( + "%r is not a valid %s, please select one of %s" + % (value, cls.__name__, str(list(cls._value2member_map_.keys()))) + ) + + +class PaddingStrategy(ExplicitEnum): + """ + Possible values for the ``padding`` argument in :meth:`PreTrainedTokenizerBase.__call__`. Useful for tab-completion + in an IDE. + """ + + LONGEST = "longest" + MAX_LENGTH = "max_length" + DO_NOT_PAD = "do_not_pad" + + +class TensorType(ExplicitEnum): + """ + Possible values for the ``return_tensors`` argument in :meth:`PreTrainedTokenizerBase.__call__`. Useful for + tab-completion in an IDE. + """ + + PYTORCH = "pt" + TENSORFLOW = "tf" + NUMPY = "np" + JAX = "jax" + + class _BaseLazyModule(ModuleType): """ Module class that surfaces all objects but only performs associated imports when the objects are requested. diff --git a/src/transformers/models/auto/tokenization_auto.py b/src/transformers/models/auto/tokenization_auto.py index 3151c0e971..0e42241f01 100644 --- a/src/transformers/models/auto/tokenization_auto.py +++ b/src/transformers/models/auto/tokenization_auto.py @@ -52,7 +52,7 @@ from ..roberta.tokenization_roberta import RobertaTokenizer from ..squeezebert.tokenization_squeezebert import SqueezeBertTokenizer from ..tapas.tokenization_tapas import TapasTokenizer from ..transfo_xl.tokenization_transfo_xl import TransfoXLTokenizer -from ..wav2vec2.tokenization_wav2vec2 import Wav2Vec2Tokenizer +from ..wav2vec2.tokenization_wav2vec2 import Wav2Vec2CTCTokenizer from ..xlm.tokenization_xlm import XLMTokenizer from .configuration_auto import ( AlbertConfig, @@ -244,7 +244,7 @@ TOKENIZER_MAPPING = OrderedDict( (TapasConfig, (TapasTokenizer, None)), (LEDConfig, (LEDTokenizer, LEDTokenizerFast)), (ConvBertConfig, (ConvBertTokenizer, ConvBertTokenizerFast)), - (Wav2Vec2Config, (Wav2Vec2Tokenizer, None)), + (Wav2Vec2Config, (Wav2Vec2CTCTokenizer, None)), ] ) diff --git a/src/transformers/models/dpr/tokenization_dpr.py b/src/transformers/models/dpr/tokenization_dpr.py index b6416ae528..705fd064a8 100644 --- a/src/transformers/models/dpr/tokenization_dpr.py +++ b/src/transformers/models/dpr/tokenization_dpr.py @@ -18,8 +18,8 @@ import collections from typing import List, Optional, Union -from ...file_utils import add_end_docstrings, add_start_docstrings -from ...tokenization_utils_base import BatchEncoding, TensorType +from ...file_utils import TensorType, add_end_docstrings, add_start_docstrings +from ...tokenization_utils_base import BatchEncoding from ...utils import logging from ..bert.tokenization_bert import BertTokenizer @@ -147,7 +147,7 @@ CUSTOM_DPR_READER_DOCSTRING = r""" The passages titles to be encoded. This can be a string or a list of strings if there are several passages. texts (:obj:`str` or :obj:`List[str]`): The passages texts to be encoded. This can be a string or a list of strings if there are several passages. - padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`False`): + padding (:obj:`bool`, :obj:`str` or :class:`~transformers.file_utils.PaddingStrategy`, `optional`, defaults to :obj:`False`): Activates and controls padding. Accepts the following values: * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single @@ -177,7 +177,7 @@ CUSTOM_DPR_READER_DOCSTRING = r""" If left unset or set to :obj:`None`, this will use the predefined model maximum length if a maximum length is required by one of the truncation/padding parameters. If the model has no specific maximum input length (like XLNet) truncation/padding to a maximum length will be deactivated. - return_tensors (:obj:`str` or :class:`~transformers.tokenization_utils_base.TensorType`, `optional`): + return_tensors (:obj:`str` or :class:`~transformers.file_utils.TensorType`, `optional`): If set, will return tensors instead of list of python integers. Acceptable values are: * :obj:`'tf'`: Return TensorFlow :obj:`tf.constant` objects. diff --git a/src/transformers/models/dpr/tokenization_dpr_fast.py b/src/transformers/models/dpr/tokenization_dpr_fast.py index 6c854006af..12a990041a 100644 --- a/src/transformers/models/dpr/tokenization_dpr_fast.py +++ b/src/transformers/models/dpr/tokenization_dpr_fast.py @@ -18,8 +18,8 @@ import collections from typing import List, Optional, Union -from ...file_utils import add_end_docstrings, add_start_docstrings -from ...tokenization_utils_base import BatchEncoding, TensorType +from ...file_utils import TensorType, add_end_docstrings, add_start_docstrings +from ...tokenization_utils_base import BatchEncoding from ...utils import logging from ..bert.tokenization_bert_fast import BertTokenizerFast from .tokenization_dpr import DPRContextEncoderTokenizer, DPRQuestionEncoderTokenizer, DPRReaderTokenizer @@ -148,7 +148,7 @@ CUSTOM_DPR_READER_DOCSTRING = r""" The passages titles to be encoded. This can be a string or a list of strings if there are several passages. texts (:obj:`str` or :obj:`List[str]`): The passages texts to be encoded. This can be a string or a list of strings if there are several passages. - padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`False`): + padding (:obj:`bool`, :obj:`str` or :class:`~transformers.file_utils.PaddingStrategy`, `optional`, defaults to :obj:`False`): Activates and controls padding. Accepts the following values: * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single @@ -178,7 +178,7 @@ CUSTOM_DPR_READER_DOCSTRING = r""" If left unset or set to :obj:`None`, this will use the predefined model maximum length if a maximum length is required by one of the truncation/padding parameters. If the model has no specific maximum input length (like XLNet) truncation/padding to a maximum length will be deactivated. - return_tensors (:obj:`str` or :class:`~transformers.tokenization_utils_base.TensorType`, `optional`): + return_tensors (:obj:`str` or :class:`~transformers.file_utils.TensorType`, `optional`): If set, will return tensors instead of list of python integers. Acceptable values are: * :obj:`'tf'`: Return TensorFlow :obj:`tf.constant` objects. diff --git a/src/transformers/models/rag/retrieval_rag.py b/src/transformers/models/rag/retrieval_rag.py index ff85560e59..12ad21ac43 100644 --- a/src/transformers/models/rag/retrieval_rag.py +++ b/src/transformers/models/rag/retrieval_rag.py @@ -573,7 +573,7 @@ class RagRetriever: The prefix used by the generator's tokenizer. n_docs (:obj:`int`, `optional`): The number of docs retrieved per query. - return_tensors (:obj:`str` or :class:`~transformers.tokenization_utils_base.TensorType`, `optional`, defaults to "pt"): + return_tensors (:obj:`str` or :class:`~transformers.file_utils.TensorType`, `optional`, defaults to "pt"): If set, will return tensors instead of list of python integers. Acceptable values are: * :obj:`'tf'`: Return TensorFlow :obj:`tf.constant` objects. diff --git a/src/transformers/models/tapas/tokenization_tapas.py b/src/transformers/models/tapas/tokenization_tapas.py index 4227ada6eb..6fe7737cc5 100644 --- a/src/transformers/models/tapas/tokenization_tapas.py +++ b/src/transformers/models/tapas/tokenization_tapas.py @@ -28,16 +28,13 @@ from typing import Callable, Dict, Generator, List, Optional, Text, Tuple, Union import numpy as np -from ...file_utils import add_end_docstrings, is_pandas_available +from ...file_utils import ExplicitEnum, PaddingStrategy, TensorType, add_end_docstrings, is_pandas_available from ...tokenization_utils import PreTrainedTokenizer, _is_control, _is_punctuation, _is_whitespace from ...tokenization_utils_base import ( ENCODE_KWARGS_DOCSTRING, BatchEncoding, EncodedInput, - ExplicitEnum, - PaddingStrategy, PreTokenizedInput, - TensorType, TextInput, ) from ...utils import logging @@ -151,7 +148,7 @@ def whitespace_tokenize(text): TAPAS_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING = r""" add_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`True`): Whether or not to encode the sequences with the special tokens relative to their model. - padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`False`): + padding (:obj:`bool`, :obj:`str` or :class:`~transformers.file_utils.PaddingStrategy`, `optional`, defaults to :obj:`False`): Activates and controls padding. Accepts the following values: * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a @@ -180,7 +177,7 @@ TAPAS_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING = r""" pad_to_multiple_of (:obj:`int`, `optional`): If set will pad the sequence to a multiple of the provided value. This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >= 7.5 (Volta). - return_tensors (:obj:`str` or :class:`~transformers.tokenization_utils_base.TensorType`, `optional`): + return_tensors (:obj:`str` or :class:`~transformers.file_utils.TensorType`, `optional`): If set, will return tensors instead of list of python integers. Acceptable values are: * :obj:`'tf'`: Return TensorFlow :obj:`tf.constant` objects. diff --git a/src/transformers/models/wav2vec2/__init__.py b/src/transformers/models/wav2vec2/__init__.py index 22066fadf8..37456c17aa 100644 --- a/src/transformers/models/wav2vec2/__init__.py +++ b/src/transformers/models/wav2vec2/__init__.py @@ -22,7 +22,9 @@ from ...file_utils import _BaseLazyModule, is_tokenizers_available, is_torch_ava _import_structure = { "configuration_wav2vec2": ["WAV_2_VEC_2_PRETRAINED_CONFIG_ARCHIVE_MAP", "Wav2Vec2Config"], - "tokenization_wav2vec2": ["Wav2Vec2Tokenizer"], + "tokenization_wav2vec2": ["Wav2Vec2CTCTokenizer", "Wav2Vec2Tokenizer"], + "feature_extraction_wav2vec2": ["Wav2Vec2FeatureExtractor"], + "processing_wav2vec2": ["Wav2Vec2Processor"], } if is_torch_available(): @@ -37,7 +39,9 @@ if is_torch_available(): if TYPE_CHECKING: from .configuration_wav2vec2 import WAV_2_VEC_2_PRETRAINED_CONFIG_ARCHIVE_MAP, Wav2Vec2Config - from .tokenization_wav2vec2 import Wav2Vec2Tokenizer + from .feature_extraction_wav2vec2 import Wav2Vec2FeatureExtractor + from .processing_wav2vec2 import Wav2Vec2Processor + from .tokenization_wav2vec2 import Wav2Vec2CTCTokenizer, Wav2Vec2Tokenizer if is_torch_available(): from .modeling_wav2vec2 import ( diff --git a/src/transformers/models/wav2vec2/feature_extraction_wav2vec2.py b/src/transformers/models/wav2vec2/feature_extraction_wav2vec2.py new file mode 100644 index 0000000000..9fd3f9a4c1 --- /dev/null +++ b/src/transformers/models/wav2vec2/feature_extraction_wav2vec2.py @@ -0,0 +1,192 @@ +# coding=utf-8 +# Copyright 2021 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Feature extractor class for Wav2Vec2 +""" + +from typing import List, Optional, Union + +import numpy as np + +from ...feature_extraction_utils import BatchFeature, PreTrainedFeatureExtractor +from ...file_utils import PaddingStrategy, TensorType +from ...utils import logging + + +logger = logging.get_logger(__name__) + + +class Wav2Vec2FeatureExtractor(PreTrainedFeatureExtractor): + r""" + Constructs a Wav2Vec2 feature extractor. + + This feature extractor inherits from :class:`~transformers.Wav2Vec2FeatureExtractor` which contains most of the + main methods. Users should refer to this superclass for more information regarding those methods. + + Args: + feature_size (:obj:`int`, defaults to 1): + The feature dimension of the extracted features. + sampling_rate (:obj:`int`, defaults to 16000): + The sampling rate at which the audio files should be digitalized expressed in Hertz per second (Hz). + padding_value (:obj:`float`, defaults to 0.0): + The value that is used to fill the padding values. + do_normalize (:obj:`bool`, `optional`, defaults to :obj:`False`): + Whether or not to zero-mean unit-variance normalize the input. Normalizing can help to significantly + improve the performance for some models, *e.g.*, `wav2vec2-lv60 + `__. + return_attention_mask (:obj:`bool`, `optional`, defaults to :obj:`False`): + Whether or not :meth:`~transformers.Wav2Vec2Tokenizer.__call__` should return :obj:`attention_mask`. + + .. note:: + + Wav2Vec2 models that have set ``config.feat_extract_norm == "group"``, such as `wav2vec2-base + `__, have **not** been trained using + :obj:`attention_mask`. For such models, :obj:`input_values` should simply be padded with 0 and no + :obj:`attention_mask` should be passed. + + For Wav2Vec2 models that have set ``config.feat_extract_norm == "layer"``, such as `wav2vec2-lv60 + `__, :obj:`attention_mask` should be + passed for batched inference. + """ + + model_input_names = ["input_values", "attention_mask"] + + def __init__( + self, + feature_size=1, + sampling_rate=16000, + padding_value=0.0, + return_attention_mask=False, + do_normalize=True, + **kwargs + ): + super().__init__(feature_size=feature_size, sampling_rate=sampling_rate, padding_value=padding_value, **kwargs) + self.return_attention_mask = return_attention_mask + self.do_normalize = do_normalize + + @staticmethod + def zero_mean_unit_var_norm(input_values: List[np.ndarray]) -> List[np.ndarray]: + """ + Every array in the list is normalized to have zero mean and unit variance + """ + return [(x - np.mean(x)) / np.sqrt(np.var(x) + 1e-5) for x in input_values] + + def __call__( + self, + raw_speech: Union[np.ndarray, List[float], List[np.ndarray], List[List[float]]], + padding: Union[bool, str, PaddingStrategy] = False, + max_length: Optional[int] = None, + pad_to_multiple_of: Optional[int] = None, + return_tensors: Optional[Union[str, TensorType]] = None, + sampling_rate: Optional[int] = None, + **kwargs + ) -> BatchFeature: + """ + Main method to featurize and prepare for the model one or several sequence(s). sequences. + + Args: + raw_speech (:obj:`np.ndarray`, :obj:`List[float]`, :obj:`List[np.ndarray]`, :obj:`List[List[float]]`): + The sequence or batch of sequences to be padded. Each sequence can be a numpy array, a list of float + values, a list of numpy arrays or a list of list of float values. + padding (:obj:`bool`, :obj:`str` or :class:`~transformers.file_utils.PaddingStrategy`, `optional`, defaults to :obj:`True`): + Select a strategy to pad the returned sequences (according to the model's padding side and padding + index) among: + + * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a + single sequence if provided). + * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the + maximum acceptable input length for the model if that argument is not provided. + * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of + different lengths). + max_length (:obj:`int`, `optional`): + Maximum length of the returned list and optionally padding length (see above). + pad_to_multiple_of (:obj:`int`, `optional`): + If set will pad the sequence to a multiple of the provided value. + + This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability + >= 7.5 (Volta), or on TPUs which benefit from having sequence lengths be a multiple of 128. + return_attention_mask (:obj:`bool`, `optional`): + Whether to return the attention mask. If left to the default, will return the attention mask according + to the specific feature_extractor's default. + + `What are attention masks? <../glossary.html#attention-mask>`__ + + .. note:: + + Wav2Vec2 models that have set ``config.feat_extract_norm == "group"``, such as `wav2vec2-base + `__, have **not** been trained using + :obj:`attention_mask`. For such models, :obj:`input_values` should simply be padded with 0 and no + :obj:`attention_mask` should be passed. + + For Wav2Vec2 models that have set ``config.feat_extract_norm == "layer"``, such as `wav2vec2-lv60 + `__, :obj:`attention_mask` should be + passed for batched inference. + + return_tensors (:obj:`str` or :class:`~transformers.file_utils.TensorType`, `optional`): + If set, will return tensors instead of list of python integers. Acceptable values are: + + * :obj:`'tf'`: Return TensorFlow :obj:`tf.constant` objects. + * :obj:`'pt'`: Return PyTorch :obj:`torch.Tensor` objects. + * :obj:`'np'`: Return Numpy :obj:`np.ndarray` objects. + sampling_rate (:obj:`int`, `optional`): + The sampling rate at which the ``raw_speech`` input was sampled. It is strongly recommended to pass + ``sampling_rate`` at the forward call to prevent silent errors. + padding_value (:obj:`float`, defaults to 0.0): + """ + + if sampling_rate is not None: + if sampling_rate != self.sampling_rate: + raise ValueError( + f"The model corresponding to this feature extractor: {self} was trained using a sampling rate of {self.sampling_rate}." + f"Please make sure that the provided `raw_speech` input was sampled with {self.sampling_rate} and not {sampling_rate}." + ) + else: + logger.warning( + "It is strongly recommended to pass the ``sampling_rate`` argument to this function." + "Failing to do so can result in silent errors that might be hard to debug." + ) + + is_batched = bool( + isinstance(raw_speech, (list, tuple)) + and (isinstance(raw_speech[0], np.ndarray) or isinstance(raw_speech[0], (tuple, list))) + ) + + # make sure input is in list format + if is_batched and not isinstance(raw_speech[0], np.ndarray): + raw_speech = [np.asarray(speech) for speech in raw_speech] + elif not is_batched and not isinstance(raw_speech, np.ndarray): + raw_speech = np.asarray(raw_speech) + + # always return batch + if not is_batched: + raw_speech = [raw_speech] + + # zero-mean and unit-variance normalization + if self.do_normalize: + raw_speech = self.zero_mean_unit_var_norm(raw_speech) + + # convert into correct format for padding + encoded_inputs = BatchFeature({"input_values": raw_speech}) + + padded_inputs = self.pad( + encoded_inputs, + padding=padding, + max_length=max_length, + pad_to_multiple_of=pad_to_multiple_of, + return_attention_mask=self.return_attention_mask, + return_tensors=return_tensors, + ) + + return padded_inputs diff --git a/src/transformers/models/wav2vec2/modeling_wav2vec2.py b/src/transformers/models/wav2vec2/modeling_wav2vec2.py index 3935566de0..2322e5cce7 100755 --- a/src/transformers/models/wav2vec2/modeling_wav2vec2.py +++ b/src/transformers/models/wav2vec2/modeling_wav2vec2.py @@ -616,9 +616,9 @@ WAV_2_VEC_2_INPUTS_DOCSTRING = r""" input_values (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`): Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file into an array of type `List[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install - soundfile`). To prepare the array into `input_values`, the :class:`~transformers.Wav2Vec2Tokenizer` should + soundfile`). To prepare the array into `input_values`, the :class:`~transformers.Wav2Vec2Processor` should be used for padding and conversion into a tensor of type `torch.FloatTensor`. See - :meth:`transformers.Wav2Vec2Tokenizer.__call__` for details. + :meth:`transformers.Wav2Vec2Processor.__call__` for details. attention_mask (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`): Mask to avoid performing convolution and attention on padding token indices. Mask values selected in ``[0, 1]``: @@ -629,8 +629,8 @@ WAV_2_VEC_2_INPUTS_DOCSTRING = r""" `What are attention masks? <../glossary.html#attention-mask>`__ .. warning:: - :obj:`attention_mask` should only be passed if the corresponding tokenizer has - ``config.return_attention_mask == True``. For all models whose tokenizer has + :obj:`attention_mask` should only be passed if the corresponding processor has + ``config.return_attention_mask == True``. For all models whose processor has ``config.return_attention_mask == False``, such as `wav2vec2-base `__, :obj:`attention_mask` should **not** be passed to avoid degraded performance when doing batched inference. For such models :obj:`input_values` should @@ -682,11 +682,11 @@ class Wav2Vec2Model(Wav2Vec2PreTrainedModel): Example:: - >>> from transformers import Wav2Vec2Tokenizer, Wav2Vec2Model + >>> from transformers import Wav2Vec2Processor, Wav2Vec2Model >>> from datasets import load_dataset >>> import soundfile as sf - >>> tokenizer = Wav2Vec2Tokenizer.from_pretrained("facebook/wav2vec2-base-960h") + >>> processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h") >>> model = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-base-960h") >>> def map_to_array(batch): @@ -697,7 +697,7 @@ class Wav2Vec2Model(Wav2Vec2PreTrainedModel): >>> ds = load_dataset("patrickvonplaten/librispeech_asr_dummy", "clean", split="validation") >>> ds = ds.map(map_to_array) - >>> input_values = tokenizer(ds["speech"][0], return_tensors="pt").input_values # Batch size 1 + >>> input_values = processor(ds["speech"][0], return_tensors="pt").input_values # Batch size 1 >>> hidden_states = model(input_values).last_hidden_state """ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions @@ -780,11 +780,11 @@ class Wav2Vec2ForMaskedLM(Wav2Vec2PreTrainedModel): Example:: - >>> from transformers import Wav2Vec2Tokenizer, Wav2Vec2Model + >>> from transformers import Wav2Vec2Processor, Wav2Vec2Model >>> from datasets import load_dataset >>> import soundfile as sf - >>> tokenizer = Wav2Vec2Tokenizer.from_pretrained("facebook/wav2vec2-base-960h") + >>> processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h") >>> model = Wav2Vec2ForMaskedLM.from_pretrained("facebook/wav2vec2-base-960h") >>> def map_to_array(batch): @@ -795,11 +795,11 @@ class Wav2Vec2ForMaskedLM(Wav2Vec2PreTrainedModel): >>> ds = load_dataset("patrickvonplaten/librispeech_asr_dummy", "clean", split="validation") >>> ds = ds.map(map_to_array) - >>> input_values = tokenizer(ds["speech"][0], return_tensors="pt").input_values # Batch size 1 + >>> input_values = processor(ds["speech"][0], return_tensors="pt").input_values # Batch size 1 >>> logits = model(input_values).logits >>> predicted_ids = torch.argmax(logits, dim=-1) - >>> transcription = tokenizer.decode(predicted_ids[0]) + >>> transcription = processor.decode(predicted_ids[0]) """ return_dict = return_dict if return_dict is not None else self.config.use_return_dict @@ -856,11 +856,11 @@ class Wav2Vec2ForCTC(Wav2Vec2PreTrainedModel): Example:: >>> import torch - >>> from transformers import Wav2Vec2Tokenizer, Wav2Vec2ForCTC + >>> from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC >>> from datasets import load_dataset >>> import soundfile as sf - >>> tokenizer = Wav2Vec2Tokenizer.from_pretrained("facebook/wav2vec2-base-960h") + >>> processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h") >>> model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h") >>> def map_to_array(batch): @@ -871,11 +871,11 @@ class Wav2Vec2ForCTC(Wav2Vec2PreTrainedModel): >>> ds = load_dataset("patrickvonplaten/librispeech_asr_dummy", "clean", split="validation") >>> ds = ds.map(map_to_array) - >>> input_values = tokenizer(ds["speech"][0], return_tensors="pt").input_values # Batch size 1 + >>> input_values = processor(ds["speech"][0], return_tensors="pt").input_values # Batch size 1 >>> logits = model(input_values).logits >>> predicted_ids = torch.argmax(logits, dim=-1) - >>> transcription = tokenizer.decode(predicted_ids[0]) + >>> transcription = processor.decode(predicted_ids[0]) """ return_dict = return_dict if return_dict is not None else self.config.use_return_dict diff --git a/src/transformers/models/wav2vec2/processing_wav2vec2.py b/src/transformers/models/wav2vec2/processing_wav2vec2.py new file mode 100644 index 0000000000..3f1bd6b4b6 --- /dev/null +++ b/src/transformers/models/wav2vec2/processing_wav2vec2.py @@ -0,0 +1,142 @@ +# coding=utf-8 +# Copyright 2021 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Speech processor class for Wav2Vec2 +""" +from contextlib import contextmanager + +from .feature_extraction_wav2vec2 import Wav2Vec2FeatureExtractor +from .tokenization_wav2vec2 import Wav2Vec2CTCTokenizer + + +class Wav2Vec2Processor: + r""" + Constructs a Wav2Vec2 processor which wraps a Wav2Vec2 feature extractor and a Wav2Vec2 CTC tokenizer into a single + processor. + + :class:`~transformers.Wav2Vec2Processor` offers all the functionalities of + :class:`~transformers.Wav2Vec2FeatureExtractor` and :class:`~transformers.Wav2Vec2CTCTokenizer`. See the docstring + of :meth:`~transformers.Wav2Vec2Processor.__call__` and :meth:`~transformers.Wav2Vec2Processor.decode` for more + information. + + Args: + feature_extractor (:obj:`Wav2Vec2FeatureExtractor`): + An instance of :class:`~transformers.Wav2Vec2FeatureExtractor`. The feature extractor is a required input. + tokenizer (:obj:`Wav2Vec2CTCTokenizer`): + An instance of :class:`~transformers.Wav2Vec2CTCTokenizer`. The tokenizer is a required input. + """ + + def __init__(self, feature_extractor, tokenizer): + if not isinstance(feature_extractor, Wav2Vec2FeatureExtractor): + raise ValueError( + f"`feature_extractor` has to be of type {Wav2Vec2FeatureExtractor.__class__}, but is {type(feature_extractor)}" + ) + if not isinstance(tokenizer, Wav2Vec2CTCTokenizer): + raise ValueError( + f"`tokenizer` has to be of type {Wav2Vec2CTCTokenizer.__class__}, but is {type(tokenizer)}" + ) + + self.feature_extractor = feature_extractor + self.tokenizer = tokenizer + self.current_processor = self.feature_extractor + + def save_pretrained(self, save_directory): + """ + Save a Wav2Vec2 feature_extractor object and Wav2Vec2 tokenizer object to the directory ``save_directory``, so + that it can be re-loaded using the :func:`~transformers.Wav2Vec2Processor.from_pretrained` class method. + + .. note:: + + This class method is simply calling :meth:`~transformers.PreTrainedFeatureExtractor.save_pretrained` and + :meth:`~transformers.tokenization_utils_base.PreTrainedTokenizer.save_pretrained`. Please refer to the + docstrings of the methods above for more information. + + Args: + save_directory (:obj:`str` or :obj:`os.PathLike`): + Directory where the feature extractor JSON file and the tokenizer files will be saved (directory will + be created if it does not exist). + """ + + self.feature_extractor.save_pretrained(save_directory) + self.tokenizer.save_pretrained(save_directory) + + @classmethod + def from_pretrained(cls, pretrained_model_name_or_path, **kwargs): + r""" + Instantiate a :class:`~transformers.Wav2Vec2Processor` from a pretrained Wav2Vec2 processor. + + .. note:: + + This class method is simply calling Wav2Vec2FeatureExtractor's + :meth:`~transformers.PreTrainedFeatureExtractor.from_pretrained` and Wav2Vec2CTCTokenizer's + :meth:`~transformers.tokenization_utils_base.PreTrainedTokenizer.from_pretrained`. Please refer to the + docstrings of the methods above for more information. + + Args: + pretrained_model_name_or_path (:obj:`str` or :obj:`os.PathLike`): + This can be either: + + - a string, the `model id` of a pretrained feature_extractor hosted inside a model repo on + huggingface.co. Valid model ids can be located at the root-level, like ``bert-base-uncased``, or + namespaced under a user or organization name, like ``dbmdz/bert-base-german-cased``. + - a path to a `directory` containing a feature extractor file saved using the + :meth:`~transformers.PreTrainedFeatureExtractor.save_pretrained` method, e.g., + ``./my_model_directory/``. + - a path or url to a saved feature extractor JSON `file`, e.g., + ``./my_model_directory/feature_extraction_config.json``. + **kwargs + Additional keyword arguments passed along to both :class:`~transformers.PreTrainedFeatureExtractor` and + :class:`~transformers.PreTrainedTokenizer` + """ + feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(pretrained_model_name_or_path, **kwargs) + tokenizer = Wav2Vec2CTCTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs) + + return cls(feature_extractor=feature_extractor, tokenizer=tokenizer) + + def __call__(self, *args, **kwargs): + """ + When used in normal mode, this method forwards all its arguments to Wav2Vec2FeatureExtractor's + :meth:`~transformers.Wav2Vec2FeatureExtractor.__call__` and returns its output. If used in the context + :meth:`~transformers.Wav2Vec2Processor.as_target_processor` this method forwards all its arguments to + Wav2Vec2CTCTokenizer's :meth:`~transformers.Wav2Vec2CTCTokenizer.__call__`. Please refer to the doctsring of + the above two methods for more information. + """ + return self.current_processor(*args, **kwargs) + + def batch_decode(self, *args, **kwargs): + """ + This method forwards all its arguments to Wav2Vec2CTCTokenizer's + :meth:`~transformers.PreTrainedTokenizer.batch_decode`. Please refer to the docstring of this method for more + information. + """ + return self.tokenizer.batch_decode(*args, **kwargs) + + def decode(self, *args, **kwargs): + """ + This method forwards all its arguments to Wav2Vec2CTCTokenizer's + :meth:`~transformers.PreTrainedTokenizer.decode`. Please refer to the docstring of this method for more + information. + """ + return self.tokenizer.decode(*args, **kwargs) + + @contextmanager + def as_target_processor(self): + """ + Temporarily sets the tokenizer for processing the input. Useful for encoding the labels when fine-tuning + Wav2Vec2. + """ + self.current_processor = self.tokenizer + yield + self.current_processor = self.feature_extractor diff --git a/src/transformers/models/wav2vec2/tokenization_wav2vec2.py b/src/transformers/models/wav2vec2/tokenization_wav2vec2.py index 0cc491cf37..3735215073 100644 --- a/src/transformers/models/wav2vec2/tokenization_wav2vec2.py +++ b/src/transformers/models/wav2vec2/tokenization_wav2vec2.py @@ -16,14 +16,16 @@ import json import os +import sys +import warnings from itertools import groupby from typing import Dict, List, Optional, Tuple, Union import numpy as np -from ...file_utils import add_end_docstrings +from ...file_utils import PaddingStrategy, TensorType, add_end_docstrings from ...tokenization_utils import PreTrainedTokenizer -from ...tokenization_utils_base import BatchEncoding, PaddingStrategy, TensorType +from ...tokenization_utils_base import BatchEncoding from ...utils import logging @@ -37,7 +39,7 @@ VOCAB_FILES_NAMES = { WAV2VEC2_KWARGS_DOCSTRING = r""" - padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`False`): + padding (:obj:`bool`, :obj:`str` or :class:`~transformers.file_utils.PaddingStrategy`, `optional`, defaults to :obj:`False`): Activates and controls padding. Accepts the following values: * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a @@ -55,7 +57,7 @@ WAV2VEC2_KWARGS_DOCSTRING = r""" pad_to_multiple_of (:obj:`int`, `optional`): If set will pad the sequence to a multiple of the provided value. This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >= 7.5 (Volta). - return_tensors (:obj:`str` or :class:`~transformers.tokenization_utils_base.TensorType`, `optional`): + return_tensors (:obj:`str` or :class:`~transformers.file_utils.TensorType`, `optional`): If set, will return tensors instead of list of python integers. Acceptable values are: * :obj:`'tf'`: Return TensorFlow :obj:`tf.constant` objects. @@ -66,6 +68,207 @@ WAV2VEC2_KWARGS_DOCSTRING = r""" """ +class Wav2Vec2CTCTokenizer(PreTrainedTokenizer): + + """ + Constructs a Wav2Vec2CTC tokenizer. + + This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains some of the main methods. + Users should refer to the superclass for more information regarding such methods. + + Args: + vocab_file (:obj:`str`): + File containing the vocabulary. + bos_token (:obj:`str`, `optional`, defaults to :obj:`""`): + The beginning of sentence token. + eos_token (:obj:`str`, `optional`, defaults to :obj:`""`): + The end of sentence token. + unk_token (:obj:`str`, `optional`, defaults to :obj:`""`): + The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this + token instead. + pad_token (:obj:`str`, `optional`, defaults to :obj:`""`): + The token used for padding, for example when batching sequences of different lengths. + word_delimiter_token (:obj:`str`, `optional`, defaults to :obj:`"|"`): + The token used for defining the end of a word. + do_lower_case (:obj:`bool`, `optional`, defaults to :obj:`False`): + Whether or not to accept lowercase input and lowercase the output when decoding. + + **kwargs + Additional keyword arguments passed along to :class:`~transformers.PreTrainedTokenizer` + """ + + vocab_files_names = VOCAB_FILES_NAMES + pretrained_vocab_files_map = { + "vocab_file": { + "facebook/wav2vec2-base-960h": "https://huggingface.co/facebook/wav2vec2-base-960h/resolve/main/vocab.json" + }, + "tokenizer_config_file": { + "facebook/wav2vec2-base-960h": "https://huggingface.co/facebook/wav2vec2-base-960h/resolve/main/tokenizer.json", + }, + } + # Wav2Vec2 has no max input length + max_model_input_sizes = {"facebook/wav2vec2-base-960h": sys.maxsize} + model_input_names = ["input_ids", "attention_mask"] + + def __init__( + self, + vocab_file, + bos_token="", + eos_token="", + unk_token="", + pad_token="", + word_delimiter_token="|", + do_lower_case=False, + **kwargs + ): + super().__init__( + unk_token=unk_token, + bos_token=bos_token, + eos_token=eos_token, + pad_token=pad_token, + do_lower_case=do_lower_case, + word_delimiter_token=word_delimiter_token, + **kwargs, + ) + + self._word_delimiter_token = word_delimiter_token + + self.do_lower_case = do_lower_case + + with open(vocab_file, encoding="utf-8") as vocab_handle: + self.encoder = json.load(vocab_handle) + self.decoder = {v: k for k, v in self.encoder.items()} + + @property + def word_delimiter_token(self) -> str: + """ + :obj:`str`: Padding token. Log an error if used while not having been set. + """ + if self._word_delimiter_token is None and self.verbose: + logger.error("Using word_delimiter_token, but it is not set yet.") + return None + return str(self._word_delimiter_token) + + @property + def word_delimiter_token_id(self) -> Optional[int]: + """ + :obj:`Optional[int]`: Id of the word_delimiter_token in the vocabulary. Returns :obj:`None` if the token has + not been set. + """ + if self._word_delimiter_token is None: + return None + return self.convert_tokens_to_ids(self.word_delimiter_token) + + @word_delimiter_token.setter + def word_delimiter_token(self, value): + self._word_delimiter_token = value + + @word_delimiter_token_id.setter + def word_delimiter_token_id(self, value): + self._word_delimiter_token = self.convert_tokens_to_ids(value) + + @property + def vocab_size(self) -> int: + return len(self.decoder) + + def get_vocab(self) -> Dict: + return dict(self.encoder, **self.added_tokens_encoder) + + def _tokenize(self, text, **kwargs): + """ + Converts a string in a sequence of tokens (string), using the tokenizer. + """ + if self.do_lower_case: + text = text.upper() + + return list(text.replace(" ", self.word_delimiter_token)) + + def _convert_token_to_id(self, token: str) -> int: + """Converts a token (str) in an index (integer) using the vocab.""" + return self.encoder.get(token, self.encoder.get(self.unk_token)) + + def _convert_id_to_token(self, index: int) -> str: + """Converts an index (integer) in a token (str) using the vocab.""" + result = self.decoder.get(index, self.unk_token) + return result + + def convert_tokens_to_string( + self, tokens: List[str], group_tokens: bool = True, spaces_between_special_tokens: bool = False + ) -> str: + """ + Converts a connectionist-temporal-classification (CTC) output tokens into a single string. + """ + # group same tokens into non-repeating tokens in CTC style decoding + if group_tokens: + tokens = [token_group[0] for token_group in groupby(tokens)] + + # filter self.pad_token which is used as CTC-blank token + filtered_tokens = list(filter(lambda token: token != self.pad_token, tokens)) + + if spaces_between_special_tokens: + join_token = " " + else: + join_token = "" + + # replace delimiter token + string = join_token.join( + [" " if token == self.word_delimiter_token else token for token in filtered_tokens] + ).strip() + + if self.do_lower_case: + string = string.lower() + return string + + def prepare_for_tokenization(self, text, is_split_into_words=False, **kwargs): + if is_split_into_words: + text = " " + text + return (text, kwargs) + + def _decode( + self, + token_ids: List[int], + skip_special_tokens: bool = False, + clean_up_tokenization_spaces: bool = True, + group_tokens: bool = True, + spaces_between_special_tokens: bool = False, + ) -> str: + """ + special _decode function is needed for Wav2Vec2Tokenizer because added tokens should be treated exactly the + same as tokens of the base vocabulary and therefore the function `convert_tokens_to_string` has to be called on + the whole token list and not individually on added tokens + """ + filtered_tokens = self.convert_ids_to_tokens(token_ids, skip_special_tokens=skip_special_tokens) + + result = [] + for token in filtered_tokens: + if skip_special_tokens and token in self.all_special_ids: + continue + result.append(token) + + text = self.convert_tokens_to_string( + result, group_tokens=group_tokens, spaces_between_special_tokens=spaces_between_special_tokens + ) + + if clean_up_tokenization_spaces: + clean_text = self.clean_up_tokenization(text) + return clean_text + else: + return text + + def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]: + if not os.path.isdir(save_directory): + logger.error("Vocabulary path ({}) should be a directory".format(save_directory)) + return + vocab_file = os.path.join( + save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"] + ) + + with open(vocab_file, "w", encoding="utf-8") as f: + f.write(json.dumps(self.encoder, ensure_ascii=False)) + + return (vocab_file,) + + class Wav2Vec2Tokenizer(PreTrainedTokenizer): """ Constructs a Wav2Vec2 tokenizer. @@ -146,6 +349,12 @@ class Wav2Vec2Tokenizer(PreTrainedTokenizer): word_delimiter_token=word_delimiter_token, **kwargs, ) + + warnings.warn( + "The class `Wav2Vec2Tokenizer` is deprecated and will be removed in version 5 of Transformers. Please use `Wav2Vec2Processor` or `Wav2Vec2CTCTokenizer` instead.", + FutureWarning, + ) + self._word_delimiter_token = word_delimiter_token self.do_lower_case = do_lower_case diff --git a/src/transformers/pipelines/question_answering.py b/src/transformers/pipelines/question_answering.py index e63d0d6ba9..439e63814b 100644 --- a/src/transformers/pipelines/question_answering.py +++ b/src/transformers/pipelines/question_answering.py @@ -4,10 +4,9 @@ from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union import numpy as np from ..data import SquadExample, SquadFeatures, squad_convert_examples_to_features -from ..file_utils import add_end_docstrings, is_tf_available, is_torch_available +from ..file_utils import PaddingStrategy, add_end_docstrings, is_tf_available, is_torch_available from ..modelcard import ModelCard from ..tokenization_utils import PreTrainedTokenizer -from ..tokenization_utils_base import PaddingStrategy from .base import PIPELINE_INIT_ARGS, ArgumentHandler, Pipeline diff --git a/src/transformers/pipelines/table_question_answering.py b/src/transformers/pipelines/table_question_answering.py index 865941f249..a846e0d939 100644 --- a/src/transformers/pipelines/table_question_answering.py +++ b/src/transformers/pipelines/table_question_answering.py @@ -205,7 +205,7 @@ class TableQuestionAnsweringPipeline(Pipeline): Whether to do inference sequentially or as a batch. Batching is faster, but models like SQA require the inference to be done sequentially to extract relations within sequences, given their conversational nature. - padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`False`): + padding (:obj:`bool`, :obj:`str` or :class:`~transformers.file_utils.PaddingStrategy`, `optional`, defaults to :obj:`False`): Activates and controls padding. Accepts the following values: * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a diff --git a/src/transformers/tokenization_utils.py b/src/transformers/tokenization_utils.py index 58aa2848ad..b5f55faf35 100644 --- a/src/transformers/tokenization_utils.py +++ b/src/transformers/tokenization_utils.py @@ -21,7 +21,7 @@ import re import unicodedata from typing import Any, Dict, List, Optional, Tuple, Union, overload -from .file_utils import add_end_docstrings +from .file_utils import PaddingStrategy, TensorType, add_end_docstrings from .tokenization_utils_base import ( ENCODE_KWARGS_DOCSTRING, ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING, @@ -30,11 +30,9 @@ from .tokenization_utils_base import ( BatchEncoding, EncodedInput, EncodedInputPair, - PaddingStrategy, PreTokenizedInput, PreTokenizedInputPair, PreTrainedTokenizerBase, - TensorType, TextInput, TextInputPair, TruncationStrategy, diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py index d4825bcbae..ebe27b6829 100644 --- a/src/transformers/tokenization_utils_base.py +++ b/src/transformers/tokenization_utils_base.py @@ -25,7 +25,6 @@ import warnings from collections import OrderedDict, UserDict from contextlib import contextmanager from dataclasses import dataclass, field -from enum import Enum from typing import TYPE_CHECKING, Any, Dict, List, NamedTuple, Optional, Sequence, Tuple, Union import numpy as np @@ -33,6 +32,14 @@ import numpy as np import requests from .file_utils import ( + ExplicitEnum, + PaddingStrategy, + TensorType, + _is_jax, + _is_numpy, + _is_tensorflow, + _is_torch, + _is_torch_device, add_end_docstrings, cached_path, hf_bucket_url, @@ -41,6 +48,7 @@ from .file_utils import ( is_tf_available, is_tokenizers_available, is_torch_available, + to_py_obj, torch_required, ) from .utils import logging @@ -55,34 +63,6 @@ if TYPE_CHECKING: import jax.numpy as jnp # noqa: F401 -def _is_numpy(x): - return isinstance(x, np.ndarray) - - -def _is_torch(x): - import torch - - return isinstance(x, torch.Tensor) - - -def _is_torch_device(x): - import torch - - return isinstance(x, torch.device) - - -def _is_tensorflow(x): - import tensorflow as tf - - return isinstance(x, tf.Tensor) - - -def _is_jax(x): - import jax.numpy as jnp # noqa: F811 - - return isinstance(x, jnp.ndarray) - - if is_tokenizers_available(): from tokenizers import AddedToken from tokenizers import Encoding as EncodingFast @@ -134,19 +114,6 @@ TOKENIZER_CONFIG_FILE = "tokenizer_config.json" FULL_TOKENIZER_FILE = "tokenizer.json" -class ExplicitEnum(Enum): - """ - Enum with more explicit error message for missing values. - """ - - @classmethod - def _missing_(cls, value): - raise ValueError( - "%r is not a valid %s, please select one of %s" - % (value, cls.__name__, str(list(cls._value2member_map_.keys()))) - ) - - class TruncationStrategy(ExplicitEnum): """ Possible values for the ``truncation`` argument in :meth:`PreTrainedTokenizerBase.__call__`. Useful for @@ -159,29 +126,6 @@ class TruncationStrategy(ExplicitEnum): DO_NOT_TRUNCATE = "do_not_truncate" -class PaddingStrategy(ExplicitEnum): - """ - Possible values for the ``padding`` argument in :meth:`PreTrainedTokenizerBase.__call__`. Useful for tab-completion - in an IDE. - """ - - LONGEST = "longest" - MAX_LENGTH = "max_length" - DO_NOT_PAD = "do_not_pad" - - -class TensorType(ExplicitEnum): - """ - Possible values for the ``return_tensors`` argument in :meth:`PreTrainedTokenizerBase.__call__`. Useful for - tab-completion in an IDE. - """ - - PYTORCH = "pt" - TENSORFLOW = "tf" - NUMPY = "np" - JAX = "jax" - - class CharSpan(NamedTuple): """ Character span in the original string. @@ -208,24 +152,6 @@ class TokenSpan(NamedTuple): end: int -def to_py_obj(obj): - """ - Convert a TensorFlow tensor, PyTorch tensor, Numpy array or python list to a python list. - """ - if isinstance(obj, (dict, BatchEncoding)): - return {k: to_py_obj(v) for k, v in obj.items()} - elif isinstance(obj, (list, tuple)): - return [to_py_obj(o) for o in obj] - elif is_tf_available() and _is_tensorflow(obj): - return obj.numpy().tolist() - elif is_torch_available() and _is_torch(obj): - return obj.detach().cpu().tolist() - elif isinstance(obj, np.ndarray): - return obj.tolist() - else: - return obj - - class BatchEncoding(UserDict): """ Holds the output of the :meth:`~transformers.tokenization_utils_base.PreTrainedTokenizerBase.encode_plus` and @@ -715,9 +641,9 @@ class BatchEncoding(UserDict): Convert the inner content to tensors. Args: - tensor_type (:obj:`str` or :class:`~transformers.tokenization_utils_base.TensorType`, `optional`): + tensor_type (:obj:`str` or :class:`~transformers.file_utils.TensorType`, `optional`): The type of tensors to use. If :obj:`str`, should be one of the values of the enum - :class:`~transformers.tokenization_utils_base.TensorType`. If :obj:`None`, no modification is done. + :class:`~transformers.file_utils.TensorType`. If :obj:`None`, no modification is done. prepend_batch_axis (:obj:`int`, `optional`, defaults to :obj:`False`): Whether or not to add the batch dimension during the conversion. """ @@ -810,9 +736,7 @@ class BatchEncoding(UserDict): if isinstance(device, str) or _is_torch_device(device) or isinstance(device, int): self.data = {k: v.to(device=device) for k, v in self.data.items()} else: - logger.warning( - f"Attempting to cast a BatchEncoding to another type, {str(device)}. This is not supported." - ) + logger.warning(f"Attempting to cast a BatchEncoding to type {str(device)}. This is not supported.") return self @@ -1321,7 +1245,7 @@ class SpecialTokensMixin: ENCODE_KWARGS_DOCSTRING = r""" add_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`True`): Whether or not to encode the sequences with the special tokens relative to their model. - padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`False`): + padding (:obj:`bool`, :obj:`str` or :class:`~transformers.file_utils.PaddingStrategy`, `optional`, defaults to :obj:`False`): Activates and controls padding. Accepts the following values: * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a @@ -1362,7 +1286,7 @@ ENCODE_KWARGS_DOCSTRING = r""" pad_to_multiple_of (:obj:`int`, `optional`): If set will pad the sequence to a multiple of the provided value. This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >= 7.5 (Volta). - return_tensors (:obj:`str` or :class:`~transformers.tokenization_utils_base.TensorType`, `optional`): + return_tensors (:obj:`str` or :class:`~transformers.file_utils.TensorType`, `optional`): If set, will return tensors instead of list of python integers. Acceptable values are: * :obj:`'tf'`: Return TensorFlow :obj:`tf.constant` objects. @@ -2608,7 +2532,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin): Instead of :obj:`List[int]` you can have tensors (numpy arrays, PyTorch tensors or TensorFlow tensors), see the note above for the return type. - padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`): + padding (:obj:`bool`, :obj:`str` or :class:`~transformers.file_utils.PaddingStrategy`, `optional`, defaults to :obj:`True`): Select a strategy to pad the returned sequences (according to the model's padding side and padding index) among: @@ -2630,7 +2554,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin): to the specific tokenizer's default, defined by the :obj:`return_outputs` attribute. `What are attention masks? <../glossary.html#attention-mask>`__ - return_tensors (:obj:`str` or :class:`~transformers.tokenization_utils_base.TensorType`, `optional`): + return_tensors (:obj:`str` or :class:`~transformers.file_utils.TensorType`, `optional`): If set, will return tensors instead of list of python integers. Acceptable values are: * :obj:`'tf'`: Return TensorFlow :obj:`tf.constant` objects. @@ -3260,7 +3184,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin): max_target_length (:obj:`int`, `optional`): Controls the maximum length of decoder inputs (target language texts or summaries) If left unset or set to :obj:`None`, this will use the max_length value. - padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`False`): + padding (:obj:`bool`, :obj:`str` or :class:`~transformers.file_utils.PaddingStrategy`, `optional`, defaults to :obj:`False`): Activates and controls padding. Accepts the following values: * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a @@ -3269,7 +3193,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin): maximum acceptable input length for the model if that argument is not provided. * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different lengths). - return_tensors (:obj:`str` or :class:`~transformers.tokenization_utils_base.TensorType`, `optional`): + return_tensors (:obj:`str` or :class:`~transformers.file_utils.TensorType`, `optional`): If set, will return tensors instead of list of python integers. Acceptable values are: * :obj:`'tf'`: Return TensorFlow :obj:`tf.constant` objects. diff --git a/src/transformers/tokenization_utils_fast.py b/src/transformers/tokenization_utils_fast.py index 34cc039908..2d33aa7a4e 100644 --- a/src/transformers/tokenization_utils_fast.py +++ b/src/transformers/tokenization_utils_fast.py @@ -27,13 +27,12 @@ from tokenizers import Tokenizer as TokenizerFast from tokenizers.decoders import Decoder as DecoderFast from .convert_slow_tokenizer import convert_slow_tokenizer -from .file_utils import add_end_docstrings +from .file_utils import PaddingStrategy, add_end_docstrings from .tokenization_utils import PreTrainedTokenizer from .tokenization_utils_base import ( INIT_TOKENIZER_DOCSTRING, AddedToken, BatchEncoding, - PaddingStrategy, PreTokenizedInput, PreTokenizedInputPair, PreTrainedTokenizerBase, @@ -308,7 +307,7 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase): section. Args: - padding_strategy (:class:`~transformers.tokenization_utils_base.PaddingStrategy`): + padding_strategy (:class:`~transformers.file_utils.PaddingStrategy`): The kind of padding that will be applied to the input truncation_strategy (:class:`~transformers.tokenization_utils_base.TruncationStrategy`): The kind of truncation that will be applied to the input diff --git a/src/transformers/trainer_utils.py b/src/transformers/trainer_utils.py index 5e0f1ae948..76622d34a3 100644 --- a/src/transformers/trainer_utils.py +++ b/src/transformers/trainer_utils.py @@ -29,13 +29,13 @@ from typing import Any, Dict, NamedTuple, Optional, Tuple, Union import numpy as np from .file_utils import ( + ExplicitEnum, is_sagemaker_distributed_available, is_tf_available, is_torch_available, is_torch_cuda_available, is_torch_tpu_available, ) -from .tokenization_utils_base import ExplicitEnum def set_seed(seed: int): diff --git a/tests/test_feature_extraction_common.py b/tests/test_feature_extraction_common.py new file mode 100644 index 0000000000..77b82019bd --- /dev/null +++ b/tests/test_feature_extraction_common.py @@ -0,0 +1,284 @@ +# coding=utf-8 +# Copyright 2021 HuggingFace Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import json +import os +import tempfile + +import numpy as np + +from transformers import BatchFeature +from transformers.testing_utils import require_tf, require_torch + + +class FeatureExtractionMixin: + + # to overwrite at feature extractactor specific tests + feat_extract_tester = None + feature_extraction_class = None + + @property + def feat_extract_dict(self): + return self.feat_extract_tester.prepare_feat_extract_dict() + + def test_feat_extract_common_properties(self): + feat_extract = self.feature_extraction_class(**self.feat_extract_dict) + self.assertTrue(hasattr(feat_extract, "feature_size")) + self.assertTrue(hasattr(feat_extract, "sampling_rate")) + self.assertTrue(hasattr(feat_extract, "padding_value")) + + def test_feat_extract_to_json_string(self): + feat_extract = self.feature_extraction_class(**self.feat_extract_dict) + obj = json.loads(feat_extract.to_json_string()) + for key, value in self.feat_extract_dict.items(): + self.assertEqual(obj[key], value) + + def test_feat_extract_to_json_file(self): + feat_extract_first = self.feature_extraction_class(**self.feat_extract_dict) + + with tempfile.TemporaryDirectory() as tmpdirname: + json_file_path = os.path.join(tmpdirname, "feat_extract.json") + feat_extract_first.to_json_file(json_file_path) + feat_extract_second = self.feature_extraction_class.from_json_file(json_file_path) + + self.assertEqual(feat_extract_second.to_dict(), feat_extract_first.to_dict()) + + def test_feat_extract_from_and_save_pretrained(self): + feat_extract_first = self.feature_extraction_class(**self.feat_extract_dict) + + with tempfile.TemporaryDirectory() as tmpdirname: + feat_extract_first.save_pretrained(tmpdirname) + feat_extract_second = self.feature_extraction_class.from_pretrained(tmpdirname) + + self.assertEqual(feat_extract_second.to_dict(), feat_extract_first.to_dict()) + + def test_init_without_params(self): + feat_extract = self.feature_extraction_class() + self.assertIsNotNone(feat_extract) + + def test_batch_feature(self): + speech_inputs = self.feat_extract_tester.prepare_inputs_for_common() + feat_extract = self.feature_extraction_class(**self.feat_extract_dict) + input_name = feat_extract.model_input_names[0] + + processed_features = BatchFeature({input_name: speech_inputs}) + + self.assertTrue(all(len(x) == len(y) for x, y in zip(speech_inputs, processed_features[input_name]))) + + speech_inputs = self.feat_extract_tester.prepare_inputs_for_common(equal_length=True) + processed_features = BatchFeature({input_name: speech_inputs}, tensor_type="np") + + batch_features_input = processed_features[input_name] + + if len(batch_features_input.shape) < 3: + batch_features_input = batch_features_input[:, :, None] + + self.assertTrue( + batch_features_input.shape + == (self.feat_extract_tester.batch_size, len(speech_inputs[0]), self.feat_extract_tester.feature_size) + ) + + @require_torch + def test_batch_feature_pt(self): + speech_inputs = self.feat_extract_tester.prepare_inputs_for_common(equal_length=True) + feat_extract = self.feature_extraction_class(**self.feat_extract_dict) + input_name = feat_extract.model_input_names[0] + + processed_features = BatchFeature({input_name: speech_inputs}, tensor_type="pt") + + batch_features_input = processed_features[input_name] + + if len(batch_features_input.shape) < 3: + batch_features_input = batch_features_input[:, :, None] + + self.assertTrue( + batch_features_input.shape + == (self.feat_extract_tester.batch_size, len(speech_inputs[0]), self.feat_extract_tester.feature_size) + ) + + @require_tf + def test_batch_feature_tf(self): + speech_inputs = self.feat_extract_tester.prepare_inputs_for_common(equal_length=True) + feat_extract = self.feature_extraction_class(**self.feat_extract_dict) + input_name = feat_extract.model_input_names[0] + + processed_features = BatchFeature({input_name: speech_inputs}, tensor_type="tf") + + batch_features_input = processed_features[input_name] + + if len(batch_features_input.shape) < 3: + batch_features_input = batch_features_input[:, :, None] + + self.assertTrue( + batch_features_input.shape + == (self.feat_extract_tester.batch_size, len(speech_inputs[0]), self.feat_extract_tester.feature_size) + ) + + def _check_padding(self, numpify=False): + def _inputs_have_equal_length(input): + length = len(input[0]) + for input_slice in input[1:]: + if len(input_slice) != length: + return False + return True + + def _inputs_are_equal(input_1, input_2): + if len(input_1) != len(input_2): + return False + + for input_slice_1, input_slice_2 in zip(input_1, input_2): + if not np.allclose(np.asarray(input_slice_1), np.asarray(input_slice_2), atol=1e-3): + return False + return True + + feat_extract = self.feature_extraction_class(**self.feat_extract_dict) + speech_inputs = self.feat_extract_tester.prepare_inputs_for_common(numpify=numpify) + input_name = feat_extract.model_input_names[0] + + processed_features = BatchFeature({input_name: speech_inputs}) + + pad_diff = self.feat_extract_tester.seq_length_diff + pad_max_length = self.feat_extract_tester.max_seq_length + pad_diff + pad_min_length = self.feat_extract_tester.min_seq_length + batch_size = self.feat_extract_tester.batch_size + feature_size = self.feat_extract_tester.feature_size + + # test padding for List[int] + numpy + input_1 = feat_extract.pad(processed_features, padding=False)[input_name] + input_2 = feat_extract.pad(processed_features, padding="longest")[input_name] + input_3 = feat_extract.pad(processed_features, padding="max_length", max_length=len(speech_inputs[-1]))[ + input_name + ] + input_4 = feat_extract.pad(processed_features, padding="longest", return_tensors="np")[input_name] + + # max_length parameter has to be provided when setting `padding="max_length"` + with self.assertRaises(ValueError): + feat_extract.pad(processed_features, padding="max_length")[input_name] + + input_5 = feat_extract.pad( + processed_features, padding="max_length", max_length=pad_max_length, return_tensors="np" + )[input_name] + + self.assertFalse(_inputs_have_equal_length(input_1)) + self.assertTrue(_inputs_have_equal_length(input_2)) + self.assertTrue(_inputs_have_equal_length(input_3)) + self.assertTrue(_inputs_are_equal(input_2, input_3)) + self.assertTrue(len(input_1[0]) == pad_min_length) + self.assertTrue(len(input_1[1]) == pad_min_length + pad_diff) + self.assertTrue(input_4.shape[:2] == (batch_size, len(input_3[0]))) + self.assertTrue(input_5.shape[:2] == (batch_size, pad_max_length)) + + if feature_size > 1: + self.assertTrue(input_4.shape[2] == input_5.shape[2] == feature_size) + + # test padding for `pad_to_multiple_of` for List[int] + numpy + input_6 = feat_extract.pad(processed_features, pad_to_multiple_of=10)[input_name] + input_7 = feat_extract.pad(processed_features, padding="longest", pad_to_multiple_of=10)[input_name] + input_8 = feat_extract.pad( + processed_features, padding="max_length", pad_to_multiple_of=10, max_length=pad_max_length + )[input_name] + input_9 = feat_extract.pad( + processed_features, + padding="max_length", + pad_to_multiple_of=10, + max_length=pad_max_length, + return_tensors="np", + )[input_name] + + self.assertTrue(all(len(x) % 10 == 0 for x in input_6)) + self.assertTrue(_inputs_are_equal(input_6, input_7)) + + expected_mult_pad_length = pad_max_length if pad_max_length % 10 == 0 else (pad_max_length // 10 + 1) * 10 + self.assertTrue(all(len(x) == expected_mult_pad_length for x in input_8)) + self.assertTrue(input_9.shape[:2], (batch_size, expected_mult_pad_length)) + + if feature_size > 1: + self.assertTrue(input_9.shape[2] == feature_size) + + # Check padding value is correct + padding_vector_sum = (np.ones(self.feat_extract_tester.feature_size) * feat_extract.padding_value).sum() + self.assertTrue( + abs(np.asarray(input_2[0])[pad_min_length:].sum() - padding_vector_sum * (pad_max_length - pad_min_length)) + < 1e-3 + ) + self.assertTrue( + abs( + np.asarray(input_2[1])[pad_min_length + pad_diff :].sum() + - padding_vector_sum * (pad_max_length - pad_min_length - pad_diff) + ) + < 1e-3 + ) + self.assertTrue( + abs( + np.asarray(input_2[2])[pad_min_length + 2 * pad_diff :].sum() + - padding_vector_sum * (pad_max_length - pad_min_length - 2 * pad_diff) + ) + < 1e-3 + ) + self.assertTrue( + abs(input_5[0, pad_min_length:].sum() - padding_vector_sum * (pad_max_length - pad_min_length)) < 1e-3 + ) + self.assertTrue( + abs(input_9[0, pad_min_length:].sum() - padding_vector_sum * (expected_mult_pad_length - pad_min_length)) + < 1e-3 + ) + + def test_padding_from_list(self): + self._check_padding(numpify=False) + + def test_padding_from_array(self): + self._check_padding(numpify=True) + + @require_torch + def test_padding_accepts_tensors_pt(self): + feat_extract = self.feature_extraction_class(**self.feat_extract_dict) + speech_inputs = self.feat_extract_tester.prepare_inputs_for_common() + input_name = feat_extract.model_input_names[0] + + processed_features = BatchFeature({input_name: speech_inputs}) + + input_np = feat_extract.pad(processed_features, padding="longest", return_tensors="np")[input_name] + input_pt = feat_extract.pad(processed_features, padding="longest", return_tensors="pt")[input_name] + + self.assertTrue(abs(input_np.sum() - input_pt.numpy().sum()) < 1e-2) + + @require_tf + def test_padding_accepts_tensors_tf(self): + feat_extract = self.feature_extraction_class(**self.feat_extract_dict) + speech_inputs = self.feat_extract_tester.prepare_inputs_for_common() + input_name = feat_extract.model_input_names[0] + + processed_features = BatchFeature({input_name: speech_inputs}) + + input_np = feat_extract.pad(processed_features, padding="longest", return_tensors="np")[input_name] + input_tf = feat_extract.pad(processed_features, padding="longest", return_tensors="tf")[input_name] + + self.assertTrue(abs(input_np.sum() - input_tf.numpy().sum()) < 1e-2) + + def test_attention_mask(self): + feat_dict = self.feat_extract_dict + feat_dict["return_attention_mask"] = True + feat_extract = self.feature_extraction_class(**feat_dict) + speech_inputs = self.feat_extract_tester.prepare_inputs_for_common() + input_lenghts = [len(x) for x in speech_inputs] + input_name = feat_extract.model_input_names[0] + + processed = BatchFeature({input_name: speech_inputs}) + + processed = feat_extract.pad(processed, padding="longest", return_tensors="np") + self.assertIn("attention_mask", processed) + self.assertListEqual(list(processed.attention_mask.shape), list(processed[input_name].shape[:2])) + self.assertListEqual(processed.attention_mask.sum(-1).tolist(), input_lenghts) diff --git a/tests/test_feature_extraction_wav2vec2.py b/tests/test_feature_extraction_wav2vec2.py new file mode 100644 index 0000000000..179bafe613 --- /dev/null +++ b/tests/test_feature_extraction_wav2vec2.py @@ -0,0 +1,147 @@ +# coding=utf-8 +# Copyright 2021 HuggingFace Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import itertools +import random +import unittest + +import numpy as np + +from transformers import WAV_2_VEC_2_PRETRAINED_MODEL_ARCHIVE_LIST, Wav2Vec2Config, Wav2Vec2FeatureExtractor +from transformers.testing_utils import slow + +from .test_feature_extraction_common import FeatureExtractionMixin + + +global_rng = random.Random() + + +def floats_list(shape, scale=1.0, rng=None, name=None): + """Creates a random float32 tensor""" + if rng is None: + rng = global_rng + + values = [] + for batch_idx in range(shape[0]): + values.append([]) + for _ in range(shape[1]): + values[-1].append(rng.random() * scale) + + return values + + +class Wav2Vec2FeatureExtractionTester(unittest.TestCase): + def __init__( + self, + parent, + batch_size=7, + min_seq_length=400, + max_seq_length=2000, + feature_size=1, + padding_value=0.0, + sampling_rate=16000, + return_attention_mask=True, + do_normalize=True, + ): + self.parent = parent + self.batch_size = batch_size + self.min_seq_length = min_seq_length + self.max_seq_length = max_seq_length + self.seq_length_diff = (self.max_seq_length - self.min_seq_length) // (self.batch_size - 1) + self.feature_size = feature_size + self.padding_value = padding_value + self.sampling_rate = sampling_rate + self.return_attention_mask = return_attention_mask + self.do_normalize = do_normalize + + def prepare_feat_extract_dict(self): + return { + "feature_size": self.feature_size, + "padding_value": self.padding_value, + "sampling_rate": self.sampling_rate, + "return_attention_mask": self.return_attention_mask, + "do_normalize": self.do_normalize, + } + + def prepare_inputs_for_common(self, equal_length=False, numpify=False): + def _flatten(list_of_lists): + return list(itertools.chain(*list_of_lists)) + + if equal_length: + speech_inputs = floats_list((self.batch_size, self.max_seq_length)) + else: + speech_inputs = [ + _flatten(floats_list((x, self.feature_size))) + for x in range(self.min_seq_length, self.max_seq_length, self.seq_length_diff) + ] + + if numpify: + speech_inputs = [np.asarray(x) for x in speech_inputs] + + return speech_inputs + + +class Wav2Vec2FeatureExtractionTest(FeatureExtractionMixin, unittest.TestCase): + + feature_extraction_class = Wav2Vec2FeatureExtractor + + def setUp(self): + self.feat_extract_tester = Wav2Vec2FeatureExtractionTester(self) + + def test_call(self): + # Tests that all call wrap to encode_plus and batch_encode_plus + feat_extract = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict()) + # create three inputs of length 800, 1000, and 1200 + speech_inputs = [floats_list((1, x))[0] for x in range(800, 1400, 200)] + np_speech_inputs = [np.asarray(speech_input) for speech_input in speech_inputs] + + # Test not batched input + encoded_sequences_1 = feat_extract(speech_inputs[0], return_tensors="np").input_values + encoded_sequences_2 = feat_extract(np_speech_inputs[0], return_tensors="np").input_values + self.assertTrue(np.allclose(encoded_sequences_1, encoded_sequences_2, atol=1e-3)) + + # Test batched + encoded_sequences_1 = feat_extract(speech_inputs, return_tensors="np").input_values + encoded_sequences_2 = feat_extract(np_speech_inputs, return_tensors="np").input_values + for enc_seq_1, enc_seq_2 in zip(encoded_sequences_1, encoded_sequences_2): + self.assertTrue(np.allclose(enc_seq_1, enc_seq_2, atol=1e-3)) + + def test_zero_mean_unit_variance_normalization(self): + feat_extract = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict()) + speech_inputs = [floats_list((1, x))[0] for x in range(800, 1400, 200)] + processed = feat_extract(speech_inputs, padding="longest") + input_values = processed.input_values + + def _check_zero_mean_unit_variance(input_vector): + self.assertTrue(np.abs(np.mean(input_vector)) < 1e-3) + self.assertTrue(np.abs(np.var(input_vector) - 1) < 1e-3) + + _check_zero_mean_unit_variance(input_values[0, :800]) + _check_zero_mean_unit_variance(input_values[1, :1000]) + _check_zero_mean_unit_variance(input_values[2]) + + @slow + def test_pretrained_checkpoints_are_set_correctly(self): + # this test makes sure that models that are using + # group norm don't have their feature extractor return the + # attention_mask + for model_id in WAV_2_VEC_2_PRETRAINED_MODEL_ARCHIVE_LIST: + config = Wav2Vec2Config.from_pretrained(model_id) + feat_extract = Wav2Vec2FeatureExtractor.from_pretrained(model_id) + + # only "layer" feature extraction norm should make use of + # attention_mask + self.assertEqual(feat_extract.return_attention_mask, config.feat_extract_norm == "layer") diff --git a/tests/test_modeling_wav2vec2.py b/tests/test_modeling_wav2vec2.py index 5cb23672e4..cbf0583a3c 100644 --- a/tests/test_modeling_wav2vec2.py +++ b/tests/test_modeling_wav2vec2.py @@ -29,7 +29,7 @@ from .test_modeling_common import ModelTesterMixin, _config_zero_init if is_torch_available(): import torch - from transformers import Wav2Vec2Config, Wav2Vec2ForCTC, Wav2Vec2ForMaskedLM, Wav2Vec2Model, Wav2Vec2Tokenizer + from transformers import Wav2Vec2Config, Wav2Vec2ForCTC, Wav2Vec2ForMaskedLM, Wav2Vec2Model, Wav2Vec2Processor class Wav2Vec2ModelTester: @@ -324,17 +324,16 @@ class Wav2Vec2ModelIntegrationTest(unittest.TestCase): def test_inference_ctc_normal(self): model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h") model.to(torch_device) - tokenizer = Wav2Vec2Tokenizer.from_pretrained("facebook/wav2vec2-base-960h", do_lower_case=True) - + processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h", do_lower_case=True) input_speech = self._load_datasamples(1) - input_values = tokenizer(input_speech, return_tensors="pt").input_values.to(torch_device) + input_values = processor(input_speech, return_tensors="pt").input_values.to(torch_device) with torch.no_grad(): logits = model(input_values).logits predicted_ids = torch.argmax(logits, dim=-1) - predicted_trans = tokenizer.batch_decode(predicted_ids) + predicted_trans = processor.batch_decode(predicted_ids) EXPECTED_TRANSCRIPTIONS = ["a man said to the universe sir i exist"] self.assertListEqual(predicted_trans, EXPECTED_TRANSCRIPTIONS) @@ -342,11 +341,11 @@ class Wav2Vec2ModelIntegrationTest(unittest.TestCase): def test_inference_ctc_normal_batched(self): model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h") model.to(torch_device) - tokenizer = Wav2Vec2Tokenizer.from_pretrained("facebook/wav2vec2-base-960h", do_lower_case=True) + processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h", do_lower_case=True) input_speech = self._load_datasamples(2) - inputs = tokenizer(input_speech, return_tensors="pt", padding=True, truncation=True) + inputs = processor(input_speech, return_tensors="pt", padding=True, truncation=True) input_values = inputs.input_values.to(torch_device) @@ -354,7 +353,7 @@ class Wav2Vec2ModelIntegrationTest(unittest.TestCase): logits = model(input_values).logits predicted_ids = torch.argmax(logits, dim=-1) - predicted_trans = tokenizer.batch_decode(predicted_ids) + predicted_trans = processor.batch_decode(predicted_ids) EXPECTED_TRANSCRIPTIONS = [ "a man said to the universe sir i exist", @@ -364,11 +363,11 @@ class Wav2Vec2ModelIntegrationTest(unittest.TestCase): def test_inference_ctc_robust_batched(self): model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h-lv60-self").to(torch_device) - tokenizer = Wav2Vec2Tokenizer.from_pretrained("facebook/wav2vec2-large-960h-lv60-self", do_lower_case=True) + processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h-lv60-self", do_lower_case=True) input_speech = self._load_datasamples(4) - inputs = tokenizer(input_speech, return_tensors="pt", padding=True, truncation=True) + inputs = processor(input_speech, return_tensors="pt", padding=True, truncation=True) input_values = inputs.input_values.to(torch_device) attention_mask = inputs.attention_mask.to(torch_device) @@ -377,7 +376,7 @@ class Wav2Vec2ModelIntegrationTest(unittest.TestCase): logits = model(input_values, attention_mask=attention_mask).logits predicted_ids = torch.argmax(logits, dim=-1) - predicted_trans = tokenizer.batch_decode(predicted_ids) + predicted_trans = processor.batch_decode(predicted_ids) EXPECTED_TRANSCRIPTIONS = [ "a man said to the universe sir i exist", diff --git a/tests/test_pipelines_common.py b/tests/test_pipelines_common.py index c8a66053a3..bcd9f97e53 100644 --- a/tests/test_pipelines_common.py +++ b/tests/test_pipelines_common.py @@ -16,9 +16,9 @@ from typing import List, Optional from unittest import mock from transformers import is_tf_available, is_torch_available, pipeline +from transformers.file_utils import to_py_obj from transformers.pipelines import Pipeline from transformers.testing_utils import _run_slow_tests, is_pipeline_test, require_tf, require_torch, slow -from transformers.tokenization_utils_base import to_py_obj VALID_INPUTS = ["A simple string", ["list of strings"]] diff --git a/tests/test_processor_wav2vec2.py b/tests/test_processor_wav2vec2.py new file mode 100644 index 0000000000..7d30b06934 --- /dev/null +++ b/tests/test_processor_wav2vec2.py @@ -0,0 +1,139 @@ +# Copyright 2021 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json +import os +import shutil +import tempfile +import unittest + +from transformers.file_utils import FEATURE_EXTRACTOR_NAME +from transformers.models.wav2vec2 import Wav2Vec2CTCTokenizer, Wav2Vec2FeatureExtractor, Wav2Vec2Processor +from transformers.models.wav2vec2.tokenization_wav2vec2 import VOCAB_FILES_NAMES + +from .test_feature_extraction_wav2vec2 import floats_list + + +class Wav2Vec2ProcessorTest(unittest.TestCase): + def setUp(self): + vocab = " | E T A O N I H S R D L U M W C F G Y P B V K ' X J Q Z".split(" ") + vocab_tokens = dict(zip(vocab, range(len(vocab)))) + + self.add_kwargs_tokens_map = { + "pad_token": "", + "unk_token": "", + "bos_token": "", + "eos_token": "", + } + feature_extractor_map = { + "feature_size": 1, + "padding_value": 0.0, + "sampling_rate": 16000, + "return_attention_mask": False, + "do_normalize": True, + } + + self.tmpdirname = tempfile.mkdtemp() + self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) + self.feature_extraction_file = os.path.join(self.tmpdirname, FEATURE_EXTRACTOR_NAME) + with open(self.vocab_file, "w", encoding="utf-8") as fp: + fp.write(json.dumps(vocab_tokens) + "\n") + + with open(self.feature_extraction_file, "w", encoding="utf-8") as fp: + fp.write(json.dumps(feature_extractor_map) + "\n") + + def get_tokenizer(self, **kwargs): + kwargs.update(self.add_kwargs_tokens_map) + return Wav2Vec2CTCTokenizer.from_pretrained(self.tmpdirname, **kwargs) + + def get_feature_extractor(self, **kwargs): + return Wav2Vec2FeatureExtractor.from_pretrained(self.tmpdirname, **kwargs) + + def tearDown(self): + shutil.rmtree(self.tmpdirname) + + def test_save_load_pretrained_default(self): + tokenizer = self.get_tokenizer() + feature_extractor = self.get_feature_extractor() + + processor = Wav2Vec2Processor(tokenizer=tokenizer, feature_extractor=feature_extractor) + + processor.save_pretrained(self.tmpdirname) + processor = Wav2Vec2Processor.from_pretrained(self.tmpdirname) + + self.assertEqual(processor.tokenizer.get_vocab(), tokenizer.get_vocab()) + self.assertIsInstance(processor.tokenizer, Wav2Vec2CTCTokenizer) + + self.assertEqual(processor.feature_extractor.to_json_string(), feature_extractor.to_json_string()) + self.assertIsInstance(processor.feature_extractor, Wav2Vec2FeatureExtractor) + + def test_save_load_pretrained_additional_features(self): + processor = Wav2Vec2Processor(tokenizer=self.get_tokenizer(), feature_extractor=self.get_feature_extractor()) + processor.save_pretrained(self.tmpdirname) + + tokenizer_add_kwargs = self.get_tokenizer(bos_token="(BOS)", eos_token="(EOS)") + feature_extractor_add_kwargs = self.get_feature_extractor(do_normalize=False, padding_value=1.0) + + processor = Wav2Vec2Processor.from_pretrained( + self.tmpdirname, bos_token="(BOS)", eos_token="(EOS)", do_normalize=False, padding_value=1.0 + ) + + self.assertEqual(processor.tokenizer.get_vocab(), tokenizer_add_kwargs.get_vocab()) + self.assertIsInstance(processor.tokenizer, Wav2Vec2CTCTokenizer) + + self.assertEqual(processor.feature_extractor.to_json_string(), feature_extractor_add_kwargs.to_json_string()) + self.assertIsInstance(processor.feature_extractor, Wav2Vec2FeatureExtractor) + + def test_feature_extractor(self): + feature_extractor = self.get_feature_extractor() + tokenizer = self.get_tokenizer() + + processor = Wav2Vec2Processor(tokenizer=tokenizer, feature_extractor=feature_extractor) + + raw_speech = floats_list((3, 1000)) + + input_feat_extract = feature_extractor(raw_speech, return_tensors="np") + input_processor = processor(raw_speech, return_tensors="np") + + for key in input_feat_extract.keys(): + self.assertAlmostEqual(input_feat_extract[key].sum(), input_processor[key].sum(), delta=1e-2) + + def test_tokenizer(self): + feature_extractor = self.get_feature_extractor() + tokenizer = self.get_tokenizer() + + processor = Wav2Vec2Processor(tokenizer=tokenizer, feature_extractor=feature_extractor) + + input_str = "This is a test string" + + with processor.as_target_processor(): + encoded_processor = processor(input_str) + + encoded_tok = tokenizer(input_str) + + for key in encoded_tok.keys(): + self.assertListEqual(encoded_tok[key], encoded_processor[key]) + + def test_tokenizer_decode(self): + feature_extractor = self.get_feature_extractor() + tokenizer = self.get_tokenizer() + + processor = Wav2Vec2Processor(tokenizer=tokenizer, feature_extractor=feature_extractor) + + predicted_ids = [[1, 4, 5, 8, 1, 0, 8], [3, 4, 3, 1, 1, 8, 9]] + + decoded_processor = processor.batch_decode(predicted_ids) + decoded_tok = tokenizer.batch_decode(predicted_ids) + + self.assertListEqual(decoded_tok, decoded_processor) diff --git a/tests/test_tokenization_wav2vec2.py b/tests/test_tokenization_wav2vec2.py index 9b7c0c33b4..f7a5e4da16 100644 --- a/tests/test_tokenization_wav2vec2.py +++ b/tests/test_tokenization_wav2vec2.py @@ -23,11 +23,17 @@ import unittest import numpy as np -from transformers import WAV_2_VEC_2_PRETRAINED_MODEL_ARCHIVE_LIST -from transformers.models.wav2vec2 import Wav2Vec2Config, Wav2Vec2Tokenizer +from transformers import ( + WAV_2_VEC_2_PRETRAINED_MODEL_ARCHIVE_LIST, + Wav2Vec2Config, + Wav2Vec2CTCTokenizer, + Wav2Vec2Tokenizer, +) from transformers.models.wav2vec2.tokenization_wav2vec2 import VOCAB_FILES_NAMES from transformers.testing_utils import slow +from .test_tokenization_common import TokenizerTesterMixin + global_rng = random.Random() @@ -345,3 +351,101 @@ class Wav2Vec2TokenizerTest(unittest.TestCase): # only "layer" feature extraction norm should make use of # attention_mask self.assertEqual(tokenizer.return_attention_mask, config.feat_extract_norm == "layer") + + +class Wav2Vec2CTCTokenizerTest(TokenizerTesterMixin, unittest.TestCase): + tokenizer_class = Wav2Vec2CTCTokenizer + test_rust_tokenizer = False + + def setUp(self): + super().setUp() + + vocab = " | E T A O N I H S R D L U M W C F G Y P B V K ' X J Q Z".split(" ") + vocab_tokens = dict(zip(vocab, range(len(vocab)))) + + self.special_tokens_map = {"pad_token": "", "unk_token": "", "bos_token": "", "eos_token": ""} + + self.tmpdirname = tempfile.mkdtemp() + self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) + with open(self.vocab_file, "w", encoding="utf-8") as fp: + fp.write(json.dumps(vocab_tokens) + "\n") + + def get_tokenizer(self, **kwargs): + kwargs.update(self.special_tokens_map) + return Wav2Vec2CTCTokenizer.from_pretrained(self.tmpdirname, **kwargs) + + def test_tokenizer_decode(self): + tokenizer = self.tokenizer_class.from_pretrained("facebook/wav2vec2-base-960h") + + sample_ids = [ + [11, 5, 15, tokenizer.pad_token_id, 15, 8, 98], + [24, 22, 5, tokenizer.word_delimiter_token_id, 24, 22, 5, 77], + ] + tokens = tokenizer.decode(sample_ids[0]) + batch_tokens = tokenizer.batch_decode(sample_ids) + self.assertEqual(tokens, batch_tokens[0]) + self.assertEqual(batch_tokens, ["HELLO", "BYE BYE"]) + + def test_tokenizer_decode_special(self): + tokenizer = self.tokenizer_class.from_pretrained("facebook/wav2vec2-base-960h") + + sample_ids = [ + [11, 5, 15, tokenizer.pad_token_id, 15, 8, 98], + [24, 22, 5, tokenizer.word_delimiter_token_id, 24, 22, 5, 77], + ] + sample_ids_2 = [ + [11, 5, 5, 5, 5, 5, 15, 15, 15, tokenizer.pad_token_id, 15, 8, 98], + [ + 24, + 22, + 5, + tokenizer.pad_token_id, + tokenizer.pad_token_id, + tokenizer.pad_token_id, + tokenizer.word_delimiter_token_id, + 24, + 22, + 5, + 77, + tokenizer.word_delimiter_token_id, + ], + ] + + batch_tokens = tokenizer.batch_decode(sample_ids) + batch_tokens_2 = tokenizer.batch_decode(sample_ids_2) + self.assertEqual(batch_tokens, batch_tokens_2) + self.assertEqual(batch_tokens, ["HELLO", "BYE BYE"]) + + def test_tokenizer_decode_added_tokens(self): + tokenizer = self.tokenizer_class.from_pretrained("facebook/wav2vec2-base-960h") + tokenizer.add_tokens(["!", "?"]) + tokenizer.add_special_tokens({"cls_token": "$$$"}) + + sample_ids = [ + [ + 11, + 5, + 15, + tokenizer.pad_token_id, + 15, + 8, + 98, + 32, + 32, + 33, + tokenizer.word_delimiter_token_id, + 32, + 32, + 33, + 34, + 34, + ], + [24, 22, 5, tokenizer.word_delimiter_token_id, 24, 22, 5, 77, tokenizer.pad_token_id, 34, 34], + ] + batch_tokens = tokenizer.batch_decode(sample_ids) + + self.assertEqual(batch_tokens, ["HELLO!?!?$$$", "BYE BYE$$$"]) + + def test_pretrained_model_lists(self): + # Wav2Vec2Model has no max model length => no + pass diff --git a/utils/check_repo.py b/utils/check_repo.py index f9c25dabc1..c8881baa65 100644 --- a/utils/check_repo.py +++ b/utils/check_repo.py @@ -372,6 +372,7 @@ DEPRECATED_OBJECTS = [ "TextDataset", "TextDatasetForNextSentencePrediction", "Wav2Vec2ForMaskedLM", + "Wav2Vec2Tokenizer", "glue_compute_metrics", "glue_convert_examples_to_features", "glue_output_modes",