[PretrainedFeatureExtractor] + Wav2Vec2FeatureExtractor, Wav2Vec2Processor, Wav2Vec2Tokenizer (#10324)

* push to show

* small improvement

* small improvement

* Update src/transformers/feature_extraction_utils.py

* Update src/transformers/feature_extraction_utils.py

* implement base

* add common tests

* make all tests pass for wav2vec2

* make padding work & add more tests

* finalize feature extractor utils

* add call method to feature extraction

* finalize feature processor

* finish tokenizer

* finish general processor design

* finish tests

* typo

* remove bogus file

* finish docstring

* add docs

* finish docs

* small fix

* correct docs

* save intermediate

* load changes

* apply changes

* apply changes to doc

* change tests

* apply surajs recommend

* final changes

* Apply suggestions from code review

* fix typo

* fix import

* correct docstring
This commit is contained in:
Patrick von Platen
2021-02-25 17:42:46 +03:00
committed by GitHub
parent 9dc7825744
commit cb38ffcc5e
33 changed files with 2252 additions and 176 deletions

View File

@@ -25,7 +25,6 @@ import warnings
from collections import OrderedDict, UserDict
from contextlib import contextmanager
from dataclasses import dataclass, field
from enum import Enum
from typing import TYPE_CHECKING, Any, Dict, List, NamedTuple, Optional, Sequence, Tuple, Union
import numpy as np
@@ -33,6 +32,14 @@ import numpy as np
import requests
from .file_utils import (
ExplicitEnum,
PaddingStrategy,
TensorType,
_is_jax,
_is_numpy,
_is_tensorflow,
_is_torch,
_is_torch_device,
add_end_docstrings,
cached_path,
hf_bucket_url,
@@ -41,6 +48,7 @@ from .file_utils import (
is_tf_available,
is_tokenizers_available,
is_torch_available,
to_py_obj,
torch_required,
)
from .utils import logging
@@ -55,34 +63,6 @@ if TYPE_CHECKING:
import jax.numpy as jnp # noqa: F401
def _is_numpy(x):
return isinstance(x, np.ndarray)
def _is_torch(x):
import torch
return isinstance(x, torch.Tensor)
def _is_torch_device(x):
import torch
return isinstance(x, torch.device)
def _is_tensorflow(x):
import tensorflow as tf
return isinstance(x, tf.Tensor)
def _is_jax(x):
import jax.numpy as jnp # noqa: F811
return isinstance(x, jnp.ndarray)
if is_tokenizers_available():
from tokenizers import AddedToken
from tokenizers import Encoding as EncodingFast
@@ -134,19 +114,6 @@ TOKENIZER_CONFIG_FILE = "tokenizer_config.json"
FULL_TOKENIZER_FILE = "tokenizer.json"
class ExplicitEnum(Enum):
"""
Enum with more explicit error message for missing values.
"""
@classmethod
def _missing_(cls, value):
raise ValueError(
"%r is not a valid %s, please select one of %s"
% (value, cls.__name__, str(list(cls._value2member_map_.keys())))
)
class TruncationStrategy(ExplicitEnum):
"""
Possible values for the ``truncation`` argument in :meth:`PreTrainedTokenizerBase.__call__`. Useful for
@@ -159,29 +126,6 @@ class TruncationStrategy(ExplicitEnum):
DO_NOT_TRUNCATE = "do_not_truncate"
class PaddingStrategy(ExplicitEnum):
"""
Possible values for the ``padding`` argument in :meth:`PreTrainedTokenizerBase.__call__`. Useful for tab-completion
in an IDE.
"""
LONGEST = "longest"
MAX_LENGTH = "max_length"
DO_NOT_PAD = "do_not_pad"
class TensorType(ExplicitEnum):
"""
Possible values for the ``return_tensors`` argument in :meth:`PreTrainedTokenizerBase.__call__`. Useful for
tab-completion in an IDE.
"""
PYTORCH = "pt"
TENSORFLOW = "tf"
NUMPY = "np"
JAX = "jax"
class CharSpan(NamedTuple):
"""
Character span in the original string.
@@ -208,24 +152,6 @@ class TokenSpan(NamedTuple):
end: int
def to_py_obj(obj):
"""
Convert a TensorFlow tensor, PyTorch tensor, Numpy array or python list to a python list.
"""
if isinstance(obj, (dict, BatchEncoding)):
return {k: to_py_obj(v) for k, v in obj.items()}
elif isinstance(obj, (list, tuple)):
return [to_py_obj(o) for o in obj]
elif is_tf_available() and _is_tensorflow(obj):
return obj.numpy().tolist()
elif is_torch_available() and _is_torch(obj):
return obj.detach().cpu().tolist()
elif isinstance(obj, np.ndarray):
return obj.tolist()
else:
return obj
class BatchEncoding(UserDict):
"""
Holds the output of the :meth:`~transformers.tokenization_utils_base.PreTrainedTokenizerBase.encode_plus` and
@@ -715,9 +641,9 @@ class BatchEncoding(UserDict):
Convert the inner content to tensors.
Args:
tensor_type (:obj:`str` or :class:`~transformers.tokenization_utils_base.TensorType`, `optional`):
tensor_type (:obj:`str` or :class:`~transformers.file_utils.TensorType`, `optional`):
The type of tensors to use. If :obj:`str`, should be one of the values of the enum
:class:`~transformers.tokenization_utils_base.TensorType`. If :obj:`None`, no modification is done.
:class:`~transformers.file_utils.TensorType`. If :obj:`None`, no modification is done.
prepend_batch_axis (:obj:`int`, `optional`, defaults to :obj:`False`):
Whether or not to add the batch dimension during the conversion.
"""
@@ -810,9 +736,7 @@ class BatchEncoding(UserDict):
if isinstance(device, str) or _is_torch_device(device) or isinstance(device, int):
self.data = {k: v.to(device=device) for k, v in self.data.items()}
else:
logger.warning(
f"Attempting to cast a BatchEncoding to another type, {str(device)}. This is not supported."
)
logger.warning(f"Attempting to cast a BatchEncoding to type {str(device)}. This is not supported.")
return self
@@ -1321,7 +1245,7 @@ class SpecialTokensMixin:
ENCODE_KWARGS_DOCSTRING = r"""
add_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`True`):
Whether or not to encode the sequences with the special tokens relative to their model.
padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`False`):
padding (:obj:`bool`, :obj:`str` or :class:`~transformers.file_utils.PaddingStrategy`, `optional`, defaults to :obj:`False`):
Activates and controls padding. Accepts the following values:
* :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a
@@ -1362,7 +1286,7 @@ ENCODE_KWARGS_DOCSTRING = r"""
pad_to_multiple_of (:obj:`int`, `optional`):
If set will pad the sequence to a multiple of the provided value. This is especially useful to enable
the use of Tensor Cores on NVIDIA hardware with compute capability >= 7.5 (Volta).
return_tensors (:obj:`str` or :class:`~transformers.tokenization_utils_base.TensorType`, `optional`):
return_tensors (:obj:`str` or :class:`~transformers.file_utils.TensorType`, `optional`):
If set, will return tensors instead of list of python integers. Acceptable values are:
* :obj:`'tf'`: Return TensorFlow :obj:`tf.constant` objects.
@@ -2608,7 +2532,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
Instead of :obj:`List[int]` you can have tensors (numpy arrays, PyTorch tensors or TensorFlow tensors),
see the note above for the return type.
padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
padding (:obj:`bool`, :obj:`str` or :class:`~transformers.file_utils.PaddingStrategy`, `optional`, defaults to :obj:`True`):
Select a strategy to pad the returned sequences (according to the model's padding side and padding
index) among:
@@ -2630,7 +2554,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
to the specific tokenizer's default, defined by the :obj:`return_outputs` attribute.
`What are attention masks? <../glossary.html#attention-mask>`__
return_tensors (:obj:`str` or :class:`~transformers.tokenization_utils_base.TensorType`, `optional`):
return_tensors (:obj:`str` or :class:`~transformers.file_utils.TensorType`, `optional`):
If set, will return tensors instead of list of python integers. Acceptable values are:
* :obj:`'tf'`: Return TensorFlow :obj:`tf.constant` objects.
@@ -3260,7 +3184,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
max_target_length (:obj:`int`, `optional`):
Controls the maximum length of decoder inputs (target language texts or summaries) If left unset or set
to :obj:`None`, this will use the max_length value.
padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`False`):
padding (:obj:`bool`, :obj:`str` or :class:`~transformers.file_utils.PaddingStrategy`, `optional`, defaults to :obj:`False`):
Activates and controls padding. Accepts the following values:
* :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a
@@ -3269,7 +3193,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
maximum acceptable input length for the model if that argument is not provided.
* :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
different lengths).
return_tensors (:obj:`str` or :class:`~transformers.tokenization_utils_base.TensorType`, `optional`):
return_tensors (:obj:`str` or :class:`~transformers.file_utils.TensorType`, `optional`):
If set, will return tensors instead of list of python integers. Acceptable values are:
* :obj:`'tf'`: Return TensorFlow :obj:`tf.constant` objects.