[PretrainedFeatureExtractor] + Wav2Vec2FeatureExtractor, Wav2Vec2Processor, Wav2Vec2Tokenizer (#10324)
* push to show * small improvement * small improvement * Update src/transformers/feature_extraction_utils.py * Update src/transformers/feature_extraction_utils.py * implement base * add common tests * make all tests pass for wav2vec2 * make padding work & add more tests * finalize feature extractor utils * add call method to feature extraction * finalize feature processor * finish tokenizer * finish general processor design * finish tests * typo * remove bogus file * finish docstring * add docs * finish docs * small fix * correct docs * save intermediate * load changes * apply changes * apply changes to doc * change tests * apply surajs recommend * final changes * Apply suggestions from code review * fix typo * fix import * correct docstring
This commit is contained in:
committed by
GitHub
parent
9dc7825744
commit
cb38ffcc5e
@@ -25,7 +25,6 @@ import warnings
|
||||
from collections import OrderedDict, UserDict
|
||||
from contextlib import contextmanager
|
||||
from dataclasses import dataclass, field
|
||||
from enum import Enum
|
||||
from typing import TYPE_CHECKING, Any, Dict, List, NamedTuple, Optional, Sequence, Tuple, Union
|
||||
|
||||
import numpy as np
|
||||
@@ -33,6 +32,14 @@ import numpy as np
|
||||
import requests
|
||||
|
||||
from .file_utils import (
|
||||
ExplicitEnum,
|
||||
PaddingStrategy,
|
||||
TensorType,
|
||||
_is_jax,
|
||||
_is_numpy,
|
||||
_is_tensorflow,
|
||||
_is_torch,
|
||||
_is_torch_device,
|
||||
add_end_docstrings,
|
||||
cached_path,
|
||||
hf_bucket_url,
|
||||
@@ -41,6 +48,7 @@ from .file_utils import (
|
||||
is_tf_available,
|
||||
is_tokenizers_available,
|
||||
is_torch_available,
|
||||
to_py_obj,
|
||||
torch_required,
|
||||
)
|
||||
from .utils import logging
|
||||
@@ -55,34 +63,6 @@ if TYPE_CHECKING:
|
||||
import jax.numpy as jnp # noqa: F401
|
||||
|
||||
|
||||
def _is_numpy(x):
|
||||
return isinstance(x, np.ndarray)
|
||||
|
||||
|
||||
def _is_torch(x):
|
||||
import torch
|
||||
|
||||
return isinstance(x, torch.Tensor)
|
||||
|
||||
|
||||
def _is_torch_device(x):
|
||||
import torch
|
||||
|
||||
return isinstance(x, torch.device)
|
||||
|
||||
|
||||
def _is_tensorflow(x):
|
||||
import tensorflow as tf
|
||||
|
||||
return isinstance(x, tf.Tensor)
|
||||
|
||||
|
||||
def _is_jax(x):
|
||||
import jax.numpy as jnp # noqa: F811
|
||||
|
||||
return isinstance(x, jnp.ndarray)
|
||||
|
||||
|
||||
if is_tokenizers_available():
|
||||
from tokenizers import AddedToken
|
||||
from tokenizers import Encoding as EncodingFast
|
||||
@@ -134,19 +114,6 @@ TOKENIZER_CONFIG_FILE = "tokenizer_config.json"
|
||||
FULL_TOKENIZER_FILE = "tokenizer.json"
|
||||
|
||||
|
||||
class ExplicitEnum(Enum):
|
||||
"""
|
||||
Enum with more explicit error message for missing values.
|
||||
"""
|
||||
|
||||
@classmethod
|
||||
def _missing_(cls, value):
|
||||
raise ValueError(
|
||||
"%r is not a valid %s, please select one of %s"
|
||||
% (value, cls.__name__, str(list(cls._value2member_map_.keys())))
|
||||
)
|
||||
|
||||
|
||||
class TruncationStrategy(ExplicitEnum):
|
||||
"""
|
||||
Possible values for the ``truncation`` argument in :meth:`PreTrainedTokenizerBase.__call__`. Useful for
|
||||
@@ -159,29 +126,6 @@ class TruncationStrategy(ExplicitEnum):
|
||||
DO_NOT_TRUNCATE = "do_not_truncate"
|
||||
|
||||
|
||||
class PaddingStrategy(ExplicitEnum):
|
||||
"""
|
||||
Possible values for the ``padding`` argument in :meth:`PreTrainedTokenizerBase.__call__`. Useful for tab-completion
|
||||
in an IDE.
|
||||
"""
|
||||
|
||||
LONGEST = "longest"
|
||||
MAX_LENGTH = "max_length"
|
||||
DO_NOT_PAD = "do_not_pad"
|
||||
|
||||
|
||||
class TensorType(ExplicitEnum):
|
||||
"""
|
||||
Possible values for the ``return_tensors`` argument in :meth:`PreTrainedTokenizerBase.__call__`. Useful for
|
||||
tab-completion in an IDE.
|
||||
"""
|
||||
|
||||
PYTORCH = "pt"
|
||||
TENSORFLOW = "tf"
|
||||
NUMPY = "np"
|
||||
JAX = "jax"
|
||||
|
||||
|
||||
class CharSpan(NamedTuple):
|
||||
"""
|
||||
Character span in the original string.
|
||||
@@ -208,24 +152,6 @@ class TokenSpan(NamedTuple):
|
||||
end: int
|
||||
|
||||
|
||||
def to_py_obj(obj):
|
||||
"""
|
||||
Convert a TensorFlow tensor, PyTorch tensor, Numpy array or python list to a python list.
|
||||
"""
|
||||
if isinstance(obj, (dict, BatchEncoding)):
|
||||
return {k: to_py_obj(v) for k, v in obj.items()}
|
||||
elif isinstance(obj, (list, tuple)):
|
||||
return [to_py_obj(o) for o in obj]
|
||||
elif is_tf_available() and _is_tensorflow(obj):
|
||||
return obj.numpy().tolist()
|
||||
elif is_torch_available() and _is_torch(obj):
|
||||
return obj.detach().cpu().tolist()
|
||||
elif isinstance(obj, np.ndarray):
|
||||
return obj.tolist()
|
||||
else:
|
||||
return obj
|
||||
|
||||
|
||||
class BatchEncoding(UserDict):
|
||||
"""
|
||||
Holds the output of the :meth:`~transformers.tokenization_utils_base.PreTrainedTokenizerBase.encode_plus` and
|
||||
@@ -715,9 +641,9 @@ class BatchEncoding(UserDict):
|
||||
Convert the inner content to tensors.
|
||||
|
||||
Args:
|
||||
tensor_type (:obj:`str` or :class:`~transformers.tokenization_utils_base.TensorType`, `optional`):
|
||||
tensor_type (:obj:`str` or :class:`~transformers.file_utils.TensorType`, `optional`):
|
||||
The type of tensors to use. If :obj:`str`, should be one of the values of the enum
|
||||
:class:`~transformers.tokenization_utils_base.TensorType`. If :obj:`None`, no modification is done.
|
||||
:class:`~transformers.file_utils.TensorType`. If :obj:`None`, no modification is done.
|
||||
prepend_batch_axis (:obj:`int`, `optional`, defaults to :obj:`False`):
|
||||
Whether or not to add the batch dimension during the conversion.
|
||||
"""
|
||||
@@ -810,9 +736,7 @@ class BatchEncoding(UserDict):
|
||||
if isinstance(device, str) or _is_torch_device(device) or isinstance(device, int):
|
||||
self.data = {k: v.to(device=device) for k, v in self.data.items()}
|
||||
else:
|
||||
logger.warning(
|
||||
f"Attempting to cast a BatchEncoding to another type, {str(device)}. This is not supported."
|
||||
)
|
||||
logger.warning(f"Attempting to cast a BatchEncoding to type {str(device)}. This is not supported.")
|
||||
return self
|
||||
|
||||
|
||||
@@ -1321,7 +1245,7 @@ class SpecialTokensMixin:
|
||||
ENCODE_KWARGS_DOCSTRING = r"""
|
||||
add_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`True`):
|
||||
Whether or not to encode the sequences with the special tokens relative to their model.
|
||||
padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`False`):
|
||||
padding (:obj:`bool`, :obj:`str` or :class:`~transformers.file_utils.PaddingStrategy`, `optional`, defaults to :obj:`False`):
|
||||
Activates and controls padding. Accepts the following values:
|
||||
|
||||
* :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a
|
||||
@@ -1362,7 +1286,7 @@ ENCODE_KWARGS_DOCSTRING = r"""
|
||||
pad_to_multiple_of (:obj:`int`, `optional`):
|
||||
If set will pad the sequence to a multiple of the provided value. This is especially useful to enable
|
||||
the use of Tensor Cores on NVIDIA hardware with compute capability >= 7.5 (Volta).
|
||||
return_tensors (:obj:`str` or :class:`~transformers.tokenization_utils_base.TensorType`, `optional`):
|
||||
return_tensors (:obj:`str` or :class:`~transformers.file_utils.TensorType`, `optional`):
|
||||
If set, will return tensors instead of list of python integers. Acceptable values are:
|
||||
|
||||
* :obj:`'tf'`: Return TensorFlow :obj:`tf.constant` objects.
|
||||
@@ -2608,7 +2532,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
|
||||
|
||||
Instead of :obj:`List[int]` you can have tensors (numpy arrays, PyTorch tensors or TensorFlow tensors),
|
||||
see the note above for the return type.
|
||||
padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
|
||||
padding (:obj:`bool`, :obj:`str` or :class:`~transformers.file_utils.PaddingStrategy`, `optional`, defaults to :obj:`True`):
|
||||
Select a strategy to pad the returned sequences (according to the model's padding side and padding
|
||||
index) among:
|
||||
|
||||
@@ -2630,7 +2554,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
|
||||
to the specific tokenizer's default, defined by the :obj:`return_outputs` attribute.
|
||||
|
||||
`What are attention masks? <../glossary.html#attention-mask>`__
|
||||
return_tensors (:obj:`str` or :class:`~transformers.tokenization_utils_base.TensorType`, `optional`):
|
||||
return_tensors (:obj:`str` or :class:`~transformers.file_utils.TensorType`, `optional`):
|
||||
If set, will return tensors instead of list of python integers. Acceptable values are:
|
||||
|
||||
* :obj:`'tf'`: Return TensorFlow :obj:`tf.constant` objects.
|
||||
@@ -3260,7 +3184,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
|
||||
max_target_length (:obj:`int`, `optional`):
|
||||
Controls the maximum length of decoder inputs (target language texts or summaries) If left unset or set
|
||||
to :obj:`None`, this will use the max_length value.
|
||||
padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`False`):
|
||||
padding (:obj:`bool`, :obj:`str` or :class:`~transformers.file_utils.PaddingStrategy`, `optional`, defaults to :obj:`False`):
|
||||
Activates and controls padding. Accepts the following values:
|
||||
|
||||
* :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a
|
||||
@@ -3269,7 +3193,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
|
||||
maximum acceptable input length for the model if that argument is not provided.
|
||||
* :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
|
||||
different lengths).
|
||||
return_tensors (:obj:`str` or :class:`~transformers.tokenization_utils_base.TensorType`, `optional`):
|
||||
return_tensors (:obj:`str` or :class:`~transformers.file_utils.TensorType`, `optional`):
|
||||
If set, will return tensors instead of list of python integers. Acceptable values are:
|
||||
|
||||
* :obj:`'tf'`: Return TensorFlow :obj:`tf.constant` objects.
|
||||
|
||||
Reference in New Issue
Block a user