[PretrainedFeatureExtractor] + Wav2Vec2FeatureExtractor, Wav2Vec2Processor, Wav2Vec2Tokenizer (#10324)

* push to show

* small improvement

* small improvement

* Update src/transformers/feature_extraction_utils.py

* Update src/transformers/feature_extraction_utils.py

* implement base

* add common tests

* make all tests pass for wav2vec2

* make padding work & add more tests

* finalize feature extractor utils

* add call method to feature extraction

* finalize feature processor

* finish tokenizer

* finish general processor design

* finish tests

* typo

* remove bogus file

* finish docstring

* add docs

* finish docs

* small fix

* correct docs

* save intermediate

* load changes

* apply changes

* apply changes to doc

* change tests

* apply surajs recommend

* final changes

* Apply suggestions from code review

* fix typo

* fix import

* correct docstring
This commit is contained in:
Patrick von Platen
2021-02-25 17:42:46 +03:00
committed by GitHub
parent 9dc7825744
commit cb38ffcc5e
33 changed files with 2252 additions and 176 deletions

View File

@@ -52,7 +52,7 @@ from ..roberta.tokenization_roberta import RobertaTokenizer
from ..squeezebert.tokenization_squeezebert import SqueezeBertTokenizer
from ..tapas.tokenization_tapas import TapasTokenizer
from ..transfo_xl.tokenization_transfo_xl import TransfoXLTokenizer
from ..wav2vec2.tokenization_wav2vec2 import Wav2Vec2Tokenizer
from ..wav2vec2.tokenization_wav2vec2 import Wav2Vec2CTCTokenizer
from ..xlm.tokenization_xlm import XLMTokenizer
from .configuration_auto import (
AlbertConfig,
@@ -244,7 +244,7 @@ TOKENIZER_MAPPING = OrderedDict(
(TapasConfig, (TapasTokenizer, None)),
(LEDConfig, (LEDTokenizer, LEDTokenizerFast)),
(ConvBertConfig, (ConvBertTokenizer, ConvBertTokenizerFast)),
(Wav2Vec2Config, (Wav2Vec2Tokenizer, None)),
(Wav2Vec2Config, (Wav2Vec2CTCTokenizer, None)),
]
)

View File

@@ -18,8 +18,8 @@
import collections
from typing import List, Optional, Union
from ...file_utils import add_end_docstrings, add_start_docstrings
from ...tokenization_utils_base import BatchEncoding, TensorType
from ...file_utils import TensorType, add_end_docstrings, add_start_docstrings
from ...tokenization_utils_base import BatchEncoding
from ...utils import logging
from ..bert.tokenization_bert import BertTokenizer
@@ -147,7 +147,7 @@ CUSTOM_DPR_READER_DOCSTRING = r"""
The passages titles to be encoded. This can be a string or a list of strings if there are several passages.
texts (:obj:`str` or :obj:`List[str]`):
The passages texts to be encoded. This can be a string or a list of strings if there are several passages.
padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`False`):
padding (:obj:`bool`, :obj:`str` or :class:`~transformers.file_utils.PaddingStrategy`, `optional`, defaults to :obj:`False`):
Activates and controls padding. Accepts the following values:
* :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
@@ -177,7 +177,7 @@ CUSTOM_DPR_READER_DOCSTRING = r"""
If left unset or set to :obj:`None`, this will use the predefined model maximum length if a maximum
length is required by one of the truncation/padding parameters. If the model has no specific maximum
input length (like XLNet) truncation/padding to a maximum length will be deactivated.
return_tensors (:obj:`str` or :class:`~transformers.tokenization_utils_base.TensorType`, `optional`):
return_tensors (:obj:`str` or :class:`~transformers.file_utils.TensorType`, `optional`):
If set, will return tensors instead of list of python integers. Acceptable values are:
* :obj:`'tf'`: Return TensorFlow :obj:`tf.constant` objects.

View File

@@ -18,8 +18,8 @@
import collections
from typing import List, Optional, Union
from ...file_utils import add_end_docstrings, add_start_docstrings
from ...tokenization_utils_base import BatchEncoding, TensorType
from ...file_utils import TensorType, add_end_docstrings, add_start_docstrings
from ...tokenization_utils_base import BatchEncoding
from ...utils import logging
from ..bert.tokenization_bert_fast import BertTokenizerFast
from .tokenization_dpr import DPRContextEncoderTokenizer, DPRQuestionEncoderTokenizer, DPRReaderTokenizer
@@ -148,7 +148,7 @@ CUSTOM_DPR_READER_DOCSTRING = r"""
The passages titles to be encoded. This can be a string or a list of strings if there are several passages.
texts (:obj:`str` or :obj:`List[str]`):
The passages texts to be encoded. This can be a string or a list of strings if there are several passages.
padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`False`):
padding (:obj:`bool`, :obj:`str` or :class:`~transformers.file_utils.PaddingStrategy`, `optional`, defaults to :obj:`False`):
Activates and controls padding. Accepts the following values:
* :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
@@ -178,7 +178,7 @@ CUSTOM_DPR_READER_DOCSTRING = r"""
If left unset or set to :obj:`None`, this will use the predefined model maximum length if a maximum
length is required by one of the truncation/padding parameters. If the model has no specific maximum
input length (like XLNet) truncation/padding to a maximum length will be deactivated.
return_tensors (:obj:`str` or :class:`~transformers.tokenization_utils_base.TensorType`, `optional`):
return_tensors (:obj:`str` or :class:`~transformers.file_utils.TensorType`, `optional`):
If set, will return tensors instead of list of python integers. Acceptable values are:
* :obj:`'tf'`: Return TensorFlow :obj:`tf.constant` objects.

View File

@@ -573,7 +573,7 @@ class RagRetriever:
The prefix used by the generator's tokenizer.
n_docs (:obj:`int`, `optional`):
The number of docs retrieved per query.
return_tensors (:obj:`str` or :class:`~transformers.tokenization_utils_base.TensorType`, `optional`, defaults to "pt"):
return_tensors (:obj:`str` or :class:`~transformers.file_utils.TensorType`, `optional`, defaults to "pt"):
If set, will return tensors instead of list of python integers. Acceptable values are:
* :obj:`'tf'`: Return TensorFlow :obj:`tf.constant` objects.

View File

@@ -28,16 +28,13 @@ from typing import Callable, Dict, Generator, List, Optional, Text, Tuple, Union
import numpy as np
from ...file_utils import add_end_docstrings, is_pandas_available
from ...file_utils import ExplicitEnum, PaddingStrategy, TensorType, add_end_docstrings, is_pandas_available
from ...tokenization_utils import PreTrainedTokenizer, _is_control, _is_punctuation, _is_whitespace
from ...tokenization_utils_base import (
ENCODE_KWARGS_DOCSTRING,
BatchEncoding,
EncodedInput,
ExplicitEnum,
PaddingStrategy,
PreTokenizedInput,
TensorType,
TextInput,
)
from ...utils import logging
@@ -151,7 +148,7 @@ def whitespace_tokenize(text):
TAPAS_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING = r"""
add_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`True`):
Whether or not to encode the sequences with the special tokens relative to their model.
padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`False`):
padding (:obj:`bool`, :obj:`str` or :class:`~transformers.file_utils.PaddingStrategy`, `optional`, defaults to :obj:`False`):
Activates and controls padding. Accepts the following values:
* :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a
@@ -180,7 +177,7 @@ TAPAS_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING = r"""
pad_to_multiple_of (:obj:`int`, `optional`):
If set will pad the sequence to a multiple of the provided value. This is especially useful to enable
the use of Tensor Cores on NVIDIA hardware with compute capability >= 7.5 (Volta).
return_tensors (:obj:`str` or :class:`~transformers.tokenization_utils_base.TensorType`, `optional`):
return_tensors (:obj:`str` or :class:`~transformers.file_utils.TensorType`, `optional`):
If set, will return tensors instead of list of python integers. Acceptable values are:
* :obj:`'tf'`: Return TensorFlow :obj:`tf.constant` objects.

View File

@@ -22,7 +22,9 @@ from ...file_utils import _BaseLazyModule, is_tokenizers_available, is_torch_ava
_import_structure = {
"configuration_wav2vec2": ["WAV_2_VEC_2_PRETRAINED_CONFIG_ARCHIVE_MAP", "Wav2Vec2Config"],
"tokenization_wav2vec2": ["Wav2Vec2Tokenizer"],
"tokenization_wav2vec2": ["Wav2Vec2CTCTokenizer", "Wav2Vec2Tokenizer"],
"feature_extraction_wav2vec2": ["Wav2Vec2FeatureExtractor"],
"processing_wav2vec2": ["Wav2Vec2Processor"],
}
if is_torch_available():
@@ -37,7 +39,9 @@ if is_torch_available():
if TYPE_CHECKING:
from .configuration_wav2vec2 import WAV_2_VEC_2_PRETRAINED_CONFIG_ARCHIVE_MAP, Wav2Vec2Config
from .tokenization_wav2vec2 import Wav2Vec2Tokenizer
from .feature_extraction_wav2vec2 import Wav2Vec2FeatureExtractor
from .processing_wav2vec2 import Wav2Vec2Processor
from .tokenization_wav2vec2 import Wav2Vec2CTCTokenizer, Wav2Vec2Tokenizer
if is_torch_available():
from .modeling_wav2vec2 import (

View File

@@ -0,0 +1,192 @@
# coding=utf-8
# Copyright 2021 The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Feature extractor class for Wav2Vec2
"""
from typing import List, Optional, Union
import numpy as np
from ...feature_extraction_utils import BatchFeature, PreTrainedFeatureExtractor
from ...file_utils import PaddingStrategy, TensorType
from ...utils import logging
logger = logging.get_logger(__name__)
class Wav2Vec2FeatureExtractor(PreTrainedFeatureExtractor):
r"""
Constructs a Wav2Vec2 feature extractor.
This feature extractor inherits from :class:`~transformers.Wav2Vec2FeatureExtractor` which contains most of the
main methods. Users should refer to this superclass for more information regarding those methods.
Args:
feature_size (:obj:`int`, defaults to 1):
The feature dimension of the extracted features.
sampling_rate (:obj:`int`, defaults to 16000):
The sampling rate at which the audio files should be digitalized expressed in Hertz per second (Hz).
padding_value (:obj:`float`, defaults to 0.0):
The value that is used to fill the padding values.
do_normalize (:obj:`bool`, `optional`, defaults to :obj:`False`):
Whether or not to zero-mean unit-variance normalize the input. Normalizing can help to significantly
improve the performance for some models, *e.g.*, `wav2vec2-lv60
<https://huggingface.co/models?search=lv60>`__.
return_attention_mask (:obj:`bool`, `optional`, defaults to :obj:`False`):
Whether or not :meth:`~transformers.Wav2Vec2Tokenizer.__call__` should return :obj:`attention_mask`.
.. note::
Wav2Vec2 models that have set ``config.feat_extract_norm == "group"``, such as `wav2vec2-base
<https://huggingface.co/facebook/wav2vec2-base-960h>`__, have **not** been trained using
:obj:`attention_mask`. For such models, :obj:`input_values` should simply be padded with 0 and no
:obj:`attention_mask` should be passed.
For Wav2Vec2 models that have set ``config.feat_extract_norm == "layer"``, such as `wav2vec2-lv60
<https://huggingface.co/facebook/wav2vec2-large-960h-lv60-self>`__, :obj:`attention_mask` should be
passed for batched inference.
"""
model_input_names = ["input_values", "attention_mask"]
def __init__(
self,
feature_size=1,
sampling_rate=16000,
padding_value=0.0,
return_attention_mask=False,
do_normalize=True,
**kwargs
):
super().__init__(feature_size=feature_size, sampling_rate=sampling_rate, padding_value=padding_value, **kwargs)
self.return_attention_mask = return_attention_mask
self.do_normalize = do_normalize
@staticmethod
def zero_mean_unit_var_norm(input_values: List[np.ndarray]) -> List[np.ndarray]:
"""
Every array in the list is normalized to have zero mean and unit variance
"""
return [(x - np.mean(x)) / np.sqrt(np.var(x) + 1e-5) for x in input_values]
def __call__(
self,
raw_speech: Union[np.ndarray, List[float], List[np.ndarray], List[List[float]]],
padding: Union[bool, str, PaddingStrategy] = False,
max_length: Optional[int] = None,
pad_to_multiple_of: Optional[int] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
sampling_rate: Optional[int] = None,
**kwargs
) -> BatchFeature:
"""
Main method to featurize and prepare for the model one or several sequence(s). sequences.
Args:
raw_speech (:obj:`np.ndarray`, :obj:`List[float]`, :obj:`List[np.ndarray]`, :obj:`List[List[float]]`):
The sequence or batch of sequences to be padded. Each sequence can be a numpy array, a list of float
values, a list of numpy arrays or a list of list of float values.
padding (:obj:`bool`, :obj:`str` or :class:`~transformers.file_utils.PaddingStrategy`, `optional`, defaults to :obj:`True`):
Select a strategy to pad the returned sequences (according to the model's padding side and padding
index) among:
* :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a
single sequence if provided).
* :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
maximum acceptable input length for the model if that argument is not provided.
* :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
different lengths).
max_length (:obj:`int`, `optional`):
Maximum length of the returned list and optionally padding length (see above).
pad_to_multiple_of (:obj:`int`, `optional`):
If set will pad the sequence to a multiple of the provided value.
This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
>= 7.5 (Volta), or on TPUs which benefit from having sequence lengths be a multiple of 128.
return_attention_mask (:obj:`bool`, `optional`):
Whether to return the attention mask. If left to the default, will return the attention mask according
to the specific feature_extractor's default.
`What are attention masks? <../glossary.html#attention-mask>`__
.. note::
Wav2Vec2 models that have set ``config.feat_extract_norm == "group"``, such as `wav2vec2-base
<https://huggingface.co/facebook/wav2vec2-base-960h>`__, have **not** been trained using
:obj:`attention_mask`. For such models, :obj:`input_values` should simply be padded with 0 and no
:obj:`attention_mask` should be passed.
For Wav2Vec2 models that have set ``config.feat_extract_norm == "layer"``, such as `wav2vec2-lv60
<https://huggingface.co/facebook/wav2vec2-large-960h-lv60-self>`__, :obj:`attention_mask` should be
passed for batched inference.
return_tensors (:obj:`str` or :class:`~transformers.file_utils.TensorType`, `optional`):
If set, will return tensors instead of list of python integers. Acceptable values are:
* :obj:`'tf'`: Return TensorFlow :obj:`tf.constant` objects.
* :obj:`'pt'`: Return PyTorch :obj:`torch.Tensor` objects.
* :obj:`'np'`: Return Numpy :obj:`np.ndarray` objects.
sampling_rate (:obj:`int`, `optional`):
The sampling rate at which the ``raw_speech`` input was sampled. It is strongly recommended to pass
``sampling_rate`` at the forward call to prevent silent errors.
padding_value (:obj:`float`, defaults to 0.0):
"""
if sampling_rate is not None:
if sampling_rate != self.sampling_rate:
raise ValueError(
f"The model corresponding to this feature extractor: {self} was trained using a sampling rate of {self.sampling_rate}."
f"Please make sure that the provided `raw_speech` input was sampled with {self.sampling_rate} and not {sampling_rate}."
)
else:
logger.warning(
"It is strongly recommended to pass the ``sampling_rate`` argument to this function."
"Failing to do so can result in silent errors that might be hard to debug."
)
is_batched = bool(
isinstance(raw_speech, (list, tuple))
and (isinstance(raw_speech[0], np.ndarray) or isinstance(raw_speech[0], (tuple, list)))
)
# make sure input is in list format
if is_batched and not isinstance(raw_speech[0], np.ndarray):
raw_speech = [np.asarray(speech) for speech in raw_speech]
elif not is_batched and not isinstance(raw_speech, np.ndarray):
raw_speech = np.asarray(raw_speech)
# always return batch
if not is_batched:
raw_speech = [raw_speech]
# zero-mean and unit-variance normalization
if self.do_normalize:
raw_speech = self.zero_mean_unit_var_norm(raw_speech)
# convert into correct format for padding
encoded_inputs = BatchFeature({"input_values": raw_speech})
padded_inputs = self.pad(
encoded_inputs,
padding=padding,
max_length=max_length,
pad_to_multiple_of=pad_to_multiple_of,
return_attention_mask=self.return_attention_mask,
return_tensors=return_tensors,
)
return padded_inputs

View File

@@ -616,9 +616,9 @@ WAV_2_VEC_2_INPUTS_DOCSTRING = r"""
input_values (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`):
Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file
into an array of type `List[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install
soundfile`). To prepare the array into `input_values`, the :class:`~transformers.Wav2Vec2Tokenizer` should
soundfile`). To prepare the array into `input_values`, the :class:`~transformers.Wav2Vec2Processor` should
be used for padding and conversion into a tensor of type `torch.FloatTensor`. See
:meth:`transformers.Wav2Vec2Tokenizer.__call__` for details.
:meth:`transformers.Wav2Vec2Processor.__call__` for details.
attention_mask (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
Mask to avoid performing convolution and attention on padding token indices. Mask values selected in ``[0,
1]``:
@@ -629,8 +629,8 @@ WAV_2_VEC_2_INPUTS_DOCSTRING = r"""
`What are attention masks? <../glossary.html#attention-mask>`__
.. warning::
:obj:`attention_mask` should only be passed if the corresponding tokenizer has
``config.return_attention_mask == True``. For all models whose tokenizer has
:obj:`attention_mask` should only be passed if the corresponding processor has
``config.return_attention_mask == True``. For all models whose processor has
``config.return_attention_mask == False``, such as `wav2vec2-base
<https://huggingface.co/facebook/wav2vec2-base-960h>`__, :obj:`attention_mask` should **not** be passed
to avoid degraded performance when doing batched inference. For such models :obj:`input_values` should
@@ -682,11 +682,11 @@ class Wav2Vec2Model(Wav2Vec2PreTrainedModel):
Example::
>>> from transformers import Wav2Vec2Tokenizer, Wav2Vec2Model
>>> from transformers import Wav2Vec2Processor, Wav2Vec2Model
>>> from datasets import load_dataset
>>> import soundfile as sf
>>> tokenizer = Wav2Vec2Tokenizer.from_pretrained("facebook/wav2vec2-base-960h")
>>> processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
>>> model = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-base-960h")
>>> def map_to_array(batch):
@@ -697,7 +697,7 @@ class Wav2Vec2Model(Wav2Vec2PreTrainedModel):
>>> ds = load_dataset("patrickvonplaten/librispeech_asr_dummy", "clean", split="validation")
>>> ds = ds.map(map_to_array)
>>> input_values = tokenizer(ds["speech"][0], return_tensors="pt").input_values # Batch size 1
>>> input_values = processor(ds["speech"][0], return_tensors="pt").input_values # Batch size 1
>>> hidden_states = model(input_values).last_hidden_state
"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
@@ -780,11 +780,11 @@ class Wav2Vec2ForMaskedLM(Wav2Vec2PreTrainedModel):
Example::
>>> from transformers import Wav2Vec2Tokenizer, Wav2Vec2Model
>>> from transformers import Wav2Vec2Processor, Wav2Vec2Model
>>> from datasets import load_dataset
>>> import soundfile as sf
>>> tokenizer = Wav2Vec2Tokenizer.from_pretrained("facebook/wav2vec2-base-960h")
>>> processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
>>> model = Wav2Vec2ForMaskedLM.from_pretrained("facebook/wav2vec2-base-960h")
>>> def map_to_array(batch):
@@ -795,11 +795,11 @@ class Wav2Vec2ForMaskedLM(Wav2Vec2PreTrainedModel):
>>> ds = load_dataset("patrickvonplaten/librispeech_asr_dummy", "clean", split="validation")
>>> ds = ds.map(map_to_array)
>>> input_values = tokenizer(ds["speech"][0], return_tensors="pt").input_values # Batch size 1
>>> input_values = processor(ds["speech"][0], return_tensors="pt").input_values # Batch size 1
>>> logits = model(input_values).logits
>>> predicted_ids = torch.argmax(logits, dim=-1)
>>> transcription = tokenizer.decode(predicted_ids[0])
>>> transcription = processor.decode(predicted_ids[0])
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
@@ -856,11 +856,11 @@ class Wav2Vec2ForCTC(Wav2Vec2PreTrainedModel):
Example::
>>> import torch
>>> from transformers import Wav2Vec2Tokenizer, Wav2Vec2ForCTC
>>> from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
>>> from datasets import load_dataset
>>> import soundfile as sf
>>> tokenizer = Wav2Vec2Tokenizer.from_pretrained("facebook/wav2vec2-base-960h")
>>> processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
>>> model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")
>>> def map_to_array(batch):
@@ -871,11 +871,11 @@ class Wav2Vec2ForCTC(Wav2Vec2PreTrainedModel):
>>> ds = load_dataset("patrickvonplaten/librispeech_asr_dummy", "clean", split="validation")
>>> ds = ds.map(map_to_array)
>>> input_values = tokenizer(ds["speech"][0], return_tensors="pt").input_values # Batch size 1
>>> input_values = processor(ds["speech"][0], return_tensors="pt").input_values # Batch size 1
>>> logits = model(input_values).logits
>>> predicted_ids = torch.argmax(logits, dim=-1)
>>> transcription = tokenizer.decode(predicted_ids[0])
>>> transcription = processor.decode(predicted_ids[0])
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict

View File

@@ -0,0 +1,142 @@
# coding=utf-8
# Copyright 2021 The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Speech processor class for Wav2Vec2
"""
from contextlib import contextmanager
from .feature_extraction_wav2vec2 import Wav2Vec2FeatureExtractor
from .tokenization_wav2vec2 import Wav2Vec2CTCTokenizer
class Wav2Vec2Processor:
r"""
Constructs a Wav2Vec2 processor which wraps a Wav2Vec2 feature extractor and a Wav2Vec2 CTC tokenizer into a single
processor.
:class:`~transformers.Wav2Vec2Processor` offers all the functionalities of
:class:`~transformers.Wav2Vec2FeatureExtractor` and :class:`~transformers.Wav2Vec2CTCTokenizer`. See the docstring
of :meth:`~transformers.Wav2Vec2Processor.__call__` and :meth:`~transformers.Wav2Vec2Processor.decode` for more
information.
Args:
feature_extractor (:obj:`Wav2Vec2FeatureExtractor`):
An instance of :class:`~transformers.Wav2Vec2FeatureExtractor`. The feature extractor is a required input.
tokenizer (:obj:`Wav2Vec2CTCTokenizer`):
An instance of :class:`~transformers.Wav2Vec2CTCTokenizer`. The tokenizer is a required input.
"""
def __init__(self, feature_extractor, tokenizer):
if not isinstance(feature_extractor, Wav2Vec2FeatureExtractor):
raise ValueError(
f"`feature_extractor` has to be of type {Wav2Vec2FeatureExtractor.__class__}, but is {type(feature_extractor)}"
)
if not isinstance(tokenizer, Wav2Vec2CTCTokenizer):
raise ValueError(
f"`tokenizer` has to be of type {Wav2Vec2CTCTokenizer.__class__}, but is {type(tokenizer)}"
)
self.feature_extractor = feature_extractor
self.tokenizer = tokenizer
self.current_processor = self.feature_extractor
def save_pretrained(self, save_directory):
"""
Save a Wav2Vec2 feature_extractor object and Wav2Vec2 tokenizer object to the directory ``save_directory``, so
that it can be re-loaded using the :func:`~transformers.Wav2Vec2Processor.from_pretrained` class method.
.. note::
This class method is simply calling :meth:`~transformers.PreTrainedFeatureExtractor.save_pretrained` and
:meth:`~transformers.tokenization_utils_base.PreTrainedTokenizer.save_pretrained`. Please refer to the
docstrings of the methods above for more information.
Args:
save_directory (:obj:`str` or :obj:`os.PathLike`):
Directory where the feature extractor JSON file and the tokenizer files will be saved (directory will
be created if it does not exist).
"""
self.feature_extractor.save_pretrained(save_directory)
self.tokenizer.save_pretrained(save_directory)
@classmethod
def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
r"""
Instantiate a :class:`~transformers.Wav2Vec2Processor` from a pretrained Wav2Vec2 processor.
.. note::
This class method is simply calling Wav2Vec2FeatureExtractor's
:meth:`~transformers.PreTrainedFeatureExtractor.from_pretrained` and Wav2Vec2CTCTokenizer's
:meth:`~transformers.tokenization_utils_base.PreTrainedTokenizer.from_pretrained`. Please refer to the
docstrings of the methods above for more information.
Args:
pretrained_model_name_or_path (:obj:`str` or :obj:`os.PathLike`):
This can be either:
- a string, the `model id` of a pretrained feature_extractor hosted inside a model repo on
huggingface.co. Valid model ids can be located at the root-level, like ``bert-base-uncased``, or
namespaced under a user or organization name, like ``dbmdz/bert-base-german-cased``.
- a path to a `directory` containing a feature extractor file saved using the
:meth:`~transformers.PreTrainedFeatureExtractor.save_pretrained` method, e.g.,
``./my_model_directory/``.
- a path or url to a saved feature extractor JSON `file`, e.g.,
``./my_model_directory/feature_extraction_config.json``.
**kwargs
Additional keyword arguments passed along to both :class:`~transformers.PreTrainedFeatureExtractor` and
:class:`~transformers.PreTrainedTokenizer`
"""
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(pretrained_model_name_or_path, **kwargs)
tokenizer = Wav2Vec2CTCTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs)
return cls(feature_extractor=feature_extractor, tokenizer=tokenizer)
def __call__(self, *args, **kwargs):
"""
When used in normal mode, this method forwards all its arguments to Wav2Vec2FeatureExtractor's
:meth:`~transformers.Wav2Vec2FeatureExtractor.__call__` and returns its output. If used in the context
:meth:`~transformers.Wav2Vec2Processor.as_target_processor` this method forwards all its arguments to
Wav2Vec2CTCTokenizer's :meth:`~transformers.Wav2Vec2CTCTokenizer.__call__`. Please refer to the doctsring of
the above two methods for more information.
"""
return self.current_processor(*args, **kwargs)
def batch_decode(self, *args, **kwargs):
"""
This method forwards all its arguments to Wav2Vec2CTCTokenizer's
:meth:`~transformers.PreTrainedTokenizer.batch_decode`. Please refer to the docstring of this method for more
information.
"""
return self.tokenizer.batch_decode(*args, **kwargs)
def decode(self, *args, **kwargs):
"""
This method forwards all its arguments to Wav2Vec2CTCTokenizer's
:meth:`~transformers.PreTrainedTokenizer.decode`. Please refer to the docstring of this method for more
information.
"""
return self.tokenizer.decode(*args, **kwargs)
@contextmanager
def as_target_processor(self):
"""
Temporarily sets the tokenizer for processing the input. Useful for encoding the labels when fine-tuning
Wav2Vec2.
"""
self.current_processor = self.tokenizer
yield
self.current_processor = self.feature_extractor

View File

@@ -16,14 +16,16 @@
import json
import os
import sys
import warnings
from itertools import groupby
from typing import Dict, List, Optional, Tuple, Union
import numpy as np
from ...file_utils import add_end_docstrings
from ...file_utils import PaddingStrategy, TensorType, add_end_docstrings
from ...tokenization_utils import PreTrainedTokenizer
from ...tokenization_utils_base import BatchEncoding, PaddingStrategy, TensorType
from ...tokenization_utils_base import BatchEncoding
from ...utils import logging
@@ -37,7 +39,7 @@ VOCAB_FILES_NAMES = {
WAV2VEC2_KWARGS_DOCSTRING = r"""
padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`False`):
padding (:obj:`bool`, :obj:`str` or :class:`~transformers.file_utils.PaddingStrategy`, `optional`, defaults to :obj:`False`):
Activates and controls padding. Accepts the following values:
* :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a
@@ -55,7 +57,7 @@ WAV2VEC2_KWARGS_DOCSTRING = r"""
pad_to_multiple_of (:obj:`int`, `optional`):
If set will pad the sequence to a multiple of the provided value. This is especially useful to enable
the use of Tensor Cores on NVIDIA hardware with compute capability >= 7.5 (Volta).
return_tensors (:obj:`str` or :class:`~transformers.tokenization_utils_base.TensorType`, `optional`):
return_tensors (:obj:`str` or :class:`~transformers.file_utils.TensorType`, `optional`):
If set, will return tensors instead of list of python integers. Acceptable values are:
* :obj:`'tf'`: Return TensorFlow :obj:`tf.constant` objects.
@@ -66,6 +68,207 @@ WAV2VEC2_KWARGS_DOCSTRING = r"""
"""
class Wav2Vec2CTCTokenizer(PreTrainedTokenizer):
"""
Constructs a Wav2Vec2CTC tokenizer.
This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains some of the main methods.
Users should refer to the superclass for more information regarding such methods.
Args:
vocab_file (:obj:`str`):
File containing the vocabulary.
bos_token (:obj:`str`, `optional`, defaults to :obj:`"<s>"`):
The beginning of sentence token.
eos_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
The end of sentence token.
unk_token (:obj:`str`, `optional`, defaults to :obj:`"<unk>"`):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead.
pad_token (:obj:`str`, `optional`, defaults to :obj:`"<pad>"`):
The token used for padding, for example when batching sequences of different lengths.
word_delimiter_token (:obj:`str`, `optional`, defaults to :obj:`"|"`):
The token used for defining the end of a word.
do_lower_case (:obj:`bool`, `optional`, defaults to :obj:`False`):
Whether or not to accept lowercase input and lowercase the output when decoding.
**kwargs
Additional keyword arguments passed along to :class:`~transformers.PreTrainedTokenizer`
"""
vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = {
"vocab_file": {
"facebook/wav2vec2-base-960h": "https://huggingface.co/facebook/wav2vec2-base-960h/resolve/main/vocab.json"
},
"tokenizer_config_file": {
"facebook/wav2vec2-base-960h": "https://huggingface.co/facebook/wav2vec2-base-960h/resolve/main/tokenizer.json",
},
}
# Wav2Vec2 has no max input length
max_model_input_sizes = {"facebook/wav2vec2-base-960h": sys.maxsize}
model_input_names = ["input_ids", "attention_mask"]
def __init__(
self,
vocab_file,
bos_token="<s>",
eos_token="</s>",
unk_token="<unk>",
pad_token="<pad>",
word_delimiter_token="|",
do_lower_case=False,
**kwargs
):
super().__init__(
unk_token=unk_token,
bos_token=bos_token,
eos_token=eos_token,
pad_token=pad_token,
do_lower_case=do_lower_case,
word_delimiter_token=word_delimiter_token,
**kwargs,
)
self._word_delimiter_token = word_delimiter_token
self.do_lower_case = do_lower_case
with open(vocab_file, encoding="utf-8") as vocab_handle:
self.encoder = json.load(vocab_handle)
self.decoder = {v: k for k, v in self.encoder.items()}
@property
def word_delimiter_token(self) -> str:
"""
:obj:`str`: Padding token. Log an error if used while not having been set.
"""
if self._word_delimiter_token is None and self.verbose:
logger.error("Using word_delimiter_token, but it is not set yet.")
return None
return str(self._word_delimiter_token)
@property
def word_delimiter_token_id(self) -> Optional[int]:
"""
:obj:`Optional[int]`: Id of the word_delimiter_token in the vocabulary. Returns :obj:`None` if the token has
not been set.
"""
if self._word_delimiter_token is None:
return None
return self.convert_tokens_to_ids(self.word_delimiter_token)
@word_delimiter_token.setter
def word_delimiter_token(self, value):
self._word_delimiter_token = value
@word_delimiter_token_id.setter
def word_delimiter_token_id(self, value):
self._word_delimiter_token = self.convert_tokens_to_ids(value)
@property
def vocab_size(self) -> int:
return len(self.decoder)
def get_vocab(self) -> Dict:
return dict(self.encoder, **self.added_tokens_encoder)
def _tokenize(self, text, **kwargs):
"""
Converts a string in a sequence of tokens (string), using the tokenizer.
"""
if self.do_lower_case:
text = text.upper()
return list(text.replace(" ", self.word_delimiter_token))
def _convert_token_to_id(self, token: str) -> int:
"""Converts a token (str) in an index (integer) using the vocab."""
return self.encoder.get(token, self.encoder.get(self.unk_token))
def _convert_id_to_token(self, index: int) -> str:
"""Converts an index (integer) in a token (str) using the vocab."""
result = self.decoder.get(index, self.unk_token)
return result
def convert_tokens_to_string(
self, tokens: List[str], group_tokens: bool = True, spaces_between_special_tokens: bool = False
) -> str:
"""
Converts a connectionist-temporal-classification (CTC) output tokens into a single string.
"""
# group same tokens into non-repeating tokens in CTC style decoding
if group_tokens:
tokens = [token_group[0] for token_group in groupby(tokens)]
# filter self.pad_token which is used as CTC-blank token
filtered_tokens = list(filter(lambda token: token != self.pad_token, tokens))
if spaces_between_special_tokens:
join_token = " "
else:
join_token = ""
# replace delimiter token
string = join_token.join(
[" " if token == self.word_delimiter_token else token for token in filtered_tokens]
).strip()
if self.do_lower_case:
string = string.lower()
return string
def prepare_for_tokenization(self, text, is_split_into_words=False, **kwargs):
if is_split_into_words:
text = " " + text
return (text, kwargs)
def _decode(
self,
token_ids: List[int],
skip_special_tokens: bool = False,
clean_up_tokenization_spaces: bool = True,
group_tokens: bool = True,
spaces_between_special_tokens: bool = False,
) -> str:
"""
special _decode function is needed for Wav2Vec2Tokenizer because added tokens should be treated exactly the
same as tokens of the base vocabulary and therefore the function `convert_tokens_to_string` has to be called on
the whole token list and not individually on added tokens
"""
filtered_tokens = self.convert_ids_to_tokens(token_ids, skip_special_tokens=skip_special_tokens)
result = []
for token in filtered_tokens:
if skip_special_tokens and token in self.all_special_ids:
continue
result.append(token)
text = self.convert_tokens_to_string(
result, group_tokens=group_tokens, spaces_between_special_tokens=spaces_between_special_tokens
)
if clean_up_tokenization_spaces:
clean_text = self.clean_up_tokenization(text)
return clean_text
else:
return text
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
if not os.path.isdir(save_directory):
logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
return
vocab_file = os.path.join(
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
)
with open(vocab_file, "w", encoding="utf-8") as f:
f.write(json.dumps(self.encoder, ensure_ascii=False))
return (vocab_file,)
class Wav2Vec2Tokenizer(PreTrainedTokenizer):
"""
Constructs a Wav2Vec2 tokenizer.
@@ -146,6 +349,12 @@ class Wav2Vec2Tokenizer(PreTrainedTokenizer):
word_delimiter_token=word_delimiter_token,
**kwargs,
)
warnings.warn(
"The class `Wav2Vec2Tokenizer` is deprecated and will be removed in version 5 of Transformers. Please use `Wav2Vec2Processor` or `Wav2Vec2CTCTokenizer` instead.",
FutureWarning,
)
self._word_delimiter_token = word_delimiter_token
self.do_lower_case = do_lower_case