[PretrainedFeatureExtractor] + Wav2Vec2FeatureExtractor, Wav2Vec2Processor, Wav2Vec2Tokenizer (#10324)
* push to show * small improvement * small improvement * Update src/transformers/feature_extraction_utils.py * Update src/transformers/feature_extraction_utils.py * implement base * add common tests * make all tests pass for wav2vec2 * make padding work & add more tests * finalize feature extractor utils * add call method to feature extraction * finalize feature processor * finish tokenizer * finish general processor design * finish tests * typo * remove bogus file * finish docstring * add docs * finish docs * small fix * correct docs * save intermediate * load changes * apply changes * apply changes to doc * change tests * apply surajs recommend * final changes * Apply suggestions from code review * fix typo * fix import * correct docstring
This commit is contained in:
committed by
GitHub
parent
9dc7825744
commit
cb38ffcc5e
@@ -375,6 +375,7 @@ TensorFlow and/or Flax.
|
|||||||
main_classes/processors
|
main_classes/processors
|
||||||
main_classes/tokenizer
|
main_classes/tokenizer
|
||||||
main_classes/trainer
|
main_classes/trainer
|
||||||
|
main_classes/feature_extractor
|
||||||
|
|
||||||
.. toctree::
|
.. toctree::
|
||||||
:maxdepth: 2
|
:maxdepth: 2
|
||||||
@@ -441,3 +442,4 @@ TensorFlow and/or Flax.
|
|||||||
internal/tokenization_utils
|
internal/tokenization_utils
|
||||||
internal/trainer_utils
|
internal/trainer_utils
|
||||||
internal/generation_utils
|
internal/generation_utils
|
||||||
|
internal/file_utils
|
||||||
|
|||||||
54
docs/source/internal/file_utils.rst
Normal file
54
docs/source/internal/file_utils.rst
Normal file
@@ -0,0 +1,54 @@
|
|||||||
|
..
|
||||||
|
Copyright 2021 The HuggingFace Team. All rights reserved.
|
||||||
|
|
||||||
|
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||||
|
the License. You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||||
|
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||||
|
specific language governing permissions and limitations under the License.
|
||||||
|
|
||||||
|
General Utilities
|
||||||
|
-----------------------------------------------------------------------------------------------------------------------
|
||||||
|
|
||||||
|
This page lists all of Transformers general utility functions that are found in the file ``file_utils.py``.
|
||||||
|
|
||||||
|
Most of those are only useful if you are studying the general code in the library.
|
||||||
|
|
||||||
|
|
||||||
|
Enums and namedtuples
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
.. autoclass:: transformers.file_utils.ExplicitEnum
|
||||||
|
|
||||||
|
.. autoclass:: transformers.file_utils.PaddingStrategy
|
||||||
|
|
||||||
|
.. autoclass:: transformers.file_utils.TensorType
|
||||||
|
|
||||||
|
|
||||||
|
Special Decorators
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
.. autofunction:: transformers.file_utils.add_start_docstrings
|
||||||
|
|
||||||
|
.. autofunction:: transformers.file_utils.add_start_docstrings_to_model_forward
|
||||||
|
|
||||||
|
.. autofunction:: transformers.file_utils.add_end_docstrings
|
||||||
|
|
||||||
|
.. autofunction:: transformers.file_utils.add_code_sample_docstrings
|
||||||
|
|
||||||
|
.. autofunction:: transformers.file_utils.replace_return_docstrings
|
||||||
|
|
||||||
|
|
||||||
|
Special Properties
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
.. autoclass:: transformers.file_utils.cached_property
|
||||||
|
|
||||||
|
|
||||||
|
Other Utilities
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
.. autoclass:: transformers.file_utils._BaseLazyModule
|
||||||
@@ -38,12 +38,6 @@ SpecialTokensMixin
|
|||||||
Enums and namedtuples
|
Enums and namedtuples
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
.. autoclass:: transformers.tokenization_utils_base.ExplicitEnum
|
|
||||||
|
|
||||||
.. autoclass:: transformers.tokenization_utils_base.PaddingStrategy
|
|
||||||
|
|
||||||
.. autoclass:: transformers.tokenization_utils_base.TensorType
|
|
||||||
|
|
||||||
.. autoclass:: transformers.tokenization_utils_base.TruncationStrategy
|
.. autoclass:: transformers.tokenization_utils_base.TruncationStrategy
|
||||||
|
|
||||||
.. autoclass:: transformers.tokenization_utils_base.CharSpan
|
.. autoclass:: transformers.tokenization_utils_base.CharSpan
|
||||||
|
|||||||
33
docs/source/main_classes/feature_extractor.rst
Normal file
33
docs/source/main_classes/feature_extractor.rst
Normal file
@@ -0,0 +1,33 @@
|
|||||||
|
..
|
||||||
|
Copyright 2021 The HuggingFace Team. All rights reserved.
|
||||||
|
|
||||||
|
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||||
|
the License. You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||||
|
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||||
|
specific language governing permissions and limitations under the License.
|
||||||
|
|
||||||
|
|
||||||
|
Feature Extractor
|
||||||
|
-----------------------------------------------------------------------------------------------------------------------
|
||||||
|
|
||||||
|
A feature extractor is in charge of preparing read-in audio files for a speech model. This includes feature extraction,
|
||||||
|
such as processing audio files to, *e.g.*, Log-Mel Spectrogram features, but also padding, normalization, and
|
||||||
|
conversion to Numpy, PyTorch, and TensorFlow tensors.
|
||||||
|
|
||||||
|
|
||||||
|
PreTrainedFeatureExtractor
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
.. autoclass:: transformers.PreTrainedFeatureExtractor
|
||||||
|
:members: from_pretrained, save_pretrained, pad
|
||||||
|
|
||||||
|
|
||||||
|
BatchFeature
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
.. autoclass:: transformers.BatchFeature
|
||||||
|
:members:
|
||||||
@@ -34,7 +34,7 @@ Tips:
|
|||||||
|
|
||||||
- Wav2Vec2 is a speech model that accepts a float array corresponding to the raw waveform of the speech signal.
|
- Wav2Vec2 is a speech model that accepts a float array corresponding to the raw waveform of the speech signal.
|
||||||
- Wav2Vec2 model was trained using connectionist temporal classification (CTC) so the model output has to be decoded
|
- Wav2Vec2 model was trained using connectionist temporal classification (CTC) so the model output has to be decoded
|
||||||
using :class:`~transformers.Wav2Vec2Tokenizer`.
|
using :class:`~transformers.Wav2Vec2CTCTokenizer`.
|
||||||
|
|
||||||
|
|
||||||
Wav2Vec2Config
|
Wav2Vec2Config
|
||||||
@@ -44,13 +44,27 @@ Wav2Vec2Config
|
|||||||
:members:
|
:members:
|
||||||
|
|
||||||
|
|
||||||
Wav2Vec2Tokenizer
|
Wav2Vec2CTCTokenizer
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
.. autoclass:: transformers.Wav2Vec2Tokenizer
|
.. autoclass:: transformers.Wav2Vec2CTCTokenizer
|
||||||
:members: __call__, save_vocabulary
|
:members: __call__, save_vocabulary
|
||||||
|
|
||||||
|
|
||||||
|
Wav2Vec2FeatureExtractor
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
.. autoclass:: transformers.Wav2Vec2FeatureExtractor
|
||||||
|
:members: __call__
|
||||||
|
|
||||||
|
|
||||||
|
Wav2Vec2Processor
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
.. autoclass:: transformers.Wav2Vec2Processor
|
||||||
|
:members: __call__, from_pretrained, save_pretrained, batch_decode, decode, as_target_processor
|
||||||
|
|
||||||
|
|
||||||
Wav2Vec2Model
|
Wav2Vec2Model
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
|||||||
@@ -39,7 +39,8 @@ from transformers import (
|
|||||||
default_data_collator,
|
default_data_collator,
|
||||||
set_seed,
|
set_seed,
|
||||||
)
|
)
|
||||||
from transformers.tokenization_utils_base import PaddingStrategy, PreTrainedTokenizerBase
|
from transformers.file_utils import PaddingStrategy
|
||||||
|
from transformers.tokenization_utils_base import PreTrainedTokenizerBase
|
||||||
from transformers.trainer_utils import get_last_checkpoint, is_main_process
|
from transformers.trainer_utils import get_last_checkpoint, is_main_process
|
||||||
|
|
||||||
|
|
||||||
@@ -133,7 +134,7 @@ class DataCollatorForMultipleChoice:
|
|||||||
Args:
|
Args:
|
||||||
tokenizer (:class:`~transformers.PreTrainedTokenizer` or :class:`~transformers.PreTrainedTokenizerFast`):
|
tokenizer (:class:`~transformers.PreTrainedTokenizer` or :class:`~transformers.PreTrainedTokenizerFast`):
|
||||||
The tokenizer used for encoding the data.
|
The tokenizer used for encoding the data.
|
||||||
padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
|
padding (:obj:`bool`, :obj:`str` or :class:`~transformers.file_utils.PaddingStrategy`, `optional`, defaults to :obj:`True`):
|
||||||
Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
|
Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
|
||||||
among:
|
among:
|
||||||
|
|
||||||
|
|||||||
@@ -88,6 +88,7 @@ _import_structure = {
|
|||||||
"TF_WEIGHTS_NAME",
|
"TF_WEIGHTS_NAME",
|
||||||
"TRANSFORMERS_CACHE",
|
"TRANSFORMERS_CACHE",
|
||||||
"WEIGHTS_NAME",
|
"WEIGHTS_NAME",
|
||||||
|
"TensorType",
|
||||||
"add_end_docstrings",
|
"add_end_docstrings",
|
||||||
"add_start_docstrings",
|
"add_start_docstrings",
|
||||||
"cached_path",
|
"cached_path",
|
||||||
@@ -125,7 +126,14 @@ _import_structure = {
|
|||||||
],
|
],
|
||||||
"models": [],
|
"models": [],
|
||||||
# Models
|
# Models
|
||||||
"models.wav2vec2": ["WAV_2_VEC_2_PRETRAINED_CONFIG_ARCHIVE_MAP", "Wav2Vec2Config", "Wav2Vec2Tokenizer"],
|
"models.wav2vec2": [
|
||||||
|
"WAV_2_VEC_2_PRETRAINED_CONFIG_ARCHIVE_MAP",
|
||||||
|
"Wav2Vec2Config",
|
||||||
|
"Wav2Vec2CTCTokenizer",
|
||||||
|
"Wav2Vec2Tokenizer",
|
||||||
|
"Wav2Vec2FeatureExtractor",
|
||||||
|
"Wav2Vec2Processor",
|
||||||
|
],
|
||||||
"models.convbert": ["CONVBERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "ConvBertConfig", "ConvBertTokenizer"],
|
"models.convbert": ["CONVBERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "ConvBertConfig", "ConvBertTokenizer"],
|
||||||
"models.albert": ["ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "AlbertConfig"],
|
"models.albert": ["ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "AlbertConfig"],
|
||||||
"models.auto": [
|
"models.auto": [
|
||||||
@@ -234,9 +242,9 @@ _import_structure = {
|
|||||||
"CharSpan",
|
"CharSpan",
|
||||||
"PreTrainedTokenizerBase",
|
"PreTrainedTokenizerBase",
|
||||||
"SpecialTokensMixin",
|
"SpecialTokensMixin",
|
||||||
"TensorType",
|
|
||||||
"TokenSpan",
|
"TokenSpan",
|
||||||
],
|
],
|
||||||
|
"feature_extraction_utils": ["PreTrainedFeatureExtractor", "BatchFeature"],
|
||||||
"trainer_callback": [
|
"trainer_callback": [
|
||||||
"DefaultFlowCallback",
|
"DefaultFlowCallback",
|
||||||
"EarlyStoppingCallback",
|
"EarlyStoppingCallback",
|
||||||
@@ -1217,6 +1225,9 @@ if TYPE_CHECKING:
|
|||||||
xnli_tasks_num_labels,
|
xnli_tasks_num_labels,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Feature Extractor
|
||||||
|
from .feature_extraction_utils import BatchFeature, PreTrainedFeatureExtractor
|
||||||
|
|
||||||
# Files and general utilities
|
# Files and general utilities
|
||||||
from .file_utils import (
|
from .file_utils import (
|
||||||
CONFIG_NAME,
|
CONFIG_NAME,
|
||||||
@@ -1228,6 +1239,7 @@ if TYPE_CHECKING:
|
|||||||
TF_WEIGHTS_NAME,
|
TF_WEIGHTS_NAME,
|
||||||
TRANSFORMERS_CACHE,
|
TRANSFORMERS_CACHE,
|
||||||
WEIGHTS_NAME,
|
WEIGHTS_NAME,
|
||||||
|
TensorType,
|
||||||
add_end_docstrings,
|
add_end_docstrings,
|
||||||
add_start_docstrings,
|
add_start_docstrings,
|
||||||
cached_path,
|
cached_path,
|
||||||
@@ -1343,7 +1355,14 @@ if TYPE_CHECKING:
|
|||||||
TransfoXLCorpus,
|
TransfoXLCorpus,
|
||||||
TransfoXLTokenizer,
|
TransfoXLTokenizer,
|
||||||
)
|
)
|
||||||
from .models.wav2vec2 import WAV_2_VEC_2_PRETRAINED_CONFIG_ARCHIVE_MAP, Wav2Vec2Config, Wav2Vec2Tokenizer
|
from .models.wav2vec2 import (
|
||||||
|
WAV_2_VEC_2_PRETRAINED_CONFIG_ARCHIVE_MAP,
|
||||||
|
Wav2Vec2Config,
|
||||||
|
Wav2Vec2CTCTokenizer,
|
||||||
|
Wav2Vec2FeatureExtractor,
|
||||||
|
Wav2Vec2Processor,
|
||||||
|
Wav2Vec2Tokenizer,
|
||||||
|
)
|
||||||
from .models.xlm import XLM_PRETRAINED_CONFIG_ARCHIVE_MAP, XLMConfig, XLMTokenizer
|
from .models.xlm import XLM_PRETRAINED_CONFIG_ARCHIVE_MAP, XLMConfig, XLMTokenizer
|
||||||
from .models.xlm_prophetnet import XLM_PROPHETNET_PRETRAINED_CONFIG_ARCHIVE_MAP, XLMProphetNetConfig
|
from .models.xlm_prophetnet import XLM_PROPHETNET_PRETRAINED_CONFIG_ARCHIVE_MAP, XLMProphetNetConfig
|
||||||
from .models.xlm_roberta import XLM_ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP, XLMRobertaConfig
|
from .models.xlm_roberta import XLM_ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP, XLMRobertaConfig
|
||||||
@@ -1381,7 +1400,6 @@ if TYPE_CHECKING:
|
|||||||
CharSpan,
|
CharSpan,
|
||||||
PreTrainedTokenizerBase,
|
PreTrainedTokenizerBase,
|
||||||
SpecialTokensMixin,
|
SpecialTokensMixin,
|
||||||
TensorType,
|
|
||||||
TokenSpan,
|
TokenSpan,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
@@ -20,8 +20,9 @@ from typing import Any, Callable, Dict, List, NewType, Optional, Tuple, Union
|
|||||||
import torch
|
import torch
|
||||||
from torch.nn.utils.rnn import pad_sequence
|
from torch.nn.utils.rnn import pad_sequence
|
||||||
|
|
||||||
|
from ..file_utils import PaddingStrategy
|
||||||
from ..modeling_utils import PreTrainedModel
|
from ..modeling_utils import PreTrainedModel
|
||||||
from ..tokenization_utils_base import BatchEncoding, PaddingStrategy, PreTrainedTokenizerBase
|
from ..tokenization_utils_base import BatchEncoding, PreTrainedTokenizerBase
|
||||||
|
|
||||||
|
|
||||||
InputDataClass = NewType("InputDataClass", Any)
|
InputDataClass = NewType("InputDataClass", Any)
|
||||||
@@ -89,7 +90,7 @@ class DataCollatorWithPadding:
|
|||||||
Args:
|
Args:
|
||||||
tokenizer (:class:`~transformers.PreTrainedTokenizer` or :class:`~transformers.PreTrainedTokenizerFast`):
|
tokenizer (:class:`~transformers.PreTrainedTokenizer` or :class:`~transformers.PreTrainedTokenizerFast`):
|
||||||
The tokenizer used for encoding the data.
|
The tokenizer used for encoding the data.
|
||||||
padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
|
padding (:obj:`bool`, :obj:`str` or :class:`~transformers.file_utils.PaddingStrategy`, `optional`, defaults to :obj:`True`):
|
||||||
Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
|
Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
|
||||||
among:
|
among:
|
||||||
|
|
||||||
@@ -138,7 +139,7 @@ class DataCollatorForTokenClassification:
|
|||||||
Args:
|
Args:
|
||||||
tokenizer (:class:`~transformers.PreTrainedTokenizer` or :class:`~transformers.PreTrainedTokenizerFast`):
|
tokenizer (:class:`~transformers.PreTrainedTokenizer` or :class:`~transformers.PreTrainedTokenizerFast`):
|
||||||
The tokenizer used for encoding the data.
|
The tokenizer used for encoding the data.
|
||||||
padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
|
padding (:obj:`bool`, :obj:`str` or :class:`~transformers.file_utils.PaddingStrategy`, `optional`, defaults to :obj:`True`):
|
||||||
Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
|
Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
|
||||||
among:
|
among:
|
||||||
|
|
||||||
@@ -238,7 +239,7 @@ class DataCollatorForSeq2Seq:
|
|||||||
prepare the `decoder_input_ids`
|
prepare the `decoder_input_ids`
|
||||||
|
|
||||||
This is useful when using `label_smoothing` to avoid calculating loss twice.
|
This is useful when using `label_smoothing` to avoid calculating loss twice.
|
||||||
padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
|
padding (:obj:`bool`, :obj:`str` or :class:`~transformers.file_utils.PaddingStrategy`, `optional`, defaults to :obj:`True`):
|
||||||
Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
|
Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
|
||||||
among:
|
among:
|
||||||
|
|
||||||
|
|||||||
737
src/transformers/feature_extraction_utils.py
Normal file
737
src/transformers/feature_extraction_utils.py
Normal file
@@ -0,0 +1,737 @@
|
|||||||
|
# coding=utf-8
|
||||||
|
# Copyright 2021 The HuggingFace Inc. team.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
"""
|
||||||
|
Feature extraction common class for python feature extractors.
|
||||||
|
"""
|
||||||
|
import copy
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
from collections import UserDict
|
||||||
|
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
from .file_utils import (
|
||||||
|
FEATURE_EXTRACTOR_NAME,
|
||||||
|
PaddingStrategy,
|
||||||
|
TensorType,
|
||||||
|
_is_jax,
|
||||||
|
_is_numpy,
|
||||||
|
_is_tensorflow,
|
||||||
|
_is_torch,
|
||||||
|
_is_torch_device,
|
||||||
|
cached_path,
|
||||||
|
hf_bucket_url,
|
||||||
|
is_flax_available,
|
||||||
|
is_remote_url,
|
||||||
|
is_tf_available,
|
||||||
|
is_torch_available,
|
||||||
|
to_py_obj,
|
||||||
|
torch_required,
|
||||||
|
)
|
||||||
|
from .utils import logging
|
||||||
|
|
||||||
|
|
||||||
|
logger = logging.get_logger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
if is_torch_available():
|
||||||
|
import torch
|
||||||
|
|
||||||
|
|
||||||
|
class BatchFeature(UserDict):
|
||||||
|
r"""
|
||||||
|
Holds the output of the :meth:`~transformers.PreTrainedFeatureExtractor.pad` and feature extractor specific
|
||||||
|
``__call__`` methods.
|
||||||
|
|
||||||
|
This class is derived from a python dictionary and can be used as a dictionary.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
data (:obj:`dict`):
|
||||||
|
Dictionary of lists/arrays/tensors returned by the __call__/pad methods ('input_values', 'attention_mask',
|
||||||
|
etc.).
|
||||||
|
tensor_type (:obj:`Union[None, str, TensorType]`, `optional`):
|
||||||
|
You can give a tensor_type here to convert the lists of integers in PyTorch/TensorFlow/Numpy Tensors at
|
||||||
|
initialization.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, data: Optional[Dict[str, Any]] = None, tensor_type: Union[None, str, TensorType] = None):
|
||||||
|
super().__init__(data)
|
||||||
|
self.convert_to_tensors(tensor_type=tensor_type)
|
||||||
|
|
||||||
|
def __getitem__(self, item: str) -> Union[Any]:
|
||||||
|
"""
|
||||||
|
If the key is a string, returns the value of the dict associated to :obj:`key` ('input_values',
|
||||||
|
'attention_mask', etc.).
|
||||||
|
"""
|
||||||
|
if isinstance(item, str):
|
||||||
|
return self.data[item]
|
||||||
|
else:
|
||||||
|
raise KeyError("Indexing with integers is not available when using Python based feature extractors")
|
||||||
|
|
||||||
|
def __getattr__(self, item: str):
|
||||||
|
try:
|
||||||
|
return self.data[item]
|
||||||
|
except KeyError:
|
||||||
|
raise AttributeError
|
||||||
|
|
||||||
|
def __getstate__(self):
|
||||||
|
return {"data": self.data}
|
||||||
|
|
||||||
|
def __setstate__(self, state):
|
||||||
|
if "data" in state:
|
||||||
|
self.data = state["data"]
|
||||||
|
|
||||||
|
# Copied from transformers.tokenization_utils_base.BatchEncoding.keys
|
||||||
|
def keys(self):
|
||||||
|
return self.data.keys()
|
||||||
|
|
||||||
|
# Copied from transformers.tokenization_utils_base.BatchEncoding.values
|
||||||
|
def values(self):
|
||||||
|
return self.data.values()
|
||||||
|
|
||||||
|
# Copied from transformers.tokenization_utils_base.BatchEncoding.items
|
||||||
|
def items(self):
|
||||||
|
return self.data.items()
|
||||||
|
|
||||||
|
def convert_to_tensors(self, tensor_type: Optional[Union[str, TensorType]] = None):
|
||||||
|
"""
|
||||||
|
Convert the inner content to tensors.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
tensor_type (:obj:`str` or :class:`~transformers.file_utils.TensorType`, `optional`):
|
||||||
|
The type of tensors to use. If :obj:`str`, should be one of the values of the enum
|
||||||
|
:class:`~transformers.file_utils.TensorType`. If :obj:`None`, no modification is done.
|
||||||
|
"""
|
||||||
|
if tensor_type is None:
|
||||||
|
return self
|
||||||
|
|
||||||
|
# Convert to TensorType
|
||||||
|
if not isinstance(tensor_type, TensorType):
|
||||||
|
tensor_type = TensorType(tensor_type)
|
||||||
|
|
||||||
|
# Get a function reference for the correct framework
|
||||||
|
if tensor_type == TensorType.TENSORFLOW:
|
||||||
|
if not is_tf_available():
|
||||||
|
raise ImportError(
|
||||||
|
"Unable to convert output to TensorFlow tensors format, TensorFlow is not installed."
|
||||||
|
)
|
||||||
|
import tensorflow as tf
|
||||||
|
|
||||||
|
as_tensor = tf.constant
|
||||||
|
is_tensor = tf.is_tensor
|
||||||
|
elif tensor_type == TensorType.PYTORCH:
|
||||||
|
if not is_torch_available():
|
||||||
|
raise ImportError("Unable to convert output to PyTorch tensors format, PyTorch is not installed.")
|
||||||
|
import torch
|
||||||
|
|
||||||
|
as_tensor = torch.tensor
|
||||||
|
is_tensor = torch.is_tensor
|
||||||
|
elif tensor_type == TensorType.JAX:
|
||||||
|
if not is_flax_available():
|
||||||
|
raise ImportError("Unable to convert output to JAX tensors format, JAX is not installed.")
|
||||||
|
import jax.numpy as jnp # noqa: F811
|
||||||
|
|
||||||
|
as_tensor = jnp.array
|
||||||
|
is_tensor = _is_jax
|
||||||
|
else:
|
||||||
|
as_tensor = np.asarray
|
||||||
|
is_tensor = _is_numpy
|
||||||
|
|
||||||
|
# Do the tensor conversion in batch
|
||||||
|
for key, value in self.items():
|
||||||
|
try:
|
||||||
|
if not is_tensor(value):
|
||||||
|
tensor = as_tensor(value)
|
||||||
|
|
||||||
|
self[key] = tensor
|
||||||
|
except: # noqa E722
|
||||||
|
if key == "overflowing_values":
|
||||||
|
raise ValueError("Unable to create tensor returning overflowing values of different lengths. ")
|
||||||
|
raise ValueError(
|
||||||
|
"Unable to create tensor, you should probably activate padding "
|
||||||
|
"with 'padding=True' to have batched tensors with the same length."
|
||||||
|
)
|
||||||
|
|
||||||
|
return self
|
||||||
|
|
||||||
|
@torch_required
|
||||||
|
# Copied from transformers.tokenization_utils_base.BatchEncoding.to with BatchEncoding->BatchFeature
|
||||||
|
def to(self, device: Union[str, "torch.device"]) -> "BatchFeature":
|
||||||
|
"""
|
||||||
|
Send all values to device by calling :obj:`v.to(device)` (PyTorch only).
|
||||||
|
|
||||||
|
Args:
|
||||||
|
device (:obj:`str` or :obj:`torch.device`): The device to put the tensors on.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
:class:`~transformers.BatchFeature`: The same instance of :class:`~transformers.BatchFeature` after
|
||||||
|
modification.
|
||||||
|
"""
|
||||||
|
|
||||||
|
# This check catches things like APEX blindly calling "to" on all inputs to a module
|
||||||
|
# Otherwise it passes the casts down and casts the LongTensor containing the token idxs
|
||||||
|
# into a HalfTensor
|
||||||
|
if isinstance(device, str) or _is_torch_device(device) or isinstance(device, int):
|
||||||
|
self.data = {k: v.to(device=device) for k, v in self.data.items()}
|
||||||
|
else:
|
||||||
|
logger.warning(f"Attempting to cast a BatchFeature to type {str(device)}. This is not supported.")
|
||||||
|
return self
|
||||||
|
|
||||||
|
|
||||||
|
class PreTrainedFeatureExtractor:
|
||||||
|
"""
|
||||||
|
This is a general feature extraction class for speech recognition.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
feature_size (:obj:`int`):
|
||||||
|
The feature dimension of the extracted features.
|
||||||
|
sampling_rate (:obj:`int`):
|
||||||
|
The sampling rate at which the audio files should be digitalized expressed in Hertz per second (Hz).
|
||||||
|
padding_value (:obj:`float`):
|
||||||
|
The value that is used to fill the padding values / vectors.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, feature_size: int, sampling_rate: int, padding_value: float, **kwargs):
|
||||||
|
self.feature_size = feature_size
|
||||||
|
self.sampling_rate = sampling_rate
|
||||||
|
self.padding_value = padding_value
|
||||||
|
|
||||||
|
self.padding_side = kwargs.pop("padding_side", "right")
|
||||||
|
self.return_attention_mask = kwargs.pop("return_attention_mask", True)
|
||||||
|
|
||||||
|
# Additional attributes without default values
|
||||||
|
for key, value in kwargs.items():
|
||||||
|
try:
|
||||||
|
setattr(self, key, value)
|
||||||
|
except AttributeError as err:
|
||||||
|
logger.error(f"Can't set {key} with value {value} for {self}")
|
||||||
|
raise err
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_pretrained(
|
||||||
|
cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs
|
||||||
|
) -> "PreTrainedFeatureExtractor":
|
||||||
|
r"""
|
||||||
|
Instantiate a :class:`~transformers.PreTrainedFeatureExtractor` (or a derived class) from a pretrained feature
|
||||||
|
extractor.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
pretrained_model_name_or_path (:obj:`str` or :obj:`os.PathLike`):
|
||||||
|
This can be either:
|
||||||
|
|
||||||
|
- a string, the `model id` of a pretrained feature_extractor hosted inside a model repo on
|
||||||
|
huggingface.co. Valid model ids can be located at the root-level, like ``bert-base-uncased``, or
|
||||||
|
namespaced under a user or organization name, like ``dbmdz/bert-base-german-cased``.
|
||||||
|
- a path to a `directory` containing a feature extractor file saved using the
|
||||||
|
:func:`~transformers.PreTrainedFeatureExtractor.save_pretrained` method, e.g.,
|
||||||
|
``./my_model_directory/``.
|
||||||
|
- a path or url to a saved feature extractor JSON `file`, e.g.,
|
||||||
|
``./my_model_directory/feature_extraction_config.json``.
|
||||||
|
cache_dir (:obj:`str` or :obj:`os.PathLike`, `optional`):
|
||||||
|
Path to a directory in which a downloaded pretrained model feature extractor should be cached if the
|
||||||
|
standard cache should not be used.
|
||||||
|
force_download (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
||||||
|
Whether or not to force to (re-)download the feature extractor files and override the cached versions
|
||||||
|
if they exist.
|
||||||
|
resume_download (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
||||||
|
Whether or not to delete incompletely received file. Attempts to resume the download if such a file
|
||||||
|
exists.
|
||||||
|
proxies (:obj:`Dict[str, str]`, `optional`):
|
||||||
|
A dictionary of proxy servers to use by protocol or endpoint, e.g., :obj:`{'http': 'foo.bar:3128',
|
||||||
|
'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
|
||||||
|
use_auth_token (:obj:`str` or `bool`, `optional`):
|
||||||
|
The token to use as HTTP bearer authorization for remote files. If :obj:`True`, will use the token
|
||||||
|
generated when running :obj:`transformers-cli login` (stored in :obj:`~/.huggingface`).
|
||||||
|
revision(:obj:`str`, `optional`, defaults to :obj:`"main"`):
|
||||||
|
The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
|
||||||
|
git-based system for storing models and other artifacts on huggingface.co, so ``revision`` can be any
|
||||||
|
identifier allowed by git.
|
||||||
|
return_unused_kwargs (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
||||||
|
If :obj:`False`, then this function returns just the final feature extractor object.
|
||||||
|
|
||||||
|
If :obj:`True`, then this functions returns a :obj:`Tuple(feature_extractor, unused_kwargs)` where
|
||||||
|
`unused_kwargs` is a dictionary consisting of the key/value pairs whose keys are not feature extractor
|
||||||
|
attributes: i.e., the part of ``kwargs`` which has not been used to update ``feature_extractor`` and is
|
||||||
|
otherwise ignored.
|
||||||
|
kwargs (:obj:`Dict[str, Any]`, `optional`):
|
||||||
|
The values in kwargs of any keys which are feature extractor attributes will be used to override the
|
||||||
|
loaded values. Behavior concerning key/value pairs whose keys are *not* feature extractor attributes is
|
||||||
|
controlled by the ``return_unused_kwargs`` keyword parameter.
|
||||||
|
|
||||||
|
.. note::
|
||||||
|
|
||||||
|
Passing :obj:`use_auth_token=True` is required when you want to use a private model.
|
||||||
|
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
:class:`~transformers.PreTrainedFeatureExtractor`: The feature extractor object instantiated from this
|
||||||
|
pretrained model.
|
||||||
|
|
||||||
|
Examples::
|
||||||
|
|
||||||
|
# We can't instantiate directly the base class `PreTrainedFeatureExtractor` so let's show the examples on a
|
||||||
|
# derived class: Wav2Vec2FeatureExtractor
|
||||||
|
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained('facebook/wav2vec2-base-960h') # Download feature_extraction_config from huggingface.co and cache.
|
||||||
|
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained('./test/saved_model/') # E.g. feature_extractor (or model) was saved using `save_pretrained('./test/saved_model/')`
|
||||||
|
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained('./test/saved_model/preprocessor_config.json')
|
||||||
|
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained('facebook/wav2vec2-base-960h', return_attention_mask=False, foo=False)
|
||||||
|
assert feature_extractor.return_attention_mask is False
|
||||||
|
feature_extractor, unused_kwargs = Wav2Vec2FeatureExtractor.from_pretrained('facebook/wav2vec2-base-960h', return_attention_mask=False,
|
||||||
|
foo=False, return_unused_kwargs=True)
|
||||||
|
assert feature_extractor.return_attention_mask is False
|
||||||
|
assert unused_kwargs == {'foo': False}
|
||||||
|
|
||||||
|
"""
|
||||||
|
feature_extractor_dict, kwargs = cls.get_feature_extractor_dict(pretrained_model_name_or_path, **kwargs)
|
||||||
|
|
||||||
|
return cls.from_dict(feature_extractor_dict, **kwargs)
|
||||||
|
|
||||||
|
def save_pretrained(self, save_directory: Union[str, os.PathLike]):
|
||||||
|
"""
|
||||||
|
Save a feature_extractor object to the directory ``save_directory``, so that it can be re-loaded using the
|
||||||
|
:func:`~transformers.PreTrainedFeatureExtractor.from_pretrained` class method.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
save_directory (:obj:`str` or :obj:`os.PathLike`):
|
||||||
|
Directory where the feature extractor JSON file will be saved (will be created if it does not exist).
|
||||||
|
"""
|
||||||
|
if os.path.isfile(save_directory):
|
||||||
|
raise AssertionError(f"Provided path ({save_directory}) should be a directory, not a file")
|
||||||
|
os.makedirs(save_directory, exist_ok=True)
|
||||||
|
# If we save using the predefined names, we can load using `from_pretrained`
|
||||||
|
output_feature_extractor_file = os.path.join(save_directory, FEATURE_EXTRACTOR_NAME)
|
||||||
|
|
||||||
|
self.to_json_file(output_feature_extractor_file)
|
||||||
|
logger.info(f"Configuration saved in {output_feature_extractor_file}")
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def get_feature_extractor_dict(
|
||||||
|
cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs
|
||||||
|
) -> Tuple[Dict[str, Any], Dict[str, Any]]:
|
||||||
|
"""
|
||||||
|
From a ``pretrained_model_name_or_path``, resolve to a dictionary of parameters, to be used for instantiating a
|
||||||
|
:class:`~transformers.PreTrainedFeatureExtractor` using ``from_dict``.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
pretrained_model_name_or_path (:obj:`str` or :obj:`os.PathLike`):
|
||||||
|
The identifier of the pre-trained checkpoint from which we want the dictionary of parameters.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
:obj:`Tuple[Dict, Dict]`: The dictionary(ies) that will be used to instantiate the feature extractor
|
||||||
|
object.
|
||||||
|
"""
|
||||||
|
cache_dir = kwargs.pop("cache_dir", None)
|
||||||
|
force_download = kwargs.pop("force_download", False)
|
||||||
|
resume_download = kwargs.pop("resume_download", False)
|
||||||
|
proxies = kwargs.pop("proxies", None)
|
||||||
|
use_auth_token = kwargs.pop("use_auth_token", None)
|
||||||
|
local_files_only = kwargs.pop("local_files_only", False)
|
||||||
|
revision = kwargs.pop("revision", None)
|
||||||
|
|
||||||
|
pretrained_model_name_or_path = str(pretrained_model_name_or_path)
|
||||||
|
if os.path.isdir(pretrained_model_name_or_path):
|
||||||
|
feature_extractor_file = os.path.join(pretrained_model_name_or_path, FEATURE_EXTRACTOR_NAME)
|
||||||
|
elif os.path.isfile(pretrained_model_name_or_path) or is_remote_url(pretrained_model_name_or_path):
|
||||||
|
feature_extractor_file = pretrained_model_name_or_path
|
||||||
|
else:
|
||||||
|
feature_extractor_file = hf_bucket_url(
|
||||||
|
pretrained_model_name_or_path, filename=FEATURE_EXTRACTOR_NAME, revision=revision, mirror=None
|
||||||
|
)
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Load from URL or cache if already cached
|
||||||
|
resolved_feature_extractor_file = cached_path(
|
||||||
|
feature_extractor_file,
|
||||||
|
cache_dir=cache_dir,
|
||||||
|
force_download=force_download,
|
||||||
|
proxies=proxies,
|
||||||
|
resume_download=resume_download,
|
||||||
|
local_files_only=local_files_only,
|
||||||
|
use_auth_token=use_auth_token,
|
||||||
|
)
|
||||||
|
# Load feature_extractor dict
|
||||||
|
with open(resolved_feature_extractor_file, "r", encoding="utf-8") as reader:
|
||||||
|
text = reader.read()
|
||||||
|
feature_extractor_dict = json.loads(text)
|
||||||
|
|
||||||
|
except EnvironmentError as err:
|
||||||
|
logger.error(err)
|
||||||
|
msg = (
|
||||||
|
f"Can't load feature extractor for '{pretrained_model_name_or_path}'. Make sure that:\n\n"
|
||||||
|
f"- '{pretrained_model_name_or_path}' is a correct model identifier listed on 'https://huggingface.co/models'\n\n"
|
||||||
|
f"- or '{pretrained_model_name_or_path}' is the correct path to a directory containing a {FEATURE_EXTRACTOR_NAME} file\n\n"
|
||||||
|
)
|
||||||
|
raise EnvironmentError(msg)
|
||||||
|
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
msg = (
|
||||||
|
f"Couldn't reach server at '{feature_extractor_file}' to download feature extractor configuration file or "
|
||||||
|
"feature extractor configuration file is not a valid JSON file. "
|
||||||
|
f"Please check network or file content here: {resolved_feature_extractor_file}."
|
||||||
|
)
|
||||||
|
raise EnvironmentError(msg)
|
||||||
|
|
||||||
|
if resolved_feature_extractor_file == feature_extractor_file:
|
||||||
|
logger.info(f"loading feature extractor configuration file {feature_extractor_file}")
|
||||||
|
else:
|
||||||
|
logger.info(
|
||||||
|
f"loading feature extractor configuration file {feature_extractor_file} from cache at {resolved_feature_extractor_file}"
|
||||||
|
)
|
||||||
|
|
||||||
|
return feature_extractor_dict, kwargs
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_dict(cls, feature_extractor_dict: Dict[str, Any], **kwargs) -> "PreTrainedFeatureExtractor":
|
||||||
|
"""
|
||||||
|
Instantiates a :class:`~transformers.PreTrainedFeatureExtractor` from a Python dictionary of parameters.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
feature_extractor_dict (:obj:`Dict[str, Any]`):
|
||||||
|
Dictionary that will be used to instantiate the feature extractor object. Such a dictionary can be
|
||||||
|
retrieved from a pretrained checkpoint by leveraging the
|
||||||
|
:func:`~transformers.PreTrainedFeatureExtractor.to_dict` method.
|
||||||
|
kwargs (:obj:`Dict[str, Any]`):
|
||||||
|
Additional parameters from which to initialize the feature extractor object.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
:class:`~transformers.PreTrainedFeatureExtractor`: The feature extractor object instantiated from those
|
||||||
|
parameters.
|
||||||
|
"""
|
||||||
|
return_unused_kwargs = kwargs.pop("return_unused_kwargs", False)
|
||||||
|
|
||||||
|
feature_extractor = cls(**feature_extractor_dict)
|
||||||
|
|
||||||
|
# Update feature_extractor with kwargs if needed
|
||||||
|
to_remove = []
|
||||||
|
for key, value in kwargs.items():
|
||||||
|
if hasattr(feature_extractor, key):
|
||||||
|
setattr(feature_extractor, key, value)
|
||||||
|
to_remove.append(key)
|
||||||
|
for key in to_remove:
|
||||||
|
kwargs.pop(key, None)
|
||||||
|
|
||||||
|
logger.info(f"Feature extractor {feature_extractor}")
|
||||||
|
if return_unused_kwargs:
|
||||||
|
return feature_extractor, kwargs
|
||||||
|
else:
|
||||||
|
return feature_extractor
|
||||||
|
|
||||||
|
def to_dict(self) -> Dict[str, Any]:
|
||||||
|
"""
|
||||||
|
Serializes this instance to a Python dictionary.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
:obj:`Dict[str, Any]`: Dictionary of all the attributes that make up this feature extractor instance.
|
||||||
|
"""
|
||||||
|
output = copy.deepcopy(self.__dict__)
|
||||||
|
|
||||||
|
return output
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_json_file(cls, json_file: Union[str, os.PathLike]) -> "PreTrainedFeatureExtractor":
|
||||||
|
"""
|
||||||
|
Instantiates a :class:`~transformers.PreTrainedFeatureExtractor` from the path to a JSON file of parameters.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
json_file (:obj:`str` or :obj:`os.PathLike`):
|
||||||
|
Path to the JSON file containing the parameters.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
:class:`~transformers.PreTrainedFeatureExtractor`: The feature_extractor object instantiated from that JSON
|
||||||
|
file.
|
||||||
|
|
||||||
|
"""
|
||||||
|
with open(json_file, "r", encoding="utf-8") as reader:
|
||||||
|
text = reader.read()
|
||||||
|
feature_extractor_dict = json.loads(text)
|
||||||
|
return cls(**feature_extractor_dict)
|
||||||
|
|
||||||
|
def to_json_string(self) -> str:
|
||||||
|
"""
|
||||||
|
Serializes this instance to a JSON string.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
:obj:`str`: String containing all the attributes that make up this feature_extractor instance in JSON
|
||||||
|
format.
|
||||||
|
"""
|
||||||
|
return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"
|
||||||
|
|
||||||
|
def to_json_file(self, json_file_path: Union[str, os.PathLike]):
|
||||||
|
"""
|
||||||
|
Save this instance to a JSON file.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
json_file_path (:obj:`str` or :obj:`os.PathLike`):
|
||||||
|
Path to the JSON file in which this feature_extractor instance's parameters will be saved.
|
||||||
|
"""
|
||||||
|
with open(json_file_path, "w", encoding="utf-8") as writer:
|
||||||
|
writer.write(self.to_json_string())
|
||||||
|
|
||||||
|
def __repr__(self):
|
||||||
|
return f"{self.__class__.__name__} {self.to_json_string()}"
|
||||||
|
|
||||||
|
def pad(
|
||||||
|
self,
|
||||||
|
processed_features: Union[
|
||||||
|
BatchFeature,
|
||||||
|
List[BatchFeature],
|
||||||
|
Dict[str, BatchFeature],
|
||||||
|
Dict[str, List[BatchFeature]],
|
||||||
|
List[Dict[str, BatchFeature]],
|
||||||
|
],
|
||||||
|
padding: Union[bool, str, PaddingStrategy] = True,
|
||||||
|
max_length: Optional[int] = None,
|
||||||
|
pad_to_multiple_of: Optional[int] = None,
|
||||||
|
return_attention_mask: Optional[bool] = None,
|
||||||
|
return_tensors: Optional[Union[str, TensorType]] = None,
|
||||||
|
) -> BatchFeature:
|
||||||
|
"""
|
||||||
|
Pad input values / input vectors or a batch of input values / input vectors up to predefined length or to the
|
||||||
|
max sequence length in the batch.
|
||||||
|
|
||||||
|
Padding side (left/right) padding values are defined at the feature extractor level (with
|
||||||
|
``self.padding_side``, ``self.padding_value``)
|
||||||
|
|
||||||
|
.. note::
|
||||||
|
|
||||||
|
If the ``processed_features`` passed are dictionary of numpy arrays, PyTorch tensors or TensorFlow tensors,
|
||||||
|
the result will use the same type unless you provide a different tensor type with ``return_tensors``. In
|
||||||
|
the case of PyTorch tensors, you will lose the specific device of your tensors however.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
processed_features (:class:`~transformers.BatchFeature`, list of :class:`~transformers.BatchFeature`, :obj:`Dict[str, List[float]]`, :obj:`Dict[str, List[List[float]]` or :obj:`List[Dict[str, List[float]]]`):
|
||||||
|
Processed inputs. Can represent one input (:class:`~transformers.BatchFeature` or :obj:`Dict[str,
|
||||||
|
List[float]]`) or a batch of input values / vectors (list of :class:`~transformers.BatchFeature`,
|
||||||
|
`Dict[str, List[List[float]]]` or `List[Dict[str, List[float]]]`) so you can use this method during
|
||||||
|
preprocessing as well as in a PyTorch Dataloader collate function.
|
||||||
|
|
||||||
|
Instead of :obj:`List[float]` you can have tensors (numpy arrays, PyTorch tensors or TensorFlow
|
||||||
|
tensors), see the note above for the return type.
|
||||||
|
padding (:obj:`bool`, :obj:`str` or :class:`~transformers.file_utils.PaddingStrategy`, `optional`, defaults to :obj:`True`):
|
||||||
|
Select a strategy to pad the returned sequences (according to the model's padding side and padding
|
||||||
|
index) among:
|
||||||
|
|
||||||
|
* :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a
|
||||||
|
single sequence if provided).
|
||||||
|
* :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
|
||||||
|
maximum acceptable input length for the model if that argument is not provided.
|
||||||
|
* :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
|
||||||
|
different lengths).
|
||||||
|
max_length (:obj:`int`, `optional`):
|
||||||
|
Maximum length of the returned list and optionally padding length (see above).
|
||||||
|
pad_to_multiple_of (:obj:`int`, `optional`):
|
||||||
|
If set will pad the sequence to a multiple of the provided value.
|
||||||
|
|
||||||
|
This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
|
||||||
|
>= 7.5 (Volta), or on TPUs which benefit from having sequence lengths be a multiple of 128.
|
||||||
|
return_attention_mask (:obj:`bool`, `optional`):
|
||||||
|
Whether to return the attention mask. If left to the default, will return the attention mask according
|
||||||
|
to the specific feature_extractor's default.
|
||||||
|
|
||||||
|
`What are attention masks? <../glossary.html#attention-mask>`__
|
||||||
|
return_tensors (:obj:`str` or :class:`~transformers.file_utils.TensorType`, `optional`):
|
||||||
|
If set, will return tensors instead of list of python integers. Acceptable values are:
|
||||||
|
|
||||||
|
* :obj:`'tf'`: Return TensorFlow :obj:`tf.constant` objects.
|
||||||
|
* :obj:`'pt'`: Return PyTorch :obj:`torch.Tensor` objects.
|
||||||
|
* :obj:`'np'`: Return Numpy :obj:`np.ndarray` objects.
|
||||||
|
"""
|
||||||
|
# If we have a list of dicts, let's convert it in a dict of lists
|
||||||
|
# We do this to allow using this method as a collate_fn function in PyTorch Dataloader
|
||||||
|
if isinstance(processed_features, (list, tuple)) and isinstance(processed_features[0], (dict, BatchFeature)):
|
||||||
|
processed_features = {
|
||||||
|
key: [example[key] for example in processed_features] for key in processed_features[0].keys()
|
||||||
|
}
|
||||||
|
|
||||||
|
# The model's main input name, usually `input_values`, has be passed for padding
|
||||||
|
if self.model_input_names[0] not in processed_features:
|
||||||
|
raise ValueError(
|
||||||
|
"You should supply an instance of :class:`~transformers.BatchFeature` or list of :class:`~transformers.BatchFeature` to this method"
|
||||||
|
f"that includes {self.model_input_names[0]}, but you provided {list(processed_features.keys())}"
|
||||||
|
)
|
||||||
|
|
||||||
|
required_input = processed_features[self.model_input_names[0]]
|
||||||
|
return_attention_mask = (
|
||||||
|
return_attention_mask if return_attention_mask is not None else self.return_attention_mask
|
||||||
|
)
|
||||||
|
|
||||||
|
if not required_input:
|
||||||
|
if return_attention_mask:
|
||||||
|
processed_features["attention_mask"] = []
|
||||||
|
return processed_features
|
||||||
|
|
||||||
|
# If we have PyTorch/TF/NumPy tensors/arrays as inputs, we cast them as python objects
|
||||||
|
# and rebuild them afterwards if no return_tensors is specified
|
||||||
|
# Note that we lose the specific device the tensor may be on for PyTorch
|
||||||
|
|
||||||
|
first_element = required_input[0]
|
||||||
|
if isinstance(first_element, (list, tuple)):
|
||||||
|
# first_element might be an empty list/tuple in some edge cases so we grab the first non empty element.
|
||||||
|
index = 0
|
||||||
|
while len(required_input[index]) == 0:
|
||||||
|
index += 1
|
||||||
|
if index < len(required_input):
|
||||||
|
first_element = required_input[index][0]
|
||||||
|
# At this state, if `first_element` is still a list/tuple, it's an empty one so there is nothing to do.
|
||||||
|
if not isinstance(first_element, (float, int, list, tuple)):
|
||||||
|
if is_tf_available() and _is_tensorflow(first_element):
|
||||||
|
return_tensors = "tf" if return_tensors is None else return_tensors
|
||||||
|
elif is_torch_available() and _is_torch(first_element):
|
||||||
|
return_tensors = "pt" if return_tensors is None else return_tensors
|
||||||
|
elif isinstance(first_element, np.ndarray):
|
||||||
|
return_tensors = "np" if return_tensors is None else return_tensors
|
||||||
|
else:
|
||||||
|
raise ValueError(
|
||||||
|
f"type of {first_element} unknown: {type(first_element)}. "
|
||||||
|
f"Should be one of a python, numpy, pytorch or tensorflow object."
|
||||||
|
)
|
||||||
|
|
||||||
|
for key, value in processed_features.items():
|
||||||
|
processed_features[key] = to_py_obj(value)
|
||||||
|
|
||||||
|
# Convert padding_strategy in PaddingStrategy
|
||||||
|
padding_strategy, max_length, _ = self._get_padding_strategies(padding=padding, max_length=max_length)
|
||||||
|
|
||||||
|
required_input = processed_features[self.model_input_names[0]]
|
||||||
|
if required_input and not isinstance(required_input[0], (list, tuple)):
|
||||||
|
processed_features = self._pad(
|
||||||
|
processed_features,
|
||||||
|
max_length=max_length,
|
||||||
|
padding_strategy=padding_strategy,
|
||||||
|
pad_to_multiple_of=pad_to_multiple_of,
|
||||||
|
return_attention_mask=return_attention_mask,
|
||||||
|
)
|
||||||
|
return BatchFeature(processed_features, tensor_type=return_tensors)
|
||||||
|
|
||||||
|
batch_size = len(required_input)
|
||||||
|
assert all(
|
||||||
|
len(v) == batch_size for v in processed_features.values()
|
||||||
|
), "Some items in the output dictionary have a different batch size than others."
|
||||||
|
|
||||||
|
if padding_strategy == PaddingStrategy.LONGEST:
|
||||||
|
max_length = max(len(inputs) for inputs in required_input)
|
||||||
|
padding_strategy = PaddingStrategy.MAX_LENGTH
|
||||||
|
|
||||||
|
batch_outputs = {}
|
||||||
|
for i in range(batch_size):
|
||||||
|
inputs = dict((k, v[i]) for k, v in processed_features.items())
|
||||||
|
outputs = self._pad(
|
||||||
|
inputs,
|
||||||
|
max_length=max_length,
|
||||||
|
padding_strategy=padding_strategy,
|
||||||
|
pad_to_multiple_of=pad_to_multiple_of,
|
||||||
|
return_attention_mask=return_attention_mask,
|
||||||
|
)
|
||||||
|
|
||||||
|
for key, value in outputs.items():
|
||||||
|
if key not in batch_outputs:
|
||||||
|
batch_outputs[key] = []
|
||||||
|
batch_outputs[key].append(value)
|
||||||
|
|
||||||
|
return BatchFeature(batch_outputs, tensor_type=return_tensors)
|
||||||
|
|
||||||
|
def _pad(
|
||||||
|
self,
|
||||||
|
processed_features: Union[Dict[str, List[float]], BatchFeature],
|
||||||
|
max_length: Optional[int] = None,
|
||||||
|
padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
|
||||||
|
pad_to_multiple_of: Optional[int] = None,
|
||||||
|
return_attention_mask: Optional[bool] = None,
|
||||||
|
) -> dict:
|
||||||
|
"""
|
||||||
|
Pad inputs (on left/right and up to predefined length or max length in the batch)
|
||||||
|
|
||||||
|
Args:
|
||||||
|
processed_features: Dictionary of input values (`List[float]`) / input vectors (`List[List[float]]`) or batch of inputs values (`List[List[int]]`) / input vectors (`List[List[List[int]]]`)
|
||||||
|
max_length: maximum length of the returned list and optionally padding length (see below)
|
||||||
|
padding_strategy: PaddingStrategy to use for padding.
|
||||||
|
|
||||||
|
- PaddingStrategy.LONGEST Pad to the longest sequence in the batch
|
||||||
|
- PaddingStrategy.MAX_LENGTH: Pad to the max length (default)
|
||||||
|
- PaddingStrategy.DO_NOT_PAD: Do not pad
|
||||||
|
The feature_extractor padding sides are defined in self.padding_side:
|
||||||
|
|
||||||
|
- 'left': pads on the left of the sequences
|
||||||
|
- 'right': pads on the right of the sequences
|
||||||
|
pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
|
||||||
|
This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
|
||||||
|
>= 7.5 (Volta), or on TPUs which benefit from having sequence lengths be a multiple of 128.
|
||||||
|
return_attention_mask: (optional) Set to False to avoid returning attention mask (default: set to model specifics)
|
||||||
|
"""
|
||||||
|
required_input = processed_features[self.model_input_names[0]]
|
||||||
|
|
||||||
|
if padding_strategy == PaddingStrategy.LONGEST:
|
||||||
|
max_length = len(required_input)
|
||||||
|
|
||||||
|
if max_length is not None and pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0):
|
||||||
|
max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of
|
||||||
|
|
||||||
|
needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(required_input) != max_length
|
||||||
|
|
||||||
|
if needs_to_be_padded:
|
||||||
|
difference = max_length - len(required_input)
|
||||||
|
padding_vector = self.feature_size * [self.padding_value] if self.feature_size > 1 else self.padding_value
|
||||||
|
if self.padding_side == "right":
|
||||||
|
if return_attention_mask:
|
||||||
|
processed_features["attention_mask"] = [1] * len(required_input) + [0] * difference
|
||||||
|
processed_features[self.model_input_names[0]] = required_input + [
|
||||||
|
padding_vector for _ in range(difference)
|
||||||
|
]
|
||||||
|
elif self.padding_side == "left":
|
||||||
|
if return_attention_mask:
|
||||||
|
processed_features["attention_mask"] = [0] * difference + [1] * len(required_input)
|
||||||
|
processed_features[self.model_input_names[0]] = [
|
||||||
|
padding_vector for _ in range(difference)
|
||||||
|
] + required_input
|
||||||
|
else:
|
||||||
|
raise ValueError("Invalid padding strategy:" + str(self.padding_side))
|
||||||
|
elif return_attention_mask and "attention_mask" not in processed_features:
|
||||||
|
processed_features["attention_mask"] = [1] * len(required_input)
|
||||||
|
|
||||||
|
return processed_features
|
||||||
|
|
||||||
|
def _get_padding_strategies(self, padding=False, max_length=None, pad_to_multiple_of=None, **kwargs):
|
||||||
|
"""
|
||||||
|
Find the correct padding strategy
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Get padding strategy
|
||||||
|
if padding is not False:
|
||||||
|
if padding is True:
|
||||||
|
padding_strategy = PaddingStrategy.LONGEST # Default to pad to the longest sequence in the batch
|
||||||
|
elif not isinstance(padding, PaddingStrategy):
|
||||||
|
padding_strategy = PaddingStrategy(padding)
|
||||||
|
elif isinstance(padding, PaddingStrategy):
|
||||||
|
padding_strategy = padding
|
||||||
|
else:
|
||||||
|
padding_strategy = PaddingStrategy.DO_NOT_PAD
|
||||||
|
|
||||||
|
# Set max length if needed
|
||||||
|
if max_length is None:
|
||||||
|
if padding_strategy == PaddingStrategy.MAX_LENGTH:
|
||||||
|
raise ValueError(
|
||||||
|
f"When setting ``padding={PaddingStrategy.MAX_LENGTH}``, make sure that" f" max_length is defined"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Test if we have a padding value
|
||||||
|
if padding_strategy != PaddingStrategy.DO_NOT_PAD and (self.padding_value is None):
|
||||||
|
raise ValueError(
|
||||||
|
"Asking to pad but the feature_extractor does not have a padding value. "
|
||||||
|
"Please select a value to use as `padding_value`. For example: `feature_extractor.padding_value = 0.0`."
|
||||||
|
)
|
||||||
|
|
||||||
|
return padding_strategy, max_length, kwargs
|
||||||
@@ -27,9 +27,10 @@ import shutil
|
|||||||
import sys
|
import sys
|
||||||
import tarfile
|
import tarfile
|
||||||
import tempfile
|
import tempfile
|
||||||
from collections import OrderedDict
|
from collections import OrderedDict, UserDict
|
||||||
from contextlib import contextmanager
|
from contextlib import contextmanager
|
||||||
from dataclasses import fields
|
from dataclasses import fields
|
||||||
|
from enum import Enum
|
||||||
from functools import partial, wraps
|
from functools import partial, wraps
|
||||||
from hashlib import sha256
|
from hashlib import sha256
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
@@ -211,6 +212,7 @@ TF2_WEIGHTS_NAME = "tf_model.h5"
|
|||||||
TF_WEIGHTS_NAME = "model.ckpt"
|
TF_WEIGHTS_NAME = "model.ckpt"
|
||||||
FLAX_WEIGHTS_NAME = "flax_model.msgpack"
|
FLAX_WEIGHTS_NAME = "flax_model.msgpack"
|
||||||
CONFIG_NAME = "config.json"
|
CONFIG_NAME = "config.json"
|
||||||
|
FEATURE_EXTRACTOR_NAME = "preprocessor_config.json"
|
||||||
MODEL_CARD_NAME = "modelcard.json"
|
MODEL_CARD_NAME = "modelcard.json"
|
||||||
|
|
||||||
SENTENCEPIECE_UNDERLINE = "▁"
|
SENTENCEPIECE_UNDERLINE = "▁"
|
||||||
@@ -1400,6 +1402,52 @@ def is_tensor(x):
|
|||||||
return isinstance(x, np.ndarray)
|
return isinstance(x, np.ndarray)
|
||||||
|
|
||||||
|
|
||||||
|
def _is_numpy(x):
|
||||||
|
return isinstance(x, np.ndarray)
|
||||||
|
|
||||||
|
|
||||||
|
def _is_torch(x):
|
||||||
|
import torch
|
||||||
|
|
||||||
|
return isinstance(x, torch.Tensor)
|
||||||
|
|
||||||
|
|
||||||
|
def _is_torch_device(x):
|
||||||
|
import torch
|
||||||
|
|
||||||
|
return isinstance(x, torch.device)
|
||||||
|
|
||||||
|
|
||||||
|
def _is_tensorflow(x):
|
||||||
|
import tensorflow as tf
|
||||||
|
|
||||||
|
return isinstance(x, tf.Tensor)
|
||||||
|
|
||||||
|
|
||||||
|
def _is_jax(x):
|
||||||
|
import jax.numpy as jnp # noqa: F811
|
||||||
|
|
||||||
|
return isinstance(x, jnp.ndarray)
|
||||||
|
|
||||||
|
|
||||||
|
def to_py_obj(obj):
|
||||||
|
"""
|
||||||
|
Convert a TensorFlow tensor, PyTorch tensor, Numpy array or python list to a python list.
|
||||||
|
"""
|
||||||
|
if isinstance(obj, (dict, UserDict)):
|
||||||
|
return {k: to_py_obj(v) for k, v in obj.items()}
|
||||||
|
elif isinstance(obj, (list, tuple)):
|
||||||
|
return [to_py_obj(o) for o in obj]
|
||||||
|
elif is_tf_available() and _is_tensorflow(obj):
|
||||||
|
return obj.numpy().tolist()
|
||||||
|
elif is_torch_available() and _is_torch(obj):
|
||||||
|
return obj.detach().cpu().tolist()
|
||||||
|
elif isinstance(obj, np.ndarray):
|
||||||
|
return obj.tolist()
|
||||||
|
else:
|
||||||
|
return obj
|
||||||
|
|
||||||
|
|
||||||
class ModelOutput(OrderedDict):
|
class ModelOutput(OrderedDict):
|
||||||
"""
|
"""
|
||||||
Base class for all model outputs as dataclass. Has a ``__getitem__`` that allows indexing by integer or slice (like
|
Base class for all model outputs as dataclass. Has a ``__getitem__`` that allows indexing by integer or slice (like
|
||||||
@@ -1489,6 +1537,42 @@ class ModelOutput(OrderedDict):
|
|||||||
return tuple(self[k] for k in self.keys())
|
return tuple(self[k] for k in self.keys())
|
||||||
|
|
||||||
|
|
||||||
|
class ExplicitEnum(Enum):
|
||||||
|
"""
|
||||||
|
Enum with more explicit error message for missing values.
|
||||||
|
"""
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def _missing_(cls, value):
|
||||||
|
raise ValueError(
|
||||||
|
"%r is not a valid %s, please select one of %s"
|
||||||
|
% (value, cls.__name__, str(list(cls._value2member_map_.keys())))
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class PaddingStrategy(ExplicitEnum):
|
||||||
|
"""
|
||||||
|
Possible values for the ``padding`` argument in :meth:`PreTrainedTokenizerBase.__call__`. Useful for tab-completion
|
||||||
|
in an IDE.
|
||||||
|
"""
|
||||||
|
|
||||||
|
LONGEST = "longest"
|
||||||
|
MAX_LENGTH = "max_length"
|
||||||
|
DO_NOT_PAD = "do_not_pad"
|
||||||
|
|
||||||
|
|
||||||
|
class TensorType(ExplicitEnum):
|
||||||
|
"""
|
||||||
|
Possible values for the ``return_tensors`` argument in :meth:`PreTrainedTokenizerBase.__call__`. Useful for
|
||||||
|
tab-completion in an IDE.
|
||||||
|
"""
|
||||||
|
|
||||||
|
PYTORCH = "pt"
|
||||||
|
TENSORFLOW = "tf"
|
||||||
|
NUMPY = "np"
|
||||||
|
JAX = "jax"
|
||||||
|
|
||||||
|
|
||||||
class _BaseLazyModule(ModuleType):
|
class _BaseLazyModule(ModuleType):
|
||||||
"""
|
"""
|
||||||
Module class that surfaces all objects but only performs associated imports when the objects are requested.
|
Module class that surfaces all objects but only performs associated imports when the objects are requested.
|
||||||
|
|||||||
@@ -52,7 +52,7 @@ from ..roberta.tokenization_roberta import RobertaTokenizer
|
|||||||
from ..squeezebert.tokenization_squeezebert import SqueezeBertTokenizer
|
from ..squeezebert.tokenization_squeezebert import SqueezeBertTokenizer
|
||||||
from ..tapas.tokenization_tapas import TapasTokenizer
|
from ..tapas.tokenization_tapas import TapasTokenizer
|
||||||
from ..transfo_xl.tokenization_transfo_xl import TransfoXLTokenizer
|
from ..transfo_xl.tokenization_transfo_xl import TransfoXLTokenizer
|
||||||
from ..wav2vec2.tokenization_wav2vec2 import Wav2Vec2Tokenizer
|
from ..wav2vec2.tokenization_wav2vec2 import Wav2Vec2CTCTokenizer
|
||||||
from ..xlm.tokenization_xlm import XLMTokenizer
|
from ..xlm.tokenization_xlm import XLMTokenizer
|
||||||
from .configuration_auto import (
|
from .configuration_auto import (
|
||||||
AlbertConfig,
|
AlbertConfig,
|
||||||
@@ -244,7 +244,7 @@ TOKENIZER_MAPPING = OrderedDict(
|
|||||||
(TapasConfig, (TapasTokenizer, None)),
|
(TapasConfig, (TapasTokenizer, None)),
|
||||||
(LEDConfig, (LEDTokenizer, LEDTokenizerFast)),
|
(LEDConfig, (LEDTokenizer, LEDTokenizerFast)),
|
||||||
(ConvBertConfig, (ConvBertTokenizer, ConvBertTokenizerFast)),
|
(ConvBertConfig, (ConvBertTokenizer, ConvBertTokenizerFast)),
|
||||||
(Wav2Vec2Config, (Wav2Vec2Tokenizer, None)),
|
(Wav2Vec2Config, (Wav2Vec2CTCTokenizer, None)),
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
@@ -18,8 +18,8 @@
|
|||||||
import collections
|
import collections
|
||||||
from typing import List, Optional, Union
|
from typing import List, Optional, Union
|
||||||
|
|
||||||
from ...file_utils import add_end_docstrings, add_start_docstrings
|
from ...file_utils import TensorType, add_end_docstrings, add_start_docstrings
|
||||||
from ...tokenization_utils_base import BatchEncoding, TensorType
|
from ...tokenization_utils_base import BatchEncoding
|
||||||
from ...utils import logging
|
from ...utils import logging
|
||||||
from ..bert.tokenization_bert import BertTokenizer
|
from ..bert.tokenization_bert import BertTokenizer
|
||||||
|
|
||||||
@@ -147,7 +147,7 @@ CUSTOM_DPR_READER_DOCSTRING = r"""
|
|||||||
The passages titles to be encoded. This can be a string or a list of strings if there are several passages.
|
The passages titles to be encoded. This can be a string or a list of strings if there are several passages.
|
||||||
texts (:obj:`str` or :obj:`List[str]`):
|
texts (:obj:`str` or :obj:`List[str]`):
|
||||||
The passages texts to be encoded. This can be a string or a list of strings if there are several passages.
|
The passages texts to be encoded. This can be a string or a list of strings if there are several passages.
|
||||||
padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`False`):
|
padding (:obj:`bool`, :obj:`str` or :class:`~transformers.file_utils.PaddingStrategy`, `optional`, defaults to :obj:`False`):
|
||||||
Activates and controls padding. Accepts the following values:
|
Activates and controls padding. Accepts the following values:
|
||||||
|
|
||||||
* :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
|
* :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
|
||||||
@@ -177,7 +177,7 @@ CUSTOM_DPR_READER_DOCSTRING = r"""
|
|||||||
If left unset or set to :obj:`None`, this will use the predefined model maximum length if a maximum
|
If left unset or set to :obj:`None`, this will use the predefined model maximum length if a maximum
|
||||||
length is required by one of the truncation/padding parameters. If the model has no specific maximum
|
length is required by one of the truncation/padding parameters. If the model has no specific maximum
|
||||||
input length (like XLNet) truncation/padding to a maximum length will be deactivated.
|
input length (like XLNet) truncation/padding to a maximum length will be deactivated.
|
||||||
return_tensors (:obj:`str` or :class:`~transformers.tokenization_utils_base.TensorType`, `optional`):
|
return_tensors (:obj:`str` or :class:`~transformers.file_utils.TensorType`, `optional`):
|
||||||
If set, will return tensors instead of list of python integers. Acceptable values are:
|
If set, will return tensors instead of list of python integers. Acceptable values are:
|
||||||
|
|
||||||
* :obj:`'tf'`: Return TensorFlow :obj:`tf.constant` objects.
|
* :obj:`'tf'`: Return TensorFlow :obj:`tf.constant` objects.
|
||||||
|
|||||||
@@ -18,8 +18,8 @@
|
|||||||
import collections
|
import collections
|
||||||
from typing import List, Optional, Union
|
from typing import List, Optional, Union
|
||||||
|
|
||||||
from ...file_utils import add_end_docstrings, add_start_docstrings
|
from ...file_utils import TensorType, add_end_docstrings, add_start_docstrings
|
||||||
from ...tokenization_utils_base import BatchEncoding, TensorType
|
from ...tokenization_utils_base import BatchEncoding
|
||||||
from ...utils import logging
|
from ...utils import logging
|
||||||
from ..bert.tokenization_bert_fast import BertTokenizerFast
|
from ..bert.tokenization_bert_fast import BertTokenizerFast
|
||||||
from .tokenization_dpr import DPRContextEncoderTokenizer, DPRQuestionEncoderTokenizer, DPRReaderTokenizer
|
from .tokenization_dpr import DPRContextEncoderTokenizer, DPRQuestionEncoderTokenizer, DPRReaderTokenizer
|
||||||
@@ -148,7 +148,7 @@ CUSTOM_DPR_READER_DOCSTRING = r"""
|
|||||||
The passages titles to be encoded. This can be a string or a list of strings if there are several passages.
|
The passages titles to be encoded. This can be a string or a list of strings if there are several passages.
|
||||||
texts (:obj:`str` or :obj:`List[str]`):
|
texts (:obj:`str` or :obj:`List[str]`):
|
||||||
The passages texts to be encoded. This can be a string or a list of strings if there are several passages.
|
The passages texts to be encoded. This can be a string or a list of strings if there are several passages.
|
||||||
padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`False`):
|
padding (:obj:`bool`, :obj:`str` or :class:`~transformers.file_utils.PaddingStrategy`, `optional`, defaults to :obj:`False`):
|
||||||
Activates and controls padding. Accepts the following values:
|
Activates and controls padding. Accepts the following values:
|
||||||
|
|
||||||
* :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
|
* :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
|
||||||
@@ -178,7 +178,7 @@ CUSTOM_DPR_READER_DOCSTRING = r"""
|
|||||||
If left unset or set to :obj:`None`, this will use the predefined model maximum length if a maximum
|
If left unset or set to :obj:`None`, this will use the predefined model maximum length if a maximum
|
||||||
length is required by one of the truncation/padding parameters. If the model has no specific maximum
|
length is required by one of the truncation/padding parameters. If the model has no specific maximum
|
||||||
input length (like XLNet) truncation/padding to a maximum length will be deactivated.
|
input length (like XLNet) truncation/padding to a maximum length will be deactivated.
|
||||||
return_tensors (:obj:`str` or :class:`~transformers.tokenization_utils_base.TensorType`, `optional`):
|
return_tensors (:obj:`str` or :class:`~transformers.file_utils.TensorType`, `optional`):
|
||||||
If set, will return tensors instead of list of python integers. Acceptable values are:
|
If set, will return tensors instead of list of python integers. Acceptable values are:
|
||||||
|
|
||||||
* :obj:`'tf'`: Return TensorFlow :obj:`tf.constant` objects.
|
* :obj:`'tf'`: Return TensorFlow :obj:`tf.constant` objects.
|
||||||
|
|||||||
@@ -573,7 +573,7 @@ class RagRetriever:
|
|||||||
The prefix used by the generator's tokenizer.
|
The prefix used by the generator's tokenizer.
|
||||||
n_docs (:obj:`int`, `optional`):
|
n_docs (:obj:`int`, `optional`):
|
||||||
The number of docs retrieved per query.
|
The number of docs retrieved per query.
|
||||||
return_tensors (:obj:`str` or :class:`~transformers.tokenization_utils_base.TensorType`, `optional`, defaults to "pt"):
|
return_tensors (:obj:`str` or :class:`~transformers.file_utils.TensorType`, `optional`, defaults to "pt"):
|
||||||
If set, will return tensors instead of list of python integers. Acceptable values are:
|
If set, will return tensors instead of list of python integers. Acceptable values are:
|
||||||
|
|
||||||
* :obj:`'tf'`: Return TensorFlow :obj:`tf.constant` objects.
|
* :obj:`'tf'`: Return TensorFlow :obj:`tf.constant` objects.
|
||||||
|
|||||||
@@ -28,16 +28,13 @@ from typing import Callable, Dict, Generator, List, Optional, Text, Tuple, Union
|
|||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
from ...file_utils import add_end_docstrings, is_pandas_available
|
from ...file_utils import ExplicitEnum, PaddingStrategy, TensorType, add_end_docstrings, is_pandas_available
|
||||||
from ...tokenization_utils import PreTrainedTokenizer, _is_control, _is_punctuation, _is_whitespace
|
from ...tokenization_utils import PreTrainedTokenizer, _is_control, _is_punctuation, _is_whitespace
|
||||||
from ...tokenization_utils_base import (
|
from ...tokenization_utils_base import (
|
||||||
ENCODE_KWARGS_DOCSTRING,
|
ENCODE_KWARGS_DOCSTRING,
|
||||||
BatchEncoding,
|
BatchEncoding,
|
||||||
EncodedInput,
|
EncodedInput,
|
||||||
ExplicitEnum,
|
|
||||||
PaddingStrategy,
|
|
||||||
PreTokenizedInput,
|
PreTokenizedInput,
|
||||||
TensorType,
|
|
||||||
TextInput,
|
TextInput,
|
||||||
)
|
)
|
||||||
from ...utils import logging
|
from ...utils import logging
|
||||||
@@ -151,7 +148,7 @@ def whitespace_tokenize(text):
|
|||||||
TAPAS_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING = r"""
|
TAPAS_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING = r"""
|
||||||
add_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`True`):
|
add_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`True`):
|
||||||
Whether or not to encode the sequences with the special tokens relative to their model.
|
Whether or not to encode the sequences with the special tokens relative to their model.
|
||||||
padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`False`):
|
padding (:obj:`bool`, :obj:`str` or :class:`~transformers.file_utils.PaddingStrategy`, `optional`, defaults to :obj:`False`):
|
||||||
Activates and controls padding. Accepts the following values:
|
Activates and controls padding. Accepts the following values:
|
||||||
|
|
||||||
* :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a
|
* :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a
|
||||||
@@ -180,7 +177,7 @@ TAPAS_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING = r"""
|
|||||||
pad_to_multiple_of (:obj:`int`, `optional`):
|
pad_to_multiple_of (:obj:`int`, `optional`):
|
||||||
If set will pad the sequence to a multiple of the provided value. This is especially useful to enable
|
If set will pad the sequence to a multiple of the provided value. This is especially useful to enable
|
||||||
the use of Tensor Cores on NVIDIA hardware with compute capability >= 7.5 (Volta).
|
the use of Tensor Cores on NVIDIA hardware with compute capability >= 7.5 (Volta).
|
||||||
return_tensors (:obj:`str` or :class:`~transformers.tokenization_utils_base.TensorType`, `optional`):
|
return_tensors (:obj:`str` or :class:`~transformers.file_utils.TensorType`, `optional`):
|
||||||
If set, will return tensors instead of list of python integers. Acceptable values are:
|
If set, will return tensors instead of list of python integers. Acceptable values are:
|
||||||
|
|
||||||
* :obj:`'tf'`: Return TensorFlow :obj:`tf.constant` objects.
|
* :obj:`'tf'`: Return TensorFlow :obj:`tf.constant` objects.
|
||||||
|
|||||||
@@ -22,7 +22,9 @@ from ...file_utils import _BaseLazyModule, is_tokenizers_available, is_torch_ava
|
|||||||
|
|
||||||
_import_structure = {
|
_import_structure = {
|
||||||
"configuration_wav2vec2": ["WAV_2_VEC_2_PRETRAINED_CONFIG_ARCHIVE_MAP", "Wav2Vec2Config"],
|
"configuration_wav2vec2": ["WAV_2_VEC_2_PRETRAINED_CONFIG_ARCHIVE_MAP", "Wav2Vec2Config"],
|
||||||
"tokenization_wav2vec2": ["Wav2Vec2Tokenizer"],
|
"tokenization_wav2vec2": ["Wav2Vec2CTCTokenizer", "Wav2Vec2Tokenizer"],
|
||||||
|
"feature_extraction_wav2vec2": ["Wav2Vec2FeatureExtractor"],
|
||||||
|
"processing_wav2vec2": ["Wav2Vec2Processor"],
|
||||||
}
|
}
|
||||||
|
|
||||||
if is_torch_available():
|
if is_torch_available():
|
||||||
@@ -37,7 +39,9 @@ if is_torch_available():
|
|||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
from .configuration_wav2vec2 import WAV_2_VEC_2_PRETRAINED_CONFIG_ARCHIVE_MAP, Wav2Vec2Config
|
from .configuration_wav2vec2 import WAV_2_VEC_2_PRETRAINED_CONFIG_ARCHIVE_MAP, Wav2Vec2Config
|
||||||
from .tokenization_wav2vec2 import Wav2Vec2Tokenizer
|
from .feature_extraction_wav2vec2 import Wav2Vec2FeatureExtractor
|
||||||
|
from .processing_wav2vec2 import Wav2Vec2Processor
|
||||||
|
from .tokenization_wav2vec2 import Wav2Vec2CTCTokenizer, Wav2Vec2Tokenizer
|
||||||
|
|
||||||
if is_torch_available():
|
if is_torch_available():
|
||||||
from .modeling_wav2vec2 import (
|
from .modeling_wav2vec2 import (
|
||||||
|
|||||||
192
src/transformers/models/wav2vec2/feature_extraction_wav2vec2.py
Normal file
192
src/transformers/models/wav2vec2/feature_extraction_wav2vec2.py
Normal file
@@ -0,0 +1,192 @@
|
|||||||
|
# coding=utf-8
|
||||||
|
# Copyright 2021 The HuggingFace Inc. team.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
"""
|
||||||
|
Feature extractor class for Wav2Vec2
|
||||||
|
"""
|
||||||
|
|
||||||
|
from typing import List, Optional, Union
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
from ...feature_extraction_utils import BatchFeature, PreTrainedFeatureExtractor
|
||||||
|
from ...file_utils import PaddingStrategy, TensorType
|
||||||
|
from ...utils import logging
|
||||||
|
|
||||||
|
|
||||||
|
logger = logging.get_logger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class Wav2Vec2FeatureExtractor(PreTrainedFeatureExtractor):
|
||||||
|
r"""
|
||||||
|
Constructs a Wav2Vec2 feature extractor.
|
||||||
|
|
||||||
|
This feature extractor inherits from :class:`~transformers.Wav2Vec2FeatureExtractor` which contains most of the
|
||||||
|
main methods. Users should refer to this superclass for more information regarding those methods.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
feature_size (:obj:`int`, defaults to 1):
|
||||||
|
The feature dimension of the extracted features.
|
||||||
|
sampling_rate (:obj:`int`, defaults to 16000):
|
||||||
|
The sampling rate at which the audio files should be digitalized expressed in Hertz per second (Hz).
|
||||||
|
padding_value (:obj:`float`, defaults to 0.0):
|
||||||
|
The value that is used to fill the padding values.
|
||||||
|
do_normalize (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
||||||
|
Whether or not to zero-mean unit-variance normalize the input. Normalizing can help to significantly
|
||||||
|
improve the performance for some models, *e.g.*, `wav2vec2-lv60
|
||||||
|
<https://huggingface.co/models?search=lv60>`__.
|
||||||
|
return_attention_mask (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
||||||
|
Whether or not :meth:`~transformers.Wav2Vec2Tokenizer.__call__` should return :obj:`attention_mask`.
|
||||||
|
|
||||||
|
.. note::
|
||||||
|
|
||||||
|
Wav2Vec2 models that have set ``config.feat_extract_norm == "group"``, such as `wav2vec2-base
|
||||||
|
<https://huggingface.co/facebook/wav2vec2-base-960h>`__, have **not** been trained using
|
||||||
|
:obj:`attention_mask`. For such models, :obj:`input_values` should simply be padded with 0 and no
|
||||||
|
:obj:`attention_mask` should be passed.
|
||||||
|
|
||||||
|
For Wav2Vec2 models that have set ``config.feat_extract_norm == "layer"``, such as `wav2vec2-lv60
|
||||||
|
<https://huggingface.co/facebook/wav2vec2-large-960h-lv60-self>`__, :obj:`attention_mask` should be
|
||||||
|
passed for batched inference.
|
||||||
|
"""
|
||||||
|
|
||||||
|
model_input_names = ["input_values", "attention_mask"]
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
feature_size=1,
|
||||||
|
sampling_rate=16000,
|
||||||
|
padding_value=0.0,
|
||||||
|
return_attention_mask=False,
|
||||||
|
do_normalize=True,
|
||||||
|
**kwargs
|
||||||
|
):
|
||||||
|
super().__init__(feature_size=feature_size, sampling_rate=sampling_rate, padding_value=padding_value, **kwargs)
|
||||||
|
self.return_attention_mask = return_attention_mask
|
||||||
|
self.do_normalize = do_normalize
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def zero_mean_unit_var_norm(input_values: List[np.ndarray]) -> List[np.ndarray]:
|
||||||
|
"""
|
||||||
|
Every array in the list is normalized to have zero mean and unit variance
|
||||||
|
"""
|
||||||
|
return [(x - np.mean(x)) / np.sqrt(np.var(x) + 1e-5) for x in input_values]
|
||||||
|
|
||||||
|
def __call__(
|
||||||
|
self,
|
||||||
|
raw_speech: Union[np.ndarray, List[float], List[np.ndarray], List[List[float]]],
|
||||||
|
padding: Union[bool, str, PaddingStrategy] = False,
|
||||||
|
max_length: Optional[int] = None,
|
||||||
|
pad_to_multiple_of: Optional[int] = None,
|
||||||
|
return_tensors: Optional[Union[str, TensorType]] = None,
|
||||||
|
sampling_rate: Optional[int] = None,
|
||||||
|
**kwargs
|
||||||
|
) -> BatchFeature:
|
||||||
|
"""
|
||||||
|
Main method to featurize and prepare for the model one or several sequence(s). sequences.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
raw_speech (:obj:`np.ndarray`, :obj:`List[float]`, :obj:`List[np.ndarray]`, :obj:`List[List[float]]`):
|
||||||
|
The sequence or batch of sequences to be padded. Each sequence can be a numpy array, a list of float
|
||||||
|
values, a list of numpy arrays or a list of list of float values.
|
||||||
|
padding (:obj:`bool`, :obj:`str` or :class:`~transformers.file_utils.PaddingStrategy`, `optional`, defaults to :obj:`True`):
|
||||||
|
Select a strategy to pad the returned sequences (according to the model's padding side and padding
|
||||||
|
index) among:
|
||||||
|
|
||||||
|
* :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a
|
||||||
|
single sequence if provided).
|
||||||
|
* :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
|
||||||
|
maximum acceptable input length for the model if that argument is not provided.
|
||||||
|
* :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
|
||||||
|
different lengths).
|
||||||
|
max_length (:obj:`int`, `optional`):
|
||||||
|
Maximum length of the returned list and optionally padding length (see above).
|
||||||
|
pad_to_multiple_of (:obj:`int`, `optional`):
|
||||||
|
If set will pad the sequence to a multiple of the provided value.
|
||||||
|
|
||||||
|
This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
|
||||||
|
>= 7.5 (Volta), or on TPUs which benefit from having sequence lengths be a multiple of 128.
|
||||||
|
return_attention_mask (:obj:`bool`, `optional`):
|
||||||
|
Whether to return the attention mask. If left to the default, will return the attention mask according
|
||||||
|
to the specific feature_extractor's default.
|
||||||
|
|
||||||
|
`What are attention masks? <../glossary.html#attention-mask>`__
|
||||||
|
|
||||||
|
.. note::
|
||||||
|
|
||||||
|
Wav2Vec2 models that have set ``config.feat_extract_norm == "group"``, such as `wav2vec2-base
|
||||||
|
<https://huggingface.co/facebook/wav2vec2-base-960h>`__, have **not** been trained using
|
||||||
|
:obj:`attention_mask`. For such models, :obj:`input_values` should simply be padded with 0 and no
|
||||||
|
:obj:`attention_mask` should be passed.
|
||||||
|
|
||||||
|
For Wav2Vec2 models that have set ``config.feat_extract_norm == "layer"``, such as `wav2vec2-lv60
|
||||||
|
<https://huggingface.co/facebook/wav2vec2-large-960h-lv60-self>`__, :obj:`attention_mask` should be
|
||||||
|
passed for batched inference.
|
||||||
|
|
||||||
|
return_tensors (:obj:`str` or :class:`~transformers.file_utils.TensorType`, `optional`):
|
||||||
|
If set, will return tensors instead of list of python integers. Acceptable values are:
|
||||||
|
|
||||||
|
* :obj:`'tf'`: Return TensorFlow :obj:`tf.constant` objects.
|
||||||
|
* :obj:`'pt'`: Return PyTorch :obj:`torch.Tensor` objects.
|
||||||
|
* :obj:`'np'`: Return Numpy :obj:`np.ndarray` objects.
|
||||||
|
sampling_rate (:obj:`int`, `optional`):
|
||||||
|
The sampling rate at which the ``raw_speech`` input was sampled. It is strongly recommended to pass
|
||||||
|
``sampling_rate`` at the forward call to prevent silent errors.
|
||||||
|
padding_value (:obj:`float`, defaults to 0.0):
|
||||||
|
"""
|
||||||
|
|
||||||
|
if sampling_rate is not None:
|
||||||
|
if sampling_rate != self.sampling_rate:
|
||||||
|
raise ValueError(
|
||||||
|
f"The model corresponding to this feature extractor: {self} was trained using a sampling rate of {self.sampling_rate}."
|
||||||
|
f"Please make sure that the provided `raw_speech` input was sampled with {self.sampling_rate} and not {sampling_rate}."
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
logger.warning(
|
||||||
|
"It is strongly recommended to pass the ``sampling_rate`` argument to this function."
|
||||||
|
"Failing to do so can result in silent errors that might be hard to debug."
|
||||||
|
)
|
||||||
|
|
||||||
|
is_batched = bool(
|
||||||
|
isinstance(raw_speech, (list, tuple))
|
||||||
|
and (isinstance(raw_speech[0], np.ndarray) or isinstance(raw_speech[0], (tuple, list)))
|
||||||
|
)
|
||||||
|
|
||||||
|
# make sure input is in list format
|
||||||
|
if is_batched and not isinstance(raw_speech[0], np.ndarray):
|
||||||
|
raw_speech = [np.asarray(speech) for speech in raw_speech]
|
||||||
|
elif not is_batched and not isinstance(raw_speech, np.ndarray):
|
||||||
|
raw_speech = np.asarray(raw_speech)
|
||||||
|
|
||||||
|
# always return batch
|
||||||
|
if not is_batched:
|
||||||
|
raw_speech = [raw_speech]
|
||||||
|
|
||||||
|
# zero-mean and unit-variance normalization
|
||||||
|
if self.do_normalize:
|
||||||
|
raw_speech = self.zero_mean_unit_var_norm(raw_speech)
|
||||||
|
|
||||||
|
# convert into correct format for padding
|
||||||
|
encoded_inputs = BatchFeature({"input_values": raw_speech})
|
||||||
|
|
||||||
|
padded_inputs = self.pad(
|
||||||
|
encoded_inputs,
|
||||||
|
padding=padding,
|
||||||
|
max_length=max_length,
|
||||||
|
pad_to_multiple_of=pad_to_multiple_of,
|
||||||
|
return_attention_mask=self.return_attention_mask,
|
||||||
|
return_tensors=return_tensors,
|
||||||
|
)
|
||||||
|
|
||||||
|
return padded_inputs
|
||||||
@@ -616,9 +616,9 @@ WAV_2_VEC_2_INPUTS_DOCSTRING = r"""
|
|||||||
input_values (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`):
|
input_values (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`):
|
||||||
Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file
|
Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file
|
||||||
into an array of type `List[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install
|
into an array of type `List[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install
|
||||||
soundfile`). To prepare the array into `input_values`, the :class:`~transformers.Wav2Vec2Tokenizer` should
|
soundfile`). To prepare the array into `input_values`, the :class:`~transformers.Wav2Vec2Processor` should
|
||||||
be used for padding and conversion into a tensor of type `torch.FloatTensor`. See
|
be used for padding and conversion into a tensor of type `torch.FloatTensor`. See
|
||||||
:meth:`transformers.Wav2Vec2Tokenizer.__call__` for details.
|
:meth:`transformers.Wav2Vec2Processor.__call__` for details.
|
||||||
attention_mask (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
|
attention_mask (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
|
||||||
Mask to avoid performing convolution and attention on padding token indices. Mask values selected in ``[0,
|
Mask to avoid performing convolution and attention on padding token indices. Mask values selected in ``[0,
|
||||||
1]``:
|
1]``:
|
||||||
@@ -629,8 +629,8 @@ WAV_2_VEC_2_INPUTS_DOCSTRING = r"""
|
|||||||
`What are attention masks? <../glossary.html#attention-mask>`__
|
`What are attention masks? <../glossary.html#attention-mask>`__
|
||||||
|
|
||||||
.. warning::
|
.. warning::
|
||||||
:obj:`attention_mask` should only be passed if the corresponding tokenizer has
|
:obj:`attention_mask` should only be passed if the corresponding processor has
|
||||||
``config.return_attention_mask == True``. For all models whose tokenizer has
|
``config.return_attention_mask == True``. For all models whose processor has
|
||||||
``config.return_attention_mask == False``, such as `wav2vec2-base
|
``config.return_attention_mask == False``, such as `wav2vec2-base
|
||||||
<https://huggingface.co/facebook/wav2vec2-base-960h>`__, :obj:`attention_mask` should **not** be passed
|
<https://huggingface.co/facebook/wav2vec2-base-960h>`__, :obj:`attention_mask` should **not** be passed
|
||||||
to avoid degraded performance when doing batched inference. For such models :obj:`input_values` should
|
to avoid degraded performance when doing batched inference. For such models :obj:`input_values` should
|
||||||
@@ -682,11 +682,11 @@ class Wav2Vec2Model(Wav2Vec2PreTrainedModel):
|
|||||||
|
|
||||||
Example::
|
Example::
|
||||||
|
|
||||||
>>> from transformers import Wav2Vec2Tokenizer, Wav2Vec2Model
|
>>> from transformers import Wav2Vec2Processor, Wav2Vec2Model
|
||||||
>>> from datasets import load_dataset
|
>>> from datasets import load_dataset
|
||||||
>>> import soundfile as sf
|
>>> import soundfile as sf
|
||||||
|
|
||||||
>>> tokenizer = Wav2Vec2Tokenizer.from_pretrained("facebook/wav2vec2-base-960h")
|
>>> processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
|
||||||
>>> model = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-base-960h")
|
>>> model = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-base-960h")
|
||||||
|
|
||||||
>>> def map_to_array(batch):
|
>>> def map_to_array(batch):
|
||||||
@@ -697,7 +697,7 @@ class Wav2Vec2Model(Wav2Vec2PreTrainedModel):
|
|||||||
>>> ds = load_dataset("patrickvonplaten/librispeech_asr_dummy", "clean", split="validation")
|
>>> ds = load_dataset("patrickvonplaten/librispeech_asr_dummy", "clean", split="validation")
|
||||||
>>> ds = ds.map(map_to_array)
|
>>> ds = ds.map(map_to_array)
|
||||||
|
|
||||||
>>> input_values = tokenizer(ds["speech"][0], return_tensors="pt").input_values # Batch size 1
|
>>> input_values = processor(ds["speech"][0], return_tensors="pt").input_values # Batch size 1
|
||||||
>>> hidden_states = model(input_values).last_hidden_state
|
>>> hidden_states = model(input_values).last_hidden_state
|
||||||
"""
|
"""
|
||||||
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
|
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
|
||||||
@@ -780,11 +780,11 @@ class Wav2Vec2ForMaskedLM(Wav2Vec2PreTrainedModel):
|
|||||||
|
|
||||||
Example::
|
Example::
|
||||||
|
|
||||||
>>> from transformers import Wav2Vec2Tokenizer, Wav2Vec2Model
|
>>> from transformers import Wav2Vec2Processor, Wav2Vec2Model
|
||||||
>>> from datasets import load_dataset
|
>>> from datasets import load_dataset
|
||||||
>>> import soundfile as sf
|
>>> import soundfile as sf
|
||||||
|
|
||||||
>>> tokenizer = Wav2Vec2Tokenizer.from_pretrained("facebook/wav2vec2-base-960h")
|
>>> processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
|
||||||
>>> model = Wav2Vec2ForMaskedLM.from_pretrained("facebook/wav2vec2-base-960h")
|
>>> model = Wav2Vec2ForMaskedLM.from_pretrained("facebook/wav2vec2-base-960h")
|
||||||
|
|
||||||
>>> def map_to_array(batch):
|
>>> def map_to_array(batch):
|
||||||
@@ -795,11 +795,11 @@ class Wav2Vec2ForMaskedLM(Wav2Vec2PreTrainedModel):
|
|||||||
>>> ds = load_dataset("patrickvonplaten/librispeech_asr_dummy", "clean", split="validation")
|
>>> ds = load_dataset("patrickvonplaten/librispeech_asr_dummy", "clean", split="validation")
|
||||||
>>> ds = ds.map(map_to_array)
|
>>> ds = ds.map(map_to_array)
|
||||||
|
|
||||||
>>> input_values = tokenizer(ds["speech"][0], return_tensors="pt").input_values # Batch size 1
|
>>> input_values = processor(ds["speech"][0], return_tensors="pt").input_values # Batch size 1
|
||||||
>>> logits = model(input_values).logits
|
>>> logits = model(input_values).logits
|
||||||
|
|
||||||
>>> predicted_ids = torch.argmax(logits, dim=-1)
|
>>> predicted_ids = torch.argmax(logits, dim=-1)
|
||||||
>>> transcription = tokenizer.decode(predicted_ids[0])
|
>>> transcription = processor.decode(predicted_ids[0])
|
||||||
"""
|
"""
|
||||||
|
|
||||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||||
@@ -856,11 +856,11 @@ class Wav2Vec2ForCTC(Wav2Vec2PreTrainedModel):
|
|||||||
Example::
|
Example::
|
||||||
|
|
||||||
>>> import torch
|
>>> import torch
|
||||||
>>> from transformers import Wav2Vec2Tokenizer, Wav2Vec2ForCTC
|
>>> from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
|
||||||
>>> from datasets import load_dataset
|
>>> from datasets import load_dataset
|
||||||
>>> import soundfile as sf
|
>>> import soundfile as sf
|
||||||
|
|
||||||
>>> tokenizer = Wav2Vec2Tokenizer.from_pretrained("facebook/wav2vec2-base-960h")
|
>>> processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
|
||||||
>>> model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")
|
>>> model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")
|
||||||
|
|
||||||
>>> def map_to_array(batch):
|
>>> def map_to_array(batch):
|
||||||
@@ -871,11 +871,11 @@ class Wav2Vec2ForCTC(Wav2Vec2PreTrainedModel):
|
|||||||
>>> ds = load_dataset("patrickvonplaten/librispeech_asr_dummy", "clean", split="validation")
|
>>> ds = load_dataset("patrickvonplaten/librispeech_asr_dummy", "clean", split="validation")
|
||||||
>>> ds = ds.map(map_to_array)
|
>>> ds = ds.map(map_to_array)
|
||||||
|
|
||||||
>>> input_values = tokenizer(ds["speech"][0], return_tensors="pt").input_values # Batch size 1
|
>>> input_values = processor(ds["speech"][0], return_tensors="pt").input_values # Batch size 1
|
||||||
>>> logits = model(input_values).logits
|
>>> logits = model(input_values).logits
|
||||||
|
|
||||||
>>> predicted_ids = torch.argmax(logits, dim=-1)
|
>>> predicted_ids = torch.argmax(logits, dim=-1)
|
||||||
>>> transcription = tokenizer.decode(predicted_ids[0])
|
>>> transcription = processor.decode(predicted_ids[0])
|
||||||
"""
|
"""
|
||||||
|
|
||||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||||
|
|||||||
142
src/transformers/models/wav2vec2/processing_wav2vec2.py
Normal file
142
src/transformers/models/wav2vec2/processing_wav2vec2.py
Normal file
@@ -0,0 +1,142 @@
|
|||||||
|
# coding=utf-8
|
||||||
|
# Copyright 2021 The HuggingFace Inc. team.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
"""
|
||||||
|
Speech processor class for Wav2Vec2
|
||||||
|
"""
|
||||||
|
from contextlib import contextmanager
|
||||||
|
|
||||||
|
from .feature_extraction_wav2vec2 import Wav2Vec2FeatureExtractor
|
||||||
|
from .tokenization_wav2vec2 import Wav2Vec2CTCTokenizer
|
||||||
|
|
||||||
|
|
||||||
|
class Wav2Vec2Processor:
|
||||||
|
r"""
|
||||||
|
Constructs a Wav2Vec2 processor which wraps a Wav2Vec2 feature extractor and a Wav2Vec2 CTC tokenizer into a single
|
||||||
|
processor.
|
||||||
|
|
||||||
|
:class:`~transformers.Wav2Vec2Processor` offers all the functionalities of
|
||||||
|
:class:`~transformers.Wav2Vec2FeatureExtractor` and :class:`~transformers.Wav2Vec2CTCTokenizer`. See the docstring
|
||||||
|
of :meth:`~transformers.Wav2Vec2Processor.__call__` and :meth:`~transformers.Wav2Vec2Processor.decode` for more
|
||||||
|
information.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
feature_extractor (:obj:`Wav2Vec2FeatureExtractor`):
|
||||||
|
An instance of :class:`~transformers.Wav2Vec2FeatureExtractor`. The feature extractor is a required input.
|
||||||
|
tokenizer (:obj:`Wav2Vec2CTCTokenizer`):
|
||||||
|
An instance of :class:`~transformers.Wav2Vec2CTCTokenizer`. The tokenizer is a required input.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, feature_extractor, tokenizer):
|
||||||
|
if not isinstance(feature_extractor, Wav2Vec2FeatureExtractor):
|
||||||
|
raise ValueError(
|
||||||
|
f"`feature_extractor` has to be of type {Wav2Vec2FeatureExtractor.__class__}, but is {type(feature_extractor)}"
|
||||||
|
)
|
||||||
|
if not isinstance(tokenizer, Wav2Vec2CTCTokenizer):
|
||||||
|
raise ValueError(
|
||||||
|
f"`tokenizer` has to be of type {Wav2Vec2CTCTokenizer.__class__}, but is {type(tokenizer)}"
|
||||||
|
)
|
||||||
|
|
||||||
|
self.feature_extractor = feature_extractor
|
||||||
|
self.tokenizer = tokenizer
|
||||||
|
self.current_processor = self.feature_extractor
|
||||||
|
|
||||||
|
def save_pretrained(self, save_directory):
|
||||||
|
"""
|
||||||
|
Save a Wav2Vec2 feature_extractor object and Wav2Vec2 tokenizer object to the directory ``save_directory``, so
|
||||||
|
that it can be re-loaded using the :func:`~transformers.Wav2Vec2Processor.from_pretrained` class method.
|
||||||
|
|
||||||
|
.. note::
|
||||||
|
|
||||||
|
This class method is simply calling :meth:`~transformers.PreTrainedFeatureExtractor.save_pretrained` and
|
||||||
|
:meth:`~transformers.tokenization_utils_base.PreTrainedTokenizer.save_pretrained`. Please refer to the
|
||||||
|
docstrings of the methods above for more information.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
save_directory (:obj:`str` or :obj:`os.PathLike`):
|
||||||
|
Directory where the feature extractor JSON file and the tokenizer files will be saved (directory will
|
||||||
|
be created if it does not exist).
|
||||||
|
"""
|
||||||
|
|
||||||
|
self.feature_extractor.save_pretrained(save_directory)
|
||||||
|
self.tokenizer.save_pretrained(save_directory)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
|
||||||
|
r"""
|
||||||
|
Instantiate a :class:`~transformers.Wav2Vec2Processor` from a pretrained Wav2Vec2 processor.
|
||||||
|
|
||||||
|
.. note::
|
||||||
|
|
||||||
|
This class method is simply calling Wav2Vec2FeatureExtractor's
|
||||||
|
:meth:`~transformers.PreTrainedFeatureExtractor.from_pretrained` and Wav2Vec2CTCTokenizer's
|
||||||
|
:meth:`~transformers.tokenization_utils_base.PreTrainedTokenizer.from_pretrained`. Please refer to the
|
||||||
|
docstrings of the methods above for more information.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
pretrained_model_name_or_path (:obj:`str` or :obj:`os.PathLike`):
|
||||||
|
This can be either:
|
||||||
|
|
||||||
|
- a string, the `model id` of a pretrained feature_extractor hosted inside a model repo on
|
||||||
|
huggingface.co. Valid model ids can be located at the root-level, like ``bert-base-uncased``, or
|
||||||
|
namespaced under a user or organization name, like ``dbmdz/bert-base-german-cased``.
|
||||||
|
- a path to a `directory` containing a feature extractor file saved using the
|
||||||
|
:meth:`~transformers.PreTrainedFeatureExtractor.save_pretrained` method, e.g.,
|
||||||
|
``./my_model_directory/``.
|
||||||
|
- a path or url to a saved feature extractor JSON `file`, e.g.,
|
||||||
|
``./my_model_directory/feature_extraction_config.json``.
|
||||||
|
**kwargs
|
||||||
|
Additional keyword arguments passed along to both :class:`~transformers.PreTrainedFeatureExtractor` and
|
||||||
|
:class:`~transformers.PreTrainedTokenizer`
|
||||||
|
"""
|
||||||
|
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(pretrained_model_name_or_path, **kwargs)
|
||||||
|
tokenizer = Wav2Vec2CTCTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs)
|
||||||
|
|
||||||
|
return cls(feature_extractor=feature_extractor, tokenizer=tokenizer)
|
||||||
|
|
||||||
|
def __call__(self, *args, **kwargs):
|
||||||
|
"""
|
||||||
|
When used in normal mode, this method forwards all its arguments to Wav2Vec2FeatureExtractor's
|
||||||
|
:meth:`~transformers.Wav2Vec2FeatureExtractor.__call__` and returns its output. If used in the context
|
||||||
|
:meth:`~transformers.Wav2Vec2Processor.as_target_processor` this method forwards all its arguments to
|
||||||
|
Wav2Vec2CTCTokenizer's :meth:`~transformers.Wav2Vec2CTCTokenizer.__call__`. Please refer to the doctsring of
|
||||||
|
the above two methods for more information.
|
||||||
|
"""
|
||||||
|
return self.current_processor(*args, **kwargs)
|
||||||
|
|
||||||
|
def batch_decode(self, *args, **kwargs):
|
||||||
|
"""
|
||||||
|
This method forwards all its arguments to Wav2Vec2CTCTokenizer's
|
||||||
|
:meth:`~transformers.PreTrainedTokenizer.batch_decode`. Please refer to the docstring of this method for more
|
||||||
|
information.
|
||||||
|
"""
|
||||||
|
return self.tokenizer.batch_decode(*args, **kwargs)
|
||||||
|
|
||||||
|
def decode(self, *args, **kwargs):
|
||||||
|
"""
|
||||||
|
This method forwards all its arguments to Wav2Vec2CTCTokenizer's
|
||||||
|
:meth:`~transformers.PreTrainedTokenizer.decode`. Please refer to the docstring of this method for more
|
||||||
|
information.
|
||||||
|
"""
|
||||||
|
return self.tokenizer.decode(*args, **kwargs)
|
||||||
|
|
||||||
|
@contextmanager
|
||||||
|
def as_target_processor(self):
|
||||||
|
"""
|
||||||
|
Temporarily sets the tokenizer for processing the input. Useful for encoding the labels when fine-tuning
|
||||||
|
Wav2Vec2.
|
||||||
|
"""
|
||||||
|
self.current_processor = self.tokenizer
|
||||||
|
yield
|
||||||
|
self.current_processor = self.feature_extractor
|
||||||
@@ -16,14 +16,16 @@
|
|||||||
|
|
||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
|
import sys
|
||||||
|
import warnings
|
||||||
from itertools import groupby
|
from itertools import groupby
|
||||||
from typing import Dict, List, Optional, Tuple, Union
|
from typing import Dict, List, Optional, Tuple, Union
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
from ...file_utils import add_end_docstrings
|
from ...file_utils import PaddingStrategy, TensorType, add_end_docstrings
|
||||||
from ...tokenization_utils import PreTrainedTokenizer
|
from ...tokenization_utils import PreTrainedTokenizer
|
||||||
from ...tokenization_utils_base import BatchEncoding, PaddingStrategy, TensorType
|
from ...tokenization_utils_base import BatchEncoding
|
||||||
from ...utils import logging
|
from ...utils import logging
|
||||||
|
|
||||||
|
|
||||||
@@ -37,7 +39,7 @@ VOCAB_FILES_NAMES = {
|
|||||||
|
|
||||||
|
|
||||||
WAV2VEC2_KWARGS_DOCSTRING = r"""
|
WAV2VEC2_KWARGS_DOCSTRING = r"""
|
||||||
padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`False`):
|
padding (:obj:`bool`, :obj:`str` or :class:`~transformers.file_utils.PaddingStrategy`, `optional`, defaults to :obj:`False`):
|
||||||
Activates and controls padding. Accepts the following values:
|
Activates and controls padding. Accepts the following values:
|
||||||
|
|
||||||
* :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a
|
* :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a
|
||||||
@@ -55,7 +57,7 @@ WAV2VEC2_KWARGS_DOCSTRING = r"""
|
|||||||
pad_to_multiple_of (:obj:`int`, `optional`):
|
pad_to_multiple_of (:obj:`int`, `optional`):
|
||||||
If set will pad the sequence to a multiple of the provided value. This is especially useful to enable
|
If set will pad the sequence to a multiple of the provided value. This is especially useful to enable
|
||||||
the use of Tensor Cores on NVIDIA hardware with compute capability >= 7.5 (Volta).
|
the use of Tensor Cores on NVIDIA hardware with compute capability >= 7.5 (Volta).
|
||||||
return_tensors (:obj:`str` or :class:`~transformers.tokenization_utils_base.TensorType`, `optional`):
|
return_tensors (:obj:`str` or :class:`~transformers.file_utils.TensorType`, `optional`):
|
||||||
If set, will return tensors instead of list of python integers. Acceptable values are:
|
If set, will return tensors instead of list of python integers. Acceptable values are:
|
||||||
|
|
||||||
* :obj:`'tf'`: Return TensorFlow :obj:`tf.constant` objects.
|
* :obj:`'tf'`: Return TensorFlow :obj:`tf.constant` objects.
|
||||||
@@ -66,6 +68,207 @@ WAV2VEC2_KWARGS_DOCSTRING = r"""
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
class Wav2Vec2CTCTokenizer(PreTrainedTokenizer):
|
||||||
|
|
||||||
|
"""
|
||||||
|
Constructs a Wav2Vec2CTC tokenizer.
|
||||||
|
|
||||||
|
This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains some of the main methods.
|
||||||
|
Users should refer to the superclass for more information regarding such methods.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
vocab_file (:obj:`str`):
|
||||||
|
File containing the vocabulary.
|
||||||
|
bos_token (:obj:`str`, `optional`, defaults to :obj:`"<s>"`):
|
||||||
|
The beginning of sentence token.
|
||||||
|
eos_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
|
||||||
|
The end of sentence token.
|
||||||
|
unk_token (:obj:`str`, `optional`, defaults to :obj:`"<unk>"`):
|
||||||
|
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
|
||||||
|
token instead.
|
||||||
|
pad_token (:obj:`str`, `optional`, defaults to :obj:`"<pad>"`):
|
||||||
|
The token used for padding, for example when batching sequences of different lengths.
|
||||||
|
word_delimiter_token (:obj:`str`, `optional`, defaults to :obj:`"|"`):
|
||||||
|
The token used for defining the end of a word.
|
||||||
|
do_lower_case (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
||||||
|
Whether or not to accept lowercase input and lowercase the output when decoding.
|
||||||
|
|
||||||
|
**kwargs
|
||||||
|
Additional keyword arguments passed along to :class:`~transformers.PreTrainedTokenizer`
|
||||||
|
"""
|
||||||
|
|
||||||
|
vocab_files_names = VOCAB_FILES_NAMES
|
||||||
|
pretrained_vocab_files_map = {
|
||||||
|
"vocab_file": {
|
||||||
|
"facebook/wav2vec2-base-960h": "https://huggingface.co/facebook/wav2vec2-base-960h/resolve/main/vocab.json"
|
||||||
|
},
|
||||||
|
"tokenizer_config_file": {
|
||||||
|
"facebook/wav2vec2-base-960h": "https://huggingface.co/facebook/wav2vec2-base-960h/resolve/main/tokenizer.json",
|
||||||
|
},
|
||||||
|
}
|
||||||
|
# Wav2Vec2 has no max input length
|
||||||
|
max_model_input_sizes = {"facebook/wav2vec2-base-960h": sys.maxsize}
|
||||||
|
model_input_names = ["input_ids", "attention_mask"]
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
vocab_file,
|
||||||
|
bos_token="<s>",
|
||||||
|
eos_token="</s>",
|
||||||
|
unk_token="<unk>",
|
||||||
|
pad_token="<pad>",
|
||||||
|
word_delimiter_token="|",
|
||||||
|
do_lower_case=False,
|
||||||
|
**kwargs
|
||||||
|
):
|
||||||
|
super().__init__(
|
||||||
|
unk_token=unk_token,
|
||||||
|
bos_token=bos_token,
|
||||||
|
eos_token=eos_token,
|
||||||
|
pad_token=pad_token,
|
||||||
|
do_lower_case=do_lower_case,
|
||||||
|
word_delimiter_token=word_delimiter_token,
|
||||||
|
**kwargs,
|
||||||
|
)
|
||||||
|
|
||||||
|
self._word_delimiter_token = word_delimiter_token
|
||||||
|
|
||||||
|
self.do_lower_case = do_lower_case
|
||||||
|
|
||||||
|
with open(vocab_file, encoding="utf-8") as vocab_handle:
|
||||||
|
self.encoder = json.load(vocab_handle)
|
||||||
|
self.decoder = {v: k for k, v in self.encoder.items()}
|
||||||
|
|
||||||
|
@property
|
||||||
|
def word_delimiter_token(self) -> str:
|
||||||
|
"""
|
||||||
|
:obj:`str`: Padding token. Log an error if used while not having been set.
|
||||||
|
"""
|
||||||
|
if self._word_delimiter_token is None and self.verbose:
|
||||||
|
logger.error("Using word_delimiter_token, but it is not set yet.")
|
||||||
|
return None
|
||||||
|
return str(self._word_delimiter_token)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def word_delimiter_token_id(self) -> Optional[int]:
|
||||||
|
"""
|
||||||
|
:obj:`Optional[int]`: Id of the word_delimiter_token in the vocabulary. Returns :obj:`None` if the token has
|
||||||
|
not been set.
|
||||||
|
"""
|
||||||
|
if self._word_delimiter_token is None:
|
||||||
|
return None
|
||||||
|
return self.convert_tokens_to_ids(self.word_delimiter_token)
|
||||||
|
|
||||||
|
@word_delimiter_token.setter
|
||||||
|
def word_delimiter_token(self, value):
|
||||||
|
self._word_delimiter_token = value
|
||||||
|
|
||||||
|
@word_delimiter_token_id.setter
|
||||||
|
def word_delimiter_token_id(self, value):
|
||||||
|
self._word_delimiter_token = self.convert_tokens_to_ids(value)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def vocab_size(self) -> int:
|
||||||
|
return len(self.decoder)
|
||||||
|
|
||||||
|
def get_vocab(self) -> Dict:
|
||||||
|
return dict(self.encoder, **self.added_tokens_encoder)
|
||||||
|
|
||||||
|
def _tokenize(self, text, **kwargs):
|
||||||
|
"""
|
||||||
|
Converts a string in a sequence of tokens (string), using the tokenizer.
|
||||||
|
"""
|
||||||
|
if self.do_lower_case:
|
||||||
|
text = text.upper()
|
||||||
|
|
||||||
|
return list(text.replace(" ", self.word_delimiter_token))
|
||||||
|
|
||||||
|
def _convert_token_to_id(self, token: str) -> int:
|
||||||
|
"""Converts a token (str) in an index (integer) using the vocab."""
|
||||||
|
return self.encoder.get(token, self.encoder.get(self.unk_token))
|
||||||
|
|
||||||
|
def _convert_id_to_token(self, index: int) -> str:
|
||||||
|
"""Converts an index (integer) in a token (str) using the vocab."""
|
||||||
|
result = self.decoder.get(index, self.unk_token)
|
||||||
|
return result
|
||||||
|
|
||||||
|
def convert_tokens_to_string(
|
||||||
|
self, tokens: List[str], group_tokens: bool = True, spaces_between_special_tokens: bool = False
|
||||||
|
) -> str:
|
||||||
|
"""
|
||||||
|
Converts a connectionist-temporal-classification (CTC) output tokens into a single string.
|
||||||
|
"""
|
||||||
|
# group same tokens into non-repeating tokens in CTC style decoding
|
||||||
|
if group_tokens:
|
||||||
|
tokens = [token_group[0] for token_group in groupby(tokens)]
|
||||||
|
|
||||||
|
# filter self.pad_token which is used as CTC-blank token
|
||||||
|
filtered_tokens = list(filter(lambda token: token != self.pad_token, tokens))
|
||||||
|
|
||||||
|
if spaces_between_special_tokens:
|
||||||
|
join_token = " "
|
||||||
|
else:
|
||||||
|
join_token = ""
|
||||||
|
|
||||||
|
# replace delimiter token
|
||||||
|
string = join_token.join(
|
||||||
|
[" " if token == self.word_delimiter_token else token for token in filtered_tokens]
|
||||||
|
).strip()
|
||||||
|
|
||||||
|
if self.do_lower_case:
|
||||||
|
string = string.lower()
|
||||||
|
return string
|
||||||
|
|
||||||
|
def prepare_for_tokenization(self, text, is_split_into_words=False, **kwargs):
|
||||||
|
if is_split_into_words:
|
||||||
|
text = " " + text
|
||||||
|
return (text, kwargs)
|
||||||
|
|
||||||
|
def _decode(
|
||||||
|
self,
|
||||||
|
token_ids: List[int],
|
||||||
|
skip_special_tokens: bool = False,
|
||||||
|
clean_up_tokenization_spaces: bool = True,
|
||||||
|
group_tokens: bool = True,
|
||||||
|
spaces_between_special_tokens: bool = False,
|
||||||
|
) -> str:
|
||||||
|
"""
|
||||||
|
special _decode function is needed for Wav2Vec2Tokenizer because added tokens should be treated exactly the
|
||||||
|
same as tokens of the base vocabulary and therefore the function `convert_tokens_to_string` has to be called on
|
||||||
|
the whole token list and not individually on added tokens
|
||||||
|
"""
|
||||||
|
filtered_tokens = self.convert_ids_to_tokens(token_ids, skip_special_tokens=skip_special_tokens)
|
||||||
|
|
||||||
|
result = []
|
||||||
|
for token in filtered_tokens:
|
||||||
|
if skip_special_tokens and token in self.all_special_ids:
|
||||||
|
continue
|
||||||
|
result.append(token)
|
||||||
|
|
||||||
|
text = self.convert_tokens_to_string(
|
||||||
|
result, group_tokens=group_tokens, spaces_between_special_tokens=spaces_between_special_tokens
|
||||||
|
)
|
||||||
|
|
||||||
|
if clean_up_tokenization_spaces:
|
||||||
|
clean_text = self.clean_up_tokenization(text)
|
||||||
|
return clean_text
|
||||||
|
else:
|
||||||
|
return text
|
||||||
|
|
||||||
|
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
|
||||||
|
if not os.path.isdir(save_directory):
|
||||||
|
logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
|
||||||
|
return
|
||||||
|
vocab_file = os.path.join(
|
||||||
|
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
|
||||||
|
)
|
||||||
|
|
||||||
|
with open(vocab_file, "w", encoding="utf-8") as f:
|
||||||
|
f.write(json.dumps(self.encoder, ensure_ascii=False))
|
||||||
|
|
||||||
|
return (vocab_file,)
|
||||||
|
|
||||||
|
|
||||||
class Wav2Vec2Tokenizer(PreTrainedTokenizer):
|
class Wav2Vec2Tokenizer(PreTrainedTokenizer):
|
||||||
"""
|
"""
|
||||||
Constructs a Wav2Vec2 tokenizer.
|
Constructs a Wav2Vec2 tokenizer.
|
||||||
@@ -146,6 +349,12 @@ class Wav2Vec2Tokenizer(PreTrainedTokenizer):
|
|||||||
word_delimiter_token=word_delimiter_token,
|
word_delimiter_token=word_delimiter_token,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
warnings.warn(
|
||||||
|
"The class `Wav2Vec2Tokenizer` is deprecated and will be removed in version 5 of Transformers. Please use `Wav2Vec2Processor` or `Wav2Vec2CTCTokenizer` instead.",
|
||||||
|
FutureWarning,
|
||||||
|
)
|
||||||
|
|
||||||
self._word_delimiter_token = word_delimiter_token
|
self._word_delimiter_token = word_delimiter_token
|
||||||
|
|
||||||
self.do_lower_case = do_lower_case
|
self.do_lower_case = do_lower_case
|
||||||
|
|||||||
@@ -4,10 +4,9 @@ from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union
|
|||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
from ..data import SquadExample, SquadFeatures, squad_convert_examples_to_features
|
from ..data import SquadExample, SquadFeatures, squad_convert_examples_to_features
|
||||||
from ..file_utils import add_end_docstrings, is_tf_available, is_torch_available
|
from ..file_utils import PaddingStrategy, add_end_docstrings, is_tf_available, is_torch_available
|
||||||
from ..modelcard import ModelCard
|
from ..modelcard import ModelCard
|
||||||
from ..tokenization_utils import PreTrainedTokenizer
|
from ..tokenization_utils import PreTrainedTokenizer
|
||||||
from ..tokenization_utils_base import PaddingStrategy
|
|
||||||
from .base import PIPELINE_INIT_ARGS, ArgumentHandler, Pipeline
|
from .base import PIPELINE_INIT_ARGS, ArgumentHandler, Pipeline
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -205,7 +205,7 @@ class TableQuestionAnsweringPipeline(Pipeline):
|
|||||||
Whether to do inference sequentially or as a batch. Batching is faster, but models like SQA require the
|
Whether to do inference sequentially or as a batch. Batching is faster, but models like SQA require the
|
||||||
inference to be done sequentially to extract relations within sequences, given their conversational
|
inference to be done sequentially to extract relations within sequences, given their conversational
|
||||||
nature.
|
nature.
|
||||||
padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`False`):
|
padding (:obj:`bool`, :obj:`str` or :class:`~transformers.file_utils.PaddingStrategy`, `optional`, defaults to :obj:`False`):
|
||||||
Activates and controls padding. Accepts the following values:
|
Activates and controls padding. Accepts the following values:
|
||||||
|
|
||||||
* :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a
|
* :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a
|
||||||
|
|||||||
@@ -21,7 +21,7 @@ import re
|
|||||||
import unicodedata
|
import unicodedata
|
||||||
from typing import Any, Dict, List, Optional, Tuple, Union, overload
|
from typing import Any, Dict, List, Optional, Tuple, Union, overload
|
||||||
|
|
||||||
from .file_utils import add_end_docstrings
|
from .file_utils import PaddingStrategy, TensorType, add_end_docstrings
|
||||||
from .tokenization_utils_base import (
|
from .tokenization_utils_base import (
|
||||||
ENCODE_KWARGS_DOCSTRING,
|
ENCODE_KWARGS_DOCSTRING,
|
||||||
ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING,
|
ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING,
|
||||||
@@ -30,11 +30,9 @@ from .tokenization_utils_base import (
|
|||||||
BatchEncoding,
|
BatchEncoding,
|
||||||
EncodedInput,
|
EncodedInput,
|
||||||
EncodedInputPair,
|
EncodedInputPair,
|
||||||
PaddingStrategy,
|
|
||||||
PreTokenizedInput,
|
PreTokenizedInput,
|
||||||
PreTokenizedInputPair,
|
PreTokenizedInputPair,
|
||||||
PreTrainedTokenizerBase,
|
PreTrainedTokenizerBase,
|
||||||
TensorType,
|
|
||||||
TextInput,
|
TextInput,
|
||||||
TextInputPair,
|
TextInputPair,
|
||||||
TruncationStrategy,
|
TruncationStrategy,
|
||||||
|
|||||||
@@ -25,7 +25,6 @@ import warnings
|
|||||||
from collections import OrderedDict, UserDict
|
from collections import OrderedDict, UserDict
|
||||||
from contextlib import contextmanager
|
from contextlib import contextmanager
|
||||||
from dataclasses import dataclass, field
|
from dataclasses import dataclass, field
|
||||||
from enum import Enum
|
|
||||||
from typing import TYPE_CHECKING, Any, Dict, List, NamedTuple, Optional, Sequence, Tuple, Union
|
from typing import TYPE_CHECKING, Any, Dict, List, NamedTuple, Optional, Sequence, Tuple, Union
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
@@ -33,6 +32,14 @@ import numpy as np
|
|||||||
import requests
|
import requests
|
||||||
|
|
||||||
from .file_utils import (
|
from .file_utils import (
|
||||||
|
ExplicitEnum,
|
||||||
|
PaddingStrategy,
|
||||||
|
TensorType,
|
||||||
|
_is_jax,
|
||||||
|
_is_numpy,
|
||||||
|
_is_tensorflow,
|
||||||
|
_is_torch,
|
||||||
|
_is_torch_device,
|
||||||
add_end_docstrings,
|
add_end_docstrings,
|
||||||
cached_path,
|
cached_path,
|
||||||
hf_bucket_url,
|
hf_bucket_url,
|
||||||
@@ -41,6 +48,7 @@ from .file_utils import (
|
|||||||
is_tf_available,
|
is_tf_available,
|
||||||
is_tokenizers_available,
|
is_tokenizers_available,
|
||||||
is_torch_available,
|
is_torch_available,
|
||||||
|
to_py_obj,
|
||||||
torch_required,
|
torch_required,
|
||||||
)
|
)
|
||||||
from .utils import logging
|
from .utils import logging
|
||||||
@@ -55,34 +63,6 @@ if TYPE_CHECKING:
|
|||||||
import jax.numpy as jnp # noqa: F401
|
import jax.numpy as jnp # noqa: F401
|
||||||
|
|
||||||
|
|
||||||
def _is_numpy(x):
|
|
||||||
return isinstance(x, np.ndarray)
|
|
||||||
|
|
||||||
|
|
||||||
def _is_torch(x):
|
|
||||||
import torch
|
|
||||||
|
|
||||||
return isinstance(x, torch.Tensor)
|
|
||||||
|
|
||||||
|
|
||||||
def _is_torch_device(x):
|
|
||||||
import torch
|
|
||||||
|
|
||||||
return isinstance(x, torch.device)
|
|
||||||
|
|
||||||
|
|
||||||
def _is_tensorflow(x):
|
|
||||||
import tensorflow as tf
|
|
||||||
|
|
||||||
return isinstance(x, tf.Tensor)
|
|
||||||
|
|
||||||
|
|
||||||
def _is_jax(x):
|
|
||||||
import jax.numpy as jnp # noqa: F811
|
|
||||||
|
|
||||||
return isinstance(x, jnp.ndarray)
|
|
||||||
|
|
||||||
|
|
||||||
if is_tokenizers_available():
|
if is_tokenizers_available():
|
||||||
from tokenizers import AddedToken
|
from tokenizers import AddedToken
|
||||||
from tokenizers import Encoding as EncodingFast
|
from tokenizers import Encoding as EncodingFast
|
||||||
@@ -134,19 +114,6 @@ TOKENIZER_CONFIG_FILE = "tokenizer_config.json"
|
|||||||
FULL_TOKENIZER_FILE = "tokenizer.json"
|
FULL_TOKENIZER_FILE = "tokenizer.json"
|
||||||
|
|
||||||
|
|
||||||
class ExplicitEnum(Enum):
|
|
||||||
"""
|
|
||||||
Enum with more explicit error message for missing values.
|
|
||||||
"""
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def _missing_(cls, value):
|
|
||||||
raise ValueError(
|
|
||||||
"%r is not a valid %s, please select one of %s"
|
|
||||||
% (value, cls.__name__, str(list(cls._value2member_map_.keys())))
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
class TruncationStrategy(ExplicitEnum):
|
class TruncationStrategy(ExplicitEnum):
|
||||||
"""
|
"""
|
||||||
Possible values for the ``truncation`` argument in :meth:`PreTrainedTokenizerBase.__call__`. Useful for
|
Possible values for the ``truncation`` argument in :meth:`PreTrainedTokenizerBase.__call__`. Useful for
|
||||||
@@ -159,29 +126,6 @@ class TruncationStrategy(ExplicitEnum):
|
|||||||
DO_NOT_TRUNCATE = "do_not_truncate"
|
DO_NOT_TRUNCATE = "do_not_truncate"
|
||||||
|
|
||||||
|
|
||||||
class PaddingStrategy(ExplicitEnum):
|
|
||||||
"""
|
|
||||||
Possible values for the ``padding`` argument in :meth:`PreTrainedTokenizerBase.__call__`. Useful for tab-completion
|
|
||||||
in an IDE.
|
|
||||||
"""
|
|
||||||
|
|
||||||
LONGEST = "longest"
|
|
||||||
MAX_LENGTH = "max_length"
|
|
||||||
DO_NOT_PAD = "do_not_pad"
|
|
||||||
|
|
||||||
|
|
||||||
class TensorType(ExplicitEnum):
|
|
||||||
"""
|
|
||||||
Possible values for the ``return_tensors`` argument in :meth:`PreTrainedTokenizerBase.__call__`. Useful for
|
|
||||||
tab-completion in an IDE.
|
|
||||||
"""
|
|
||||||
|
|
||||||
PYTORCH = "pt"
|
|
||||||
TENSORFLOW = "tf"
|
|
||||||
NUMPY = "np"
|
|
||||||
JAX = "jax"
|
|
||||||
|
|
||||||
|
|
||||||
class CharSpan(NamedTuple):
|
class CharSpan(NamedTuple):
|
||||||
"""
|
"""
|
||||||
Character span in the original string.
|
Character span in the original string.
|
||||||
@@ -208,24 +152,6 @@ class TokenSpan(NamedTuple):
|
|||||||
end: int
|
end: int
|
||||||
|
|
||||||
|
|
||||||
def to_py_obj(obj):
|
|
||||||
"""
|
|
||||||
Convert a TensorFlow tensor, PyTorch tensor, Numpy array or python list to a python list.
|
|
||||||
"""
|
|
||||||
if isinstance(obj, (dict, BatchEncoding)):
|
|
||||||
return {k: to_py_obj(v) for k, v in obj.items()}
|
|
||||||
elif isinstance(obj, (list, tuple)):
|
|
||||||
return [to_py_obj(o) for o in obj]
|
|
||||||
elif is_tf_available() and _is_tensorflow(obj):
|
|
||||||
return obj.numpy().tolist()
|
|
||||||
elif is_torch_available() and _is_torch(obj):
|
|
||||||
return obj.detach().cpu().tolist()
|
|
||||||
elif isinstance(obj, np.ndarray):
|
|
||||||
return obj.tolist()
|
|
||||||
else:
|
|
||||||
return obj
|
|
||||||
|
|
||||||
|
|
||||||
class BatchEncoding(UserDict):
|
class BatchEncoding(UserDict):
|
||||||
"""
|
"""
|
||||||
Holds the output of the :meth:`~transformers.tokenization_utils_base.PreTrainedTokenizerBase.encode_plus` and
|
Holds the output of the :meth:`~transformers.tokenization_utils_base.PreTrainedTokenizerBase.encode_plus` and
|
||||||
@@ -715,9 +641,9 @@ class BatchEncoding(UserDict):
|
|||||||
Convert the inner content to tensors.
|
Convert the inner content to tensors.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
tensor_type (:obj:`str` or :class:`~transformers.tokenization_utils_base.TensorType`, `optional`):
|
tensor_type (:obj:`str` or :class:`~transformers.file_utils.TensorType`, `optional`):
|
||||||
The type of tensors to use. If :obj:`str`, should be one of the values of the enum
|
The type of tensors to use. If :obj:`str`, should be one of the values of the enum
|
||||||
:class:`~transformers.tokenization_utils_base.TensorType`. If :obj:`None`, no modification is done.
|
:class:`~transformers.file_utils.TensorType`. If :obj:`None`, no modification is done.
|
||||||
prepend_batch_axis (:obj:`int`, `optional`, defaults to :obj:`False`):
|
prepend_batch_axis (:obj:`int`, `optional`, defaults to :obj:`False`):
|
||||||
Whether or not to add the batch dimension during the conversion.
|
Whether or not to add the batch dimension during the conversion.
|
||||||
"""
|
"""
|
||||||
@@ -810,9 +736,7 @@ class BatchEncoding(UserDict):
|
|||||||
if isinstance(device, str) or _is_torch_device(device) or isinstance(device, int):
|
if isinstance(device, str) or _is_torch_device(device) or isinstance(device, int):
|
||||||
self.data = {k: v.to(device=device) for k, v in self.data.items()}
|
self.data = {k: v.to(device=device) for k, v in self.data.items()}
|
||||||
else:
|
else:
|
||||||
logger.warning(
|
logger.warning(f"Attempting to cast a BatchEncoding to type {str(device)}. This is not supported.")
|
||||||
f"Attempting to cast a BatchEncoding to another type, {str(device)}. This is not supported."
|
|
||||||
)
|
|
||||||
return self
|
return self
|
||||||
|
|
||||||
|
|
||||||
@@ -1321,7 +1245,7 @@ class SpecialTokensMixin:
|
|||||||
ENCODE_KWARGS_DOCSTRING = r"""
|
ENCODE_KWARGS_DOCSTRING = r"""
|
||||||
add_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`True`):
|
add_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`True`):
|
||||||
Whether or not to encode the sequences with the special tokens relative to their model.
|
Whether or not to encode the sequences with the special tokens relative to their model.
|
||||||
padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`False`):
|
padding (:obj:`bool`, :obj:`str` or :class:`~transformers.file_utils.PaddingStrategy`, `optional`, defaults to :obj:`False`):
|
||||||
Activates and controls padding. Accepts the following values:
|
Activates and controls padding. Accepts the following values:
|
||||||
|
|
||||||
* :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a
|
* :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a
|
||||||
@@ -1362,7 +1286,7 @@ ENCODE_KWARGS_DOCSTRING = r"""
|
|||||||
pad_to_multiple_of (:obj:`int`, `optional`):
|
pad_to_multiple_of (:obj:`int`, `optional`):
|
||||||
If set will pad the sequence to a multiple of the provided value. This is especially useful to enable
|
If set will pad the sequence to a multiple of the provided value. This is especially useful to enable
|
||||||
the use of Tensor Cores on NVIDIA hardware with compute capability >= 7.5 (Volta).
|
the use of Tensor Cores on NVIDIA hardware with compute capability >= 7.5 (Volta).
|
||||||
return_tensors (:obj:`str` or :class:`~transformers.tokenization_utils_base.TensorType`, `optional`):
|
return_tensors (:obj:`str` or :class:`~transformers.file_utils.TensorType`, `optional`):
|
||||||
If set, will return tensors instead of list of python integers. Acceptable values are:
|
If set, will return tensors instead of list of python integers. Acceptable values are:
|
||||||
|
|
||||||
* :obj:`'tf'`: Return TensorFlow :obj:`tf.constant` objects.
|
* :obj:`'tf'`: Return TensorFlow :obj:`tf.constant` objects.
|
||||||
@@ -2608,7 +2532,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
|
|||||||
|
|
||||||
Instead of :obj:`List[int]` you can have tensors (numpy arrays, PyTorch tensors or TensorFlow tensors),
|
Instead of :obj:`List[int]` you can have tensors (numpy arrays, PyTorch tensors or TensorFlow tensors),
|
||||||
see the note above for the return type.
|
see the note above for the return type.
|
||||||
padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
|
padding (:obj:`bool`, :obj:`str` or :class:`~transformers.file_utils.PaddingStrategy`, `optional`, defaults to :obj:`True`):
|
||||||
Select a strategy to pad the returned sequences (according to the model's padding side and padding
|
Select a strategy to pad the returned sequences (according to the model's padding side and padding
|
||||||
index) among:
|
index) among:
|
||||||
|
|
||||||
@@ -2630,7 +2554,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
|
|||||||
to the specific tokenizer's default, defined by the :obj:`return_outputs` attribute.
|
to the specific tokenizer's default, defined by the :obj:`return_outputs` attribute.
|
||||||
|
|
||||||
`What are attention masks? <../glossary.html#attention-mask>`__
|
`What are attention masks? <../glossary.html#attention-mask>`__
|
||||||
return_tensors (:obj:`str` or :class:`~transformers.tokenization_utils_base.TensorType`, `optional`):
|
return_tensors (:obj:`str` or :class:`~transformers.file_utils.TensorType`, `optional`):
|
||||||
If set, will return tensors instead of list of python integers. Acceptable values are:
|
If set, will return tensors instead of list of python integers. Acceptable values are:
|
||||||
|
|
||||||
* :obj:`'tf'`: Return TensorFlow :obj:`tf.constant` objects.
|
* :obj:`'tf'`: Return TensorFlow :obj:`tf.constant` objects.
|
||||||
@@ -3260,7 +3184,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
|
|||||||
max_target_length (:obj:`int`, `optional`):
|
max_target_length (:obj:`int`, `optional`):
|
||||||
Controls the maximum length of decoder inputs (target language texts or summaries) If left unset or set
|
Controls the maximum length of decoder inputs (target language texts or summaries) If left unset or set
|
||||||
to :obj:`None`, this will use the max_length value.
|
to :obj:`None`, this will use the max_length value.
|
||||||
padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`False`):
|
padding (:obj:`bool`, :obj:`str` or :class:`~transformers.file_utils.PaddingStrategy`, `optional`, defaults to :obj:`False`):
|
||||||
Activates and controls padding. Accepts the following values:
|
Activates and controls padding. Accepts the following values:
|
||||||
|
|
||||||
* :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a
|
* :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a
|
||||||
@@ -3269,7 +3193,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
|
|||||||
maximum acceptable input length for the model if that argument is not provided.
|
maximum acceptable input length for the model if that argument is not provided.
|
||||||
* :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
|
* :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
|
||||||
different lengths).
|
different lengths).
|
||||||
return_tensors (:obj:`str` or :class:`~transformers.tokenization_utils_base.TensorType`, `optional`):
|
return_tensors (:obj:`str` or :class:`~transformers.file_utils.TensorType`, `optional`):
|
||||||
If set, will return tensors instead of list of python integers. Acceptable values are:
|
If set, will return tensors instead of list of python integers. Acceptable values are:
|
||||||
|
|
||||||
* :obj:`'tf'`: Return TensorFlow :obj:`tf.constant` objects.
|
* :obj:`'tf'`: Return TensorFlow :obj:`tf.constant` objects.
|
||||||
|
|||||||
@@ -27,13 +27,12 @@ from tokenizers import Tokenizer as TokenizerFast
|
|||||||
from tokenizers.decoders import Decoder as DecoderFast
|
from tokenizers.decoders import Decoder as DecoderFast
|
||||||
|
|
||||||
from .convert_slow_tokenizer import convert_slow_tokenizer
|
from .convert_slow_tokenizer import convert_slow_tokenizer
|
||||||
from .file_utils import add_end_docstrings
|
from .file_utils import PaddingStrategy, add_end_docstrings
|
||||||
from .tokenization_utils import PreTrainedTokenizer
|
from .tokenization_utils import PreTrainedTokenizer
|
||||||
from .tokenization_utils_base import (
|
from .tokenization_utils_base import (
|
||||||
INIT_TOKENIZER_DOCSTRING,
|
INIT_TOKENIZER_DOCSTRING,
|
||||||
AddedToken,
|
AddedToken,
|
||||||
BatchEncoding,
|
BatchEncoding,
|
||||||
PaddingStrategy,
|
|
||||||
PreTokenizedInput,
|
PreTokenizedInput,
|
||||||
PreTokenizedInputPair,
|
PreTokenizedInputPair,
|
||||||
PreTrainedTokenizerBase,
|
PreTrainedTokenizerBase,
|
||||||
@@ -308,7 +307,7 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase):
|
|||||||
section.
|
section.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
padding_strategy (:class:`~transformers.tokenization_utils_base.PaddingStrategy`):
|
padding_strategy (:class:`~transformers.file_utils.PaddingStrategy`):
|
||||||
The kind of padding that will be applied to the input
|
The kind of padding that will be applied to the input
|
||||||
truncation_strategy (:class:`~transformers.tokenization_utils_base.TruncationStrategy`):
|
truncation_strategy (:class:`~transformers.tokenization_utils_base.TruncationStrategy`):
|
||||||
The kind of truncation that will be applied to the input
|
The kind of truncation that will be applied to the input
|
||||||
|
|||||||
@@ -29,13 +29,13 @@ from typing import Any, Dict, NamedTuple, Optional, Tuple, Union
|
|||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
from .file_utils import (
|
from .file_utils import (
|
||||||
|
ExplicitEnum,
|
||||||
is_sagemaker_distributed_available,
|
is_sagemaker_distributed_available,
|
||||||
is_tf_available,
|
is_tf_available,
|
||||||
is_torch_available,
|
is_torch_available,
|
||||||
is_torch_cuda_available,
|
is_torch_cuda_available,
|
||||||
is_torch_tpu_available,
|
is_torch_tpu_available,
|
||||||
)
|
)
|
||||||
from .tokenization_utils_base import ExplicitEnum
|
|
||||||
|
|
||||||
|
|
||||||
def set_seed(seed: int):
|
def set_seed(seed: int):
|
||||||
|
|||||||
284
tests/test_feature_extraction_common.py
Normal file
284
tests/test_feature_extraction_common.py
Normal file
@@ -0,0 +1,284 @@
|
|||||||
|
# coding=utf-8
|
||||||
|
# Copyright 2021 HuggingFace Inc.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import tempfile
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
from transformers import BatchFeature
|
||||||
|
from transformers.testing_utils import require_tf, require_torch
|
||||||
|
|
||||||
|
|
||||||
|
class FeatureExtractionMixin:
|
||||||
|
|
||||||
|
# to overwrite at feature extractactor specific tests
|
||||||
|
feat_extract_tester = None
|
||||||
|
feature_extraction_class = None
|
||||||
|
|
||||||
|
@property
|
||||||
|
def feat_extract_dict(self):
|
||||||
|
return self.feat_extract_tester.prepare_feat_extract_dict()
|
||||||
|
|
||||||
|
def test_feat_extract_common_properties(self):
|
||||||
|
feat_extract = self.feature_extraction_class(**self.feat_extract_dict)
|
||||||
|
self.assertTrue(hasattr(feat_extract, "feature_size"))
|
||||||
|
self.assertTrue(hasattr(feat_extract, "sampling_rate"))
|
||||||
|
self.assertTrue(hasattr(feat_extract, "padding_value"))
|
||||||
|
|
||||||
|
def test_feat_extract_to_json_string(self):
|
||||||
|
feat_extract = self.feature_extraction_class(**self.feat_extract_dict)
|
||||||
|
obj = json.loads(feat_extract.to_json_string())
|
||||||
|
for key, value in self.feat_extract_dict.items():
|
||||||
|
self.assertEqual(obj[key], value)
|
||||||
|
|
||||||
|
def test_feat_extract_to_json_file(self):
|
||||||
|
feat_extract_first = self.feature_extraction_class(**self.feat_extract_dict)
|
||||||
|
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdirname:
|
||||||
|
json_file_path = os.path.join(tmpdirname, "feat_extract.json")
|
||||||
|
feat_extract_first.to_json_file(json_file_path)
|
||||||
|
feat_extract_second = self.feature_extraction_class.from_json_file(json_file_path)
|
||||||
|
|
||||||
|
self.assertEqual(feat_extract_second.to_dict(), feat_extract_first.to_dict())
|
||||||
|
|
||||||
|
def test_feat_extract_from_and_save_pretrained(self):
|
||||||
|
feat_extract_first = self.feature_extraction_class(**self.feat_extract_dict)
|
||||||
|
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdirname:
|
||||||
|
feat_extract_first.save_pretrained(tmpdirname)
|
||||||
|
feat_extract_second = self.feature_extraction_class.from_pretrained(tmpdirname)
|
||||||
|
|
||||||
|
self.assertEqual(feat_extract_second.to_dict(), feat_extract_first.to_dict())
|
||||||
|
|
||||||
|
def test_init_without_params(self):
|
||||||
|
feat_extract = self.feature_extraction_class()
|
||||||
|
self.assertIsNotNone(feat_extract)
|
||||||
|
|
||||||
|
def test_batch_feature(self):
|
||||||
|
speech_inputs = self.feat_extract_tester.prepare_inputs_for_common()
|
||||||
|
feat_extract = self.feature_extraction_class(**self.feat_extract_dict)
|
||||||
|
input_name = feat_extract.model_input_names[0]
|
||||||
|
|
||||||
|
processed_features = BatchFeature({input_name: speech_inputs})
|
||||||
|
|
||||||
|
self.assertTrue(all(len(x) == len(y) for x, y in zip(speech_inputs, processed_features[input_name])))
|
||||||
|
|
||||||
|
speech_inputs = self.feat_extract_tester.prepare_inputs_for_common(equal_length=True)
|
||||||
|
processed_features = BatchFeature({input_name: speech_inputs}, tensor_type="np")
|
||||||
|
|
||||||
|
batch_features_input = processed_features[input_name]
|
||||||
|
|
||||||
|
if len(batch_features_input.shape) < 3:
|
||||||
|
batch_features_input = batch_features_input[:, :, None]
|
||||||
|
|
||||||
|
self.assertTrue(
|
||||||
|
batch_features_input.shape
|
||||||
|
== (self.feat_extract_tester.batch_size, len(speech_inputs[0]), self.feat_extract_tester.feature_size)
|
||||||
|
)
|
||||||
|
|
||||||
|
@require_torch
|
||||||
|
def test_batch_feature_pt(self):
|
||||||
|
speech_inputs = self.feat_extract_tester.prepare_inputs_for_common(equal_length=True)
|
||||||
|
feat_extract = self.feature_extraction_class(**self.feat_extract_dict)
|
||||||
|
input_name = feat_extract.model_input_names[0]
|
||||||
|
|
||||||
|
processed_features = BatchFeature({input_name: speech_inputs}, tensor_type="pt")
|
||||||
|
|
||||||
|
batch_features_input = processed_features[input_name]
|
||||||
|
|
||||||
|
if len(batch_features_input.shape) < 3:
|
||||||
|
batch_features_input = batch_features_input[:, :, None]
|
||||||
|
|
||||||
|
self.assertTrue(
|
||||||
|
batch_features_input.shape
|
||||||
|
== (self.feat_extract_tester.batch_size, len(speech_inputs[0]), self.feat_extract_tester.feature_size)
|
||||||
|
)
|
||||||
|
|
||||||
|
@require_tf
|
||||||
|
def test_batch_feature_tf(self):
|
||||||
|
speech_inputs = self.feat_extract_tester.prepare_inputs_for_common(equal_length=True)
|
||||||
|
feat_extract = self.feature_extraction_class(**self.feat_extract_dict)
|
||||||
|
input_name = feat_extract.model_input_names[0]
|
||||||
|
|
||||||
|
processed_features = BatchFeature({input_name: speech_inputs}, tensor_type="tf")
|
||||||
|
|
||||||
|
batch_features_input = processed_features[input_name]
|
||||||
|
|
||||||
|
if len(batch_features_input.shape) < 3:
|
||||||
|
batch_features_input = batch_features_input[:, :, None]
|
||||||
|
|
||||||
|
self.assertTrue(
|
||||||
|
batch_features_input.shape
|
||||||
|
== (self.feat_extract_tester.batch_size, len(speech_inputs[0]), self.feat_extract_tester.feature_size)
|
||||||
|
)
|
||||||
|
|
||||||
|
def _check_padding(self, numpify=False):
|
||||||
|
def _inputs_have_equal_length(input):
|
||||||
|
length = len(input[0])
|
||||||
|
for input_slice in input[1:]:
|
||||||
|
if len(input_slice) != length:
|
||||||
|
return False
|
||||||
|
return True
|
||||||
|
|
||||||
|
def _inputs_are_equal(input_1, input_2):
|
||||||
|
if len(input_1) != len(input_2):
|
||||||
|
return False
|
||||||
|
|
||||||
|
for input_slice_1, input_slice_2 in zip(input_1, input_2):
|
||||||
|
if not np.allclose(np.asarray(input_slice_1), np.asarray(input_slice_2), atol=1e-3):
|
||||||
|
return False
|
||||||
|
return True
|
||||||
|
|
||||||
|
feat_extract = self.feature_extraction_class(**self.feat_extract_dict)
|
||||||
|
speech_inputs = self.feat_extract_tester.prepare_inputs_for_common(numpify=numpify)
|
||||||
|
input_name = feat_extract.model_input_names[0]
|
||||||
|
|
||||||
|
processed_features = BatchFeature({input_name: speech_inputs})
|
||||||
|
|
||||||
|
pad_diff = self.feat_extract_tester.seq_length_diff
|
||||||
|
pad_max_length = self.feat_extract_tester.max_seq_length + pad_diff
|
||||||
|
pad_min_length = self.feat_extract_tester.min_seq_length
|
||||||
|
batch_size = self.feat_extract_tester.batch_size
|
||||||
|
feature_size = self.feat_extract_tester.feature_size
|
||||||
|
|
||||||
|
# test padding for List[int] + numpy
|
||||||
|
input_1 = feat_extract.pad(processed_features, padding=False)[input_name]
|
||||||
|
input_2 = feat_extract.pad(processed_features, padding="longest")[input_name]
|
||||||
|
input_3 = feat_extract.pad(processed_features, padding="max_length", max_length=len(speech_inputs[-1]))[
|
||||||
|
input_name
|
||||||
|
]
|
||||||
|
input_4 = feat_extract.pad(processed_features, padding="longest", return_tensors="np")[input_name]
|
||||||
|
|
||||||
|
# max_length parameter has to be provided when setting `padding="max_length"`
|
||||||
|
with self.assertRaises(ValueError):
|
||||||
|
feat_extract.pad(processed_features, padding="max_length")[input_name]
|
||||||
|
|
||||||
|
input_5 = feat_extract.pad(
|
||||||
|
processed_features, padding="max_length", max_length=pad_max_length, return_tensors="np"
|
||||||
|
)[input_name]
|
||||||
|
|
||||||
|
self.assertFalse(_inputs_have_equal_length(input_1))
|
||||||
|
self.assertTrue(_inputs_have_equal_length(input_2))
|
||||||
|
self.assertTrue(_inputs_have_equal_length(input_3))
|
||||||
|
self.assertTrue(_inputs_are_equal(input_2, input_3))
|
||||||
|
self.assertTrue(len(input_1[0]) == pad_min_length)
|
||||||
|
self.assertTrue(len(input_1[1]) == pad_min_length + pad_diff)
|
||||||
|
self.assertTrue(input_4.shape[:2] == (batch_size, len(input_3[0])))
|
||||||
|
self.assertTrue(input_5.shape[:2] == (batch_size, pad_max_length))
|
||||||
|
|
||||||
|
if feature_size > 1:
|
||||||
|
self.assertTrue(input_4.shape[2] == input_5.shape[2] == feature_size)
|
||||||
|
|
||||||
|
# test padding for `pad_to_multiple_of` for List[int] + numpy
|
||||||
|
input_6 = feat_extract.pad(processed_features, pad_to_multiple_of=10)[input_name]
|
||||||
|
input_7 = feat_extract.pad(processed_features, padding="longest", pad_to_multiple_of=10)[input_name]
|
||||||
|
input_8 = feat_extract.pad(
|
||||||
|
processed_features, padding="max_length", pad_to_multiple_of=10, max_length=pad_max_length
|
||||||
|
)[input_name]
|
||||||
|
input_9 = feat_extract.pad(
|
||||||
|
processed_features,
|
||||||
|
padding="max_length",
|
||||||
|
pad_to_multiple_of=10,
|
||||||
|
max_length=pad_max_length,
|
||||||
|
return_tensors="np",
|
||||||
|
)[input_name]
|
||||||
|
|
||||||
|
self.assertTrue(all(len(x) % 10 == 0 for x in input_6))
|
||||||
|
self.assertTrue(_inputs_are_equal(input_6, input_7))
|
||||||
|
|
||||||
|
expected_mult_pad_length = pad_max_length if pad_max_length % 10 == 0 else (pad_max_length // 10 + 1) * 10
|
||||||
|
self.assertTrue(all(len(x) == expected_mult_pad_length for x in input_8))
|
||||||
|
self.assertTrue(input_9.shape[:2], (batch_size, expected_mult_pad_length))
|
||||||
|
|
||||||
|
if feature_size > 1:
|
||||||
|
self.assertTrue(input_9.shape[2] == feature_size)
|
||||||
|
|
||||||
|
# Check padding value is correct
|
||||||
|
padding_vector_sum = (np.ones(self.feat_extract_tester.feature_size) * feat_extract.padding_value).sum()
|
||||||
|
self.assertTrue(
|
||||||
|
abs(np.asarray(input_2[0])[pad_min_length:].sum() - padding_vector_sum * (pad_max_length - pad_min_length))
|
||||||
|
< 1e-3
|
||||||
|
)
|
||||||
|
self.assertTrue(
|
||||||
|
abs(
|
||||||
|
np.asarray(input_2[1])[pad_min_length + pad_diff :].sum()
|
||||||
|
- padding_vector_sum * (pad_max_length - pad_min_length - pad_diff)
|
||||||
|
)
|
||||||
|
< 1e-3
|
||||||
|
)
|
||||||
|
self.assertTrue(
|
||||||
|
abs(
|
||||||
|
np.asarray(input_2[2])[pad_min_length + 2 * pad_diff :].sum()
|
||||||
|
- padding_vector_sum * (pad_max_length - pad_min_length - 2 * pad_diff)
|
||||||
|
)
|
||||||
|
< 1e-3
|
||||||
|
)
|
||||||
|
self.assertTrue(
|
||||||
|
abs(input_5[0, pad_min_length:].sum() - padding_vector_sum * (pad_max_length - pad_min_length)) < 1e-3
|
||||||
|
)
|
||||||
|
self.assertTrue(
|
||||||
|
abs(input_9[0, pad_min_length:].sum() - padding_vector_sum * (expected_mult_pad_length - pad_min_length))
|
||||||
|
< 1e-3
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_padding_from_list(self):
|
||||||
|
self._check_padding(numpify=False)
|
||||||
|
|
||||||
|
def test_padding_from_array(self):
|
||||||
|
self._check_padding(numpify=True)
|
||||||
|
|
||||||
|
@require_torch
|
||||||
|
def test_padding_accepts_tensors_pt(self):
|
||||||
|
feat_extract = self.feature_extraction_class(**self.feat_extract_dict)
|
||||||
|
speech_inputs = self.feat_extract_tester.prepare_inputs_for_common()
|
||||||
|
input_name = feat_extract.model_input_names[0]
|
||||||
|
|
||||||
|
processed_features = BatchFeature({input_name: speech_inputs})
|
||||||
|
|
||||||
|
input_np = feat_extract.pad(processed_features, padding="longest", return_tensors="np")[input_name]
|
||||||
|
input_pt = feat_extract.pad(processed_features, padding="longest", return_tensors="pt")[input_name]
|
||||||
|
|
||||||
|
self.assertTrue(abs(input_np.sum() - input_pt.numpy().sum()) < 1e-2)
|
||||||
|
|
||||||
|
@require_tf
|
||||||
|
def test_padding_accepts_tensors_tf(self):
|
||||||
|
feat_extract = self.feature_extraction_class(**self.feat_extract_dict)
|
||||||
|
speech_inputs = self.feat_extract_tester.prepare_inputs_for_common()
|
||||||
|
input_name = feat_extract.model_input_names[0]
|
||||||
|
|
||||||
|
processed_features = BatchFeature({input_name: speech_inputs})
|
||||||
|
|
||||||
|
input_np = feat_extract.pad(processed_features, padding="longest", return_tensors="np")[input_name]
|
||||||
|
input_tf = feat_extract.pad(processed_features, padding="longest", return_tensors="tf")[input_name]
|
||||||
|
|
||||||
|
self.assertTrue(abs(input_np.sum() - input_tf.numpy().sum()) < 1e-2)
|
||||||
|
|
||||||
|
def test_attention_mask(self):
|
||||||
|
feat_dict = self.feat_extract_dict
|
||||||
|
feat_dict["return_attention_mask"] = True
|
||||||
|
feat_extract = self.feature_extraction_class(**feat_dict)
|
||||||
|
speech_inputs = self.feat_extract_tester.prepare_inputs_for_common()
|
||||||
|
input_lenghts = [len(x) for x in speech_inputs]
|
||||||
|
input_name = feat_extract.model_input_names[0]
|
||||||
|
|
||||||
|
processed = BatchFeature({input_name: speech_inputs})
|
||||||
|
|
||||||
|
processed = feat_extract.pad(processed, padding="longest", return_tensors="np")
|
||||||
|
self.assertIn("attention_mask", processed)
|
||||||
|
self.assertListEqual(list(processed.attention_mask.shape), list(processed[input_name].shape[:2]))
|
||||||
|
self.assertListEqual(processed.attention_mask.sum(-1).tolist(), input_lenghts)
|
||||||
147
tests/test_feature_extraction_wav2vec2.py
Normal file
147
tests/test_feature_extraction_wav2vec2.py
Normal file
@@ -0,0 +1,147 @@
|
|||||||
|
# coding=utf-8
|
||||||
|
# Copyright 2021 HuggingFace Inc.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
|
||||||
|
import itertools
|
||||||
|
import random
|
||||||
|
import unittest
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
from transformers import WAV_2_VEC_2_PRETRAINED_MODEL_ARCHIVE_LIST, Wav2Vec2Config, Wav2Vec2FeatureExtractor
|
||||||
|
from transformers.testing_utils import slow
|
||||||
|
|
||||||
|
from .test_feature_extraction_common import FeatureExtractionMixin
|
||||||
|
|
||||||
|
|
||||||
|
global_rng = random.Random()
|
||||||
|
|
||||||
|
|
||||||
|
def floats_list(shape, scale=1.0, rng=None, name=None):
|
||||||
|
"""Creates a random float32 tensor"""
|
||||||
|
if rng is None:
|
||||||
|
rng = global_rng
|
||||||
|
|
||||||
|
values = []
|
||||||
|
for batch_idx in range(shape[0]):
|
||||||
|
values.append([])
|
||||||
|
for _ in range(shape[1]):
|
||||||
|
values[-1].append(rng.random() * scale)
|
||||||
|
|
||||||
|
return values
|
||||||
|
|
||||||
|
|
||||||
|
class Wav2Vec2FeatureExtractionTester(unittest.TestCase):
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
parent,
|
||||||
|
batch_size=7,
|
||||||
|
min_seq_length=400,
|
||||||
|
max_seq_length=2000,
|
||||||
|
feature_size=1,
|
||||||
|
padding_value=0.0,
|
||||||
|
sampling_rate=16000,
|
||||||
|
return_attention_mask=True,
|
||||||
|
do_normalize=True,
|
||||||
|
):
|
||||||
|
self.parent = parent
|
||||||
|
self.batch_size = batch_size
|
||||||
|
self.min_seq_length = min_seq_length
|
||||||
|
self.max_seq_length = max_seq_length
|
||||||
|
self.seq_length_diff = (self.max_seq_length - self.min_seq_length) // (self.batch_size - 1)
|
||||||
|
self.feature_size = feature_size
|
||||||
|
self.padding_value = padding_value
|
||||||
|
self.sampling_rate = sampling_rate
|
||||||
|
self.return_attention_mask = return_attention_mask
|
||||||
|
self.do_normalize = do_normalize
|
||||||
|
|
||||||
|
def prepare_feat_extract_dict(self):
|
||||||
|
return {
|
||||||
|
"feature_size": self.feature_size,
|
||||||
|
"padding_value": self.padding_value,
|
||||||
|
"sampling_rate": self.sampling_rate,
|
||||||
|
"return_attention_mask": self.return_attention_mask,
|
||||||
|
"do_normalize": self.do_normalize,
|
||||||
|
}
|
||||||
|
|
||||||
|
def prepare_inputs_for_common(self, equal_length=False, numpify=False):
|
||||||
|
def _flatten(list_of_lists):
|
||||||
|
return list(itertools.chain(*list_of_lists))
|
||||||
|
|
||||||
|
if equal_length:
|
||||||
|
speech_inputs = floats_list((self.batch_size, self.max_seq_length))
|
||||||
|
else:
|
||||||
|
speech_inputs = [
|
||||||
|
_flatten(floats_list((x, self.feature_size)))
|
||||||
|
for x in range(self.min_seq_length, self.max_seq_length, self.seq_length_diff)
|
||||||
|
]
|
||||||
|
|
||||||
|
if numpify:
|
||||||
|
speech_inputs = [np.asarray(x) for x in speech_inputs]
|
||||||
|
|
||||||
|
return speech_inputs
|
||||||
|
|
||||||
|
|
||||||
|
class Wav2Vec2FeatureExtractionTest(FeatureExtractionMixin, unittest.TestCase):
|
||||||
|
|
||||||
|
feature_extraction_class = Wav2Vec2FeatureExtractor
|
||||||
|
|
||||||
|
def setUp(self):
|
||||||
|
self.feat_extract_tester = Wav2Vec2FeatureExtractionTester(self)
|
||||||
|
|
||||||
|
def test_call(self):
|
||||||
|
# Tests that all call wrap to encode_plus and batch_encode_plus
|
||||||
|
feat_extract = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
|
||||||
|
# create three inputs of length 800, 1000, and 1200
|
||||||
|
speech_inputs = [floats_list((1, x))[0] for x in range(800, 1400, 200)]
|
||||||
|
np_speech_inputs = [np.asarray(speech_input) for speech_input in speech_inputs]
|
||||||
|
|
||||||
|
# Test not batched input
|
||||||
|
encoded_sequences_1 = feat_extract(speech_inputs[0], return_tensors="np").input_values
|
||||||
|
encoded_sequences_2 = feat_extract(np_speech_inputs[0], return_tensors="np").input_values
|
||||||
|
self.assertTrue(np.allclose(encoded_sequences_1, encoded_sequences_2, atol=1e-3))
|
||||||
|
|
||||||
|
# Test batched
|
||||||
|
encoded_sequences_1 = feat_extract(speech_inputs, return_tensors="np").input_values
|
||||||
|
encoded_sequences_2 = feat_extract(np_speech_inputs, return_tensors="np").input_values
|
||||||
|
for enc_seq_1, enc_seq_2 in zip(encoded_sequences_1, encoded_sequences_2):
|
||||||
|
self.assertTrue(np.allclose(enc_seq_1, enc_seq_2, atol=1e-3))
|
||||||
|
|
||||||
|
def test_zero_mean_unit_variance_normalization(self):
|
||||||
|
feat_extract = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
|
||||||
|
speech_inputs = [floats_list((1, x))[0] for x in range(800, 1400, 200)]
|
||||||
|
processed = feat_extract(speech_inputs, padding="longest")
|
||||||
|
input_values = processed.input_values
|
||||||
|
|
||||||
|
def _check_zero_mean_unit_variance(input_vector):
|
||||||
|
self.assertTrue(np.abs(np.mean(input_vector)) < 1e-3)
|
||||||
|
self.assertTrue(np.abs(np.var(input_vector) - 1) < 1e-3)
|
||||||
|
|
||||||
|
_check_zero_mean_unit_variance(input_values[0, :800])
|
||||||
|
_check_zero_mean_unit_variance(input_values[1, :1000])
|
||||||
|
_check_zero_mean_unit_variance(input_values[2])
|
||||||
|
|
||||||
|
@slow
|
||||||
|
def test_pretrained_checkpoints_are_set_correctly(self):
|
||||||
|
# this test makes sure that models that are using
|
||||||
|
# group norm don't have their feature extractor return the
|
||||||
|
# attention_mask
|
||||||
|
for model_id in WAV_2_VEC_2_PRETRAINED_MODEL_ARCHIVE_LIST:
|
||||||
|
config = Wav2Vec2Config.from_pretrained(model_id)
|
||||||
|
feat_extract = Wav2Vec2FeatureExtractor.from_pretrained(model_id)
|
||||||
|
|
||||||
|
# only "layer" feature extraction norm should make use of
|
||||||
|
# attention_mask
|
||||||
|
self.assertEqual(feat_extract.return_attention_mask, config.feat_extract_norm == "layer")
|
||||||
@@ -29,7 +29,7 @@ from .test_modeling_common import ModelTesterMixin, _config_zero_init
|
|||||||
if is_torch_available():
|
if is_torch_available():
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
from transformers import Wav2Vec2Config, Wav2Vec2ForCTC, Wav2Vec2ForMaskedLM, Wav2Vec2Model, Wav2Vec2Tokenizer
|
from transformers import Wav2Vec2Config, Wav2Vec2ForCTC, Wav2Vec2ForMaskedLM, Wav2Vec2Model, Wav2Vec2Processor
|
||||||
|
|
||||||
|
|
||||||
class Wav2Vec2ModelTester:
|
class Wav2Vec2ModelTester:
|
||||||
@@ -324,17 +324,16 @@ class Wav2Vec2ModelIntegrationTest(unittest.TestCase):
|
|||||||
def test_inference_ctc_normal(self):
|
def test_inference_ctc_normal(self):
|
||||||
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")
|
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")
|
||||||
model.to(torch_device)
|
model.to(torch_device)
|
||||||
tokenizer = Wav2Vec2Tokenizer.from_pretrained("facebook/wav2vec2-base-960h", do_lower_case=True)
|
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h", do_lower_case=True)
|
||||||
|
|
||||||
input_speech = self._load_datasamples(1)
|
input_speech = self._load_datasamples(1)
|
||||||
|
|
||||||
input_values = tokenizer(input_speech, return_tensors="pt").input_values.to(torch_device)
|
input_values = processor(input_speech, return_tensors="pt").input_values.to(torch_device)
|
||||||
|
|
||||||
with torch.no_grad():
|
with torch.no_grad():
|
||||||
logits = model(input_values).logits
|
logits = model(input_values).logits
|
||||||
|
|
||||||
predicted_ids = torch.argmax(logits, dim=-1)
|
predicted_ids = torch.argmax(logits, dim=-1)
|
||||||
predicted_trans = tokenizer.batch_decode(predicted_ids)
|
predicted_trans = processor.batch_decode(predicted_ids)
|
||||||
|
|
||||||
EXPECTED_TRANSCRIPTIONS = ["a man said to the universe sir i exist"]
|
EXPECTED_TRANSCRIPTIONS = ["a man said to the universe sir i exist"]
|
||||||
self.assertListEqual(predicted_trans, EXPECTED_TRANSCRIPTIONS)
|
self.assertListEqual(predicted_trans, EXPECTED_TRANSCRIPTIONS)
|
||||||
@@ -342,11 +341,11 @@ class Wav2Vec2ModelIntegrationTest(unittest.TestCase):
|
|||||||
def test_inference_ctc_normal_batched(self):
|
def test_inference_ctc_normal_batched(self):
|
||||||
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")
|
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")
|
||||||
model.to(torch_device)
|
model.to(torch_device)
|
||||||
tokenizer = Wav2Vec2Tokenizer.from_pretrained("facebook/wav2vec2-base-960h", do_lower_case=True)
|
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h", do_lower_case=True)
|
||||||
|
|
||||||
input_speech = self._load_datasamples(2)
|
input_speech = self._load_datasamples(2)
|
||||||
|
|
||||||
inputs = tokenizer(input_speech, return_tensors="pt", padding=True, truncation=True)
|
inputs = processor(input_speech, return_tensors="pt", padding=True, truncation=True)
|
||||||
|
|
||||||
input_values = inputs.input_values.to(torch_device)
|
input_values = inputs.input_values.to(torch_device)
|
||||||
|
|
||||||
@@ -354,7 +353,7 @@ class Wav2Vec2ModelIntegrationTest(unittest.TestCase):
|
|||||||
logits = model(input_values).logits
|
logits = model(input_values).logits
|
||||||
|
|
||||||
predicted_ids = torch.argmax(logits, dim=-1)
|
predicted_ids = torch.argmax(logits, dim=-1)
|
||||||
predicted_trans = tokenizer.batch_decode(predicted_ids)
|
predicted_trans = processor.batch_decode(predicted_ids)
|
||||||
|
|
||||||
EXPECTED_TRANSCRIPTIONS = [
|
EXPECTED_TRANSCRIPTIONS = [
|
||||||
"a man said to the universe sir i exist",
|
"a man said to the universe sir i exist",
|
||||||
@@ -364,11 +363,11 @@ class Wav2Vec2ModelIntegrationTest(unittest.TestCase):
|
|||||||
|
|
||||||
def test_inference_ctc_robust_batched(self):
|
def test_inference_ctc_robust_batched(self):
|
||||||
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h-lv60-self").to(torch_device)
|
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h-lv60-self").to(torch_device)
|
||||||
tokenizer = Wav2Vec2Tokenizer.from_pretrained("facebook/wav2vec2-large-960h-lv60-self", do_lower_case=True)
|
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h-lv60-self", do_lower_case=True)
|
||||||
|
|
||||||
input_speech = self._load_datasamples(4)
|
input_speech = self._load_datasamples(4)
|
||||||
|
|
||||||
inputs = tokenizer(input_speech, return_tensors="pt", padding=True, truncation=True)
|
inputs = processor(input_speech, return_tensors="pt", padding=True, truncation=True)
|
||||||
|
|
||||||
input_values = inputs.input_values.to(torch_device)
|
input_values = inputs.input_values.to(torch_device)
|
||||||
attention_mask = inputs.attention_mask.to(torch_device)
|
attention_mask = inputs.attention_mask.to(torch_device)
|
||||||
@@ -377,7 +376,7 @@ class Wav2Vec2ModelIntegrationTest(unittest.TestCase):
|
|||||||
logits = model(input_values, attention_mask=attention_mask).logits
|
logits = model(input_values, attention_mask=attention_mask).logits
|
||||||
|
|
||||||
predicted_ids = torch.argmax(logits, dim=-1)
|
predicted_ids = torch.argmax(logits, dim=-1)
|
||||||
predicted_trans = tokenizer.batch_decode(predicted_ids)
|
predicted_trans = processor.batch_decode(predicted_ids)
|
||||||
|
|
||||||
EXPECTED_TRANSCRIPTIONS = [
|
EXPECTED_TRANSCRIPTIONS = [
|
||||||
"a man said to the universe sir i exist",
|
"a man said to the universe sir i exist",
|
||||||
|
|||||||
@@ -16,9 +16,9 @@ from typing import List, Optional
|
|||||||
from unittest import mock
|
from unittest import mock
|
||||||
|
|
||||||
from transformers import is_tf_available, is_torch_available, pipeline
|
from transformers import is_tf_available, is_torch_available, pipeline
|
||||||
|
from transformers.file_utils import to_py_obj
|
||||||
from transformers.pipelines import Pipeline
|
from transformers.pipelines import Pipeline
|
||||||
from transformers.testing_utils import _run_slow_tests, is_pipeline_test, require_tf, require_torch, slow
|
from transformers.testing_utils import _run_slow_tests, is_pipeline_test, require_tf, require_torch, slow
|
||||||
from transformers.tokenization_utils_base import to_py_obj
|
|
||||||
|
|
||||||
|
|
||||||
VALID_INPUTS = ["A simple string", ["list of strings"]]
|
VALID_INPUTS = ["A simple string", ["list of strings"]]
|
||||||
|
|||||||
139
tests/test_processor_wav2vec2.py
Normal file
139
tests/test_processor_wav2vec2.py
Normal file
@@ -0,0 +1,139 @@
|
|||||||
|
# Copyright 2021 The HuggingFace Team. All rights reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import shutil
|
||||||
|
import tempfile
|
||||||
|
import unittest
|
||||||
|
|
||||||
|
from transformers.file_utils import FEATURE_EXTRACTOR_NAME
|
||||||
|
from transformers.models.wav2vec2 import Wav2Vec2CTCTokenizer, Wav2Vec2FeatureExtractor, Wav2Vec2Processor
|
||||||
|
from transformers.models.wav2vec2.tokenization_wav2vec2 import VOCAB_FILES_NAMES
|
||||||
|
|
||||||
|
from .test_feature_extraction_wav2vec2 import floats_list
|
||||||
|
|
||||||
|
|
||||||
|
class Wav2Vec2ProcessorTest(unittest.TestCase):
|
||||||
|
def setUp(self):
|
||||||
|
vocab = "<pad> <s> </s> <unk> | E T A O N I H S R D L U M W C F G Y P B V K ' X J Q Z".split(" ")
|
||||||
|
vocab_tokens = dict(zip(vocab, range(len(vocab))))
|
||||||
|
|
||||||
|
self.add_kwargs_tokens_map = {
|
||||||
|
"pad_token": "<pad>",
|
||||||
|
"unk_token": "<unk>",
|
||||||
|
"bos_token": "<s>",
|
||||||
|
"eos_token": "</s>",
|
||||||
|
}
|
||||||
|
feature_extractor_map = {
|
||||||
|
"feature_size": 1,
|
||||||
|
"padding_value": 0.0,
|
||||||
|
"sampling_rate": 16000,
|
||||||
|
"return_attention_mask": False,
|
||||||
|
"do_normalize": True,
|
||||||
|
}
|
||||||
|
|
||||||
|
self.tmpdirname = tempfile.mkdtemp()
|
||||||
|
self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
|
||||||
|
self.feature_extraction_file = os.path.join(self.tmpdirname, FEATURE_EXTRACTOR_NAME)
|
||||||
|
with open(self.vocab_file, "w", encoding="utf-8") as fp:
|
||||||
|
fp.write(json.dumps(vocab_tokens) + "\n")
|
||||||
|
|
||||||
|
with open(self.feature_extraction_file, "w", encoding="utf-8") as fp:
|
||||||
|
fp.write(json.dumps(feature_extractor_map) + "\n")
|
||||||
|
|
||||||
|
def get_tokenizer(self, **kwargs):
|
||||||
|
kwargs.update(self.add_kwargs_tokens_map)
|
||||||
|
return Wav2Vec2CTCTokenizer.from_pretrained(self.tmpdirname, **kwargs)
|
||||||
|
|
||||||
|
def get_feature_extractor(self, **kwargs):
|
||||||
|
return Wav2Vec2FeatureExtractor.from_pretrained(self.tmpdirname, **kwargs)
|
||||||
|
|
||||||
|
def tearDown(self):
|
||||||
|
shutil.rmtree(self.tmpdirname)
|
||||||
|
|
||||||
|
def test_save_load_pretrained_default(self):
|
||||||
|
tokenizer = self.get_tokenizer()
|
||||||
|
feature_extractor = self.get_feature_extractor()
|
||||||
|
|
||||||
|
processor = Wav2Vec2Processor(tokenizer=tokenizer, feature_extractor=feature_extractor)
|
||||||
|
|
||||||
|
processor.save_pretrained(self.tmpdirname)
|
||||||
|
processor = Wav2Vec2Processor.from_pretrained(self.tmpdirname)
|
||||||
|
|
||||||
|
self.assertEqual(processor.tokenizer.get_vocab(), tokenizer.get_vocab())
|
||||||
|
self.assertIsInstance(processor.tokenizer, Wav2Vec2CTCTokenizer)
|
||||||
|
|
||||||
|
self.assertEqual(processor.feature_extractor.to_json_string(), feature_extractor.to_json_string())
|
||||||
|
self.assertIsInstance(processor.feature_extractor, Wav2Vec2FeatureExtractor)
|
||||||
|
|
||||||
|
def test_save_load_pretrained_additional_features(self):
|
||||||
|
processor = Wav2Vec2Processor(tokenizer=self.get_tokenizer(), feature_extractor=self.get_feature_extractor())
|
||||||
|
processor.save_pretrained(self.tmpdirname)
|
||||||
|
|
||||||
|
tokenizer_add_kwargs = self.get_tokenizer(bos_token="(BOS)", eos_token="(EOS)")
|
||||||
|
feature_extractor_add_kwargs = self.get_feature_extractor(do_normalize=False, padding_value=1.0)
|
||||||
|
|
||||||
|
processor = Wav2Vec2Processor.from_pretrained(
|
||||||
|
self.tmpdirname, bos_token="(BOS)", eos_token="(EOS)", do_normalize=False, padding_value=1.0
|
||||||
|
)
|
||||||
|
|
||||||
|
self.assertEqual(processor.tokenizer.get_vocab(), tokenizer_add_kwargs.get_vocab())
|
||||||
|
self.assertIsInstance(processor.tokenizer, Wav2Vec2CTCTokenizer)
|
||||||
|
|
||||||
|
self.assertEqual(processor.feature_extractor.to_json_string(), feature_extractor_add_kwargs.to_json_string())
|
||||||
|
self.assertIsInstance(processor.feature_extractor, Wav2Vec2FeatureExtractor)
|
||||||
|
|
||||||
|
def test_feature_extractor(self):
|
||||||
|
feature_extractor = self.get_feature_extractor()
|
||||||
|
tokenizer = self.get_tokenizer()
|
||||||
|
|
||||||
|
processor = Wav2Vec2Processor(tokenizer=tokenizer, feature_extractor=feature_extractor)
|
||||||
|
|
||||||
|
raw_speech = floats_list((3, 1000))
|
||||||
|
|
||||||
|
input_feat_extract = feature_extractor(raw_speech, return_tensors="np")
|
||||||
|
input_processor = processor(raw_speech, return_tensors="np")
|
||||||
|
|
||||||
|
for key in input_feat_extract.keys():
|
||||||
|
self.assertAlmostEqual(input_feat_extract[key].sum(), input_processor[key].sum(), delta=1e-2)
|
||||||
|
|
||||||
|
def test_tokenizer(self):
|
||||||
|
feature_extractor = self.get_feature_extractor()
|
||||||
|
tokenizer = self.get_tokenizer()
|
||||||
|
|
||||||
|
processor = Wav2Vec2Processor(tokenizer=tokenizer, feature_extractor=feature_extractor)
|
||||||
|
|
||||||
|
input_str = "This is a test string"
|
||||||
|
|
||||||
|
with processor.as_target_processor():
|
||||||
|
encoded_processor = processor(input_str)
|
||||||
|
|
||||||
|
encoded_tok = tokenizer(input_str)
|
||||||
|
|
||||||
|
for key in encoded_tok.keys():
|
||||||
|
self.assertListEqual(encoded_tok[key], encoded_processor[key])
|
||||||
|
|
||||||
|
def test_tokenizer_decode(self):
|
||||||
|
feature_extractor = self.get_feature_extractor()
|
||||||
|
tokenizer = self.get_tokenizer()
|
||||||
|
|
||||||
|
processor = Wav2Vec2Processor(tokenizer=tokenizer, feature_extractor=feature_extractor)
|
||||||
|
|
||||||
|
predicted_ids = [[1, 4, 5, 8, 1, 0, 8], [3, 4, 3, 1, 1, 8, 9]]
|
||||||
|
|
||||||
|
decoded_processor = processor.batch_decode(predicted_ids)
|
||||||
|
decoded_tok = tokenizer.batch_decode(predicted_ids)
|
||||||
|
|
||||||
|
self.assertListEqual(decoded_tok, decoded_processor)
|
||||||
@@ -23,11 +23,17 @@ import unittest
|
|||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
from transformers import WAV_2_VEC_2_PRETRAINED_MODEL_ARCHIVE_LIST
|
from transformers import (
|
||||||
from transformers.models.wav2vec2 import Wav2Vec2Config, Wav2Vec2Tokenizer
|
WAV_2_VEC_2_PRETRAINED_MODEL_ARCHIVE_LIST,
|
||||||
|
Wav2Vec2Config,
|
||||||
|
Wav2Vec2CTCTokenizer,
|
||||||
|
Wav2Vec2Tokenizer,
|
||||||
|
)
|
||||||
from transformers.models.wav2vec2.tokenization_wav2vec2 import VOCAB_FILES_NAMES
|
from transformers.models.wav2vec2.tokenization_wav2vec2 import VOCAB_FILES_NAMES
|
||||||
from transformers.testing_utils import slow
|
from transformers.testing_utils import slow
|
||||||
|
|
||||||
|
from .test_tokenization_common import TokenizerTesterMixin
|
||||||
|
|
||||||
|
|
||||||
global_rng = random.Random()
|
global_rng = random.Random()
|
||||||
|
|
||||||
@@ -345,3 +351,101 @@ class Wav2Vec2TokenizerTest(unittest.TestCase):
|
|||||||
# only "layer" feature extraction norm should make use of
|
# only "layer" feature extraction norm should make use of
|
||||||
# attention_mask
|
# attention_mask
|
||||||
self.assertEqual(tokenizer.return_attention_mask, config.feat_extract_norm == "layer")
|
self.assertEqual(tokenizer.return_attention_mask, config.feat_extract_norm == "layer")
|
||||||
|
|
||||||
|
|
||||||
|
class Wav2Vec2CTCTokenizerTest(TokenizerTesterMixin, unittest.TestCase):
|
||||||
|
tokenizer_class = Wav2Vec2CTCTokenizer
|
||||||
|
test_rust_tokenizer = False
|
||||||
|
|
||||||
|
def setUp(self):
|
||||||
|
super().setUp()
|
||||||
|
|
||||||
|
vocab = "<pad> <s> </s> <unk> | E T A O N I H S R D L U M W C F G Y P B V K ' X J Q Z".split(" ")
|
||||||
|
vocab_tokens = dict(zip(vocab, range(len(vocab))))
|
||||||
|
|
||||||
|
self.special_tokens_map = {"pad_token": "<pad>", "unk_token": "<unk>", "bos_token": "<s>", "eos_token": "</s>"}
|
||||||
|
|
||||||
|
self.tmpdirname = tempfile.mkdtemp()
|
||||||
|
self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
|
||||||
|
with open(self.vocab_file, "w", encoding="utf-8") as fp:
|
||||||
|
fp.write(json.dumps(vocab_tokens) + "\n")
|
||||||
|
|
||||||
|
def get_tokenizer(self, **kwargs):
|
||||||
|
kwargs.update(self.special_tokens_map)
|
||||||
|
return Wav2Vec2CTCTokenizer.from_pretrained(self.tmpdirname, **kwargs)
|
||||||
|
|
||||||
|
def test_tokenizer_decode(self):
|
||||||
|
tokenizer = self.tokenizer_class.from_pretrained("facebook/wav2vec2-base-960h")
|
||||||
|
|
||||||
|
sample_ids = [
|
||||||
|
[11, 5, 15, tokenizer.pad_token_id, 15, 8, 98],
|
||||||
|
[24, 22, 5, tokenizer.word_delimiter_token_id, 24, 22, 5, 77],
|
||||||
|
]
|
||||||
|
tokens = tokenizer.decode(sample_ids[0])
|
||||||
|
batch_tokens = tokenizer.batch_decode(sample_ids)
|
||||||
|
self.assertEqual(tokens, batch_tokens[0])
|
||||||
|
self.assertEqual(batch_tokens, ["HELLO<unk>", "BYE BYE<unk>"])
|
||||||
|
|
||||||
|
def test_tokenizer_decode_special(self):
|
||||||
|
tokenizer = self.tokenizer_class.from_pretrained("facebook/wav2vec2-base-960h")
|
||||||
|
|
||||||
|
sample_ids = [
|
||||||
|
[11, 5, 15, tokenizer.pad_token_id, 15, 8, 98],
|
||||||
|
[24, 22, 5, tokenizer.word_delimiter_token_id, 24, 22, 5, 77],
|
||||||
|
]
|
||||||
|
sample_ids_2 = [
|
||||||
|
[11, 5, 5, 5, 5, 5, 15, 15, 15, tokenizer.pad_token_id, 15, 8, 98],
|
||||||
|
[
|
||||||
|
24,
|
||||||
|
22,
|
||||||
|
5,
|
||||||
|
tokenizer.pad_token_id,
|
||||||
|
tokenizer.pad_token_id,
|
||||||
|
tokenizer.pad_token_id,
|
||||||
|
tokenizer.word_delimiter_token_id,
|
||||||
|
24,
|
||||||
|
22,
|
||||||
|
5,
|
||||||
|
77,
|
||||||
|
tokenizer.word_delimiter_token_id,
|
||||||
|
],
|
||||||
|
]
|
||||||
|
|
||||||
|
batch_tokens = tokenizer.batch_decode(sample_ids)
|
||||||
|
batch_tokens_2 = tokenizer.batch_decode(sample_ids_2)
|
||||||
|
self.assertEqual(batch_tokens, batch_tokens_2)
|
||||||
|
self.assertEqual(batch_tokens, ["HELLO<unk>", "BYE BYE<unk>"])
|
||||||
|
|
||||||
|
def test_tokenizer_decode_added_tokens(self):
|
||||||
|
tokenizer = self.tokenizer_class.from_pretrained("facebook/wav2vec2-base-960h")
|
||||||
|
tokenizer.add_tokens(["!", "?"])
|
||||||
|
tokenizer.add_special_tokens({"cls_token": "$$$"})
|
||||||
|
|
||||||
|
sample_ids = [
|
||||||
|
[
|
||||||
|
11,
|
||||||
|
5,
|
||||||
|
15,
|
||||||
|
tokenizer.pad_token_id,
|
||||||
|
15,
|
||||||
|
8,
|
||||||
|
98,
|
||||||
|
32,
|
||||||
|
32,
|
||||||
|
33,
|
||||||
|
tokenizer.word_delimiter_token_id,
|
||||||
|
32,
|
||||||
|
32,
|
||||||
|
33,
|
||||||
|
34,
|
||||||
|
34,
|
||||||
|
],
|
||||||
|
[24, 22, 5, tokenizer.word_delimiter_token_id, 24, 22, 5, 77, tokenizer.pad_token_id, 34, 34],
|
||||||
|
]
|
||||||
|
batch_tokens = tokenizer.batch_decode(sample_ids)
|
||||||
|
|
||||||
|
self.assertEqual(batch_tokens, ["HELLO<unk>!?!?$$$", "BYE BYE<unk>$$$"])
|
||||||
|
|
||||||
|
def test_pretrained_model_lists(self):
|
||||||
|
# Wav2Vec2Model has no max model length => no
|
||||||
|
pass
|
||||||
|
|||||||
@@ -372,6 +372,7 @@ DEPRECATED_OBJECTS = [
|
|||||||
"TextDataset",
|
"TextDataset",
|
||||||
"TextDatasetForNextSentencePrediction",
|
"TextDatasetForNextSentencePrediction",
|
||||||
"Wav2Vec2ForMaskedLM",
|
"Wav2Vec2ForMaskedLM",
|
||||||
|
"Wav2Vec2Tokenizer",
|
||||||
"glue_compute_metrics",
|
"glue_compute_metrics",
|
||||||
"glue_convert_examples_to_features",
|
"glue_convert_examples_to_features",
|
||||||
"glue_output_modes",
|
"glue_output_modes",
|
||||||
|
|||||||
Reference in New Issue
Block a user